import librosa import numpy as np import torch import os from scipy.io.wavfile import write, read from model import BigramLanguageModel from divideStereo import split_stereo, add_stereo class ProceesAudio(): audio_data = [] final_audio = [] device = 'cuda' if torch.cuda.is_available() else 'cpu' target_audio = [] def _split_audio_s(self, audio, sr, segment_length=0.5, overlap=0): # Calculate segment and overlap samples segment_samples = int(segment_length * sr) overlap_samples = int(segment_samples * overlap) # Split the stereo audio into segments while preserving stereo channels segments = [] for start in range(0, audio.shape[1], segment_samples - overlap_samples): segment = audio[:, start:start + segment_samples] if segment.shape[1] == segment_samples: segments.append(segment) return segments, sr def _split_audio(self, file_path, segment_length=0.1, overlap=0): audio, sr = librosa.load(file_path, sr=None) segment_samples = int(segment_length * sr) overlap_samples = int(segment_samples * overlap) segments = [] for start in range(0, len(audio), segment_samples - overlap_samples): segment = audio[start:start + segment_samples] if len(segment) == segment_samples: segments.append(segment) return segments, sr def _calculate_average_amplitude(self, segments, sr, n_fft=2048, hop_length=256, num_frequency_bands=100): # for segment in segments: # stft = librosa.stft(segment, n_fft=n_fft, hop_length=hop_length) # magnitude = np.abs(stft) # max_amplitude = max(max_amplitude, np.max(magnitude)) ret=[] audios = [] max_amp = 0 for segment in segments: max_amp = max(max_amp, np.max(np.abs(segment))) for segment in segments: amps, _ = split_stereo(segment=segment, max_amp=max_amp, sr=sr, num_parts=10) ret.append(amps) return ret, audios def _get_output_amps(self, input_amps, index): model = BigramLanguageModel() model.to("cpu", dtype=float) model.load_state_dict(torch.load('amp_net.pth', map_location=torch.device('cpu'))) return model.generate(torch.tensor(input_amps[:index+1]).view(1, index+1, len(input_amps[0])).to(self.device), len(input_amps)-index + 1) def make_smooth(self, audio, gain, prev_gain): smooth_index = 1000 mult_arr_in = np.linspace(prev_gain, gain, num=smooth_index) for i in range(smooth_index): audio[:smooth_index][i] *= mult_arr_in[i] audio[smooth_index:] *= gain return audio def perform_modulation(self, data, sr, index): segments, sr = self._split_audio_s(data, sr=sr) max_amp = 0 for segment in segments: max_amp = max(max_amp, np.max(np.abs(segment))) x, _ = self._calculate_average_amplitude(segments=segments, sr=sr) #print(x.shape) y = self._get_output_amps(x, index) modified_segs = [] prev_gains = np.ones(10) for segment, mod in zip(segments, y[0]): _, audios = split_stereo(segment=segment, max_amp=max_amp, sr=sr, num_parts=10) final_audios = [] curr_gains = [] for audio, target_amp, i in zip(audios, mod, range(10)): gain = (target_amp.item()/np.mean(np.abs(audio)))*max_amp if np.mean(np.abs(audio)) == 0: gain=0 elif gain <= 50: gain = gain/50 else: gain=1 audio = self.make_smooth(audio, gain, prev_gains[i]) curr_gains.append(gain) final_audios.append(audio) prev_gains = curr_gains modified_seg = add_stereo(final_audios, len(final_audios[0]), sample_rate=sr) modified_segs.append(modified_seg) modified_segs = np.concatenate(modified_segs) return modified_segs.astype(np.float32) def get_training_data(self, file_path, data_dir): for song in os.listdir(data_dir): self.audio_data = [] self._process_main(os.path.join(data_dir, song)) audio_data = np.array(self.audio_data) tensor_data = torch.tensor(audio_data, dtype=torch.float32) torch.save(tensor_data, file_path + '/' + song + '.data.pt')