diff --git a/Utils.py b/Utils.py new file mode 100644 index 0000000..83f147a --- /dev/null +++ b/Utils.py @@ -0,0 +1,118 @@ +import librosa +import numpy as np +import torch +import os +from scipy.io.wavfile import write, read +from model import BigramLanguageModel +from divideStereo import split_stereo, add_stereo + +class ProceesAudio(): + audio_data = [] + + final_audio = [] + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + target_audio = [] + + def _split_audio_s(self, audio, sr, segment_length=0.5, overlap=0): + # Calculate segment and overlap samples + segment_samples = int(segment_length * sr) + overlap_samples = int(segment_samples * overlap) + + # Split the stereo audio into segments while preserving stereo channels + segments = [] + for start in range(0, audio.shape[1], segment_samples - overlap_samples): + segment = audio[:, start:start + segment_samples] + if segment.shape[1] == segment_samples: + segments.append(segment) + + return segments, sr + + def _split_audio(self, file_path, segment_length=0.1, overlap=0): + audio, sr = librosa.load(file_path, sr=None) + segment_samples = int(segment_length * sr) + overlap_samples = int(segment_samples * overlap) + segments = [] + for start in range(0, len(audio), segment_samples - overlap_samples): + segment = audio[start:start + segment_samples] + if len(segment) == segment_samples: + segments.append(segment) + return segments, sr + + def _calculate_average_amplitude(self, segments, sr, n_fft=2048, hop_length=256, num_frequency_bands=100): + # for segment in segments: + # stft = librosa.stft(segment, n_fft=n_fft, hop_length=hop_length) + # magnitude = np.abs(stft) + # max_amplitude = max(max_amplitude, np.max(magnitude)) + ret=[] + audios = [] + max_amp = 0 + for segment in segments: + max_amp = max(max_amp, np.max(np.abs(segment))) + for segment in segments: + amps, _ = split_stereo(segment=segment, max_amp=max_amp, sr=sr, num_parts=10) + ret.append(amps) + return ret, audios + + def _get_output_amps(self, input_amps, index): + model = BigramLanguageModel() + model.to("cpu", dtype=float) + model.load_state_dict(torch.load('amp_net.pth', map_location=torch.device('cpu'))) + return model.generate(torch.tensor(input_amps[:index+1]).view(1, index+1, len(input_amps[0])).to(self.device), len(input_amps)-index + 1) + + def make_smooth(self, audio, gain, prev_gain): + smooth_index = 1000 + mult_arr_in = np.linspace(prev_gain, gain, num=smooth_index) + for i in range(smooth_index): + audio[:smooth_index][i] *= mult_arr_in[i] + audio[smooth_index:] *= gain + return audio + + + def perform_modulation(self, data, sr, index): + segments, sr = self._split_audio_s(data, sr=sr) + max_amp = 0 + for segment in segments: + max_amp = max(max_amp, np.max(np.abs(segment))) + + x, _ = self._calculate_average_amplitude(segments=segments, sr=sr) + #print(x.shape) + y = self._get_output_amps(x, index) + modified_segs = [] + prev_gains = np.ones(10) + for segment, mod in zip(segments, y[0]): + _, audios = split_stereo(segment=segment, max_amp=max_amp, sr=sr, num_parts=10) + final_audios = [] + curr_gains = [] + + for audio, target_amp, i in zip(audios, mod, range(10)): + + gain = (target_amp.item()/np.mean(np.abs(audio)))*max_amp + + if np.mean(np.abs(audio)) == 0: + gain=0 + elif gain <= 50: + gain = gain/50 + else: + gain=1 + + audio = self.make_smooth(audio, gain, prev_gains[i]) + curr_gains.append(gain) + final_audios.append(audio) + + prev_gains = curr_gains + modified_seg = add_stereo(final_audios, len(final_audios[0]), sample_rate=sr) + modified_segs.append(modified_seg) + + modified_segs = np.concatenate(modified_segs) + return modified_segs.astype(np.float32) + + def get_training_data(self, file_path, data_dir): + for song in os.listdir(data_dir): + self.audio_data = [] + self._process_main(os.path.join(data_dir, song)) + audio_data = np.array(self.audio_data) + tensor_data = torch.tensor(audio_data, dtype=torch.float32) + torch.save(tensor_data, file_path + '/' + song + '.data.pt') + diff --git a/divideStereo.py b/divideStereo.py index 9ea12c6..d07e1db 100644 --- a/divideStereo.py +++ b/divideStereo.py @@ -2,8 +2,7 @@ from pydub import AudioSegment import numpy as np from scipy.io.wavfile import write import time -# Load the audio file -audio = AudioSegment.from_file("sitare.wav") + # Ensure the audio is stereo if audio.channels != 2: diff --git a/server.py b/server.py new file mode 100644 index 0000000..87c706f --- /dev/null +++ b/server.py @@ -0,0 +1,99 @@ +from flask import Flask, request, send_file, jsonify +import io +from scipy.io import wavfile +import numpy as np +import librosa +from Utils import ProceesAudio +import pyrebase +import soundfile as sf +import datetime +from flask_jwt_extended import JWTManager, jwt_required, create_access_token, get_jwt_identity + +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 16 MB limit + + +# Firebase configuration +firebase_config = { + "apiKey": "AIzaSyBqDZlqD7UOBvt2zsk9OLWKH1Lc3_f_VJM", + "authDomain": "modifier-4088b.firebaseapp.com", + "projectId": "modifier-4088b", + "storageBucket": "modifier-4088b.appspot.com", + "messagingSenderId": "237119475630", + "appId": "1:237119475630:web:6c96c38c61285f5fcb823f", + "measurementId": "G-6CWLQMT2Q3", + "databaseURL": "https://modifier-4088b.firebaseio.com", +} + +firebase = pyrebase.initialize_app(firebase_config) +storage = firebase.storage() + +@app.route('/process_and_upload', methods=['POST']) + +def upload_to_firebase(processed_data, userId): + # Create an in-memory bytes buffer + buffer = io.BytesIO() + + # Write processed data to the buffer as a WAV file + sf.write(buffer, processed_data, 44100, format='WAV') + buffer.seek(0) # Rewind the buffer + + # Upload the buffer to Firebase Storage + storage_path = f'uploads/processed_audio_{userId}.wav' + storage.child(storage_path).put(buffer, f'processed_audio_{userId}.wav') + + # Get the URL of the uploaded file + file_url = storage.child(storage_path).get_url(None) + + return file_url + + +def int16_to_float32(samples): + return samples.astype(np.float32) / 32768.0 + +def process_audio_bytes(audio_bytes): + # Read the audio file from bytes + sample_rate, data = wavfile.read(io.BytesIO(audio_bytes)) + left = [] + right = [] + for frame in data: + frame = int16_to_float32(frame) + left.append(frame[0]) + right.append(frame[1]) + data = np.array([left, right], dtype=np.float32) + + pa = ProceesAudio() + + processed_data = pa.perform_modulation(data=data, sr=sample_rate, index=0) + file_url = upload_to_firebase(processed_data=processed_data, userId="parth") + arr_to_show = [] + + acc_factor = int(len(processed_data)/150) + + for i in range(0, len(processed_data), acc_factor): + arr_to_show.append(np.mean(np.abs(processed_data[i:i+acc_factor]))) + + for i in range(len(arr_to_show)): + arr_to_show[i] = float(arr_to_show[i]) + + return jsonify({"file_url": file_url, "array": arr_to_show}) + +@app.route('/modify', methods=['POST']) +def modify(): + if 'song' not in request.files: + return 'No file part', 400 + file = request.files['song'] + if file.filename == '': + return 'No selected file', 400 + if file: + # Read file bytes + file_bytes = file.read() + + # Process the audio bytes + response = process_audio_bytes(file_bytes) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +if __name__ == '__main__': + app.run(debug=True, port=8000)