Ritwika-Das-Gupta commited on
Commit
87ff2a6
1 Parent(s): b1bc9d5

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -339
app.py DELETED
@@ -1,339 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[3]:
5
-
6
-
7
- #!pip install torchaudio
8
-
9
-
10
- # In[2]:
11
-
12
-
13
- from IPython.display import Audio
14
- import IPython.display as ipd
15
- from scipy.io import wavfile
16
- import numpy as np
17
- import warnings
18
- import re
19
- warnings.filterwarnings("ignore")
20
- import soundfile as sf
21
- import librosa
22
- import torch
23
- import os
24
- import soundfile as sf
25
- import librosa
26
- import noisereduce as nr
27
- import numpy as np
28
- import gradio as gr
29
- import pyloudnorm as pyln
30
- # import torchaudio
31
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
32
- from transformers import AutoModelForCTC, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
33
- from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
34
- import pandas as pd
35
- from transformers import pipeline, AutoModelForAudioClassification, AutoProcessor
36
-
37
-
38
- # In[3]:
39
-
40
-
41
- # Set device and dtype
42
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
43
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
44
- lid_model_id = "facebook/mms-lid-126"
45
- lid_pipeline = pipeline("audio-classification", model=lid_model_id,device=device)
46
- language_mapping = {
47
- "hin": "hindi",
48
- "ben": "bengali",
49
- "eng": "english",
50
- "guj": "gujarati"
51
- }
52
-
53
- def detect_language_for_audio_file(audio_file_path, lid_pipeline, target_sampling_rate=16000):
54
- """
55
- Detects the language of a given audio file and returns a DataFrame.
56
-
57
- Parameters:
58
- - audio_file_path (str): The path to the audio file.
59
- - lid_pipeline: The language identification pipeline.
60
- - target_sampling_rate (int): The target sampling rate for the audio file. Default is 16000.
61
-
62
- Returns:
63
- - df (pd.DataFrame): A DataFrame containing the detected language and filename.
64
- """
65
- detected_languages = []
66
- audio_filenames = []
67
-
68
- filename = os.path.basename(audio_file_path)
69
- waveform, original_sampling_rate = librosa.load(audio_file_path, sr=None)
70
-
71
- if len(waveform.shape) > 1:
72
- waveform = librosa.to_mono(waveform)
73
-
74
- if original_sampling_rate != target_sampling_rate:
75
- waveform = librosa.resample(waveform, orig_sr=original_sampling_rate, target_sr=target_sampling_rate)
76
-
77
- # Perform language identification
78
- lid_result = lid_pipeline(waveform, sampling_rate=target_sampling_rate)
79
- detected_language = lid_result[0]['label'].split('_')[0]
80
- print(f"Detected language for {filename}: {detected_language}")
81
-
82
- detected_languages.append(detected_language)
83
- audio_filenames.append(filename)
84
-
85
- df = pd.DataFrame({
86
- "Detected_Language": detected_languages,
87
- "Audio_Filename": audio_filenames
88
- })
89
-
90
- # removing nondetected languages
91
-
92
- df['Detected_Language'] = df['Detected_Language'].map(language_mapping)
93
-
94
- df.dropna(inplace=True, axis= 0)
95
-
96
- # adding model names based on language
97
- model_names = []
98
-
99
- for index, row in df.iterrows():
100
- detected_language = row['Detected_Language']
101
-
102
- model_name = "ai4bharat/indicwav2vec_v1_" + detected_language
103
-
104
- model_names.append(model_name)
105
-
106
- df['Model_Name'] = model_names
107
-
108
- return df
109
- # Example usage:
110
- # audio_file_path = 'processed_audio.wav'
111
- # df = detect_language_for_audio_file(audio_file_path, lid_pipeline)
112
- # print(df)
113
-
114
-
115
- # In[4]:
116
-
117
-
118
- loaded_models = {}
119
- def load_model_and_tokenizer(standardized_language):
120
- if standardized_language not in loaded_models:
121
- if standardized_language == 'hindi':
122
- model_name = "ai4bharat/indicwav2vec-hindi"
123
- elif standardized_language == 'odia':
124
- model_name = "ai4bharat/indicwav2vec-odia"
125
- elif standardized_language == 'english':
126
- model_name = "facebook/wav2vec2-large-960h-lv60-self"
127
- else:
128
- model_name = "ai4bharat/indicwav2vec_v1_" + standardized_language
129
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
130
- tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
131
- loaded_models[standardized_language] = (model, tokenizer)
132
- else:
133
- model, tokenizer = loaded_models[standardized_language]
134
- return model, tokenizer
135
-
136
-
137
- # In[5]:
138
-
139
-
140
- def perform_transcription(df):
141
-
142
- transcriptions = []
143
-
144
- for index, row in df.iterrows():
145
- audio_file_path = row['Audio_Filename']
146
- detected_language = row['Detected_Language']
147
-
148
- standardized_language = language_mapping.get(detected_language, detected_language)
149
- model, tokenizer = load_model_and_tokenizer(standardized_language)
150
-
151
- input_audio, _ = librosa.load(audio_file_path, sr=16000)
152
- input_values = tokenizer(input_audio, return_tensors="pt").input_values
153
-
154
- with torch.no_grad():
155
- logits = model(input_values).logits
156
-
157
- predicted_ids = torch.argmax(logits, dim=-1)
158
- text = tokenizer.batch_decode(predicted_ids)[0]
159
-
160
- transcriptions.append(text)
161
-
162
- df['Transcription'] = transcriptions
163
-
164
- return df
165
-
166
-
167
- # In[7]:
168
-
169
-
170
- # Loading the tokenizer and model from Hugging Face's model hub.
171
- tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN'))
172
- model = AutoModelForCausalLM.from_pretrained(
173
- "soketlabs/pragna-1b",
174
- token=os.environ.get('HF_TOKEN'),
175
- revision='3c5b8b1309f7d89710331ba2f164570608af0de7'
176
- )
177
- model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN'))
178
- model = model.to(device)
179
-
180
- # Function to generate response
181
- def generate_response(transcription):
182
- try:
183
- messages = [
184
- {"role": "system", "content": " you are a friendly bot to help the user"},
185
- {"role": "user", "content": transcription},
186
- ]
187
- tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
188
- input_ids = tokenized_chat[0].to(device)
189
- if len(input_ids.shape) == 1:
190
- input_ids = input_ids.unsqueeze(0)
191
- with torch.no_grad():
192
- output = model.generate(
193
- input_ids,
194
- max_new_tokens=300,
195
- do_sample=True,
196
- top_k=5,
197
- num_beams=1,
198
- use_cache=False,
199
- temperature=0.2,
200
- repetition_penalty=1.1,
201
- )
202
- generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
203
- return find_last_sentence(generated_text)
204
- except Exception as e:
205
- print("Error during response generation:", e)
206
- return "Response generation error: " + str(e)
207
-
208
- # Function to find last sentence in generated text
209
- def find_last_sentence(text):
210
- sentence_endings = re.finditer(r'[।?!]', text)
211
- end_positions = [ending.end() for ending in sentence_endings]
212
- if end_positions:
213
- return text[:end_positions[-1]]
214
- return text
215
-
216
-
217
- # In[15]:
218
-
219
-
220
- def generate_text_and_display_audio(row, model, tokenizer):
221
- audio_file = row['Audio_Filename']
222
- transcription = row['Transcription']
223
-
224
- # Generate text
225
- generated_text = generate_response(transcription)
226
-
227
- generated_text = find_last_sentence(generated_text)
228
- # Display audio
229
- # display(ipd.Audio(audio_path))
230
- return transcription, generated_text
231
- # Display prompt and generated text
232
- # print("Transcribed Text:", transcription)
233
- # print("Generated Text:", generated_text)
234
-
235
-
236
- # In[16]:
237
-
238
- def spectral_subtraction(audio_data, sample_rate):
239
- # Compute short-time Fourier transform (STFT)
240
- stft = librosa.stft(audio_data)
241
-
242
- # Compute power spectrogram
243
- power_spec = np.abs(stft)**2
244
-
245
- # Estimate noise power spectrum
246
- noise_power = np.median(power_spec, axis=1)
247
-
248
- # Apply spectral subtraction
249
- alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0
250
- denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0)
251
-
252
- # Inverse STFT to obtain denoised audio
253
- denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft)))
254
-
255
- return denoised_audio
256
-
257
- def apply_compression(audio_data, sample_rate):
258
- # Apply dynamic range compression
259
- meter = pyln.Meter(sample_rate) # create BS.1770 meter
260
- loudness = meter.integrated_loudness(audio_data)
261
-
262
- # Normalize audio to target loudness of -24 LUFS
263
- loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0)
264
-
265
- return loud_norm
266
-
267
- def process_audio(audio_file_path):
268
- try:
269
- # Read audio data
270
- audio_data, sample_rate = librosa.load(audio_file_path)
271
- print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}")
272
-
273
- # Apply noise reduction using noisereduce
274
- reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate)
275
- print("Noise reduction applied")
276
-
277
- # Apply spectral subtraction for additional noise reduction
278
- denoised_audio = spectral_subtraction(reduced_noise, sample_rate)
279
- print("Spectral subtraction applied")
280
-
281
- # Apply dynamic range compression to make foreground louder
282
- compressed_audio = apply_compression(denoised_audio, sample_rate)
283
- print("Dynamic range compression applied")
284
-
285
- # Remove silent spaces
286
- final_audio = librosa.effects.trim(compressed_audio)[0]
287
- print("Silences trimmed")
288
-
289
- # Save the final processed audio to a file with a fixed name
290
- processed_file_path = 'processed_audio.wav'
291
- sf.write(processed_file_path, final_audio, sample_rate)
292
- print(f"Processed audio saved to: {processed_file_path}")
293
-
294
- # Check if file exists to confirm it was saved
295
- if not os.path.isfile(processed_file_path):
296
- raise FileNotFoundError(f"Processed file not found: {processed_file_path}")
297
-
298
- # Load the processed audio for transcription
299
- processed_audio_data, _ = librosa.load(processed_file_path)
300
- print(f"Processed audio reloaded for transcription: {processed_file_path}")
301
-
302
- df = detect_language_for_audio_file(processed_file_path, lid_pipeline)
303
- print(df)
304
- df_transcription= perform_transcription(df)
305
- print(df_transcription)
306
- for index, row in df_transcription.iterrows():
307
- print(index, row)
308
- transcription, response = generate_text_and_display_audio(row, model, tokenizer)
309
-
310
-
311
- # Transcribe audio
312
- # transcription = transcribe_audio(processed_audio_data)
313
- # print("Transcription completed")
314
-
315
- # # Generate response
316
- # response = generate_response(transcription)
317
- # print("Response generated")
318
-
319
- return processed_file_path, transcription, response
320
- except Exception as e:
321
- print("Error during audio processing:", e)
322
- return "Error during audio processing:", str(e)
323
-
324
-
325
- # Create Gradio interface
326
- iface = gr.Interface(
327
- fn=process_audio,
328
- inputs=gr.Audio(label="Record Audio", type="filepath"),
329
- outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")]
330
- )
331
-
332
- iface.launch(share=True)
333
-
334
-
335
- # In[ ]:
336
-
337
-
338
-
339
-