Spaces:

ajchri5
/

Assignment-2-IT164_ajchri5

Runtime error

ajchri5 commited on Nov 19, 2024

Commit

4570d8a

verified ·

1 Parent(s): d4d32cf

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import gradio as gr
 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
 import torch
 import numpy as np
-import librosa
 # Load Whisper model for transcription
 whisper_model_name = "openai/whisper-large"
@@ -15,16 +14,23 @@ lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-la
 # Function to transcribe audio to text using Whisper model
 def transcribe_audio(audio_file):
-    # Ensure the audio is a numpy array (Gradio input type for audio is numpy)
-    audio = np.array(audio_file)
-    # Prepare input features for Whisper
     input_features = processor(audio, return_tensors="pt", sampling_rate=16000)
     # Generate transcription
     generated_ids = model.generate(input_features["input_features"])
     transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
     return transcription
 # Function to detect the language of the transcription using zero-shot classification

 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
 import torch
 import numpy as np
 # Load Whisper model for transcription
 whisper_model_name = "openai/whisper-large"
 # Function to transcribe audio to text using Whisper model
 def transcribe_audio(audio_file):
+    # Check if audio_file is a list (Gradio returns a list when multiple clips are recorded)
+    if isinstance(audio_file, list):
+        audio = np.concatenate(audio_file)  # Concatenate the list of arrays into a single 1D array
+    else:
+        audio = np.array(audio_file)  # Ensure it's a 1D array
+    # Ensure the shape is 1D (if the shape is (2, N), we flatten it)
+    if len(audio.shape) > 1:
+        audio = audio.flatten()
+    # Prepare input features for Whisper (sampling rate should be 16000 for Whisper)
     input_features = processor(audio, return_tensors="pt", sampling_rate=16000)
     # Generate transcription
     generated_ids = model.generate(input_features["input_features"])
     transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
     return transcription
 # Function to detect the language of the transcription using zero-shot classification