Spaces:
Sleeping
Sleeping
import torch | |
from transformers import pipeline | |
import librosa | |
import soundfile as sf | |
import numpy as np | |
class WhisperTranscriber: | |
def __init__(self, model_size="medium"): | |
self.model_size = model_size | |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
self.model = pipeline( | |
"automatic-speech-recognition", | |
model=f"openai/whisper-{model_size}", | |
chunk_length_s=30, | |
device=self.device, | |
batch_size=8, | |
torch_dtype=torch.float16, | |
return_timestamps=True | |
) | |
def preprocess_audio(self, audio_path, target_sr=16000): | |
# Load audio with librosa for better preprocessing | |
y, sr = librosa.load(audio_path, sr=None) | |
# Resample to 16kHz (Whisper's expected rate) | |
y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=target_sr) | |
# Apply noise reduction | |
y_cleaned = librosa.effects.preemphasis(y_resampled) | |
# Normalize audio | |
y_normalized = librosa.util.normalize(y_cleaned) | |
# Remove silence and very quiet parts | |
y_filtered = librosa.effects.trim( | |
y_normalized, | |
top_db=30, | |
frame_length=2048, | |
hop_length=512 | |
)[0] | |
return y_filtered, target_sr | |
def transcribe(self, audio_path): | |
try: | |
# Preprocess audio | |
audio_data, sample_rate = self.preprocess_audio(audio_path) | |
print(f"Audio loaded and preprocessed - Shape: {audio_data.shape}, Sample rate: {sample_rate}") | |
# Transcribe | |
result = self.model( | |
audio_data, | |
generate_kwargs={ | |
"task": "transcribe", | |
"language": "en", | |
"max_new_tokens": 256, | |
"temperature": 0.7 # Added to reduce hallucination | |
} | |
) | |
# Extract transcription with timestamps if available | |
if isinstance(result, dict): | |
if "chunks" in result: | |
transcription = " ".join([chunk["text"] for chunk in result["chunks"]]) | |
else: | |
transcription = result["text"] | |
else: | |
transcription = result | |
return transcription | |
except Exception as e: | |
print(f"Error in transcribe: {str(e)}") | |
raise | |
# Example usage | |
if __name__ == "__main__": | |
transcriber = WhisperTranscriber(model_size="medium") | |
transcription = transcriber.transcribe("path_to_your_audio_file.wav") | |
print(f"Transcription: {transcription}") | |