from transformers import pipeline from accelerate import Accelerator import spaces import librosa model_id = "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW" pipe = None def load_model(): global pipe device = Accelerator().device pipe = pipeline("automatic-speech-recognition", model=model_id, device=device) def get_gpu_duration(audio: str) -> int: y, sr = librosa.load(audio) duration = librosa.get_duration(y=y, sr=sr) / 60.0 gpu_duration = max(1.0, (duration + 59.0) // 60.0) * 60.0 print(f"{duration=}, {gpu_duration=}") return int(gpu_duration) @spaces.GPU(duration=get_gpu_duration) def transcribe_audio_local(audio: str) -> str: print(f"{audio=}") if pipe is None: load_model() out = pipe(audio, return_timestamps=True) print(f"{out=}") return out["text"]