Spaces:

JacobLinCool
/

TWASR

Sleeping

JacobLinCool commited on Oct 31, 2024

Commit

5ece751

1 Parent(s): 176e214

fix: long form transcription

Files changed (2) hide show

app.py CHANGED Viewed

@@ -20,7 +20,27 @@ def transcribe_audio(audio: str) -> str:
             "Please wait a moment for the audio to be uploaded, then click the button again."
         )
-    b64 = read_file_as_base64(audio)
     url = f"https://api-inference.huggingface.co/models/{model_id}"
     headers = {
         "Authorization": f"Bearer {token}",
@@ -31,11 +51,12 @@ def transcribe_audio(audio: str) -> str:
         "inputs": b64,
         "parameters": {
             "generate_kwargs": {
-                "num_beams": 5,
             }
         },
     }
     response = requests.post(url, headers=headers, json=data)
     out = response.json()
     print(f"{out=}")

             "Please wait a moment for the audio to be uploaded, then click the button again."
         )
+    # resample to 16k mono to reduce file size
+    import subprocess
+    import os
+    audio_resampled = audio.replace(".mp3", "_resampled.mp3")
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-i",
+            audio,
+            "-ac",
+            "1",
+            "-ar",
+            "16000",
+            audio_resampled,
+            "-y",
+        ],
+        check=True,
+    )
+    b64 = read_file_as_base64(audio_resampled)
     url = f"https://api-inference.huggingface.co/models/{model_id}"
     headers = {
         "Authorization": f"Bearer {token}",
         "inputs": b64,
         "parameters": {
             "generate_kwargs": {
+                "return_timestamps": True,
             }
         },
     }
     response = requests.post(url, headers=headers, json=data)
+    print(f"{response.text=}")
     out = response.json()
     print(f"{out=}")

model.py CHANGED Viewed

@@ -20,7 +20,7 @@ def transcribe_audio_local(audio: str) -> str:
     if pipe is None:
         load_model()
-    out = pipe(audio)
     print(f"{out=}")
     return out["text"]

     if pipe is None:
         load_model()
+    out = pipe(audio, return_timestamps=True)
     print(f"{out=}")
     return out["text"]