JacobLinCool commited on
Commit
5ece751
·
1 Parent(s): 176e214

fix: long form transcription

Browse files
Files changed (2) hide show
  1. app.py +23 -2
  2. model.py +1 -1
app.py CHANGED
@@ -20,7 +20,27 @@ def transcribe_audio(audio: str) -> str:
20
  "Please wait a moment for the audio to be uploaded, then click the button again."
21
  )
22
 
23
- b64 = read_file_as_base64(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  url = f"https://api-inference.huggingface.co/models/{model_id}"
25
  headers = {
26
  "Authorization": f"Bearer {token}",
@@ -31,11 +51,12 @@ def transcribe_audio(audio: str) -> str:
31
  "inputs": b64,
32
  "parameters": {
33
  "generate_kwargs": {
34
- "num_beams": 5,
35
  }
36
  },
37
  }
38
  response = requests.post(url, headers=headers, json=data)
 
39
  out = response.json()
40
  print(f"{out=}")
41
 
 
20
  "Please wait a moment for the audio to be uploaded, then click the button again."
21
  )
22
 
23
+ # resample to 16k mono to reduce file size
24
+ import subprocess
25
+ import os
26
+
27
+ audio_resampled = audio.replace(".mp3", "_resampled.mp3")
28
+ subprocess.run(
29
+ [
30
+ "ffmpeg",
31
+ "-i",
32
+ audio,
33
+ "-ac",
34
+ "1",
35
+ "-ar",
36
+ "16000",
37
+ audio_resampled,
38
+ "-y",
39
+ ],
40
+ check=True,
41
+ )
42
+
43
+ b64 = read_file_as_base64(audio_resampled)
44
  url = f"https://api-inference.huggingface.co/models/{model_id}"
45
  headers = {
46
  "Authorization": f"Bearer {token}",
 
51
  "inputs": b64,
52
  "parameters": {
53
  "generate_kwargs": {
54
+ "return_timestamps": True,
55
  }
56
  },
57
  }
58
  response = requests.post(url, headers=headers, json=data)
59
+ print(f"{response.text=}")
60
  out = response.json()
61
  print(f"{out=}")
62
 
model.py CHANGED
@@ -20,7 +20,7 @@ def transcribe_audio_local(audio: str) -> str:
20
  if pipe is None:
21
  load_model()
22
 
23
- out = pipe(audio)
24
  print(f"{out=}")
25
 
26
  return out["text"]
 
20
  if pipe is None:
21
  load_model()
22
 
23
+ out = pipe(audio, return_timestamps=True)
24
  print(f"{out=}")
25
 
26
  return out["text"]