riteshkr commited on
Commit
d23e0cd
1 Parent(s): 3e6fe5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -30
app.py CHANGED
@@ -1,47 +1,101 @@
1
- import gradio as gr
2
  from transformers import pipeline
 
 
3
 
4
- # Load the ASR model using the Hugging Face pipeline
5
- model_id = "riteshkr/whisper-large-v3-quantized" # Update with your model path or ID
6
- pipe = pipeline("automatic-speech-recognition", model=model_id)
7
-
8
- # Define the transcription function
9
- def transcribe_speech(filepath):
10
- output = pipe(
11
- filepath,
12
- max_new_tokens=256,
13
- generate_kwargs={
14
- "task": "transcribe",
15
- "language": "english",
16
- }, # Update the language as per your model's fine-tuning
17
- chunk_length_s=30,
18
- batch_size=8,
19
- )
20
- return output["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Define the Gradio interface for microphone input
23
  mic_transcribe = gr.Interface(
24
- fn=transcribe_speech,
25
- inputs=gr.Audio(sources="microphone", type="filepath"),
26
- outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
27
  )
28
 
29
- # Define the Gradio interface for file upload input
30
  file_transcribe = gr.Interface(
31
- fn=transcribe_speech,
32
- inputs=gr.Audio(sources="upload", type="filepath"),
33
- outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
35
 
36
- # Creating the tabbed layout using Blocks
37
  demo = gr.Blocks()
38
 
39
  with demo:
40
  gr.TabbedInterface(
41
  [mic_transcribe, file_transcribe],
42
- ["Transcribe Microphone", "Transcribe Audio File"],
43
  )
44
 
45
- # Launch the app with debugging enabled
46
  if __name__ == "__main__":
47
- demo.launch(debug=True, share=True)
 
1
+ import torch
2
  from transformers import pipeline
3
+ from transformers.pipelines.audio_utils import ffmpeg_read
4
+ import gradio as gr
5
 
6
+ # Define model details
7
+ MODEL_NAME = "riteshkr/whisper-large-v3-quantized" # Update with your actual model ID
8
+ BATCH_SIZE = 8
9
+
10
+ # Select device based on availability of CUDA (GPU) or fallback to CPU
11
+ device = 0 if torch.cuda.is_available() else "cpu"
12
+
13
+ # Load the ASR model pipeline
14
+ pipe = pipeline(
15
+ task="automatic-speech-recognition",
16
+ model=MODEL_NAME,
17
+ chunk_length_s=30, # Adjust as needed for your application
18
+ device=device,
19
+ )
20
+
21
+ # Utility function to format timestamps
22
+ def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
23
+ if seconds is not None:
24
+ milliseconds = round(seconds * 1000.0)
25
+ hours = milliseconds // 3_600_000
26
+ milliseconds -= hours * 3_600_000
27
+ minutes = milliseconds // 60_000
28
+ milliseconds -= minutes * 60_000
29
+ seconds = milliseconds // 1_000
30
+ milliseconds -= seconds * 1_000
31
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
32
+ return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
33
+ else:
34
+ return seconds
35
+
36
+ # Transcription function for batch processing
37
+ def transcribe(files, task, return_timestamps):
38
+ transcriptions = []
39
+ for file in files: # Process each file in the batch
40
+ outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
41
+ text = outputs["text"]
42
+ if return_timestamps:
43
+ timestamps = outputs["chunks"]
44
+ formatted_chunks = [
45
+ f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
46
+ for chunk in timestamps
47
+ ]
48
+ text = "\n".join(formatted_chunks)
49
+ transcriptions.append(text)
50
+ return "\n\n".join(transcriptions) # Return all transcriptions combined
51
 
52
+ # Define Gradio interface for microphone input
53
  mic_transcribe = gr.Interface(
54
+ fn=transcribe,
55
+ inputs=[
56
+ gr.Audio(source="microphone", type="filepath", optional=True),
57
+ gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
58
+ gr.Checkbox(default=False, label="Return timestamps"),
59
+ ],
60
+ outputs="text",
61
+ layout="horizontal",
62
+ title="Whisper Demo: Transcribe Audio",
63
+ description=(
64
+ f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
65
+ ),
66
+ allow_flagging="never",
67
  )
68
 
69
+ # Define Gradio interface for file upload
70
  file_transcribe = gr.Interface(
71
+ fn=transcribe,
72
+ inputs=[
73
+ gr.Audio(source="upload", type="filepath", label="Upload Audio File", optional=True),
74
+ gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
75
+ gr.Checkbox(default=False, label="Return timestamps"),
76
+ ],
77
+ outputs="text",
78
+ layout="horizontal",
79
+ title="Whisper Demo: Transcribe Audio",
80
+ description=(
81
+ f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
82
+ ),
83
+ allow_flagging="never",
84
+ examples=[
85
+ ["./example.flac", "transcribe", False],
86
+ ["./example.flac", "transcribe", True],
87
+ ],
88
  )
89
 
90
+ # Create the Gradio tabbed interface for switching between modes
91
  demo = gr.Blocks()
92
 
93
  with demo:
94
  gr.TabbedInterface(
95
  [mic_transcribe, file_transcribe],
96
+ ["Transcribe Microphone", "Transcribe Audio File"]
97
  )
98
 
99
+ # Launch the app
100
  if __name__ == "__main__":
101
+ demo.launch(debug=True, enable_queue=True, share=True)