riteshkr commited on
Commit
7d4a692
1 Parent(s): 42de01f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -83
app.py CHANGED
@@ -1,100 +1,47 @@
1
- import torch
2
- from transformers import pipeline
3
  import gradio as gr
 
4
 
5
- # Define the model details
6
- MODEL_NAME = "riteshkr/quantized-whisper-large-v3" # Update with your actual model ID
7
- BATCH_SIZE = 8
8
-
9
- # Select device based on availability of CUDA (GPU) or fallback to CPU
10
- device = 0 if torch.cuda.is_available() else "cpu"
11
-
12
- # Load the ASR model pipeline
13
- pipe = pipeline(
14
- task="automatic-speech-recognition",
15
- model=MODEL_NAME,
16
- chunk_length_s=30,
17
- device=device,
18
- )
19
-
20
- # Utility function to format timestamps
21
- def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
22
- if seconds is not None:
23
- milliseconds = round(seconds * 1000.0)
24
- hours = milliseconds // 3_600_000
25
- milliseconds -= hours * 3_600_000
26
- minutes = milliseconds // 60_000
27
- milliseconds -= minutes * 60_000
28
- seconds = milliseconds // 1_000
29
- milliseconds -= seconds * 1_000
30
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
31
- return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
32
- else:
33
- return seconds
34
-
35
- # Transcription function for batch processing
36
- def transcribe(files, task, return_timestamps):
37
- transcriptions = []
38
- for file in files: # Process each file in the batch
39
- outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
40
- text = outputs["text"]
41
- if return_timestamps:
42
- timestamps = outputs["chunks"]
43
- formatted_chunks = [
44
- f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
45
- for chunk in timestamps
46
- ]
47
- text = "\n".join(formatted_chunks)
48
- transcriptions.append(text)
49
- return "\n\n".join(transcriptions) # Return all transcriptions combined
50
 
51
- # Define Gradio interface for microphone input
52
  mic_transcribe = gr.Interface(
53
- fn=transcribe,
54
- inputs=[
55
- gr.Audio(sources="microphone", type="filepath"),
56
- gr.Radio(["transcribe", "translate"], label="Task"),
57
- gr.Checkbox(label="Return timestamps"),
58
- ],
59
- outputs="text",
60
- layout="horizontal",
61
- title="Whisper Demo: Transcribe Audio",
62
- description=(
63
- f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
64
- ),
65
- allow_flagging="never",
66
  )
67
 
68
- # Define Gradio interface for file upload
69
  file_transcribe = gr.Interface(
70
- fn=transcribe,
71
- inputs=[
72
- gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),
73
- gr.Radio(["transcribe", "translate"], label="Task"),
74
- gr.Checkbox(label="Return timestamps"),
75
- ],
76
- outputs="text",
77
- layout="horizontal",
78
- title="Whisper Demo: Transcribe Audio",
79
- description=(
80
- f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
81
- ),
82
- allow_flagging="never",
83
- examples=[
84
- ["./example.flac", "transcribe", False],
85
- ["./example.flac", "transcribe", True],
86
- ],
87
  )
88
 
89
- # Create the Gradio tabbed interface for switching between modes
90
  demo = gr.Blocks()
91
 
92
  with demo:
93
  gr.TabbedInterface(
94
  [mic_transcribe, file_transcribe],
95
- ["Transcribe Microphone", "Transcribe Audio File"]
96
  )
97
 
98
- # Launch the app
99
  if __name__ == "__main__":
100
- demo.launch(debug=True, enable_queue=True, share=True)
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
 
4
+ # Load the ASR model using the Hugging Face pipeline
5
+ model_id = "riteshkr/quantized-whisper-large-v3" # Update with your model path or ID
6
+ pipe = pipeline("automatic-speech-recognition", model=model_id)
7
+
8
+ # Define the transcription function
9
+ def transcribe_speech(filepath):
10
+ output = pipe(
11
+ filepath,
12
+ max_new_tokens=256,
13
+ generate_kwargs={
14
+ "task": "transcribe",
15
+ "language": "english",
16
+ }, # Update the language as per your model's fine-tuning
17
+ chunk_length_s=30,
18
+ batch_size=8,
19
+ )
20
+ return output["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Define the Gradio interface for microphone input
23
  mic_transcribe = gr.Interface(
24
+ fn=transcribe_speech,
25
+ inputs=gr.Audio(sources="microphone", type="filepath"),
26
+ outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
27
  )
28
 
29
+ # Define the Gradio interface for file upload input
30
  file_transcribe = gr.Interface(
31
+ fn=transcribe_speech,
32
+ inputs=gr.Audio(sources="upload", type="filepath"),
33
+ outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
35
 
36
+ # Creating the tabbed layout using Blocks
37
  demo = gr.Blocks()
38
 
39
  with demo:
40
  gr.TabbedInterface(
41
  [mic_transcribe, file_transcribe],
42
+ ["Transcribe Microphone", "Transcribe Audio File"],
43
  )
44
 
45
+ # Launch the app with debugging enabled
46
  if __name__ == "__main__":
47
+ demo.launch(debug=True, share=True)