riteshkr commited on
Commit
59c621c
1 Parent(s): 04dc82f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -84
app.py CHANGED
@@ -1,101 +1,47 @@
1
- import torch
2
- from transformers import pipeline
3
- from transformers.pipelines.audio_utils import ffmpeg_read
4
  import gradio as gr
 
5
 
6
- # Define model details
7
- MODEL_NAME = "riteshkr/whisper-large-v3-quantized" # Update with your actual model ID
8
- BATCH_SIZE = 8
9
-
10
- # Select device based on availability of CUDA (GPU) or fallback to CPU
11
- device = 0 if torch.cuda.is_available() else "cpu"
12
-
13
- # Load the ASR model pipeline
14
- pipe = pipeline(
15
- task="automatic-speech-recognition",
16
- model=MODEL_NAME,
17
- chunk_length_s=30, # Adjust as needed for your application
18
- device=device,
19
- )
20
-
21
- # Utility function to format timestamps
22
- def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
23
- if seconds is not None:
24
- milliseconds = round(seconds * 1000.0)
25
- hours = milliseconds // 3_600_000
26
- milliseconds -= hours * 3_600_000
27
- minutes = milliseconds // 60_000
28
- milliseconds -= minutes * 60_000
29
- seconds = milliseconds // 1_000
30
- milliseconds -= seconds * 1_000
31
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
32
- return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
33
- else:
34
- return seconds
35
-
36
- # Transcription function for batch processing
37
- def transcribe(files, task, return_timestamps):
38
- transcriptions = []
39
- for file in files: # Process each file in the batch
40
- outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
41
- text = outputs["text"]
42
- if return_timestamps:
43
- timestamps = outputs["chunks"]
44
- formatted_chunks = [
45
- f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
46
- for chunk in timestamps
47
- ]
48
- text = "\n".join(formatted_chunks)
49
- transcriptions.append(text)
50
- return "\n\n".join(transcriptions) # Return all transcriptions combined
51
 
52
- # Define Gradio interface for microphone input
53
  mic_transcribe = gr.Interface(
54
- fn=transcribe,
55
- inputs=[
56
- gr.Audio(sources="microphone", type="filepath"),
57
- gr.Radio(["transcribe", "translate"], label="Task"),
58
- gr.Checkbox(label="Return timestamps"),
59
- ],
60
- outputs="text",
61
- layout="horizontal",
62
- title="Whisper Demo: Transcribe Audio",
63
- description=(
64
- f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
65
- ),
66
- allow_flagging="never",
67
  )
68
 
69
- # Define Gradio interface for file upload
70
  file_transcribe = gr.Interface(
71
- fn=transcribe,
72
- inputs=[
73
- gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),
74
- gr.Radio(["transcribe", "translate"], label="Task"),
75
- gr.Checkbox(label="Return timestamps"),
76
- ],
77
- outputs="text",
78
- layout="horizontal",
79
- title="Whisper Demo: Transcribe Audio",
80
- description=(
81
- f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
82
- ),
83
- allow_flagging="never",
84
- examples=[
85
- ["./example.flac", "transcribe", False],
86
- ["./example.flac", "transcribe", True],
87
- ],
88
  )
89
 
90
- # Create the Gradio tabbed interface for switching between modes
91
  demo = gr.Blocks()
92
 
93
  with demo:
94
  gr.TabbedInterface(
95
  [mic_transcribe, file_transcribe],
96
- ["Transcribe Microphone", "Transcribe Audio File"]
97
  )
98
 
99
- # Launch the app
100
  if __name__ == "__main__":
101
- demo.launch(debug=True, enable_queue=True, share=True)
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
 
4
+ # Load the ASR model using the Hugging Face pipeline
5
+ model_id = "riteshkr/whisper-large-v3-quantized" # Update with your model path or ID
6
+ pipe = pipeline("automatic-speech-recognition", model=model_id)
7
+
8
+ # Define the transcription function
9
+ def transcribe_speech(filepath):
10
+ output = pipe(
11
+ filepath,
12
+ max_new_tokens=256,
13
+ generate_kwargs={
14
+ "task": "transcribe",
15
+ "language": "english",
16
+ }, # Update the language as per your model's fine-tuning
17
+ chunk_length_s=30,
18
+ batch_size=8,
19
+ )
20
+ return output["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Define the Gradio interface for microphone input
23
  mic_transcribe = gr.Interface(
24
+ fn=transcribe_speech,
25
+ inputs=gr.Audio(sources="microphone", type="filepath"),
26
+ outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
27
  )
28
 
29
+ # Define the Gradio interface for file upload input
30
  file_transcribe = gr.Interface(
31
+ fn=transcribe_speech,
32
+ inputs=gr.Audio(sources="upload", type="filepath"),
33
+ outputs=gr.Textbox(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
35
 
36
+ # Creating the tabbed layout using Blocks
37
  demo = gr.Blocks()
38
 
39
  with demo:
40
  gr.TabbedInterface(
41
  [mic_transcribe, file_transcribe],
42
+ ["Transcribe Microphone", "Transcribe Audio File"],
43
  )
44
 
45
+ # Launch the app with debugging enabled
46
  if __name__ == "__main__":
47
+ demo.launch(debug=True, share=True)