Spaces:

riteshkr
/

transcribe-using-q-whi-L-v3

Sleeping

App Files Files Community

transcribe-using-q-whi-L-v3 / app.py

riteshkr

Update app.py

d23e0cd verified 2 months ago

raw

history blame

3.66 kB

	import torch
	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read
	import gradio as gr

	# Define model details
	MODEL_NAME = "riteshkr/whisper-large-v3-quantized" # Update with your actual model ID
	BATCH_SIZE = 8

	# Select device based on availability of CUDA (GPU) or fallback to CPU
	device = 0 if torch.cuda.is_available() else "cpu"

	# Load the ASR model pipeline
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30, # Adjust as needed for your application
	device=device,
	)

	# Utility function to format timestamps
	def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
	if seconds is not None:
	milliseconds = round(seconds * 1000.0)
	hours = milliseconds // 3_600_000
	milliseconds -= hours * 3_600_000
	minutes = milliseconds // 60_000
	milliseconds -= minutes * 60_000
	seconds = milliseconds // 1_000
	milliseconds -= seconds * 1_000
	hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
	return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
	else:
	return seconds

	# Transcription function for batch processing
	def transcribe(files, task, return_timestamps):
	transcriptions = []
	for file in files: # Process each file in the batch
	outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
	text = outputs["text"]
	if return_timestamps:
	timestamps = outputs["chunks"]
	formatted_chunks = [
	f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
	for chunk in timestamps
	]
	text = "\n".join(formatted_chunks)
	transcriptions.append(text)
	return "\n\n".join(transcriptions) # Return all transcriptions combined

	# Define Gradio interface for microphone input
	mic_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="microphone", type="filepath", optional=True),
	gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
	gr.Checkbox(default=False, label="Return timestamps"),
	],
	outputs="text",
	layout="horizontal",
	title="Whisper Demo: Transcribe Audio",
	description=(
	f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
	),
	allow_flagging="never",
	)

	# Define Gradio interface for file upload
	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="upload", type="filepath", label="Upload Audio File", optional=True),
	gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
	gr.Checkbox(default=False, label="Return timestamps"),
	],
	outputs="text",
	layout="horizontal",
	title="Whisper Demo: Transcribe Audio",
	description=(
	f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
	),
	allow_flagging="never",
	examples=[
	["./example.flac", "transcribe", False],
	["./example.flac", "transcribe", True],
	],
	)

	# Create the Gradio tabbed interface for switching between modes
	demo = gr.Blocks()

	with demo:
	gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Transcribe Microphone", "Transcribe Audio File"]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(debug=True, enable_queue=True, share=True)