from pydub import AudioSegment from transformers import pipeline import gradio as gr import ffmpeg import soundfile as sf import numpy as np def process_audio_length(audio_path): audio = AudioSegment.from_file(audio_path) audio_length_ms = len(audio) chunks = [] target_length_ms=30000 if audio_length_ms < target_length_ms: padding = AudioSegment.silent(duration=target_length_ms - audio_length_ms) padded_audio = audio + padding chunks.append(padded_audio) else: for start in range(0, audio_length_ms, target_length_ms): end = min(start + target_length_ms, audio_length_ms) chunk = audio[start:end] if len(chunk) < target_length_ms: padding = AudioSegment.silent(duration=target_length_ms - len(chunk)) chunk += padding chunks.append(chunk) return chunks def save_chunks(chunks): import os output_folder="audio_chunks" if not os.path.exists(output_folder): os.makedirs(output_folder) chunk_files = [] for i, chunk in enumerate(chunks): chunk_name = f"{output_folder}/chunk_{i}.wav" chunk.export(chunk_name, format="wav") chunk_files.append(chunk_name) return chunk_files # Load the model using the pipeline pipe = pipeline(task="automatic-speech-recognition", model="alisharifi/whisper-farsi") # Extracting sound from video file def extract_audio_from_video(video_file): output_audio = "output_audio.wav" stream = ffmpeg.input(video_file) stream = ffmpeg.output(stream, output_audio, ac=1, ar="16000") ffmpeg.run(stream, overwrite_output=True) return output_audio # ASR function def process_audio(audio): if audio is None: return "لطفاً صدایی ضبط کنید." chunks = process_audio_length(audio) chunk_files = save_chunks(chunks) transcriptions = [] for chunk_file in chunk_files: transcription = pipe(chunk_file)["text"] transcriptions.append(transcription) return " ".join(transcriptions) # Video handler def process_video(video_file): audio_path = extract_audio_from_video(video_file) return process_audio(audio_path) # Audio handler def process_audio_file(audio_file): return process_audio(audio_file) # microphone handler def process_microphone(audio_data): print(f"Audio data type: {type(audio_data)}") print(f"Audio data content: {audio_data}") if audio_data is None: return "هیچ صدایی ضبط نشد. لطفاً دوباره امتحان کنید." # بررسی فرمت داده if not isinstance(audio_data, tuple) or len(audio_data) != 2: return f"فرمت داده صوتی نادرست است: {type(audio_data)}" audio_array, sample_rate = audio_data if not isinstance(audio_array, np.ndarray): return f"داده صوتی نادرست است: {type(audio_array)}" # اطمینان از اینکه داده‌های صوتی دو‌بعدی است if audio_array.ndim == 1: audio_array = audio_array[:, np.newaxis] # ذخیره فایل صوتی audio_path = "recorded_audio.wav" sf.write(audio_path, audio_array, sample_rate) return process_audio(audio_path) # Gradio interface with gr.Blocks() as demo: gr.Markdown("## سامانه تبدیل گفتار به متن") # Video input with gr.Tab("آپلود ویدئو"): video_input = gr.Video(label="آپلود فایل ویدئو") video_output = gr.Textbox(label="متن استخراج شده") video_button = gr.Button("پردازش") video_button.click(process_video, inputs=video_input, outputs=video_output) # Audio input with gr.Tab("آپلود فایل صوتی"): audio_input = gr.Audio(label="آپلود فایل صوتی", type="filepath") audio_output = gr.Textbox(label="متن استخراج شده") audio_button = gr.Button("پردازش") audio_button.click(process_audio_file, inputs=audio_input, outputs=audio_output) # recording voice with gr.Tab("ضبط صدا"): mic_input = gr.Audio(sources="microphone", type="filepath", label="ضبط صدا") mic_output = gr.Textbox(label="متن استخراج شده") mic_button = gr.Button("پردازش") mic_button.click(process_audio_file, inputs=mic_input, outputs=mic_output) # Ensure share=True for a shareable link demo.launch(share=True)