from flask import Flask, request, jsonify, render_template from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio import os # Initialize Flask app app = Flask(__name__) # Load the model and processor model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) # Define the upload folder UPLOAD_FOLDER = 'uploads' app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Ensure the upload folder exists if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) @app.route('/') def index(): return render_template('index.html') @app.route('/transcribe', methods=['POST']) def transcribe_audio(): if 'file' not in request.files: return jsonify({'error': 'No file part'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No selected file'}), 400 if file: # Save the uploaded file file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) file.save(file_path) # Load the audio file speech_array, sampling_rate = torchaudio.load(file_path) speech_array = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(speech_array) # Process the audio input input_values = processor(speech_array.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values # Perform inference with torch.no_grad(): logits = model(input_values).logits # Get the predicted transcription predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return jsonify({'transcription': transcription[0]}) return jsonify({'error': 'Something went wrong!'}), 500 if __name__ == '__main__': app.run(debug=True)