from flask import Flask, render_template, request, jsonify import torch from transformers import pipeline import gradio as gr app = Flask(__name__) # Load the automatic speech recognition model pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v3", torch_dtype=torch.float16, device="cuda:0") # Load the emotion classification model emotion_classifier = pipeline( "text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True ) def transcribe(audio_file, task): if audio_file is None: return "Please upload or record an audio file." # Check if the audio file is in bytes format (drag-and-drop file) if isinstance(audio_file, bytes): text = pipe(audio_file, generate_kwargs={"task": task}, return_timestamps=True)["text"] else: # Handle the case where the file is uploaded using the file uploader text = pipe(audio_file.name, generate_kwargs={"task": task}, return_timestamps=True)["text"] return text @app.route('/') def index(): return render_template('index.html') @app.route('/transcribe', methods=['POST']) def transcribe_endpoint(): audio_file = request.files.get('audio_file') task = request.form.get('task') text = transcribe(audio_file, task) return jsonify({'text': text}) @app.route('/classify_emotion', methods=['POST']) def classify_emotion_endpoint(): text = request.form.get('text') result = emotion_classifier(text) return jsonify(result) if __name__ == '__main__': app.run(debug=True)