import whisper import datetime import subprocess import torch import pyannote.audio from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding embedding_model = PretrainedSpeakerEmbedding( "speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda")) from pyannote.audio import Audio from pyannote.core import Segment import wave import contextlib from sklearn.cluster import AgglomerativeClustering import numpy as np num_speakers = 2 language = 'English' model_size = 'medium' model = whisper.load_model(model_size) model_name = model_size audio = Audio() def segmentembedding(segment): start = segment["start"] end = min(duration, segment["end"]) clip = Segment(start, end) waveform, sample_rate = audio.crop(path, clip) return embedding_model(waveform[None]) def time(secs): return datetime.timedelta(seconds=round(secs)) from transformers import pipeline summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary") def translatetotext(path): out="" if path[-3:] != 'wav': subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y']) path = 'audio.wav' result = model.transcribe(path) segments = result["segments"] print(segments) with contextlib.closing(wave.open(path,'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) f.close() embeddings = np.zeros(shape=(len(segments), 192)) for i, segment in enumerate(segments): embeddings[i] = segment_embedding(segment) embeddings = np.nan_to_num(embeddings) clustering = AgglomerativeClustering(num_speakers).fit(embeddings) labels = clustering.labels_ for i in range(len(segments)): segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) f = open("transcript.txt", "w") for (i, segment) in enumerate(segments): if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n') out=out+segment["speaker"] f.write(segment["text"][1:] + ' ') out=out+segment["text"][1:] + '\n' f.close() summary = summarizer(out) return out,summary demo = gr.Interface( fn=translatetotext, inputs=gr.Audio(source="upload",type="filepath"), outputs=["text","text"] ) demo.launch(debug=True)