import whisper import datetime import subprocess import torch import pyannote.audio from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding embedding_model = PretrainedSpeakerEmbedding( "speechbrain/spkrec-ecapa-voxceleb",device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) from pyannote.audio import Audio from pyannote.core import Segment import wave import contextlib import gradio as gr from sklearn.cluster import AgglomerativeClustering import numpy as np import os from transformers import pipeline summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary") def time(secs): return datetime.timedelta(seconds=round(secs)) def translatetotext(vpath,no_of_speaker): num_speakers = no_of_speaker language = 'English' model_size = 'small' model = whisper.load_model(model_size) model_name = model_size _,file_ending = os.path.splitext(f'{vpath}') print(f'file enging is {file_ending}') path = vpath.replace(file_ending, ".wav") print("starting conversion to wav") os.system(f'ffmpeg -i "{vpath}" -ar 16000 -ac 1 -c:a pcm_s16le "{path}"') result = model.transcribe(path) segments = result["segments"] print(segments) duration=0 with contextlib.closing(wave.open(path,'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) def segment_embedding(segment): audio = Audio() start = segment["start"] # Whisper overshoots the end timestamp in the last segment end = min(duration, segment["end"]) clip = Segment(start, end) waveform, sample_rate = audio.crop(path, clip) return embedding_model(waveform[None]) embeddings = np.zeros(shape=(len(segments), 192)) print(duration) for i, segment in enumerate(segments): embeddings[i] = segment_embedding(segment) embeddings = np.nan_to_num(embeddings) clustering = AgglomerativeClustering(num_speakers).fit(embeddings) labels = clustering.labels_ print(labels) for i in range(len(segments)): segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) f = open("transcript.txt", "w") out="" for (i, segment) in enumerate(segments): if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: f.write(segment["speaker"] + ' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ') out=out+segment["speaker"]+' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ' f.write(segment["text"][1:] + '\n') out=out+segment["text"][1:] + '\n' f.close() summary = summarizer(out) f = open("summary.txt", "w") f.write(summary[0]["summary_text"]) f.close() return out,summary[0]["summary_text"],"transcript.txt","summary.txt" demo = gr.Interface( fn=translatetotext, inputs=[gr.Video(source="upload",type="filepath"),gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)], outputs=["text","text",gr.File(label="Download transcript"),gr.File(label="Download summary")] ) demo.launch(debug=True)