kabita-choudhary's picture
Update app.py
7478ded
import whisper
import datetime
import subprocess
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
import gradio as gr
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import os
from transformers import pipeline
summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary")
def time(secs):
return datetime.timedelta(seconds=round(secs))
def translatetotext(vpath,no_of_speaker):
num_speakers = no_of_speaker
language = 'English'
model_size = 'small'
model = whisper.load_model(model_size)
model_name = model_size
_,file_ending = os.path.splitext(f'{vpath}')
print(f'file enging is {file_ending}')
path = vpath.replace(file_ending, ".wav")
print("starting conversion to wav")
os.system(f'ffmpeg -i "{vpath}" -ar 16000 -ac 1 -c:a pcm_s16le "{path}"')
result = model.transcribe(path)
segments = result["segments"]
print(segments)
duration=0
with contextlib.closing(wave.open(path,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
def segment_embedding(segment):
audio = Audio()
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(path, clip)
return embedding_model(waveform[None])
embeddings = np.zeros(shape=(len(segments), 192))
print(duration)
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
print(labels)
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
f = open("transcript.txt", "w")
out=""
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
f.write(segment["speaker"] + ' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ')
out=out+segment["speaker"]+' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' '
f.write(segment["text"][1:] + '\n')
out=out+segment["text"][1:] + '\n'
f.close()
summary = summarizer(out)
f = open("summary.txt", "w")
f.write(summary[0]["summary_text"])
f.close()
return out,summary[0]["summary_text"],"transcript.txt","summary.txt"
demo = gr.Interface(
fn=translatetotext,
inputs=[gr.Video(source="upload",type="filepath"),gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)],
outputs=["text","text",gr.File(label="Download transcript"),gr.File(label="Download summary")]
)
demo.launch(debug=True)