Spaces:
Runtime error
Runtime error
import whisper | |
import datetime | |
import subprocess | |
import torch | |
import pyannote.audio | |
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding | |
embedding_model = PretrainedSpeakerEmbedding( | |
"speechbrain/spkrec-ecapa-voxceleb",device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) | |
from pyannote.audio import Audio | |
from pyannote.core import Segment | |
import wave | |
import contextlib | |
import gradio as gr | |
from sklearn.cluster import AgglomerativeClustering | |
import numpy as np | |
import os | |
from transformers import pipeline | |
summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary") | |
def time(secs): | |
return datetime.timedelta(seconds=round(secs)) | |
def translatetotext(vpath,no_of_speaker): | |
num_speakers = no_of_speaker | |
language = 'English' | |
model_size = 'small' | |
model = whisper.load_model(model_size) | |
model_name = model_size | |
_,file_ending = os.path.splitext(f'{vpath}') | |
print(f'file enging is {file_ending}') | |
path = vpath.replace(file_ending, ".wav") | |
print("starting conversion to wav") | |
os.system(f'ffmpeg -i "{vpath}" -ar 16000 -ac 1 -c:a pcm_s16le "{path}"') | |
result = model.transcribe(path) | |
segments = result["segments"] | |
print(segments) | |
duration=0 | |
with contextlib.closing(wave.open(path,'r')) as f: | |
frames = f.getnframes() | |
rate = f.getframerate() | |
duration = frames / float(rate) | |
def segment_embedding(segment): | |
audio = Audio() | |
start = segment["start"] | |
# Whisper overshoots the end timestamp in the last segment | |
end = min(duration, segment["end"]) | |
clip = Segment(start, end) | |
waveform, sample_rate = audio.crop(path, clip) | |
return embedding_model(waveform[None]) | |
embeddings = np.zeros(shape=(len(segments), 192)) | |
print(duration) | |
for i, segment in enumerate(segments): | |
embeddings[i] = segment_embedding(segment) | |
embeddings = np.nan_to_num(embeddings) | |
clustering = AgglomerativeClustering(num_speakers).fit(embeddings) | |
labels = clustering.labels_ | |
print(labels) | |
for i in range(len(segments)): | |
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) | |
f = open("transcript.txt", "w") | |
out="" | |
for (i, segment) in enumerate(segments): | |
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: | |
f.write(segment["speaker"] + ' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ') | |
out=out+segment["speaker"]+' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ' | |
f.write(segment["text"][1:] + '\n') | |
out=out+segment["text"][1:] + '\n' | |
f.close() | |
summary = summarizer(out) | |
f = open("summary.txt", "w") | |
f.write(summary[0]["summary_text"]) | |
f.close() | |
return out,summary[0]["summary_text"],"transcript.txt","summary.txt" | |
demo = gr.Interface( | |
fn=translatetotext, | |
inputs=[gr.Video(source="upload",type="filepath"),gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)], | |
outputs=["text","text",gr.File(label="Download transcript"),gr.File(label="Download summary")] | |
) | |
demo.launch(debug=True) |