kabita-choudhary commited on
Commit
4e9325e
1 Parent(s): ce976bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import datetime
3
+ import subprocess
4
+ import torch
5
+ import pyannote.audio
6
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
7
+ embedding_model = PretrainedSpeakerEmbedding(
8
+ "speechbrain/spkrec-ecapa-voxceleb",
9
+ device=torch.device("cuda"))
10
+
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+
14
+ import wave
15
+ import contextlib
16
+
17
+ from sklearn.cluster import AgglomerativeClustering
18
+ import numpy as np
19
+
20
+
21
+ num_speakers = 2
22
+
23
+ language = 'English'
24
+
25
+ model_size = 'medium'
26
+ model = whisper.load_model(model_size)
27
+
28
+
29
+ model_name = model_size
30
+ audio = Audio()
31
+ def segmentembedding(segment):
32
+ start = segment["start"]
33
+ end = min(duration, segment["end"])
34
+ clip = Segment(start, end)
35
+ waveform, sample_rate = audio.crop(path, clip)
36
+ return embedding_model(waveform[None])
37
+ def time(secs):
38
+ return datetime.timedelta(seconds=round(secs))
39
+ from transformers import pipeline
40
+ summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary")
41
+
42
+ def translatetotext(path):
43
+ out=""
44
+ if path[-3:] != 'wav':
45
+ subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
46
+ path = 'audio.wav'
47
+ result = model.transcribe(path)
48
+ segments = result["segments"]
49
+ print(segments)
50
+ with contextlib.closing(wave.open(path,'r')) as f:
51
+ frames = f.getnframes()
52
+ rate = f.getframerate()
53
+ duration = frames / float(rate)
54
+ f.close()
55
+ embeddings = np.zeros(shape=(len(segments), 192))
56
+ for i, segment in enumerate(segments):
57
+ embeddings[i] = segment_embedding(segment)
58
+ embeddings = np.nan_to_num(embeddings)
59
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
60
+ labels = clustering.labels_
61
+ for i in range(len(segments)):
62
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
63
+ f = open("transcript.txt", "w")
64
+ for (i, segment) in enumerate(segments):
65
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
66
+ f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
67
+ out=out+segment["speaker"]
68
+ f.write(segment["text"][1:] + ' ')
69
+ out=out+segment["text"][1:] + '\n'
70
+ f.close()
71
+ summary = summarizer(out)
72
+ return out,summary
73
+
74
+ demo = gr.Interface(
75
+ fn=translatetotext,
76
+ inputs=gr.Audio(source="upload",type="filepath"),
77
+ outputs=["text","text"]
78
+ )
79
+ demo.launch(debug=True)