kabita-choudhary's picture
Update app.py
731d7a6
import gradio as gr
import pandas as pd
from pydub import AudioSegment
from pyannote.audio import Pipeline
import whisper
model = whisper.load_model("medium")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token="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")
def diarization(inp_audio):
diarization = pipeline(inp_audio)
speakertime=[]
output=""
# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
details=[turn.start,turn.end,speaker]
speakertime.append(details)
#print(turn.start)
#print(speaker)
print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
output=output+f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}"+"\n"
#print(speakertime)
df = pd.DataFrame(speakertime,columns=['start', 'end','speaker'])
text=[]
for i in range (df.start.count()):
text.append(generatetext(inp_audio,df.start[i], df.end[i]))
df['text']=text
with open('my_file.txt', 'w') as my_file:
for i in range (df.start.count()):
my_file.write(df.speaker[i]+": " +df.text[i] + '\n')
output=output+df.speaker[i]+": " +df.text[i] + '\n'
print(open("my_file.txt","r").read())
return output
def generatetext(filename,starttime,endtime):
t1 = starttime * 1000 # works in milliseconds
t2 = endtime * 1000
newAudio = AudioSegment.from_wav(filename)
a = newAudio[t1:t2]
a.export('audio.wav', format="wav")
audio = whisper.load_audio('audio.wav')
result= model.transcribe(audio)
#text1 = whisper('audio.wav')
print(result)
print(result.get("text"))
return result.get("text")
block = gr.Blocks()
with block:
with gr.Group():
with gr.Box():
with gr.Row().style():
inp_audio = gr.Audio(
label="Input Audio",
type="filepath",
mirror_webcam = False
)
outputdialogs = gr.Textbox()
btn = gr.Button("Generate Text")
btn.click(diarization, inputs=[inp_audio], outputs=[outputdialogs],api_name="view_api")
block.launch(enable_queue = True,debug=True)