Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import datetime
|
2 |
import os
|
3 |
os.system('pip install git+https://github.com/openai/whisper.git')
|
4 |
-
from whisper.audio import N_SAMPLES
|
5 |
import gradio as gr
|
6 |
import wave
|
7 |
import whisper
|
@@ -12,8 +12,8 @@ import torchaudio.functional as F
|
|
12 |
LOGGING_FORMAT = '%(asctime)s %(message)s'
|
13 |
logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
|
14 |
|
15 |
-
|
16 |
-
|
17 |
# tmp dir to store audio files.
|
18 |
if not os.path.isdir('./tmp/'):
|
19 |
os.mkdir('./tmp')
|
@@ -68,8 +68,9 @@ def transcribe(audio, state={}):
|
|
68 |
print('=====================')
|
69 |
logging.info(state)
|
70 |
# Whisper only take maximum 30s of audio as input.
|
71 |
-
# And the gradio streaming does not guarantee each callback is 1s,
|
72 |
-
#
|
|
|
73 |
# However the text should not change.
|
74 |
|
75 |
if not state:
|
@@ -78,8 +79,8 @@ def transcribe(audio, state={}):
|
|
78 |
state['chunks'] = {}
|
79 |
return state['all_chunk_texts'], state
|
80 |
|
81 |
-
chunk = state['count'] //
|
82 |
-
chunk_offset = state['count'] %
|
83 |
|
84 |
if chunk_offset == 0:
|
85 |
state['chunks'][chunk] = {}
|
@@ -91,7 +92,7 @@ def transcribe(audio, state={}):
|
|
91 |
state['count'] += 1
|
92 |
|
93 |
# Determin if recognizes current chunk.
|
94 |
-
if (chunk_offset + 1) %
|
95 |
logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
|
96 |
result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
|
97 |
logging.info('complete transcribe.......')
|
@@ -111,7 +112,7 @@ def transcribe(audio, state={}):
|
|
111 |
return state['all_chunk_texts'], state
|
112 |
|
113 |
# Make sure not missing any audio clip.
|
114 |
-
assert
|
115 |
|
116 |
gr.Interface(fn=transcribe,
|
117 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|
|
|
1 |
import datetime
|
2 |
import os
|
3 |
os.system('pip install git+https://github.com/openai/whisper.git')
|
4 |
+
from whisper.audio import N_SAMPLES
|
5 |
import gradio as gr
|
6 |
import wave
|
7 |
import whisper
|
|
|
12 |
LOGGING_FORMAT = '%(asctime)s %(message)s'
|
13 |
logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
|
14 |
|
15 |
+
RECOGNITION_INTERVAL = 2
|
16 |
+
CNT_PER_CHUNK = 6
|
17 |
# tmp dir to store audio files.
|
18 |
if not os.path.isdir('./tmp/'):
|
19 |
os.mkdir('./tmp')
|
|
|
68 |
print('=====================')
|
69 |
logging.info(state)
|
70 |
# Whisper only take maximum 30s of audio as input.
|
71 |
+
# And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
|
72 |
+
# The logic of determine chunk could be improved by reading exact how many samples in audio files.
|
73 |
+
# After count reach CNT_PER_CHUNK * n, a new audio file is created.
|
74 |
# However the text should not change.
|
75 |
|
76 |
if not state:
|
|
|
79 |
state['chunks'] = {}
|
80 |
return state['all_chunk_texts'], state
|
81 |
|
82 |
+
chunk = state['count'] // CNT_PER_CHUNK
|
83 |
+
chunk_offset = state['count'] % CNT_PER_CHUNK
|
84 |
|
85 |
if chunk_offset == 0:
|
86 |
state['chunks'][chunk] = {}
|
|
|
92 |
state['count'] += 1
|
93 |
|
94 |
# Determin if recognizes current chunk.
|
95 |
+
if (chunk_offset + 1) % RECOGNITION_INTERVAL == 0 and chunk_offset > 0:
|
96 |
logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
|
97 |
result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
|
98 |
logging.info('complete transcribe.......')
|
|
|
112 |
return state['all_chunk_texts'], state
|
113 |
|
114 |
# Make sure not missing any audio clip.
|
115 |
+
assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
|
116 |
|
117 |
gr.Interface(fn=transcribe,
|
118 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|