supersonictw commited on
Commit
52024f3
·
verified ·
0 Parent(s):

Upload 20221008 artifacts

Browse files
._mic_vad_streaming.py ADDED
Binary file (4.1 kB). View file
 
.gitattributes ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ deepspeech-0.9.3-models-zh-TW.pbmm filter=lfs diff=lfs merge=lfs -text
2
+ deepspeech-0.9.3-models-zh-TW.scorer filter=lfs diff=lfs merge=lfs -text
3
+ deepspeech-0.9.3-models-zh-TW.tflite filter=lfs diff=lfs merge=lfs -text
4
+ deepspeech-0.9.3-models.pbmm filter=lfs diff=lfs merge=lfs -text
5
+ deepspeech-0.9.3-models.scorer filter=lfs diff=lfs merge=lfs -text
6
+ deepspeech-0.9.3-models.tflite filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.7
LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+
deepspeech-0.9.3-models-zh-TW.pbmm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d04351d5526290aa1d18195f92d7f5eb84c0eeab437c804dd241c67df3d3dd1
3
+ size 190777619
deepspeech-0.9.3-models-zh-TW.scorer ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2687d968f46189504d9b9edc0f914f6b4b39c4d94c73f75b6a6c180d33f30240
3
+ size 67141744
deepspeech-0.9.3-models-zh-TW.tflite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbd3461aaba5f320b71f3c53e49a0924afb8a8eb383364f07dfb8cc51364e396
3
+ size 47798728
deepspeech-0.9.3-models.pbmm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe9b4f64bda3591acd723c27629f101d1bb1ec487730d9f882bcfe03214462d
3
+ size 188915987
deepspeech-0.9.3-models.scorer ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0cf926ab9cab54a8a7d70003b931b2d62ebd9105ed392d1ec9c840029867799
3
+ size 953363776
deepspeech-0.9.3-models.tflite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a88f98ff15c9bf760bf7da035b9dafae240e7eb000af376f87e052aae331203
3
+ size 47331784
mic_vad_streaming.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time, logging
2
+ from datetime import datetime
3
+ import threading, collections, queue, os, os.path
4
+ import deepspeech
5
+ import numpy as np
6
+ import pyaudio
7
+ import wave
8
+ import webrtcvad
9
+ from halo import Halo
10
+ from scipy import signal
11
+
12
+ logging.basicConfig(level=20)
13
+
14
+ class Audio(object):
15
+ """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
16
+
17
+ FORMAT = pyaudio.paInt16
18
+ # Network/VAD rate-space
19
+ RATE_PROCESS = 16000
20
+ CHANNELS = 1
21
+ BLOCKS_PER_SECOND = 50
22
+
23
+ def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS, file=None):
24
+ def proxy_callback(in_data, frame_count, time_info, status):
25
+ #pylint: disable=unused-argument
26
+ if self.chunk is not None:
27
+ in_data = self.wf.readframes(self.chunk)
28
+ callback(in_data)
29
+ return (None, pyaudio.paContinue)
30
+ if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
31
+ self.buffer_queue = queue.Queue()
32
+ self.device = device
33
+ self.input_rate = input_rate
34
+ self.sample_rate = self.RATE_PROCESS
35
+ self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
36
+ self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
37
+ self.pa = pyaudio.PyAudio()
38
+
39
+ kwargs = {
40
+ 'format': self.FORMAT,
41
+ 'channels': self.CHANNELS,
42
+ 'rate': self.input_rate,
43
+ 'input': True,
44
+ 'frames_per_buffer': self.block_size_input,
45
+ 'stream_callback': proxy_callback,
46
+ }
47
+
48
+ self.chunk = None
49
+ # if not default device
50
+ if self.device:
51
+ kwargs['input_device_index'] = self.device
52
+ elif file is not None:
53
+ self.chunk = 320
54
+ self.wf = wave.open(file, 'rb')
55
+
56
+ self.stream = self.pa.open(**kwargs)
57
+ self.stream.start_stream()
58
+
59
+ def resample(self, data, input_rate):
60
+ """
61
+ Microphone may not support our native processing sampling rate, so
62
+ resample from input_rate to RATE_PROCESS here for webrtcvad and
63
+ deepspeech
64
+ Args:
65
+ data (binary): Input audio stream
66
+ input_rate (int): Input audio rate to resample from
67
+ """
68
+ data16 = np.fromstring(string=data, dtype=np.int16)
69
+ resample_size = int(len(data16) / self.input_rate * self.RATE_PROCESS)
70
+ resample = signal.resample(data16, resample_size)
71
+ resample16 = np.array(resample, dtype=np.int16)
72
+ return resample16.tostring()
73
+
74
+ def read_resampled(self):
75
+ """Return a block of audio data resampled to 16000hz, blocking if necessary."""
76
+ return self.resample(data=self.buffer_queue.get(),
77
+ input_rate=self.input_rate)
78
+
79
+ def read(self):
80
+ """Return a block of audio data, blocking if necessary."""
81
+ return self.buffer_queue.get()
82
+
83
+ def destroy(self):
84
+ self.stream.stop_stream()
85
+ self.stream.close()
86
+ self.pa.terminate()
87
+
88
+ frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
89
+
90
+ def write_wav(self, filename, data):
91
+ logging.info("write wav %s", filename)
92
+ wf = wave.open(filename, 'wb')
93
+ wf.setnchannels(self.CHANNELS)
94
+ # wf.setsampwidth(self.pa.get_sample_size(FORMAT))
95
+ assert self.FORMAT == pyaudio.paInt16
96
+ wf.setsampwidth(2)
97
+ wf.setframerate(self.sample_rate)
98
+ wf.writeframes(data)
99
+ wf.close()
100
+
101
+
102
+ class VADAudio(Audio):
103
+ """Filter & segment audio with voice activity detection."""
104
+
105
+ def __init__(self, aggressiveness=3, device=None, input_rate=None, file=None):
106
+ super().__init__(device=device, input_rate=input_rate, file=file)
107
+ self.vad = webrtcvad.Vad(aggressiveness)
108
+
109
+ def frame_generator(self):
110
+ """Generator that yields all audio frames from microphone."""
111
+ if self.input_rate == self.RATE_PROCESS:
112
+ while True:
113
+ yield self.read()
114
+ else:
115
+ while True:
116
+ yield self.read_resampled()
117
+
118
+ def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
119
+ """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
120
+ Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
121
+ Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
122
+ |---utterence---| |---utterence---|
123
+ """
124
+ if frames is None: frames = self.frame_generator()
125
+ num_padding_frames = padding_ms // self.frame_duration_ms
126
+ ring_buffer = collections.deque(maxlen=num_padding_frames)
127
+ triggered = False
128
+
129
+ for frame in frames:
130
+ if len(frame) < 640:
131
+ return
132
+
133
+ is_speech = self.vad.is_speech(frame, self.sample_rate)
134
+
135
+ if not triggered:
136
+ ring_buffer.append((frame, is_speech))
137
+ num_voiced = len([f for f, speech in ring_buffer if speech])
138
+ if num_voiced > ratio * ring_buffer.maxlen:
139
+ triggered = True
140
+ for f, s in ring_buffer:
141
+ yield f
142
+ ring_buffer.clear()
143
+
144
+ else:
145
+ yield frame
146
+ ring_buffer.append((frame, is_speech))
147
+ num_unvoiced = len([f for f, speech in ring_buffer if not speech])
148
+ if num_unvoiced > ratio * ring_buffer.maxlen:
149
+ triggered = False
150
+ yield None
151
+ ring_buffer.clear()
152
+
153
+ def main(ARGS):
154
+ # Load DeepSpeech model
155
+ if os.path.isdir(ARGS.model):
156
+ model_dir = ARGS.model
157
+ ARGS.model = os.path.join(model_dir, 'output_graph.pb')
158
+ ARGS.scorer = os.path.join(model_dir, ARGS.scorer)
159
+
160
+ print('Initializing model...')
161
+ logging.info("ARGS.model: %s", ARGS.model)
162
+ model = deepspeech.Model(ARGS.model)
163
+ if ARGS.scorer:
164
+ logging.info("ARGS.scorer: %s", ARGS.scorer)
165
+ model.enableExternalScorer(ARGS.scorer)
166
+
167
+ # Start audio with VAD
168
+ vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
169
+ device=ARGS.device,
170
+ input_rate=ARGS.rate,
171
+ file=ARGS.file)
172
+ print("Listening (ctrl-C to exit)...")
173
+ frames = vad_audio.vad_collector()
174
+
175
+ # Stream from microphone to DeepSpeech using VAD
176
+ spinner = None
177
+ if not ARGS.nospinner:
178
+ spinner = Halo(spinner='line')
179
+ stream_context = model.createStream()
180
+ wav_data = bytearray()
181
+ for frame in frames:
182
+ if frame is not None:
183
+ if spinner: spinner.start()
184
+ logging.debug("streaming frame")
185
+ stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
186
+ if ARGS.savewav: wav_data.extend(frame)
187
+ else:
188
+ if spinner: spinner.stop()
189
+ logging.debug("end utterence")
190
+ if ARGS.savewav:
191
+ vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
192
+ wav_data = bytearray()
193
+ text = stream_context.finishStream()
194
+ print("Recognized: %s" % text)
195
+ stream_context = model.createStream()
196
+
197
+ if __name__ == '__main__':
198
+ DEFAULT_SAMPLE_RATE = 16000
199
+
200
+ import argparse
201
+ parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
202
+
203
+ parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
204
+ help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
205
+ parser.add_argument('--nospinner', action='store_true',
206
+ help="Disable spinner")
207
+ parser.add_argument('-w', '--savewav',
208
+ help="Save .wav files of utterences to given directory")
209
+ parser.add_argument('-f', '--file',
210
+ help="Read from .wav file instead of microphone")
211
+
212
+ parser.add_argument('-m', '--model', required=True,
213
+ help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
214
+ parser.add_argument('-s', '--scorer',
215
+ help="Path to the external scorer file.")
216
+ parser.add_argument('-d', '--device', type=int, default=None,
217
+ help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
218
+ parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
219
+ help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
220
+
221
+ ARGS = parser.parse_args()
222
+ if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
223
+ main(ARGS)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ deepspeech~=0.9.3
2
+ pyaudio~=0.2.11
3
+ halo~=0.0.18
4
+ numpy>=1.15.1
5
+ scipy>=1.1.0