Liusuthu commited on
Commit
0a209fb
·
verified ·
1 Parent(s): ef0acdf

Update app_utils.py

Browse files
Files changed (1) hide show
  1. app_utils.py +240 -232
app_utils.py CHANGED
@@ -97,248 +97,256 @@ def text_api(text:str):
97
  #######################################################################
98
  #规范函数,只管值输入输出:
99
  def text_score(text):
100
- string=text_api(text)
101
- part1 = str.partition(string, r"text")
102
- want1 = part1[2]
103
- label = want1[4:6]
104
- part2 = str.partition(string, r"probability")
105
- want2 = part2[2]
106
- prob = float(want2[3:-4])
107
- if label=="正向":
108
- score=-np.log10(prob*10)
109
  else:
110
- score=np.log10(prob*10)
111
- # print("from func:text_score————,text:",text,",score:",score)
112
- return text,score
 
 
 
 
 
 
 
 
 
 
113
 
114
  def speech_score(audio):
115
- print(type(audio))
116
- print(audio)
117
- sample_rate, signal = audio # 这是语音的输入
118
- signal = signal.astype(np.float32)
119
- signal /= np.max(np.abs(signal))
120
- sf.write("data/a.wav", signal, sample_rate)
121
- signal, sample_rate = torchaudio.load("data/a.wav")
122
- signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
123
- signal
124
- )
125
- torchaudio.save("data/out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
126
- Audio = "data/out.wav"
127
- speech, sample_rate = AudioReader.read_wav_file(Audio)
128
- if signal == "none":
129
- return "none", "none", "haha"
130
  else:
131
- segments = vad.segments_offline(speech)
132
- text_results = ""
133
- for part in segments:
134
- _result = ASR_model.infer_offline(
135
- speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
136
- )
137
- text_results += punc.punctuate(_result)[0]
138
-
139
- out_prob, score, index, text_lab = classifier.classify_batch(signal1)
140
- print("from func:speech_score————type and value of prob:")
141
- print(type(out_prob.squeeze(0).numpy()))
142
- print(out_prob.squeeze(0).numpy())
143
- print("from func:speech_score————type and value of resul_label:")
144
- print(type(text_lab[-1]))
145
- print(text_lab[-1])
146
- #return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio
147
- prob=out_prob.squeeze(0).numpy()
148
- #print(prob)
149
- score2=10*prob[0]-10*prob[1]
150
- if score2>=0:
151
- score2=np.log10(score2)
152
  else:
153
- score2=-np.log10(-score2)
154
- # print("from func:speech_score————score2:",score2)
155
- # print("from func:speech_score————",text_lab[-1])
156
-
157
- text,score1=text_score(text_results)
158
- # # text_emo=str(get_text_score(text_results))
159
- # print("from func:speech_score————text:",text,",score1:",score1)
160
- score=(2/3)*score1+(1/3)*score2
161
-
162
-
163
- return text,score
164
-
165
-
166
- def video_score(video):
167
 
168
- cap = cv2.VideoCapture(video)
169
- w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
170
- h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
171
- fps = np.round(cap.get(cv2.CAP_PROP_FPS))
172
-
173
- path_save_video_face = 'result_face.mp4'
174
- vid_writer_face = cv2.VideoWriter(path_save_video_face, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
175
-
176
- # path_save_video_hm = 'result_hm.mp4'
177
- # vid_writer_hm = cv2.VideoWriter(path_save_video_hm, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
178
-
179
- lstm_features = []
180
- count_frame = 1
181
- count_face = 0
182
- probs = []
183
- frames = []
184
- last_output = None
185
- last_heatmap = None
186
- cur_face = None
187
-
188
- with mp_face_mesh.FaceMesh(
189
- max_num_faces=1,
190
- refine_landmarks=False,
191
- min_detection_confidence=0.5,
192
- min_tracking_confidence=0.5) as face_mesh:
193
-
194
- while cap.isOpened():
195
- _, frame = cap.read()
196
- if frame is None: break
197
-
198
- frame_copy = frame.copy()
199
- frame_copy.flags.writeable = False
200
- frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
201
- results = face_mesh.process(frame_copy)
202
- frame_copy.flags.writeable = True
203
-
204
- if results.multi_face_landmarks:
205
- for fl in results.multi_face_landmarks:
206
- startX, startY, endX, endY = get_box(fl, w, h)
207
- cur_face = frame_copy[startY:endY, startX: endX]
208
-
209
- if count_face%config_data.FRAME_DOWNSAMPLING == 0:
210
- cur_face_copy = pth_processing(Image.fromarray(cur_face))
211
- with torch.no_grad():
212
- features = torch.nn.functional.relu(pth_model_static.extract_features(cur_face_copy)).detach().numpy()
213
-
214
- # grayscale_cam = cam(input_tensor=cur_face_copy)
215
- # grayscale_cam = grayscale_cam[0, :]
216
- # cur_face_hm = cv2.resize(cur_face,(224,224), interpolation = cv2.INTER_AREA)
217
- # cur_face_hm = np.float32(cur_face_hm) / 255
218
- # heatmap = show_cam_on_image(cur_face_hm, grayscale_cam, use_rgb=False)
219
- # last_heatmap = heatmap
220
-
221
- if len(lstm_features) == 0:
222
- lstm_features = [features]*10
223
- else:
224
- lstm_features = lstm_features[1:] + [features]
225
-
226
- lstm_f = torch.from_numpy(np.vstack(lstm_features))
227
- lstm_f = torch.unsqueeze(lstm_f, 0)
228
- with torch.no_grad():
229
- output = pth_model_dynamic(lstm_f).detach().numpy()
230
- last_output = output
231
-
232
- if count_face == 0:
233
- count_face += 1
234
-
235
- else:
236
- if last_output is not None:
237
- output = last_output
238
- # heatmap = last_heatmap
239
-
240
- elif last_output is None:
241
- output = np.empty((1, 7))
242
- output[:] = np.nan
243
-
244
- probs.append(output[0])
245
- frames.append(count_frame)
246
  else:
247
- if last_output is not None:
248
- lstm_features = []
249
- empty = np.empty((7))
250
- empty[:] = np.nan
251
- probs.append(empty)
252
- frames.append(count_frame)
253
-
254
- if cur_face is not None:
255
- # heatmap_f = display_info(heatmap, 'Frame: {}'.format(count_frame), box_scale=.3)
256
-
257
- cur_face = cv2.cvtColor(cur_face, cv2.COLOR_RGB2BGR)
258
- cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA)
259
- cur_face = display_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3)
260
- vid_writer_face.write(cur_face)
261
- # vid_writer_hm.write(heatmap_f)
262
-
263
- count_frame += 1
264
- if count_face != 0:
265
- count_face += 1
266
-
267
- vid_writer_face.release()
268
- # vid_writer_hm.release()
269
-
270
- stat = statistics_plot(frames, probs)
271
-
272
- if not stat:
273
- return None, None
274
-
275
- #for debug
276
- print("from func:video_score————")
277
- print(type(frames))
278
- print(frames)
279
- print(type(probs))
280
- print(probs)
281
- # to calculate scores
282
- nan=float('nan')
283
- s1 = 0
284
- s2 = 0
285
- s3 = 0
286
- # s4 = 0
287
- # s5 = 0
288
- # s6 = 0
289
- # s7 = 0
290
- frames_len=len(frames)
291
- for i in range(frames_len):
292
- if np.isnan(probs[i][0]):
293
- frames_len=frames_len-1
294
- else:
295
- s1=s1+probs[i][0]
296
- s2=s2+probs[i][1]
297
- s3=s3+probs[i][2]
298
- # s4=s4+probs[i][3]
299
- # s5=s5+probs[i][4]
300
- # s6=s6+probs[i][5]
301
- # s7=s7+probs[i][6]
302
- s1=s1/frames_len
303
- s2=s2/frames_len
304
- s3=s3/frames_len
305
- # s4=s4/frames_len
306
- # s5=s5/frames_len
307
- # s6=s6/frames_len
308
- # s7=s7/frames_len
309
- # scores=[s1,s2,s3,s4,s5,s6,s7]
310
- # scores_str=str(scores)
311
- # score1=0*scores[0]-8*scores[1]+4*scores[2]+0*scores[3]+2*scores[4]+2*scores[5]+4*scores[6]
312
- #print("from func:video_score————score1=",score1)
313
- #print("from func:video_score————logs:")
314
- # with open("local_data/data.txt",'a', encoding="utf8") as f:
315
- # f.write(scores_str+'\n')
316
-
317
- # with open("local_data/data.txt",'r', encoding="utf8") as f:
318
- # for i in f:
319
- # print(i)
320
 
321
 
322
- print(str([s1,s2,s3]))
323
- if s1>=0.4:
324
- score1=0
325
  else:
326
- if s2>=s3:
327
- score1=-1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  else:
329
- score1=+1
330
- #trans the audio file
331
- my_audio_clip = AudioFileClip(video)
332
- my_audio_clip.write_audiofile("data/audio.wav",ffmpeg_params=["-ac","1"])
333
-
334
- audio = wav.read('data/audio.wav')
335
-
336
- text,score2=speech_score(audio)
337
-
338
- #print("from func:video_score————text:",text)
339
-
340
- score=(score1+6*score2)/7
341
- #print("from func:video_score————score:",score)
342
- return text,score
 
 
 
343
  #######################################################################
344
 
 
97
  #######################################################################
98
  #规范函数,只管值输入输出:
99
  def text_score(text):
100
+ if text==None:
101
+ gr.Warning("提交内容为空!")
 
 
 
 
 
 
 
102
  else:
103
+ string=text_api(text)
104
+ part1 = str.partition(string, r"text")
105
+ want1 = part1[2]
106
+ label = want1[4:6]
107
+ part2 = str.partition(string, r"probability")
108
+ want2 = part2[2]
109
+ prob = float(want2[3:-4])
110
+ if label=="正向":
111
+ score=-np.log10(prob*10)
112
+ else:
113
+ score=np.log10(prob*10)
114
+ # print("from func:text_score————,text:",text,",score:",score)
115
+ return text,score
116
 
117
  def speech_score(audio):
118
+ if audio==None:
119
+ gr.Warning("提交内容为空!请等待音频加载完毕后再尝试提交!")
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
+ print(type(audio))
122
+ print(audio)
123
+ sample_rate, signal = audio # 这是语音的输入
124
+ signal = signal.astype(np.float32)
125
+ signal /= np.max(np.abs(signal))
126
+ sf.write("data/a.wav", signal, sample_rate)
127
+ signal, sample_rate = torchaudio.load("data/a.wav")
128
+ signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
129
+ signal
130
+ )
131
+ torchaudio.save("data/out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
132
+ Audio = "data/out.wav"
133
+ speech, sample_rate = AudioReader.read_wav_file(Audio)
134
+ if signal == "none":
135
+ return "none", "none", "haha"
 
 
 
 
 
 
136
  else:
137
+ segments = vad.segments_offline(speech)
138
+ text_results = ""
139
+ for part in segments:
140
+ _result = ASR_model.infer_offline(
141
+ speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
142
+ )
143
+ text_results += punc.punctuate(_result)[0]
 
 
 
 
 
 
 
144
 
145
+ out_prob, score, index, text_lab = classifier.classify_batch(signal1)
146
+ print("from func:speech_score————type and value of prob:")
147
+ print(type(out_prob.squeeze(0).numpy()))
148
+ print(out_prob.squeeze(0).numpy())
149
+ print("from func:speech_score————type and value of resul_label:")
150
+ print(type(text_lab[-1]))
151
+ print(text_lab[-1])
152
+ #return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio
153
+ prob=out_prob.squeeze(0).numpy()
154
+ #print(prob)
155
+ score2=10*prob[0]-10*prob[1]
156
+ if score2>=0:
157
+ score2=np.log10(score2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  else:
159
+ score2=-np.log10(-score2)
160
+ # print("from func:speech_score————score2:",score2)
161
+ # print("from func:speech_score————",text_lab[-1])
162
+
163
+ text,score1=text_score(text_results)
164
+ # # text_emo=str(get_text_score(text_results))
165
+ # print("from func:speech_score————text:",text,",score1:",score1)
166
+ score=(2/3)*score1+(1/3)*score2
167
+
168
+
169
+ return text,score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
 
172
+ def video_score(video):
173
+ if video==None:
174
+ gr.Warning("提交内容为空!请等待视频加载完毕后再尝试提交!")
175
  else:
176
+ cap = cv2.VideoCapture(video)
177
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
178
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
179
+ fps = np.round(cap.get(cv2.CAP_PROP_FPS))
180
+
181
+ path_save_video_face = 'result_face.mp4'
182
+ vid_writer_face = cv2.VideoWriter(path_save_video_face, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
183
+
184
+ # path_save_video_hm = 'result_hm.mp4'
185
+ # vid_writer_hm = cv2.VideoWriter(path_save_video_hm, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
186
+
187
+ lstm_features = []
188
+ count_frame = 1
189
+ count_face = 0
190
+ probs = []
191
+ frames = []
192
+ last_output = None
193
+ last_heatmap = None
194
+ cur_face = None
195
+
196
+ with mp_face_mesh.FaceMesh(
197
+ max_num_faces=1,
198
+ refine_landmarks=False,
199
+ min_detection_confidence=0.5,
200
+ min_tracking_confidence=0.5) as face_mesh:
201
+
202
+ while cap.isOpened():
203
+ _, frame = cap.read()
204
+ if frame is None: break
205
+
206
+ frame_copy = frame.copy()
207
+ frame_copy.flags.writeable = False
208
+ frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
209
+ results = face_mesh.process(frame_copy)
210
+ frame_copy.flags.writeable = True
211
+
212
+ if results.multi_face_landmarks:
213
+ for fl in results.multi_face_landmarks:
214
+ startX, startY, endX, endY = get_box(fl, w, h)
215
+ cur_face = frame_copy[startY:endY, startX: endX]
216
+
217
+ if count_face%config_data.FRAME_DOWNSAMPLING == 0:
218
+ cur_face_copy = pth_processing(Image.fromarray(cur_face))
219
+ with torch.no_grad():
220
+ features = torch.nn.functional.relu(pth_model_static.extract_features(cur_face_copy)).detach().numpy()
221
+
222
+ # grayscale_cam = cam(input_tensor=cur_face_copy)
223
+ # grayscale_cam = grayscale_cam[0, :]
224
+ # cur_face_hm = cv2.resize(cur_face,(224,224), interpolation = cv2.INTER_AREA)
225
+ # cur_face_hm = np.float32(cur_face_hm) / 255
226
+ # heatmap = show_cam_on_image(cur_face_hm, grayscale_cam, use_rgb=False)
227
+ # last_heatmap = heatmap
228
+
229
+ if len(lstm_features) == 0:
230
+ lstm_features = [features]*10
231
+ else:
232
+ lstm_features = lstm_features[1:] + [features]
233
+
234
+ lstm_f = torch.from_numpy(np.vstack(lstm_features))
235
+ lstm_f = torch.unsqueeze(lstm_f, 0)
236
+ with torch.no_grad():
237
+ output = pth_model_dynamic(lstm_f).detach().numpy()
238
+ last_output = output
239
+
240
+ if count_face == 0:
241
+ count_face += 1
242
+
243
+ else:
244
+ if last_output is not None:
245
+ output = last_output
246
+ # heatmap = last_heatmap
247
+
248
+ elif last_output is None:
249
+ output = np.empty((1, 7))
250
+ output[:] = np.nan
251
+
252
+ probs.append(output[0])
253
+ frames.append(count_frame)
254
+ else:
255
+ if last_output is not None:
256
+ lstm_features = []
257
+ empty = np.empty((7))
258
+ empty[:] = np.nan
259
+ probs.append(empty)
260
+ frames.append(count_frame)
261
+
262
+ if cur_face is not None:
263
+ # heatmap_f = display_info(heatmap, 'Frame: {}'.format(count_frame), box_scale=.3)
264
+
265
+ cur_face = cv2.cvtColor(cur_face, cv2.COLOR_RGB2BGR)
266
+ cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA)
267
+ cur_face = display_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3)
268
+ vid_writer_face.write(cur_face)
269
+ # vid_writer_hm.write(heatmap_f)
270
+
271
+ count_frame += 1
272
+ if count_face != 0:
273
+ count_face += 1
274
+
275
+ vid_writer_face.release()
276
+ # vid_writer_hm.release()
277
+
278
+ stat = statistics_plot(frames, probs)
279
+
280
+ if not stat:
281
+ return None, None
282
+
283
+ #for debug
284
+ print("from func:video_score————")
285
+ print(type(frames))
286
+ print(frames)
287
+ print(type(probs))
288
+ print(probs)
289
+ # to calculate scores
290
+ nan=float('nan')
291
+ s1 = 0
292
+ s2 = 0
293
+ s3 = 0
294
+ # s4 = 0
295
+ # s5 = 0
296
+ # s6 = 0
297
+ # s7 = 0
298
+ frames_len=len(frames)
299
+ for i in range(frames_len):
300
+ if np.isnan(probs[i][0]):
301
+ frames_len=frames_len-1
302
+ else:
303
+ s1=s1+probs[i][0]
304
+ s2=s2+probs[i][1]
305
+ s3=s3+probs[i][2]
306
+ # s4=s4+probs[i][3]
307
+ # s5=s5+probs[i][4]
308
+ # s6=s6+probs[i][5]
309
+ # s7=s7+probs[i][6]
310
+ s1=s1/frames_len
311
+ s2=s2/frames_len
312
+ s3=s3/frames_len
313
+ # s4=s4/frames_len
314
+ # s5=s5/frames_len
315
+ # s6=s6/frames_len
316
+ # s7=s7/frames_len
317
+ # scores=[s1,s2,s3,s4,s5,s6,s7]
318
+ # scores_str=str(scores)
319
+ # score1=0*scores[0]-8*scores[1]+4*scores[2]+0*scores[3]+2*scores[4]+2*scores[5]+4*scores[6]
320
+ #print("from func:video_score————score1=",score1)
321
+ #print("from func:video_score————logs:")
322
+ # with open("local_data/data.txt",'a', encoding="utf8") as f:
323
+ # f.write(scores_str+'\n')
324
+
325
+ # with open("local_data/data.txt",'r', encoding="utf8") as f:
326
+ # for i in f:
327
+ # print(i)
328
+
329
+
330
+ print(str([s1,s2,s3]))
331
+ if s1>=0.4:
332
+ score1=0
333
  else:
334
+ if s2>=s3:
335
+ score1=-1
336
+ else:
337
+ score1=+1
338
+ #trans the audio file
339
+ my_audio_clip = AudioFileClip(video)
340
+ my_audio_clip.write_audiofile("data/audio.wav",ffmpeg_params=["-ac","1"])
341
+
342
+ audio = wav.read('data/audio.wav')
343
+
344
+ text,score2=speech_score(audio)
345
+
346
+ #print("from func:video_score————text:",text)
347
+
348
+ score=(score1+6*score2)/7
349
+ #print("from func:video_score————score:",score)
350
+ return text,score
351
  #######################################################################
352