Liusuthu commited on
Commit
0073adb
·
verified ·
1 Parent(s): 84638ab

Update app_utils.py

Browse files
Files changed (1) hide show
  1. app_utils.py +226 -160
app_utils.py CHANGED
@@ -1,10 +1,3 @@
1
- """
2
- File: app_utils.py
3
- Author: Elena Ryumina and Dmitry Ryumin
4
- Description: This module contains utility functions for facial expression recognition application.
5
- License: MIT License
6
- """
7
-
8
  import torch
9
  import numpy as np
10
  import mediapipe as mp
@@ -19,164 +12,74 @@ from config import DICT_EMO, config_data
19
  from plot import statistics_plot
20
  from moviepy.editor import AudioFileClip
21
 
 
 
 
 
 
 
 
 
 
22
  mp_face_mesh = mp.solutions.face_mesh
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def preprocess_image_and_predict(inp):
26
  return None, None, None
27
- # inp = np.array(inp)
28
-
29
- # if inp is None:
30
- # return None, None
31
-
32
- # try:
33
- # h, w = inp.shape[:2]
34
- # except Exception:
35
- # return None, None
36
-
37
- # with mp_face_mesh.FaceMesh(
38
- # max_num_faces=1,
39
- # refine_landmarks=False,
40
- # min_detection_confidence=0.5,
41
- # min_tracking_confidence=0.5,
42
- # ) as face_mesh:
43
- # results = face_mesh.process(inp)
44
- # if results.multi_face_landmarks:
45
- # for fl in results.multi_face_landmarks:
46
- # startX, startY, endX, endY = get_box(fl, w, h)
47
- # cur_face = inp[startY:endY, startX:endX]
48
- # cur_face_n = pth_processing(Image.fromarray(cur_face))
49
- # with torch.no_grad():
50
- # prediction = (
51
- # torch.nn.functional.softmax(pth_model_static(cur_face_n), dim=1)
52
- # .detach()
53
- # .numpy()[0]
54
- # )
55
- # confidences = {DICT_EMO[i]: float(prediction[i]) for i in range(7)}
56
- # grayscale_cam = cam(input_tensor=cur_face_n)
57
- # grayscale_cam = grayscale_cam[0, :]
58
- # cur_face_hm = cv2.resize(cur_face,(224,224))
59
- # cur_face_hm = np.float32(cur_face_hm) / 255
60
- # heatmap = show_cam_on_image(cur_face_hm, grayscale_cam, use_rgb=True)
61
-
62
- # return cur_face, heatmap, confidences
63
-
64
-
65
- def preprocess_video_and_predict(video):
66
-
67
- # cap = cv2.VideoCapture(video)
68
- # w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
69
- # h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
70
- # fps = np.round(cap.get(cv2.CAP_PROP_FPS))
71
-
72
- # path_save_video_face = 'result_face.mp4'
73
- # vid_writer_face = cv2.VideoWriter(path_save_video_face, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
74
 
75
- # path_save_video_hm = 'result_hm.mp4'
76
- # vid_writer_hm = cv2.VideoWriter(path_save_video_hm, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
77
-
78
- # lstm_features = []
79
- # count_frame = 1
80
- # count_face = 0
81
- # probs = []
82
- # frames = []
83
- # last_output = None
84
- # last_heatmap = None
85
- # cur_face = None
86
-
87
- # with mp_face_mesh.FaceMesh(
88
- # max_num_faces=1,
89
- # refine_landmarks=False,
90
- # min_detection_confidence=0.5,
91
- # min_tracking_confidence=0.5) as face_mesh:
92
-
93
- # while cap.isOpened():
94
- # _, frame = cap.read()
95
- # if frame is None: break
96
-
97
- # frame_copy = frame.copy()
98
- # frame_copy.flags.writeable = False
99
- # frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
100
- # results = face_mesh.process(frame_copy)
101
- # frame_copy.flags.writeable = True
102
-
103
- # if results.multi_face_landmarks:
104
- # for fl in results.multi_face_landmarks:
105
- # startX, startY, endX, endY = get_box(fl, w, h)
106
- # cur_face = frame_copy[startY:endY, startX: endX]
107
-
108
- # if count_face%config_data.FRAME_DOWNSAMPLING == 0:
109
- # cur_face_copy = pth_processing(Image.fromarray(cur_face))
110
- # with torch.no_grad():
111
- # features = torch.nn.functional.relu(pth_model_static.extract_features(cur_face_copy)).detach().numpy()
112
-
113
- # grayscale_cam = cam(input_tensor=cur_face_copy)
114
- # grayscale_cam = grayscale_cam[0, :]
115
- # cur_face_hm = cv2.resize(cur_face,(224,224), interpolation = cv2.INTER_AREA)
116
- # cur_face_hm = np.float32(cur_face_hm) / 255
117
- # heatmap = show_cam_on_image(cur_face_hm, grayscale_cam, use_rgb=False)
118
- # last_heatmap = heatmap
119
-
120
- # if len(lstm_features) == 0:
121
- # lstm_features = [features]*10
122
- # else:
123
- # lstm_features = lstm_features[1:] + [features]
124
-
125
- # lstm_f = torch.from_numpy(np.vstack(lstm_features))
126
- # lstm_f = torch.unsqueeze(lstm_f, 0)
127
- # with torch.no_grad():
128
- # output = pth_model_dynamic(lstm_f).detach().numpy()
129
- # last_output = output
130
-
131
- # if count_face == 0:
132
- # count_face += 1
133
-
134
- # else:
135
- # if last_output is not None:
136
- # output = last_output
137
- # heatmap = last_heatmap
138
-
139
- # elif last_output is None:
140
- # output = np.empty((1, 7))
141
- # output[:] = np.nan
142
-
143
- # probs.append(output[0])
144
- # frames.append(count_frame)
145
- # else:
146
- # if last_output is not None:
147
- # lstm_features = []
148
- # empty = np.empty((7))
149
- # empty[:] = np.nan
150
- # probs.append(empty)
151
- # frames.append(count_frame)
152
-
153
- # if cur_face is not None:
154
- # heatmap_f = display_info(heatmap, 'Frame: {}'.format(count_frame), box_scale=.3)
155
-
156
- # cur_face = cv2.cvtColor(cur_face, cv2.COLOR_RGB2BGR)
157
- # cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA)
158
- # cur_face = display_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3)
159
- # vid_writer_face.write(cur_face)
160
- # vid_writer_hm.write(heatmap_f)
161
-
162
- # count_frame += 1
163
- # if count_face != 0:
164
- # count_face += 1
165
-
166
- # vid_writer_face.release()
167
- # vid_writer_hm.release()
168
-
169
- # stat = statistics_plot(frames, probs)
170
-
171
- # if not stat:
172
- # return None, None, None, None
173
-
174
- # # print(type(frames))
175
- # # print(frames)
176
- # # print(type(probs))
177
- # # print(probs)
178
-
179
- # return video, path_save_video_face, path_save_video_hm, stat
180
  return None, None, None, None
181
 
182
 
@@ -338,4 +241,167 @@ def preprocess_video_and_rank(video):
338
  my_audio_clip = AudioFileClip(video)
339
  my_audio_clip.write_audiofile("data/audio.wav",ffmpeg_params=["-ac","1"])
340
 
341
- return stat,scores_str,"data/audio.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import numpy as np
3
  import mediapipe as mp
 
12
  from plot import statistics_plot
13
  from moviepy.editor import AudioFileClip
14
 
15
+ import soundfile as sf
16
+ import torchaudio
17
+ from speechbrain.pretrained.interfaces import foreign_class
18
+ from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline
19
+ from gradio_client import Client
20
+
21
+ ##############################################################################################
22
+ client = Client("Liusuthu/TextDepression")
23
+
24
  mp_face_mesh = mp.solutions.face_mesh
25
 
26
 
27
+ classifier = foreign_class(
28
+ source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP"
29
+ pymodule_file="custom_interface.py",
30
+ classname="CustomEncoderWav2vec2Classifier",
31
+ savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
32
+ )
33
+ ASR_model = ParaformerOffline()
34
+ vad = FSMNVad()
35
+ punc = CttPunctuator()
36
+
37
+
38
+ #########################################################################################
39
+ def text_api(text:str):
40
+ result = client.predict(
41
+ text, # str in '输入文字' Textbox component
42
+ api_name="/predict",
43
+ )
44
+ return result
45
+
46
+
47
+ def classify_continuous(audio):
48
+ print(type(audio))
49
+ print(audio)
50
+ sample_rate, signal = audio # 这是语音的输入
51
+ signal = signal.astype(np.float32)
52
+ signal /= np.max(np.abs(signal))
53
+ sf.write("data/a.wav", signal, sample_rate)
54
+ signal, sample_rate = torchaudio.load("data/a.wav")
55
+ signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
56
+ signal
57
+ )
58
+ torchaudio.save("data/out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
59
+ Audio = "data/out.wav"
60
+ speech, sample_rate = AudioReader.read_wav_file(Audio)
61
+ if signal == "none":
62
+ return "none", "none", "haha"
63
+ else:
64
+ segments = vad.segments_offline(speech)
65
+ text_results = ""
66
+ for part in segments:
67
+ _result = ASR_model.infer_offline(
68
+ speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
69
+ )
70
+ text_results += punc.punctuate(_result)[0]
71
+
72
+ out_prob, score, index, text_lab = classifier.classify_batch(signal1)
73
+ print(type(out_prob.squeeze(0).numpy()))
74
+ print(type(text_lab[-1]))
75
+ return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio
76
+
77
+
78
+
79
  def preprocess_image_and_predict(inp):
80
  return None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ def preprocess_video_and_predict(video):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  return None, None, None, None
84
 
85
 
 
241
  my_audio_clip = AudioFileClip(video)
242
  my_audio_clip.write_audiofile("data/audio.wav",ffmpeg_params=["-ac","1"])
243
 
244
+ return stat,scores_str,"data/audio.wav"
245
+
246
+ ###########################################################################################################################
247
+ def video_score(video):
248
+ cap = cv2.VideoCapture(video)
249
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
250
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
251
+ fps = np.round(cap.get(cv2.CAP_PROP_FPS))
252
+
253
+ path_save_video_face = 'result_face.mp4'
254
+ vid_writer_face = cv2.VideoWriter(path_save_video_face, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
255
+
256
+ # path_save_video_hm = 'result_hm.mp4'
257
+ # vid_writer_hm = cv2.VideoWriter(path_save_video_hm, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224))
258
+
259
+ lstm_features = []
260
+ count_frame = 1
261
+ count_face = 0
262
+ probs = []
263
+ frames = []
264
+ last_output = None
265
+ last_heatmap = None
266
+ cur_face = None
267
+
268
+ with mp_face_mesh.FaceMesh(
269
+ max_num_faces=1,
270
+ refine_landmarks=False,
271
+ min_detection_confidence=0.5,
272
+ min_tracking_confidence=0.5) as face_mesh:
273
+
274
+ while cap.isOpened():
275
+ _, frame = cap.read()
276
+ if frame is None: break
277
+
278
+ frame_copy = frame.copy()
279
+ frame_copy.flags.writeable = False
280
+ frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
281
+ results = face_mesh.process(frame_copy)
282
+ frame_copy.flags.writeable = True
283
+
284
+ if results.multi_face_landmarks:
285
+ for fl in results.multi_face_landmarks:
286
+ startX, startY, endX, endY = get_box(fl, w, h)
287
+ cur_face = frame_copy[startY:endY, startX: endX]
288
+
289
+ if count_face%config_data.FRAME_DOWNSAMPLING == 0:
290
+ cur_face_copy = pth_processing(Image.fromarray(cur_face))
291
+ with torch.no_grad():
292
+ features = torch.nn.functional.relu(pth_model_static.extract_features(cur_face_copy)).detach().numpy()
293
+
294
+ # grayscale_cam = cam(input_tensor=cur_face_copy)
295
+ # grayscale_cam = grayscale_cam[0, :]
296
+ # cur_face_hm = cv2.resize(cur_face,(224,224), interpolation = cv2.INTER_AREA)
297
+ # cur_face_hm = np.float32(cur_face_hm) / 255
298
+ # heatmap = show_cam_on_image(cur_face_hm, grayscale_cam, use_rgb=False)
299
+ # last_heatmap = heatmap
300
+
301
+ if len(lstm_features) == 0:
302
+ lstm_features = [features]*10
303
+ else:
304
+ lstm_features = lstm_features[1:] + [features]
305
+
306
+ lstm_f = torch.from_numpy(np.vstack(lstm_features))
307
+ lstm_f = torch.unsqueeze(lstm_f, 0)
308
+ with torch.no_grad():
309
+ output = pth_model_dynamic(lstm_f).detach().numpy()
310
+ last_output = output
311
+
312
+ if count_face == 0:
313
+ count_face += 1
314
+
315
+ else:
316
+ if last_output is not None:
317
+ output = last_output
318
+ # heatmap = last_heatmap
319
+
320
+ elif last_output is None:
321
+ output = np.empty((1, 7))
322
+ output[:] = np.nan
323
+
324
+ probs.append(output[0])
325
+ frames.append(count_frame)
326
+ else:
327
+ if last_output is not None:
328
+ lstm_features = []
329
+ empty = np.empty((7))
330
+ empty[:] = np.nan
331
+ probs.append(empty)
332
+ frames.append(count_frame)
333
+
334
+ if cur_face is not None:
335
+ # heatmap_f = display_info(heatmap, 'Frame: {}'.format(count_frame), box_scale=.3)
336
+
337
+ cur_face = cv2.cvtColor(cur_face, cv2.COLOR_RGB2BGR)
338
+ cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA)
339
+ cur_face = display_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3)
340
+ vid_writer_face.write(cur_face)
341
+ # vid_writer_hm.write(heatmap_f)
342
+
343
+ count_frame += 1
344
+ if count_face != 0:
345
+ count_face += 1
346
+
347
+ vid_writer_face.release()
348
+ # vid_writer_hm.release()
349
+
350
+ stat = statistics_plot(frames, probs)
351
+
352
+ if not stat:
353
+ return None, None
354
+
355
+ #for debug
356
+ print(type(frames))
357
+ print(frames)
358
+ print(type(probs))
359
+ print(probs)
360
+ # to calculate scores
361
+ nan=float('nan')
362
+ s1 = 0
363
+ s2 = 0
364
+ s3 = 0
365
+ s4 = 0
366
+ s5 = 0
367
+ s6 = 0
368
+ s7 = 0
369
+ frames_len=len(frames)
370
+ for i in range(frames_len):
371
+ if np.isnan(probs[i][0]):
372
+ frames_len=frames_len-1
373
+ else:
374
+ s1=s1+probs[i][0]
375
+ s2=s2+probs[i][1]
376
+ s3=s3+probs[i][2]
377
+ s4=s4+probs[i][3]
378
+ s5=s5+probs[i][4]
379
+ s6=s6+probs[i][5]
380
+ s7=s7+probs[i][6]
381
+ s1=s1/frames_len
382
+ s2=s2/frames_len
383
+ s3=s3/frames_len
384
+ s4=s4/frames_len
385
+ s5=s5/frames_len
386
+ s6=s6/frames_len
387
+ s7=s7/frames_len
388
+ prob=[s1,s2,s3,s4,s5,s6,s7]
389
+ prob_str=str(prob)
390
+ with open("local_data/data.txt",'a', encoding="utf8") as f:
391
+ f.write(prob_str+'\n')
392
+
393
+ with open("local_data/data.txt",'r', encoding="utf8") as f:
394
+ for i in f:
395
+ print(i)
396
+ #平衡点值为零,越正越负面
397
+ score1=0*prob[0]-8*prob[1]+4*prob[2]+0*prob[3]+2*prob[4]+2*prob[5]+4*prob[6]
398
+ print("score1=",score)
399
+
400
+ #trans the audio file
401
+ my_audio_clip = AudioFileClip(video)
402
+ my_audio_clip.write_audiofile("data/audio.wav",ffmpeg_params=["-ac","1"])
403
+
404
+
405
+
406
+
407
+ return score1