vumichien commited on
Commit
443cd8b
·
1 Parent(s): 48cacbd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from ultralyticsplus import YOLO
4
+ import numpy as np
5
+ from sahi.prediction import ObjectPrediction, PredictionScore
6
+ from sahi.utils.cv import (
7
+ get_bool_mask_from_coco_segmentation,
8
+ read_image_as_pil,
9
+ visualize_object_predictions,
10
+ )
11
+
12
+ from base64 import b64encode
13
+ from io import BytesIO
14
+ from gtts import gTTS
15
+ from mtranslate import translate
16
+ from speech_recognition import AudioFile, Recognizer
17
+ import time
18
+
19
+ model = YOLO('ultralyticsplus/yolov8s')
20
+ CLASS = model.model.names
21
+
22
+ def tts(text: str, language="ja") -> object:
23
+ """Converts text into autoplay html.
24
+ Args:
25
+ text (str): generated answer of bot
26
+ Returns:
27
+ html: autoplay object
28
+ """
29
+ tts_object = gTTS(text=text, lang=language, slow=False)
30
+ bytes_object = BytesIO()
31
+ tts_object.write_to_fp(bytes_object)
32
+ bytes_object.seek(0)
33
+ b64 = b64encode(bytes_object.getvalue()).decode()
34
+ html = f"""
35
+ <audio controls autoplay>
36
+ <source src="data:audio/wav;base64,{b64}" type="audio/wav">
37
+ </audio>
38
+ """
39
+ return html
40
+
41
+
42
+ def yolov8_inference(
43
+ image,
44
+ area_thres=0.2,
45
+ defaul_bot_voice="おはいようございます"
46
+ ):
47
+ """
48
+ YOLOv8 inference function
49
+ Args:
50
+ image: Input image
51
+ Returns:
52
+ Rendered image
53
+ """
54
+ time.sleep(2)
55
+ # set model parameters
56
+ model.overrides['conf'] = 0.25 # NMS confidence threshold
57
+ model.overrides['iou'] = 0.45 # NMS IoU threshold
58
+ model.overrides['agnostic_nms'] = False # NMS class-agnostic
59
+ model.overrides['max_det'] = 1000 # maximum number of detections per image
60
+ results = model.predict(image, show=False)[0]
61
+ image = read_image_as_pil(image)
62
+ np_image = np.ascontiguousarray(image)
63
+ masks, boxes = results.masks, results.boxes
64
+ area_image = image.width*image.height
65
+ object_predictions = []
66
+ html_bot_voice = ""
67
+ if boxes is not None:
68
+ det_ind = 0
69
+ for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
70
+ if int(cls) != 0:
71
+ continue
72
+ box = xyxy.tolist()
73
+ area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
74
+
75
+ if area_rate >= area_thres:
76
+ object_prediction = ObjectPrediction(
77
+ bbox=box,
78
+ category_name=CLASS[int(cls)],
79
+ category_id=int(cls),
80
+ score=area_rate,
81
+ )
82
+ object_predictions.append(object_prediction)
83
+ det_ind += 1
84
+ html_bot_voice = tts(defaul_bot_voice, language="ja")
85
+
86
+ result = visualize_object_predictions(
87
+ image=np_image,
88
+ object_prediction_list=object_predictions,
89
+ rect_th=2,
90
+ text_th=2,
91
+ )
92
+
93
+ return Image.fromarray(result["image"]), html_bot_voice
94
+
95
+
96
+ outputs = [gr.Image(type="filepath", label="Output Image"),
97
+ gr.HTML()]
98
+ title = "State-of-the-Art YOLO Models for Object detection"
99
+
100
+ demo_app = gr.Interface(
101
+ fn=yolov8_inference,
102
+ inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
103
+ outputs=outputs,
104
+ title=title,
105
+ live=True,
106
+ )
107
+ demo_app.launch(debug=True)