Aisatsu-robot / app.py
vumichien's picture
Create app.py
443cd8b
raw
history blame
3.18 kB
import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np
from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
get_bool_mask_from_coco_segmentation,
read_image_as_pil,
visualize_object_predictions,
)
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
def tts(text: str, language="ja") -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = b64encode(bytes_object.getvalue()).decode()
html = f"""
<audio controls autoplay>
<source src="data:audio/wav;base64,{b64}" type="audio/wav">
</audio>
"""
return html
def yolov8_inference(
image,
area_thres=0.2,
defaul_bot_voice="おはいようございます"
):
"""
YOLOv8 inference function
Args:
image: Input image
Returns:
Rendered image
"""
time.sleep(2)
# set model parameters
model.overrides['conf'] = 0.25 # NMS confidence threshold
model.overrides['iou'] = 0.45 # NMS IoU threshold
model.overrides['agnostic_nms'] = False # NMS class-agnostic
model.overrides['max_det'] = 1000 # maximum number of detections per image
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
np_image = np.ascontiguousarray(image)
masks, boxes = results.masks, results.boxes
area_image = image.width*image.height
object_predictions = []
html_bot_voice = ""
if boxes is not None:
det_ind = 0
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
if area_rate >= area_thres:
object_prediction = ObjectPrediction(
bbox=box,
category_name=CLASS[int(cls)],
category_id=int(cls),
score=area_rate,
)
object_predictions.append(object_prediction)
det_ind += 1
html_bot_voice = tts(defaul_bot_voice, language="ja")
result = visualize_object_predictions(
image=np_image,
object_prediction_list=object_predictions,
rect_th=2,
text_th=2,
)
return Image.fromarray(result["image"]), html_bot_voice
outputs = [gr.Image(type="filepath", label="Output Image"),
gr.HTML()]
title = "State-of-the-Art YOLO Models for Object detection"
demo_app = gr.Interface(
fn=yolov8_inference,
inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
outputs=outputs,
title=title,
live=True,
)
demo_app.launch(debug=True)