import gradio as gr import torch from ultralyticsplus import YOLO import numpy as np from sahi.prediction import ObjectPrediction, PredictionScore from sahi.utils.cv import ( get_bool_mask_from_coco_segmentation, read_image_as_pil, visualize_object_predictions, ) from base64 import b64encode from io import BytesIO from gtts import gTTS from mtranslate import translate from speech_recognition import AudioFile, Recognizer import time model = YOLO('ultralyticsplus/yolov8s') CLASS = model.model.names def tts(text: str, language="ja") -> object: """Converts text into autoplay html. Args: text (str): generated answer of bot Returns: html: autoplay object """ tts_object = gTTS(text=text, lang=language, slow=False) bytes_object = BytesIO() tts_object.write_to_fp(bytes_object) bytes_object.seek(0) b64 = b64encode(bytes_object.getvalue()).decode() html = f""" """ return html def yolov8_inference( image, area_thres=0.2, defaul_bot_voice="おはいようございます" ): """ YOLOv8 inference function Args: image: Input image Returns: Rendered image """ time.sleep(2) # set model parameters model.overrides['conf'] = 0.25 # NMS confidence threshold model.overrides['iou'] = 0.45 # NMS IoU threshold model.overrides['agnostic_nms'] = False # NMS class-agnostic model.overrides['max_det'] = 1000 # maximum number of detections per image results = model.predict(image, show=False)[0] image = read_image_as_pil(image) np_image = np.ascontiguousarray(image) masks, boxes = results.masks, results.boxes area_image = image.width*image.height object_predictions = [] html_bot_voice = "" if boxes is not None: det_ind = 0 for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): if int(cls) != 0: continue box = xyxy.tolist() area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image if area_rate >= area_thres: object_prediction = ObjectPrediction( bbox=box, category_name=CLASS[int(cls)], category_id=int(cls), score=area_rate, ) object_predictions.append(object_prediction) det_ind += 1 html_bot_voice = tts(defaul_bot_voice, language="ja") result = visualize_object_predictions( image=np_image, object_prediction_list=object_predictions, rect_th=2, text_th=2, ) return Image.fromarray(result["image"]), html_bot_voice outputs = [gr.Image(type="filepath", label="Output Image"), gr.HTML()] title = "State-of-the-Art YOLO Models for Object detection" demo_app = gr.Interface( fn=yolov8_inference, inputs=gr.Image(source="webcam", streaming=True, label="Input Image"), outputs=outputs, title=title, live=True, ) demo_app.launch(debug=True)