import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np
from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
    get_bool_mask_from_coco_segmentation,
    read_image_as_pil,
    visualize_object_predictions,
)

from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time

model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names

def tts(text: str, language="ja") -> object:
    """Converts text into autoplay html.
    Args:
        text (str): generated answer of bot
    Returns:
        html: autoplay object
    """
    tts_object = gTTS(text=text, lang=language, slow=False)
    bytes_object = BytesIO()
    tts_object.write_to_fp(bytes_object)
    bytes_object.seek(0)
    b64 = b64encode(bytes_object.getvalue()).decode()
    html = f"""
    <audio controls autoplay>
    <source src="data:audio/wav;base64,{b64}" type="audio/wav">
    </audio>
    """
    return html


def yolov8_inference(
    image,
    area_thres=0.2,
    defaul_bot_voice="おはいようございます"
):
    """
    YOLOv8 inference function
    Args:
        image: Input image
    Returns:
        Rendered image
    """
    time.sleep(2)
    # set model parameters
    model.overrides['conf'] = 0.25  # NMS confidence threshold
    model.overrides['iou'] = 0.45  # NMS IoU threshold
    model.overrides['agnostic_nms'] = False  # NMS class-agnostic
    model.overrides['max_det'] = 1000  # maximum number of detections per image
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    np_image = np.ascontiguousarray(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width*image.height
    object_predictions = []
    html_bot_voice = ""
    if boxes is not None:
        det_ind = 0
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image

            if area_rate >= area_thres:
                object_prediction = ObjectPrediction(
                    bbox=box,
                    category_name=CLASS[int(cls)],
                    category_id=int(cls),
                    score=area_rate,
                )
                object_predictions.append(object_prediction)
                det_ind += 1
                html_bot_voice = tts(defaul_bot_voice, language="ja")
                
    result = visualize_object_predictions(
        image=np_image,
        object_prediction_list=object_predictions,
        rect_th=2,
        text_th=2,
    )
    
    return Image.fromarray(result["image"]), html_bot_voice


outputs = [gr.Image(type="filepath", label="Output Image"),
           gr.HTML()]
title = "State-of-the-Art YOLO Models for Object detection"

demo_app = gr.Interface(
    fn=yolov8_inference,
    inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
    outputs=outputs,
    title=title,
    live=True,
)
demo_app.launch(debug=True)