import io import matplotlib.pyplot as plt import requests import inflect from PIL import Image def load_image_from_url(url): return Image.open(requests.get(url, stream=True).raw) def render_results_in_image(in_pil_img, in_results): plt.figure(figsize=(16, 10)) plt.imshow(in_pil_img) ax = plt.gca() for prediction in in_results: x, y = prediction['box']['xmin'], prediction['box']['ymin'] w = prediction['box']['xmax'] - prediction['box']['xmin'] h = prediction['box']['ymax'] - prediction['box']['ymin'] ax.add_patch(plt.Rectangle((x, y), w, h, fill=False, color="green", linewidth=2)) ax.text( x, y, f"{prediction['label']}: {round(prediction['score']*100, 1)}%", color='red' ) plt.axis("off") # Save the modified image to a BytesIO object img_buf = io.BytesIO() plt.savefig(img_buf, format='png', bbox_inches='tight', pad_inches=0) img_buf.seek(0) modified_image = Image.open(img_buf) # Close the plot to prevent it from being displayed plt.close() return modified_image def summarize_predictions_natural_language(predictions): summary = {} p = inflect.engine() for prediction in predictions: label = prediction['label'] if label in summary: summary[label] += 1 else: summary[label] = 1 result_string = "In this image, there are " for i, (label, count) in enumerate(summary.items()): count_string = p.number_to_words(count) result_string += f"{count_string} {label}" if count > 1: result_string += "s" result_string += " " if i == len(summary) - 2: result_string += "and " # Remove the trailing comma and space result_string = result_string.rstrip(', ') + "." return result_string ##### To ignore warnings ##### import warnings import logging from transformers import logging as hf_logging def ignore_warnings(): # Ignore specific Python warnings warnings.filterwarnings("ignore", message="Some weights of the model checkpoint") warnings.filterwarnings("ignore", message="Could not find image processor class") warnings.filterwarnings("ignore", message="The `max_size` parameter is deprecated") # Adjust logging for libraries using the logging module logging.basicConfig(level=logging.ERROR) hf_logging.set_verbosity_error() ######## from transformers import pipeline from PIL import Image import gradio as gr import numpy as np import io def processed_image(image): # The uploaded image is a PIL image od_pipe= pipeline("object-detection", model="facebook/detr-resnet-50") pl_out = od_pipe(image) processed_image=render_results_in_image(image,pl_out) text=summarize_predictions_natural_language(pl_out) return processed_image,text iface = gr.Interface(processed_image, # Function to process the image inputs=gr.Image(type="pil"), # Image upload input outputs=[gr.Image(type="pil"),"text"] # Image output ) iface.launch() tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-vctk") narrated_text=tts_pipe(text)