import gradio as gr import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import requests import base64 import tempfile processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu") class Aspecto(): pass screen = Aspecto() with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo: image = gr.Image(label="Imagen", sources = ["upload","clipboard"]) with gr.Row(): button = gr.Button("Describir", variant="primary") clear = gr.Button("Borrar") output = gr.Textbox(label="Resumen") with gr.Row(): button2 = gr.Button("Leer", variant="primary") clear = gr.Button("Borrar") output2 = gr.Audio(label="Audio") def describir(image): raw_image = image inputs = processor(raw_image, return_tensors="pt").to("cpu") out = model.generate(**inputs) return processor.decode(out[0], skip_special_tokens=True) def leer(texto): response = requests.post("https://charly-text-to-speech.hf.space/run/predict", json={ "data": [ texto, ]}).json() data = response['data'][0] # Extraer la parte de base64 del string (eliminar el prefijo 'data:audio/flac;base64,') audio_base64 = data.split(',')[1] # Decodificar el string base64 audio_data = base64.b64decode(audio_base64) # Crear un archivo temporal with tempfile.NamedTemporaryFile(delete=False, suffix='.flac') as temp_audio_file: temp_audio_file.write(audio_data) temp_audio_path = temp_audio_file.name return temp_audio_path button.click(describir, [image], output) button2.click(leer, [output], output2) demo.launch(debug=True)