Spaces:

Daemontatox
/

Imagechat

Running

App Files Files Community

Daemontatox commited on Feb 2

Commit

0b4a56a

verified ·

1 Parent(s): df7e1e9

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -27

app.py CHANGED Viewed

@@ -10,15 +10,19 @@ from openai import OpenAI
 inference_api_key = os.environ.get("HF_TOKEN")
 chat_api_key = os.environ.get("HF_TOKEN")
-# Global variable to store the image as a data URL so it can be used in subsequent chat calls.
 global_image_data_url = None
 def generate_image_fn(selected_prompt):
     """
     Uses the Hugging Face Inference API to generate an image from the selected prompt.
-    Converts the image to a data URL so that it can be embedded in the chat request.
     """
-    global global_image_data_url
     # Create an inference client for text-to-image (Stable Diffusion)
     image_client = InferenceClient(
@@ -37,46 +41,36 @@ def generate_image_fn(selected_prompt):
     image.save(buffered, format="PNG")
     img_bytes = buffered.getvalue()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
-    data_url = f"data:image/png;base64,{img_b64}"
-    global_image_data_url = data_url
     return image
 def chat_about_image_fn(user_input):
     """
-    Sends the user's text message and the current image (as a data URL) to a vision-chat model.
-    Returns the model's response.
     """
     if not global_image_data_url:
         return "Please generate an image first."
-    # Create the messages payload. The payload contains the user's text
-    # along with the image in a field named "image_url" (using our data URL).
     messages = [
         {
             "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": user_input
-                },
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": global_image_data_url
-                    }
                 }
             ]
         }
     ]
-    # Create a client for the vision-chat model
     chat_client = OpenAI(
         base_url="https://api-inference.huggingface.co/v1/",
         api_key=chat_api_key  # Loaded from env secrets
     )
-    # Call the chat completions API. Here we use streaming to accumulate the full response.
     stream = chat_client.chat.completions.create(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
         messages=messages,
@@ -84,7 +78,52 @@ def chat_about_image_fn(user_input):
         stream=True
     )
-    # Concatenate all streamed chunks of the response.
     response_text = ""
     for chunk in stream:
         response_text += chunk.choices[0].delta.content
@@ -107,8 +146,8 @@ prompt_options = [
 # Define the Gradio interface using Blocks.
 with gr.Blocks() as demo:
-    gr.Markdown("# Image Generation and Chat")
     with gr.Row():
         with gr.Column():
             gr.Markdown("## Generate Image")
@@ -117,14 +156,26 @@ with gr.Blocks() as demo:
             img_output = gr.Image(label="Generated Image")
         with gr.Column():
             gr.Markdown("## Chat about the Image")
-            chat_input = gr.Textbox(label="Enter your message about the image", placeholder="Ask a question or comment about the image...")
             chat_output = gr.Textbox(label="Chat Response")
-    # When the "Generate Image" button is clicked, call generate_image_fn with the selected prompt.
     generate_btn.click(generate_image_fn, inputs=prompt_dropdown, outputs=img_output)
-    # When the user submits a message in the chat textbox, call chat_about_image_fn.
-    chat_input.submit(chat_about_image_fn, inputs=chat_input, outputs=chat_output)
 # Launch the app. (Hugging Face Spaces will detect and run this.)
 demo.launch()

 inference_api_key = os.environ.get("HF_TOKEN")
 chat_api_key = os.environ.get("HF_TOKEN")
+# Global variables to store the generated image (as a data URL) and the prompt used
 global_image_data_url = None
+global_image_prompt = None
 def generate_image_fn(selected_prompt):
     """
     Uses the Hugging Face Inference API to generate an image from the selected prompt.
+    Converts the image to a data URL for later use, and stores the prompt globally.
     """
+    global global_image_data_url, global_image_prompt
+    # Store the chosen prompt for later use in detail checking
+    global_image_prompt = selected_prompt
     # Create an inference client for text-to-image (Stable Diffusion)
     image_client = InferenceClient(
     image.save(buffered, format="PNG")
     img_bytes = buffered.getvalue()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    global_image_data_url = f"data:image/png;base64,{img_b64}"
     return image
 def chat_about_image_fn(user_input):
     """
+    Sends the user's chat message along with the current image (as a data URL)
+    to a vision‑chat model, and returns the model's response.
     """
     if not global_image_data_url:
         return "Please generate an image first."
     messages = [
         {
             "role": "user",
             "content": [
+                {"type": "text", "text": user_input},
                 {
                     "type": "image_url",
+                    "image_url": {"url": global_image_data_url}
                 }
             ]
         }
     ]
     chat_client = OpenAI(
         base_url="https://api-inference.huggingface.co/v1/",
         api_key=chat_api_key  # Loaded from env secrets
     )
     stream = chat_client.chat.completions.create(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
         messages=messages,
         stream=True
     )
+    response_text = ""
+    for chunk in stream:
+        response_text += chunk.choices[0].delta.content
+    return response_text
+def check_details_fn(user_details):
+    """
+    Compares the user's description of the generated image with the prompt used to generate it.
+    The function sends both the original prompt and the user description to the vision-chat model,
+    which responds whether the description is correct and (if not) provides a hint.
+    """
+    if not global_image_prompt:
+        return "Please generate an image first."
+    # Build a message to instruct the model to evaluate the user's details.
+    # The message asks the model to check whether the description covers the key elements of the prompt.
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": (
+                        f"The image was generated using the prompt: '{global_image_prompt}'.\n"
+                        f"Evaluate the following user description of the image: '{user_details}'.\n"
+                        "If the description is accurate and captures the key elements of the prompt, reply with 'Correct'. "
+                        "If it is inaccurate or missing important details, reply with 'Incorrect' and provide a hint on what is missing."
+                    )
+                }
+            ]
+        }
+    ]
+    chat_client = OpenAI(
+        base_url="https://api-inference.huggingface.co/v1/",
+        api_key=chat_api_key  # Loaded from env secrets
+    )
+    stream = chat_client.chat.completions.create(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        messages=messages,
+        max_tokens=100,
+        stream=True
+    )
     response_text = ""
     for chunk in stream:
         response_text += chunk.choices[0].delta.content
 # Define the Gradio interface using Blocks.
 with gr.Blocks() as demo:
+    gr.Markdown("# Image Generation, Chat, and Detail Check")
     with gr.Row():
         with gr.Column():
             gr.Markdown("## Generate Image")
             img_output = gr.Image(label="Generated Image")
         with gr.Column():
             gr.Markdown("## Chat about the Image")
+            chat_input = gr.Textbox(
+                label="Enter your message about the image",
+                placeholder="Ask a question or comment about the image..."
+            )
             chat_output = gr.Textbox(label="Chat Response")
+            chat_input.submit(chat_about_image_fn, inputs=chat_input, outputs=chat_output)
+    # Row for checking the user's description of the generated image.
+    with gr.Row():
+        gr.Markdown("## Check Your Description of the Image")
+        details_input = gr.Textbox(
+            label="Enter details about the image",
+            placeholder="Describe the key elements of the image..."
+        )
+        check_details_btn = gr.Button("Check Details")
+        details_output = gr.Textbox(label="Result")
+    # Bind the button clicks to functions.
     generate_btn.click(generate_image_fn, inputs=prompt_dropdown, outputs=img_output)
+    check_details_btn.click(check_details_fn, inputs=details_input, outputs=details_output)
 # Launch the app. (Hugging Face Spaces will detect and run this.)
 demo.launch()