Spaces:

matjarm
/

image-to-text-comparison

Running

App Files Files Community

matjarm commited on Dec 6, 2024

Commit

681078d

1 Parent(s): d4c78d6

init

Browse files

Files changed (5) hide show

app.py +207 -0
index.html +0 -19
main.py +119 -0
requirement.txt +154 -0
style.css +0 -28

app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import random
+import uuid
+import gradio as gr
+import numpy as np
+from PIL import Image
+import torch
+from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+from typing import Tuple
+# CSS for Gradio Interface
+css = '''
+.gradio-container{max-width: 575px !important}
+h1{text-align:center}
+footer {
+    visibility: hidden
+}
+'''
+DESCRIPTION = """
+## Text-to-Image Generator 🚀
+Create stunning images from text prompts using Stable Diffusion XL. Explore high-quality styles and customizable options.
+"""
+# Example Prompts
+examples = [
+    "A beautiful sunset over the ocean, ultra-realistic, high resolution",
+    "A futuristic cityscape with flying cars, cyberpunk theme, vibrant colors",
+    "A cozy cabin in the woods during winter, detailed and realistic",
+    "A magical forest with glowing plants and creatures, fantasy art",
+]
+# Model Configurations
+MODEL_OPTIONS = {
+    "LIGHTNING V5.0": "SG161222/RealVisXL_V5.0_Lightning",
+    "LIGHTNING V4.0": "SG161222/RealVisXL_V4.0_Lightning",
+}
+# Define Styles
+style_list = [
+    {
+        "name": "Ultra HD",
+        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "4K Realistic",
+        "prompt": "realistic 4K image of {prompt}. sharp, detailed, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, blurry, low resolution",
+    },
+    {
+        "name": "Minimal Style",
+        "prompt": "{prompt}, clean, minimalistic",
+        "negative_prompt": "",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+DEFAULT_STYLE_NAME = "Ultra HD"
+# Define Global Variables
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+MAX_IMAGE_SIZE = 4096
+MAX_SEED = np.iinfo(np.int32).max
+# Load Model Function
+def load_and_prepare_model(model_id):
+    pipe = StableDiffusionXLPipeline.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    ).to(device)
+    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+    return pipe
+# Load Models
+models = {key: load_and_prepare_model(value) for key, value in MODEL_OPTIONS.items()}
+# Generate Function
+def generate_image(
+    model_choice: str,
+    prompt: str,
+    negative_prompt: str,
+    style_name: str,
+    width: int,
+    height: int,
+    guidance_scale: float,
+    num_steps: int,
+    num_images: int,
+    randomize_seed: bool,
+    seed: int,
+):
+    # Apply Style
+    positive_style, negative_style = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    styled_prompt = positive_style.replace("{prompt}", prompt)
+    styled_negative_prompt = negative_style + (negative_prompt if negative_prompt else "")
+    # Randomize Seed if Enabled
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    # Generate Images
+    pipe = models[model_choice]
+    images = pipe(
+        prompt=[styled_prompt] * num_images,
+        negative_prompt=[styled_negative_prompt] * num_images,
+        width=width,
+        height=height,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_steps,
+        generator=generator,
+        output_type="pil",
+    ).images
+    # Save and Return Images
+    image_paths = []
+    for img in images:
+        unique_name = f"{uuid.uuid4()}.png"
+        img.save(unique_name)
+        image_paths.append(unique_name)
+    return image_paths, seed
+# Gradio Interface
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            label="Select Model",
+            choices=list(MODEL_OPTIONS.keys()),
+            value="LIGHTNING V5.0",
+        )
+    prompt = gr.Textbox(
+        label="Prompt",
+        placeholder="Enter your creative prompt here...",
+    )
+    negative_prompt = gr.Textbox(
+        label="Negative Prompt",
+        placeholder="Optional: Add details you want to avoid...",
+        value="blurry, deformed, low-quality, cartoonish",
+    )
+    style_name = gr.Radio(
+        label="Style",
+        choices=list(styles.keys()),
+        value=DEFAULT_STYLE_NAME,
+    )
+    with gr.Accordion("Advanced Options", open=False):
+        width = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
+        height = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
+        guidance_scale = gr.Slider(
+            label="Guidance Scale",
+            minimum=1,
+            maximum=20,
+            step=0.5,
+            value=7.5,
+        )
+        num_steps = gr.Slider(
+            label="Steps",
+            minimum=1,
+            maximum=50,
+            step=1,
+            value=25,
+        )
+        num_images = gr.Slider(
+            label="Number of Images",
+            minimum=1,
+            maximum=5,
+            step=1,
+            value=1,
+        )
+        randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+        seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
+    with gr.Row():
+        run_button = gr.Button("Generate Images")
+        result_gallery = gr.Gallery(label="Generated Images", show_label=False)
+    run_button.click(
+        generate_image,
+        inputs=[
+            model_choice,
+            prompt,
+            negative_prompt,
+            style_name,
+            width,
+            height,
+            guidance_scale,
+            num_steps,
+            num_images,
+            randomize_seed,
+            seed,
+        ],
+        outputs=[result_gallery, seed],
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=50).launch()

index.html DELETED Viewed

@@ -1,19 +0,0 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

main.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import requests
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+import torch
+from PIL import Image
+model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model1.to(device1)
+max_length = 16
+num_beams = 4
+gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+def image_to_text_model_1(image_url):
+    raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
+    pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values
+    pixel_values = pixel_values.to(device1)
+    output_ids = model1.generate(pixel_values, **gen_kwargs)
+    preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
+    preds = [pred.strip() for pred in preds]
+    return preds
+def bytes_to_text_model_1(bts):
+    pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values
+    pixel_values = pixel_values.to(device1)
+    output_ids = model1.generate(pixel_values, **gen_kwargs)
+    preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
+    preds = [pred.strip() for pred in preds]
+    print(preds[0])
+import requests
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import torch
+device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
+model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2)
+def image_to_text_model_2(img_url):
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+    text = "a picture of "
+    inputs = processor2(raw_image, text, return_tensors="pt").to(device2)
+    out = model2.generate(**inputs, num_beams = 3)
+    print(processor2.decode(out[0], skip_special_tokens=True))
+def bytes_to_text_model_2(byts):
+    text = "a picture of "
+    inputs = processor2(byts, text, return_tensors="pt").to(device2)
+    out = model2.generate(**inputs, num_beams = 3)
+    print(processor2.decode(out[0], skip_special_tokens=True))
+import requests
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+def image_to_text_model_3(img_url):
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+    text = "a picture of"
+    inputs = processor3(raw_image, text, return_tensors="pt")
+    inputs = processor3(raw_image, return_tensors="pt")
+    out = model3.generate(**inputs)
+    print(processor3.decode(out[0], skip_special_tokens=True))
+def bytes_to_text_model_3(byts):
+    text = "a picture of"
+    inputs = processor3(byts, text, return_tensors="pt")
+    inputs = processor3(byts, return_tensors="pt")
+    out = model3.generate(**inputs)
+    print(processor3.decode(out[0], skip_special_tokens=True))
+import cv2
+def FrameCapture(path):
+    vidObj = cv2.VideoCapture(path)
+    count = 0
+    success = 1
+    while success:
+        success, image = vidObj.read()
+        if count % 20 == 0:
+            print("NEW FRAME")
+            print("MODEL 1")
+            bytes_to_text_model_1(image)
+            print("MODEL 2")
+            bytes_to_text_model_2(image)
+            print("MODEL 3")
+            bytes_to_text_model_3(image)
+            print("\n\n")
+        count += 1
+FrameCapture("animation.mp4")

requirement.txt ADDED Viewed

	@@ -0,0 +1,154 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+appnope==0.1.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.2.0
+blinker==1.9.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+debugpy==1.8.8
+decorator==5.1.1
+defusedxml==0.7.1
+diffusers==0.31.0
+exceptiongroup==1.2.2
+executing==2.1.0
+fastapi==0.115.6
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+Flask==3.1.0
+fonttools==4.55.2
+fqdn==1.5.1
+fsspec==2024.10.0
+gradio==4.44.1
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.3
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+ipykernel==6.29.5
+ipython==8.18.1
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.4
+joblib==1.4.2
+json5==0.9.28
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.0
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+mpmath==1.3.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+nltk==3.9.1
+notebook_shim==0.2.4
+numpy==2.0.2
+opencv-python==4.10.0.84
+orjson==3.10.12
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.4.0
+pipeline==0.1.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+pydantic==2.10.3
+pydantic_core==2.27.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+python-multipart==0.0.19
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.21.0
+ruff==0.8.2
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.13.1
+semantic-version==2.10.0
+Send2Trash==1.8.3
+shellingham==1.5.4
+six==1.16.0
+sklearn==0.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.41.3
+sympy==1.13.1
+terminado==0.18.1
+threadpoolctl==3.5.0
+tinycss2==1.4.0
+tokenizers==0.21.0
+tomli==2.1.0
+tomlkit==0.12.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.67.0
+traitlets==5.14.3
+transformers==4.47.0
+typer==0.15.1
+types-python-dateutil==2.9.0.20241003
+typing_extensions==4.12.2
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.1
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==12.0
+Werkzeug==3.1.3
+zipp==3.21.0

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}