Spaces:

Rialbox
/

Qwen-local-demo-test

Sleeping

File size: 4,387 Bytes

import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer
import torch

# Load the OCR model and processor
ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype="auto",
    device_map="auto",
)

ocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# Load the Math model and tokenizer
math_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Math-72B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)

math_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-72B-Instruct")

# OCR extraction function
def ocr_and_query(image, question):
    # Prepare image for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {
                    "type": "text",
                    "text": question
                },
            ],
        }
    ]
    
    # Process image and text prompt
    text_prompt = ocr_processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = ocr_processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    
    # Run the model to generate OCR results
    inputs = inputs.to("cuda")
    output_ids = ocr_model.generate(**inputs, max_new_tokens=1024)
    
    # Decode the generated text
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
    
    return output_text

# Math problem solving function
def solve_math_problem(prompt):
    # CoT (Chain of Thought)
    messages = [
        {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
        {"role": "user", "content": prompt}
    ]

    text = math_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = math_tokenizer([text], return_tensors="pt").to("cuda")

    generated_ids = math_model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = math_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response

# Function to clear inputs and output
def clear_inputs():
    return None, "", ""

# Gradio interface setup
def gradio_app(image, question, task):
    if task == "OCR and Query":
        return image, question, ocr_and_query(image, question)
    elif task == "Solve Math Problem from Image":
        if image is None:
            return image, question, "Please upload an image."
        extracted_text = ocr_and_query(image, "")
        math_solution = solve_math_problem(extracted_text)
        return image, extracted_text, math_solution
    elif task == "Solve Math Problem from Text":
        if question.strip() == "":
            return image, question, "Please enter a math problem."
        math_solution = solve_math_problem(question)
        return image, question, math_solution
    else:
        return image, question, "Please select a task."

# Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# Image OCR and Math Solver")
    gr.Markdown("Upload an image, enter your question or math problem, and select the appropriate task.")
    
    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload Image")
        text_input = gr.Textbox(lines=2, placeholder="Enter your question or math problem here...", label="Input")
    
    with gr.Row():
        task_radio = gr.Radio(["OCR and Query", "Solve Math Problem from Image", "Solve Math Problem from Text"], label="Task")
    
    with gr.Row():
        complete_button = gr.Button("Complete")
        clear_button = gr.Button("Clear")
    
    output = gr.Markdown(label="Output")
    
    # Event listeners
    complete_button.click(fn=gradio_app, inputs=[image_input, text_input, task_radio], outputs=[image_input, text_input, output])
    clear_button.click(fn=clear_inputs, outputs=[image_input, text_input, output])

# Launch the app
app.launch(share=True)