Spaces:

wjbmattingly
/

medieval-htr-page

Running on Zero

App Files Files Community

wjbmattingly commited on Aug 9

Commit

f8ba7b0

•

1 Parent(s): c10450c

Create app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import torch
+import spaces
+import subprocess
+import json
+from PIL import Image, ImageDraw
+import os
+import tempfile
+# Dictionary of model names and their corresponding HuggingFace model IDs
+MODEL_OPTIONS = {
+    "Microsoft Handwritten": "microsoft/trocr-base-handwritten",
+    "Medieval Base": "medieval-data/trocr-medieval-base",
+    "Medieval Latin Caroline": "medieval-data/trocr-medieval-latin-caroline",
+    "Medieval Castilian Hybrida": "medieval-data/trocr-medieval-castilian-hybrida",
+    "Medieval Humanistica": "medieval-data/trocr-medieval-humanistica",
+    "Medieval Textualis": "medieval-data/trocr-medieval-textualis",
+    "Medieval Cursiva": "medieval-data/trocr-medieval-cursiva",
+    "Medieval Semitextualis": "medieval-data/trocr-medieval-semitextualis",
+    "Medieval Praegothica": "medieval-data/trocr-medieval-praegothica",
+    "Medieval Semihybrida": "medieval-data/trocr-medieval-semihybrida",
+    "Medieval Print": "medieval-data/trocr-medieval-print"
+}
+# Global variables to store the current model and processor
+current_model = None
+current_processor = None
+current_model_name = None
+def load_model(model_name):
+    global current_model, current_processor, current_model_name
+    if model_name != current_model_name:
+        model_id = MODEL_OPTIONS[model_name]
+        current_processor = TrOCRProcessor.from_pretrained(model_id)
+        current_model = VisionEncoderDecoderModel.from_pretrained(model_id)
+        current_model_name = model_name
+        # Move model to GPU
+        current_model = current_model.to('cuda')
+    return current_processor, current_model
+@spaces.GPU
+def process_image(image, model_name):
+    # Save the uploaded image to a temporary file
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_img:
+        image.save(temp_img, format="JPEG")
+        temp_img_path = temp_img.name
+    # Run Kraken for line detection
+    lines_json_path = "lines.json"
+    kraken_command = f"kraken -i {temp_img_path} {lines_json_path} binarize segment -bl"
+    subprocess.run(kraken_command, shell=True, check=True)
+    # Load the lines from the JSON file
+    with open(lines_json_path, 'r') as f:
+        lines_data = json.load(f)
+    processor, model = load_model(model_name)
+    # Process each line
+    transcriptions = []
+    for line in lines_data['lines']:
+        # Extract line coordinates
+        x1, y1 = line['baseline'][0]
+        x2, y2 = line['baseline'][-1]
+        # Crop the line from the original image
+        line_image = image.crop((x1, y1, x2, y2))
+        # Prepare image for TrOCR
+        pixel_values = processor(line_image, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to('cuda')
+        # Generate (no beam search)
+        with torch.no_grad():
+            generated_ids = model.generate(pixel_values)
+        # Decode
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcriptions.append(generated_text)
+    # Clean up temporary files
+    os.unlink(temp_img_path)
+    os.unlink(lines_json_path)
+    # Create an image with bounding boxes
+    draw = ImageDraw.Draw(image)
+    for line in lines_data['lines']:
+        coords = line['baseline']
+        draw.line(coords, fill="red", width=2)
+    return image, "\n".join(transcriptions)
+# Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Medieval Document Transcription")
+    gr.Markdown("Upload an image of a medieval document and select a model to transcribe it. The tool will detect lines and transcribe each line separately.")
+    with gr.Row():
+        input_image = gr.Image(type="pil", label="Input Image")
+        model_dropdown = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Select Model", value="Medieval Base")
+    with gr.Row():
+        output_image = gr.Image(type="pil", label="Detected Lines")
+        transcription_output = gr.Textbox(label="Transcription", lines=10)
+    submit_button = gr.Button("Transcribe")
+    submit_button.click(fn=process_image, inputs=[input_image, model_dropdown], outputs=[output_image, transcription_output])
+iface.launch()