File size: 3,514 Bytes
5ebeb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import math
import os
import random
import xml.etree.ElementTree as ET

from PIL import Image, ImageDraw, ImageFont


class XmlParser:
    def __init__(self, page_xml="./page_xml.xml"):
        self.tree = ET.parse(page_xml, parser=ET.XMLParser(encoding="utf-8"))
        self.root = self.tree.getroot()
        self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"

    def visualize_xml(
        self,
        background_image,
        font_size=9,
        text_offset=10,
        font_path_tff="./src/htr_pipeline/utils/templates/arial.ttf",
    ):
        image = Image.fromarray(background_image).convert("RGBA")
        image_width = int(self.root.find(f"{self.namespace}Page").attrib["imageWidth"])
        image_height = int(self.root.find(f"{self.namespace}Page").attrib["imageHeight"])

        text_offset = -text_offset
        base_font_size = font_size
        font_path = font_path_tff

        max_bbox_width = 0  # Initialize maximum bounding box width

        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
            coords = textregion.find(f"{self.namespace}Coords").attrib["points"].split()
            points = [tuple(map(int, point.split(","))) for point in coords]
            x_coords, y_coords = zip(*points)
            min_x, max_x = min(x_coords), max(x_coords)
            bbox_width = max_x - min_x  # Width of the current bounding box
            max_bbox_width = max(max_bbox_width, bbox_width)  # Update maximum bounding box width

        scaling_factor = max_bbox_width / 400.0  # Use maximum bounding box width for scaling
        font_size_scaled = int(base_font_size * scaling_factor)
        font = ImageFont.truetype(font_path, font_size_scaled)

        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
            fill_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 100)
            for textline in textregion.findall(f".//{self.namespace}TextLine"):
                coords = textline.find(f"{self.namespace}Coords").attrib["points"].split()
                points = [tuple(map(int, point.split(","))) for point in coords]

                poly_image = Image.new("RGBA", image.size)
                poly_draw = ImageDraw.Draw(poly_image)
                poly_draw.polygon(points, fill=fill_color)

                text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text

                x_coords, y_coords = zip(*points)
                min_x, max_x = min(x_coords), max(x_coords)
                min_y = min(y_coords)
                text_width, text_height = poly_draw.textsize(text, font=font)  # Get text size
                text_position = (
                    (min_x + max_x) // 2 - text_width // 2,
                    min_y + text_offset,
                )  # Center text horizontally

                poly_draw.text(text_position, text, fill=(0, 0, 0), font=font)
                image = Image.alpha_composite(image, poly_image)

        return image

    def xml_to_txt(self, output_file="page_txt.txt"):
        with open(output_file, "w", encoding="utf-8") as f:
            for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
                for textline in textregion.findall(f".//{self.namespace}TextLine"):
                    text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
                    f.write(text + "\n")
                f.write("\n")