File size: 2,424 Bytes
3ee750e
 
 
 
 
 
f92f684
 
3ee750e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f92f684
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import json
import polars as pl
import gradio as gr
from gradio_pdf import PDF

from common import Interface
from parser import PDFTableParser


class WebUI:
    def __init__(self):
        pass

    def process_pdf(pdf_file, output_path, edge_tol, row_tol, pages):
        ts, tempd = Interface.get_tempdir()
        tempf = os.path.join(tempd, output_path)

        parser = PDFTableParser([pdf_file], [tempf], ',', edge_tol, row_tol, pages)
        tables = parser.read_tables(pdf_file)
        if tables:
            parser.save_tables_as_csv(tables, tempf)
            df = pl.concat([pl.DataFrame(table.df) for table in tables])

            return df, [tempf], {"status": "success", "message": f"Processed PDF and saved as {tempf}"}
        return None, None, {"status": "error", "message": "Failed to process PDF"}

    def run(self):
        with gr.Blocks(title="PDF Table Parser", css="body { font-family: Arial, sans-serif; } footer { visibility: hidden; }") as app:
            gr.Markdown("# PDF Table Parser")
            description="Upload a PDF file to extract tables"
            gr.Markdown(f"### {description}")
            with gr.Row():
                with gr.Column():
                    pdf_in = PDF(label="Document")
                    with gr.Row():
                        edge_tol = gr.Number(50, label="Edge tol")
                        row_tol = gr.Number(50, label="Row tol")
                        pages = gr.Textbox('1', label="Pages", info="You can pass 'all', '3-end', etc.")
                        output_path = gr.Textbox(f"output.csv", label="Output Path")
                with gr.Column():
                    status_msg = gr.JSON(label="Status Message")
                    output_files = gr.Files(label="Output Files")

            with gr.Row():
                output_df = gr.Dataframe(label="Extracted Table")
            examples = gr.Examples([["data/demo.pdf"]], inputs=pdf_in)
            pdf_in.change(WebUI.process_pdf,
            inputs=[pdf_in, output_path, edge_tol, row_tol, pages],
            outputs=[output_df, output_files, status_msg])

        app.launch()

def main(args):
    parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
    parser.process_files()

if __name__ == "__main__":
    webui = WebUI()
    webui.run()