morisono commited on
Commit
3ee750e
1 Parent(s): 22eec52

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +85 -8
  3. data/demo.pdf +3 -0
  4. data/readme.txt +2 -0
  5. data/success.csv +0 -0
  6. requirements.txt +5 -0
  7. src/app/run.py +162 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/demo.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,89 @@
1
  ---
2
- title: Camelot Pg
3
- emoji: 📚
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: camelot-pg
3
+ app_file: src/app/run.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.32.2
 
 
6
  ---
7
 
8
+ # PDF Table Parser
9
+
10
+ This script extracts tables from PDF files and saves them as CSV files. It supports command-line interface (CLI) for batch processing and also provides an optional web UI for interactive processing.
11
+
12
+ ## Features
13
+
14
+ - Multi-page PDF support
15
+ - Progress display per lines/rows, per page, and per file
16
+ - CSV output with UTF-8 with BOM encoding
17
+ - Customizable edge and row tolerances for table detection
18
+ - Optional web UI for interactive processing using Gradio
19
+
20
+ ## Installation
21
+
22
+ 1. Clone the repository or download the script.
23
+ 2. Install the required dependencies:
24
+ ```bash
25
+ pip install rich camelot-py polars gradio gradio_pdf
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ### Command-Line Interface (CLI)
31
+
32
+ To run the script via CLI, use the following command:
33
+
34
+ ```bash
35
+ python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
36
+ ```
37
+
38
+ #### Arguments:
39
+
40
+ - `input_files`: List of input PDF files
41
+ - `output_files`: List of output CSV files (must match the number of input files)
42
+
43
+ #### Optional Arguments:
44
+
45
+ - `--delimiter`: Output file delimiter (default: `,`)
46
+ - `--edge_tol`: Tolerance parameter used to specify the distance between text and table edges (default: `50`)
47
+ - `--row_tol`: Tolerance parameter used to specify the distance between table rows (default: `10`)
48
+ - `--webui`: Launch the web UI
49
+
50
+ ### Web UI
51
+
52
+ To run the script with the web UI, use the following command:
53
+
54
+ ```bash
55
+ python src/app/run.py data/demo.pdf data/output.csv --webui
56
+ ```
57
+
58
+ This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
59
+
60
+ ## Example
61
+
62
+ ### CLI Example
63
+
64
+ ```bash
65
+ python src/app/run.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
66
+ ```
67
+
68
+ ### Web UI Example
69
+
70
+ ```bash
71
+ python src/app/run.py data/demo.pdf data/output.csv --webui
72
+ ```
73
+
74
+ ## Handling Interruptions
75
+
76
+ The script handles `SIGINT` and `SIGTERM` signals gracefully, ensuring that processing can be interrupted safely.
77
+
78
+ ## License
79
+
80
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
81
+
82
+ ## Acknowledgements
83
+
84
+ This script uses the following libraries:
85
+ - [Rich](https://github.com/willmcgugan/rich) for console output and progress bars
86
+ - [Camelot](https://github.com/camelot-dev/camelot) for PDF table extraction
87
+ - [Polars](https://github.com/pola-rs/polars) for efficient DataFrame operations
88
+ - [Gradio](https://github.com/gradio-app/gradio) for the web UI
89
+ - [gradio_pdf](https://github.com/gradio-app/gradio) for PDF handling in Gradio
data/demo.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cdbced74a67ec85b5891d3ce931edc3345e9251512d7de314b867d2a716211b
3
+ size 1437530
data/readme.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Demo file from :
2
+ - https://www.npo-homepage.go.jp/npoportal/certification
data/success.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ camelot_py==0.11.0
2
+ gradio==4.36.1
3
+ gradio_pdf==0.0.11
4
+ polars==0.20.31
5
+ rich==13.7.1
src/app/run.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import signal
4
+ import sys
5
+ import json
6
+ import time
7
+ import tempfile
8
+ import zipfile
9
+ from rich.console import Console
10
+ from rich.progress import track
11
+ import camelot
12
+ import polars as pl
13
+ import gradio as gr
14
+ from gradio_pdf import PDF
15
+
16
+ console = Console()
17
+
18
+ class Interface:
19
+ def get_tempdir():
20
+ timestamp = int(time.time())
21
+ temp_dir = tempfile.mkdtemp()
22
+ return timestamp, temp_dir
23
+
24
+ def create_zip(file_list, zip_path, password=None):
25
+ with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
26
+ if password:
27
+ zipf.setpassword(bytes(password, 'utf-8'))
28
+ for item in file_list:
29
+ if os.path.isdir(item):
30
+ for root, _, files in os.walk(item):
31
+ for file in files:
32
+ file_path = os.path.join(root, file)
33
+ arcname = os.path.relpath(file_path, item)
34
+ zipf.write(file_path, arcname)
35
+ else:
36
+ arcname = os.path.basename(item)
37
+ zipf.write(item, arcname)
38
+
39
+ class PDFTableParser:
40
+ def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
41
+ self.input_files = input_files
42
+ self.output_files = output_files
43
+ self.delimiter = delimiter
44
+ self.edge_tol = edge_tol
45
+ self.row_tol = row_tol
46
+ self.pages = pages
47
+
48
+ def read_tables(self, file_name):
49
+ try:
50
+ console.print(f"Reading tables from {file_name}...")
51
+ tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
52
+ console.print(f"Found {len(tables)} tables in {file_name}.")
53
+ return tables
54
+ except Exception as e:
55
+ console.print(f"[red]Error reading {file_name}: {e}[/red]")
56
+ return None
57
+
58
+ def save_tables_as_csv(self, tables, output_file):
59
+ try:
60
+ console.print(f"Saving tables to {output_file}...")
61
+ df = pl.concat([pl.DataFrame(table.df) for table in tables])
62
+ df.write_csv(output_file, separator=self.delimiter)
63
+ console.print(f"Saved tables to {output_file}.")
64
+ except Exception as e:
65
+ console.print(f"[red]Error saving to {output_file}: {e}[/red]")
66
+
67
+ def estimate_processing_time(self, file_name):
68
+ try:
69
+ with open(file_name, 'rb') as f:
70
+ content = f.read().decode('utf-8', errors='ignore')
71
+ pages = content.count('\n')
72
+ words = len(content.split())
73
+ chars = len(content)
74
+ estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
75
+ console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
76
+ return estimated_time
77
+ except Exception as e:
78
+ console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
79
+ return 0
80
+
81
+ def process_files(self):
82
+ for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
83
+ self.estimate_processing_time(input_file)
84
+ tables = self.read_tables(input_file)
85
+ if tables:
86
+ self.save_tables_as_csv(tables, output_file)
87
+
88
+ class WebUI:
89
+ def __init__(self):
90
+ pass
91
+
92
+ def process_pdf(pdf_file, output_path, edge_tol, row_tol, pages):
93
+ ts, tempd = Interface.get_tempdir()
94
+ tempf = os.path.join(tempd, output_path)
95
+
96
+ parser = PDFTableParser([pdf_file], [tempf], ',', edge_tol, row_tol, pages)
97
+ tables = parser.read_tables(pdf_file)
98
+ if tables:
99
+ parser.save_tables_as_csv(tables, tempf)
100
+ df = pl.concat([pl.DataFrame(table.df) for table in tables])
101
+
102
+ return df, [tempf], {"status": "success", "message": f"Processed PDF and saved as {tempf}"}
103
+ return None, None, {"status": "error", "message": "Failed to process PDF"}
104
+
105
+ def run(self):
106
+ with gr.Blocks(title="PDF Table Parser", css="body { font-family: Arial, sans-serif; } footer { visibility: hidden; }") as app:
107
+ gr.Markdown("# PDF Table Parser")
108
+ description="Upload a PDF file to extract tables"
109
+ gr.Markdown(f"### {description}")
110
+ with gr.Row():
111
+ with gr.Column():
112
+ pdf_in = PDF(label="Document")
113
+ with gr.Row():
114
+ edge_tol = gr.Number(50, label="Edge tol")
115
+ row_tol = gr.Number(50, label="Row tol")
116
+ pages = gr.Textbox('1', label="Pages", info="You can pass 'all', '3-end', etc.")
117
+ output_path = gr.Textbox(f"output.csv", label="Output Path")
118
+ with gr.Column():
119
+ status_msg = gr.JSON(label="Status Message")
120
+ output_files = gr.Files(label="Output Files")
121
+
122
+ with gr.Row():
123
+ output_df = gr.Dataframe(label="Extracted Table")
124
+ examples = gr.Examples([["data/demo.pdf"]], inputs=pdf_in)
125
+ pdf_in.change(WebUI.process_pdf,
126
+ inputs=[pdf_in, output_path, edge_tol, row_tol, pages],
127
+ outputs=[output_df, output_files, status_msg])
128
+
129
+ app.launch()
130
+
131
+ def handle_signal(signum, frame):
132
+ console.print("\n[red]Process interrupted.[/red]")
133
+ sys.exit(1)
134
+
135
+ def main(args):
136
+ parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
137
+ parser.process_files()
138
+
139
+ if __name__ == "__main__":
140
+ signal.signal(signal.SIGINT, handle_signal)
141
+ signal.signal(signal.SIGTERM, handle_signal)
142
+
143
+ parser = argparse.ArgumentParser(description="PDF Table Parser")
144
+ parser.add_argument("input_files", nargs='+', help="List of input PDF files")
145
+ parser.add_argument("output_files", nargs='+', help="List of output CSV files")
146
+ parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
147
+ parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
148
+ parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
149
+ parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
150
+ parser.add_argument("--webui", action='store_true', help="Launch the web UI")
151
+
152
+ args = parser.parse_args()
153
+
154
+ if len(args.input_files) != len(args.output_files):
155
+ console.print("[red]The number of input files and output files must match.[/red]")
156
+ sys.exit(1)
157
+
158
+ if args.webui:
159
+ webui = WebUI()
160
+ webui.run()
161
+ else:
162
+ main(args)