morisono commited on
Commit
f92f684
1 Parent(s): b5b4335

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -32,7 +32,7 @@ This script extracts tables from PDF files and saves them as CSV files. It suppo
32
  To run the script via CLI, use the following command:
33
 
34
  ```bash
35
- python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
36
  ```
37
 
38
  #### Arguments:
@@ -52,7 +52,7 @@ python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
52
  To run the script with the web UI, use the following command:
53
 
54
  ```bash
55
- python src/app/run.py data/demo.pdf data/output.csv --webui
56
  ```
57
 
58
  This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
@@ -62,28 +62,9 @@ This will launch a Gradio-based web application where you can upload PDFs and vi
62
  ### CLI Example
63
 
64
  ```bash
65
- python src/app/run.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
66
  ```
67
 
68
- ### Web UI Example
69
-
70
- ```bash
71
- python src/app/run.py data/demo.pdf data/output.csv --webui
72
- ```
73
-
74
- ## Handling Interruptions
75
-
76
- The script handles `SIGINT` and `SIGTERM` signals gracefully, ensuring that processing can be interrupted safely.
77
-
78
  ## License
79
 
80
- This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
81
-
82
- ## Acknowledgements
83
-
84
- This script uses the following libraries:
85
- - [Rich](https://github.com/willmcgugan/rich) for console output and progress bars
86
- - [Camelot](https://github.com/camelot-dev/camelot) for PDF table extraction
87
- - [Polars](https://github.com/pola-rs/polars) for efficient DataFrame operations
88
- - [Gradio](https://github.com/gradio-app/gradio) for the web UI
89
- - [gradio_pdf](https://github.com/gradio-app/gradio) for PDF handling in Gradio
 
32
  To run the script via CLI, use the following command:
33
 
34
  ```bash
35
+ python src/app/parser.py input1.pdf input2.pdf output1.csv output2.csv
36
  ```
37
 
38
  #### Arguments:
 
52
  To run the script with the web UI, use the following command:
53
 
54
  ```bash
55
+ python src/app/run.py
56
  ```
57
 
58
  This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
 
62
  ### CLI Example
63
 
64
  ```bash
65
+ python src/app/parser.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
66
  ```
67
 
 
 
 
 
 
 
 
 
 
 
68
  ## License
69
 
70
+ This project is licensed under the MIT License.
 
 
 
 
 
 
 
 
 
src/app/__pycache__/common.cpython-310.pyc ADDED
Binary file (1.1 kB). View file
 
src/app/__pycache__/parser.cpython-310.pyc ADDED
Binary file (4.1 kB). View file
 
src/app/common.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import tempfile
3
+ import zipfile
4
+
5
+ class Interface:
6
+ def get_tempdir():
7
+ timestamp = int(time.time())
8
+ temp_dir = tempfile.mkdtemp()
9
+ return timestamp, temp_dir
10
+
11
+ def create_zip(file_list, zip_path, password=None):
12
+ with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
13
+ if password:
14
+ zipf.setpassword(bytes(password, 'utf-8'))
15
+ for item in file_list:
16
+ if os.path.isdir(item):
17
+ for root, _, files in os.walk(item):
18
+ for file in files:
19
+ file_path = os.path.join(root, file)
20
+ arcname = os.path.relpath(file_path, item)
21
+ zipf.write(file_path, arcname)
22
+ else:
23
+ arcname = os.path.basename(item)
24
+ zipf.write(item, arcname)
src/app/parser.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import camelot
3
+ import polars as pl
4
+ import signal
5
+ import argparse
6
+ from rich.console import Console
7
+ from rich.progress import track
8
+
9
+ console = Console()
10
+
11
+ class PDFTableParser:
12
+ def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
13
+ self.input_files = input_files
14
+ self.output_files = output_files
15
+ self.delimiter = delimiter
16
+ self.edge_tol = edge_tol
17
+ self.row_tol = row_tol
18
+ self.pages = pages
19
+
20
+ def read_tables(self, file_name):
21
+ try:
22
+ console.print(f"Reading tables from {file_name}...")
23
+ tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
24
+ console.print(f"Found {len(tables)} tables in {file_name}.")
25
+ return tables
26
+ except Exception as e:
27
+ console.print(f"[red]Error reading {file_name}: {e}[/red]")
28
+ return None
29
+
30
+ def save_tables_as_csv(self, tables, output_file):
31
+ try:
32
+ console.print(f"Saving tables to {output_file}...")
33
+ df = pl.concat([pl.DataFrame(table.df) for table in tables])
34
+ df.write_csv(output_file, separator=self.delimiter)
35
+ console.print(f"Saved tables to {output_file}.")
36
+ except Exception as e:
37
+ console.print(f"[red]Error saving to {output_file}: {e}[/red]")
38
+
39
+ def estimate_processing_time(self, file_name):
40
+ try:
41
+ with open(file_name, 'rb') as f:
42
+ content = f.read().decode('utf-8', errors='ignore')
43
+ pages = content.count('\n')
44
+ words = len(content.split())
45
+ chars = len(content)
46
+ estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
47
+ console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
48
+ return estimated_time
49
+ except Exception as e:
50
+ console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
51
+ return 0
52
+
53
+ def process_files(self):
54
+ for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
55
+ self.estimate_processing_time(input_file)
56
+ tables = self.read_tables(input_file)
57
+ if tables:
58
+ self.save_tables_as_csv(tables, output_file)
59
+
60
+ def handle_signal(signum, frame):
61
+ console.print("\n[red]Process interrupted.[/red]")
62
+ sys.exit(1)
63
+
64
+ if __name__ == "__main__":
65
+ signal.signal(signal.SIGINT, handle_signal)
66
+ signal.signal(signal.SIGTERM, handle_signal)
67
+
68
+ parser = argparse.ArgumentParser(description="PDF Table Parser")
69
+ parser.add_argument("input_files", nargs='+', help="List of input PDF files")
70
+ parser.add_argument("output_files", nargs='+', help="List of output CSV files")
71
+ parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
72
+ parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
73
+ parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
74
+ parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
75
+ parser.add_argument("--webui", action='store_true', help="Launch the web UI")
76
+
77
+ args = parser.parse_args()
78
+
79
+ if len(args.input_files) != len(args.output_files):
80
+ console.print("[red]The number of input files and output files must match.[/red]")
81
+ sys.exit(1)
82
+
83
+ if args.webui:
84
+ webui = WebUI()
85
+ webui.run()
86
+ else:
87
+ main(args)
src/app/run.py CHANGED
@@ -1,89 +1,12 @@
1
- import argparse
2
  import os
3
- import signal
4
- import sys
5
  import json
6
- import time
7
- import tempfile
8
- import zipfile
9
- from rich.console import Console
10
- from rich.progress import track
11
- import camelot
12
  import polars as pl
13
  import gradio as gr
14
  from gradio_pdf import PDF
15
 
16
- console = Console()
 
17
 
18
- class Interface:
19
- def get_tempdir():
20
- timestamp = int(time.time())
21
- temp_dir = tempfile.mkdtemp()
22
- return timestamp, temp_dir
23
-
24
- def create_zip(file_list, zip_path, password=None):
25
- with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
26
- if password:
27
- zipf.setpassword(bytes(password, 'utf-8'))
28
- for item in file_list:
29
- if os.path.isdir(item):
30
- for root, _, files in os.walk(item):
31
- for file in files:
32
- file_path = os.path.join(root, file)
33
- arcname = os.path.relpath(file_path, item)
34
- zipf.write(file_path, arcname)
35
- else:
36
- arcname = os.path.basename(item)
37
- zipf.write(item, arcname)
38
-
39
- class PDFTableParser:
40
- def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
41
- self.input_files = input_files
42
- self.output_files = output_files
43
- self.delimiter = delimiter
44
- self.edge_tol = edge_tol
45
- self.row_tol = row_tol
46
- self.pages = pages
47
-
48
- def read_tables(self, file_name):
49
- try:
50
- console.print(f"Reading tables from {file_name}...")
51
- tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
52
- console.print(f"Found {len(tables)} tables in {file_name}.")
53
- return tables
54
- except Exception as e:
55
- console.print(f"[red]Error reading {file_name}: {e}[/red]")
56
- return None
57
-
58
- def save_tables_as_csv(self, tables, output_file):
59
- try:
60
- console.print(f"Saving tables to {output_file}...")
61
- df = pl.concat([pl.DataFrame(table.df) for table in tables])
62
- df.write_csv(output_file, separator=self.delimiter)
63
- console.print(f"Saved tables to {output_file}.")
64
- except Exception as e:
65
- console.print(f"[red]Error saving to {output_file}: {e}[/red]")
66
-
67
- def estimate_processing_time(self, file_name):
68
- try:
69
- with open(file_name, 'rb') as f:
70
- content = f.read().decode('utf-8', errors='ignore')
71
- pages = content.count('\n')
72
- words = len(content.split())
73
- chars = len(content)
74
- estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
75
- console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
76
- return estimated_time
77
- except Exception as e:
78
- console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
79
- return 0
80
-
81
- def process_files(self):
82
- for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
83
- self.estimate_processing_time(input_file)
84
- tables = self.read_tables(input_file)
85
- if tables:
86
- self.save_tables_as_csv(tables, output_file)
87
 
88
  class WebUI:
89
  def __init__(self):
@@ -128,35 +51,10 @@ class WebUI:
128
 
129
  app.launch()
130
 
131
- def handle_signal(signum, frame):
132
- console.print("\n[red]Process interrupted.[/red]")
133
- sys.exit(1)
134
-
135
  def main(args):
136
  parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
137
  parser.process_files()
138
 
139
  if __name__ == "__main__":
140
- signal.signal(signal.SIGINT, handle_signal)
141
- signal.signal(signal.SIGTERM, handle_signal)
142
-
143
- parser = argparse.ArgumentParser(description="PDF Table Parser")
144
- parser.add_argument("input_files", nargs='+', help="List of input PDF files")
145
- parser.add_argument("output_files", nargs='+', help="List of output CSV files")
146
- parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
147
- parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
148
- parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
149
- parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
150
- parser.add_argument("--webui", action='store_true', help="Launch the web UI")
151
-
152
- args = parser.parse_args()
153
-
154
- if len(args.input_files) != len(args.output_files):
155
- console.print("[red]The number of input files and output files must match.[/red]")
156
- sys.exit(1)
157
-
158
- if args.webui:
159
- webui = WebUI()
160
- webui.run()
161
- else:
162
- main(args)
 
 
1
  import os
 
 
2
  import json
 
 
 
 
 
 
3
  import polars as pl
4
  import gradio as gr
5
  from gradio_pdf import PDF
6
 
7
+ from common import Interface
8
+ from parser import PDFTableParser
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class WebUI:
12
  def __init__(self):
 
51
 
52
  app.launch()
53
 
 
 
 
 
54
  def main(args):
55
  parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
56
  parser.process_files()
57
 
58
  if __name__ == "__main__":
59
+ webui = WebUI()
60
+ webui.run()