Spaces:

zundom
/

camelot-pg

Sleeping

App Files Files Community

morisono commited on Jun 15

Commit

f92f684

•

1 Parent(s): b5b4335

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +4 -23
src/app/__pycache__/common.cpython-310.pyc +0 -0
src/app/__pycache__/parser.cpython-310.pyc +0 -0
src/app/common.py +24 -0
src/app/parser.py +87 -0
src/app/run.py +4 -106

README.md CHANGED Viewed

@@ -32,7 +32,7 @@ This script extracts tables from PDF files and saves them as CSV files. It suppo
 To run the script via CLI, use the following command:
 ```bash
-python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
 ```
 #### Arguments:
@@ -52,7 +52,7 @@ python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
 To run the script with the web UI, use the following command:
 ```bash
-python src/app/run.py data/demo.pdf data/output.csv --webui
 ```
 This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
@@ -62,28 +62,9 @@ This will launch a Gradio-based web application where you can upload PDFs and vi
 ### CLI Example
 ```bash
-python src/app/run.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
 ```
-### Web UI Example
-```bash
-python src/app/run.py data/demo.pdf data/output.csv --webui
-```
-## Handling Interruptions
-The script handles `SIGINT` and `SIGTERM` signals gracefully, ensuring that processing can be interrupted safely.
 ## License
-This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
-## Acknowledgements
-This script uses the following libraries:
-- [Rich](https://github.com/willmcgugan/rich) for console output and progress bars
-- [Camelot](https://github.com/camelot-dev/camelot) for PDF table extraction
-- [Polars](https://github.com/pola-rs/polars) for efficient DataFrame operations
-- [Gradio](https://github.com/gradio-app/gradio) for the web UI
-- [gradio_pdf](https://github.com/gradio-app/gradio) for PDF handling in Gradio

 To run the script via CLI, use the following command:
 ```bash
+python src/app/parser.py input1.pdf input2.pdf output1.csv output2.csv
 ```
 #### Arguments:
 To run the script with the web UI, use the following command:
 ```bash
+python src/app/run.py
 ```
 This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
 ### CLI Example
 ```bash
+python src/app/parser.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
 ```
 ## License
+This project is licensed under the MIT License.

src/app/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (1.1 kB). View file

src/app/__pycache__/parser.cpython-310.pyc ADDED Viewed

Binary file (4.1 kB). View file

src/app/common.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import time
+import tempfile
+import zipfile
+class Interface:
+    def get_tempdir():
+        timestamp = int(time.time())
+        temp_dir = tempfile.mkdtemp()
+        return timestamp, temp_dir
+    def create_zip(file_list, zip_path, password=None):
+        with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
+            if password:
+                zipf.setpassword(bytes(password, 'utf-8'))
+            for item in file_list:
+                if os.path.isdir(item):
+                    for root, _, files in os.walk(item):
+                        for file in files:
+                            file_path = os.path.join(root, file)
+                            arcname = os.path.relpath(file_path, item)
+                            zipf.write(file_path, arcname)
+                else:
+                    arcname = os.path.basename(item)
+                    zipf.write(item, arcname)

src/app/parser.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import sys
+import camelot
+import polars as pl
+import signal
+import argparse
+from rich.console import Console
+from rich.progress import track
+console = Console()
+class PDFTableParser:
+    def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
+        self.input_files = input_files
+        self.output_files = output_files
+        self.delimiter = delimiter
+        self.edge_tol = edge_tol
+        self.row_tol = row_tol
+        self.pages = pages
+    def read_tables(self, file_name):
+        try:
+            console.print(f"Reading tables from {file_name}...")
+            tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
+            console.print(f"Found {len(tables)} tables in {file_name}.")
+            return tables
+        except Exception as e:
+            console.print(f"[red]Error reading {file_name}: {e}[/red]")
+            return None
+    def save_tables_as_csv(self, tables, output_file):
+        try:
+            console.print(f"Saving tables to {output_file}...")
+            df = pl.concat([pl.DataFrame(table.df) for table in tables])
+            df.write_csv(output_file, separator=self.delimiter)
+            console.print(f"Saved tables to {output_file}.")
+        except Exception as e:
+            console.print(f"[red]Error saving to {output_file}: {e}[/red]")
+    def estimate_processing_time(self, file_name):
+        try:
+            with open(file_name, 'rb') as f:
+                content = f.read().decode('utf-8', errors='ignore')
+            pages = content.count('\n')
+            words = len(content.split())
+            chars = len(content)
+            estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
+            console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
+            return estimated_time
+        except Exception as e:
+            console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
+            return 0
+    def process_files(self):
+        for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
+            self.estimate_processing_time(input_file)
+            tables = self.read_tables(input_file)
+            if tables:
+                self.save_tables_as_csv(tables, output_file)
+def handle_signal(signum, frame):
+    console.print("\n[red]Process interrupted.[/red]")
+    sys.exit(1)
+if __name__ == "__main__":
+    signal.signal(signal.SIGINT, handle_signal)
+    signal.signal(signal.SIGTERM, handle_signal)
+    parser = argparse.ArgumentParser(description="PDF Table Parser")
+    parser.add_argument("input_files", nargs='+', help="List of input PDF files")
+    parser.add_argument("output_files", nargs='+', help="List of output CSV files")
+    parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
+    parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
+    parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
+    parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
+    parser.add_argument("--webui", action='store_true', help="Launch the web UI")
+    args = parser.parse_args()
+    if len(args.input_files) != len(args.output_files):
+        console.print("[red]The number of input files and output files must match.[/red]")
+        sys.exit(1)
+    if args.webui:
+        webui = WebUI()
+        webui.run()
+    else:
+        main(args)

src/app/run.py CHANGED Viewed

@@ -1,89 +1,12 @@
-import argparse
 import os
-import signal
-import sys
 import json
-import time
-import tempfile
-import zipfile
-from rich.console import Console
-from rich.progress import track
-import camelot
 import polars as pl
 import gradio as gr
 from gradio_pdf import PDF
-console = Console()
-class Interface:
-    def get_tempdir():
-        timestamp = int(time.time())
-        temp_dir = tempfile.mkdtemp()
-        return timestamp, temp_dir
-    def create_zip(file_list, zip_path, password=None):
-        with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
-            if password:
-                zipf.setpassword(bytes(password, 'utf-8'))
-            for item in file_list:
-                if os.path.isdir(item):
-                    for root, _, files in os.walk(item):
-                        for file in files:
-                            file_path = os.path.join(root, file)
-                            arcname = os.path.relpath(file_path, item)
-                            zipf.write(file_path, arcname)
-                else:
-                    arcname = os.path.basename(item)
-                    zipf.write(item, arcname)
-class PDFTableParser:
-    def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
-        self.input_files = input_files
-        self.output_files = output_files
-        self.delimiter = delimiter
-        self.edge_tol = edge_tol
-        self.row_tol = row_tol
-        self.pages = pages
-    def read_tables(self, file_name):
-        try:
-            console.print(f"Reading tables from {file_name}...")
-            tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
-            console.print(f"Found {len(tables)} tables in {file_name}.")
-            return tables
-        except Exception as e:
-            console.print(f"[red]Error reading {file_name}: {e}[/red]")
-            return None
-    def save_tables_as_csv(self, tables, output_file):
-        try:
-            console.print(f"Saving tables to {output_file}...")
-            df = pl.concat([pl.DataFrame(table.df) for table in tables])
-            df.write_csv(output_file, separator=self.delimiter)
-            console.print(f"Saved tables to {output_file}.")
-        except Exception as e:
-            console.print(f"[red]Error saving to {output_file}: {e}[/red]")
-    def estimate_processing_time(self, file_name):
-        try:
-            with open(file_name, 'rb') as f:
-                content = f.read().decode('utf-8', errors='ignore')
-            pages = content.count('\n')
-            words = len(content.split())
-            chars = len(content)
-            estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
-            console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
-            return estimated_time
-        except Exception as e:
-            console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
-            return 0
-    def process_files(self):
-        for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
-            self.estimate_processing_time(input_file)
-            tables = self.read_tables(input_file)
-            if tables:
-                self.save_tables_as_csv(tables, output_file)
 class WebUI:
     def __init__(self):
@@ -128,35 +51,10 @@ class WebUI:
         app.launch()
-def handle_signal(signum, frame):
-    console.print("\n[red]Process interrupted.[/red]")
-    sys.exit(1)
 def main(args):
     parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
     parser.process_files()
 if __name__ == "__main__":
-    signal.signal(signal.SIGINT, handle_signal)
-    signal.signal(signal.SIGTERM, handle_signal)
-    parser = argparse.ArgumentParser(description="PDF Table Parser")
-    parser.add_argument("input_files", nargs='+', help="List of input PDF files")
-    parser.add_argument("output_files", nargs='+', help="List of output CSV files")
-    parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
-    parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
-    parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
-    parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
-    parser.add_argument("--webui", action='store_true', help="Launch the web UI")
-    args = parser.parse_args()
-    if len(args.input_files) != len(args.output_files):
-        console.print("[red]The number of input files and output files must match.[/red]")
-        sys.exit(1)
-    if args.webui:
-        webui = WebUI()
-        webui.run()
-    else:
-        main(args)

 import os
 import json
 import polars as pl
 import gradio as gr
 from gradio_pdf import PDF
+from common import Interface
+from parser import PDFTableParser
 class WebUI:
     def __init__(self):
         app.launch()
 def main(args):
     parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
     parser.process_files()
 if __name__ == "__main__":
+    webui = WebUI()
+    webui.run()