π improve formatting
Browse filesSigned-off-by: peter szemraj <[email protected]>
- pdf2text.py +8 -7
pdf2text.py
CHANGED
@@ -556,7 +556,7 @@ def postprocess(text: str) -> str:
|
|
556 |
return eval_and_replace(proc)
|
557 |
|
558 |
|
559 |
-
def result2text(result) -> str:
|
560 |
"""Convert OCR result to text"""
|
561 |
|
562 |
full_doc = []
|
@@ -570,8 +570,7 @@ def result2text(result) -> str:
|
|
570 |
text += word.value + " "
|
571 |
full_doc.append(text)
|
572 |
|
573 |
-
|
574 |
-
return full_text
|
575 |
|
576 |
|
577 |
import warnings
|
@@ -603,8 +602,10 @@ def convert_PDF_to_Text(
|
|
603 |
logging.info(f"running OCR on {len(doc)} pages")
|
604 |
result = ocr_model(doc)
|
605 |
raw_text = result2text(result)
|
606 |
-
proc_text = format_ocr_out(raw_text
|
607 |
-
|
|
|
|
|
608 |
|
609 |
fn_rt = time.perf_counter() - st
|
610 |
|
@@ -614,8 +615,8 @@ def convert_PDF_to_Text(
|
|
614 |
"num_pages": len(doc),
|
615 |
"runtime": round(fn_rt, 2),
|
616 |
"date": str(date.today()),
|
617 |
-
"converted_text":
|
618 |
-
"length": len(
|
619 |
}
|
620 |
|
621 |
return results_dict
|
|
|
556 |
return eval_and_replace(proc)
|
557 |
|
558 |
|
559 |
+
def result2text(result, as_text=False) -> str or list:
|
560 |
"""Convert OCR result to text"""
|
561 |
|
562 |
full_doc = []
|
|
|
570 |
text += word.value + " "
|
571 |
full_doc.append(text)
|
572 |
|
573 |
+
return "\n".join(full_doc) if as_text else full_doc
|
|
|
574 |
|
575 |
|
576 |
import warnings
|
|
|
602 |
logging.info(f"running OCR on {len(doc)} pages")
|
603 |
result = ocr_model(doc)
|
604 |
raw_text = result2text(result)
|
605 |
+
proc_text = [format_ocr_out(r) for r in raw_text]
|
606 |
+
fin_text = [postprocess(t) for t in proc_text]
|
607 |
+
|
608 |
+
ocr_results = "\n\n".join(fin_text)
|
609 |
|
610 |
fn_rt = time.perf_counter() - st
|
611 |
|
|
|
615 |
"num_pages": len(doc),
|
616 |
"runtime": round(fn_rt, 2),
|
617 |
"date": str(date.today()),
|
618 |
+
"converted_text": ocr_results,
|
619 |
+
"length": len(ocr_results),
|
620 |
}
|
621 |
|
622 |
return results_dict
|