# -*- coding: utf-8 -*- | |
''' | |
Created by Shengbo.Zhang on 2021/08/13 | |
''' | |
import sys | |
import time | |
################################################## | |
############## 算法:PDF2TXT_V3.py ################ | |
############## 测试示例 ################ | |
################################################## | |
from Pdf2Txt.pdf2txt_v1 import find_all_local_file | |
from Pdf2Txt.pdf2txt_v3 import * | |
while True: | |
count_total = 0 | |
count_success = 0 | |
count_failed = 0 | |
test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
if test_file_dir == 'exit': | |
sys.exit() | |
print('*****************************************************') | |
t1 = time.time() | |
for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
count_total += 1 | |
pdf_file_path = path | |
pdf_dir_path = os.path.dirname(path) | |
pdf_file_name = os.path.basename(path)[:-4] | |
output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
print(f'开始处理: 第 {idx + 1} 个文件...') | |
print(f'文件名: {pdf_file_name}.pdf') | |
tt1 = time.time() | |
try: | |
txt_string = get_txt_from_pdf(pdf_file_path) | |
if txt_string != '': | |
output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) | |
count_success += 1 | |
print('处理成功.') | |
else: | |
count_failed += 1 | |
print('处理失败!') | |
except Exception as e: | |
print(e) | |
count_failed += 1 | |
print('处理失败!') | |
tt2 = time.time() | |
print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') | |
print('*****************************************************') | |
t2 = time.time() | |
print('\n所有PDF格式的公告文件已处理完毕!') | |
print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') | |
print('执行耗时:', round(t2-t1, 3), '秒') | |
print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') | |
# ################################################## | |
# ############## 算法:PDF2TXT_V2.py ################ | |
# ############## 测试示例 ################ | |
# ################################################## | |
# from Pdf2Txt.pdf2txt_v1 import find_all_local_file | |
# from Pdf2Txt.pdf2txt_v2 import * | |
# while True: | |
# count_total = 0 | |
# count_success = 0 | |
# count_failed = 0 | |
# | |
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
# if test_file_dir == 'exit': | |
# sys.exit() | |
# | |
# print('*****************************************************') | |
# t1 = time.time() | |
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
# count_total += 1 | |
# | |
# pdf_file_path = path | |
# pdf_dir_path = os.path.dirname(path) | |
# pdf_file_name = os.path.basename(path)[:-4] | |
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
# | |
# print(f'开始处理: 第 {idx + 1} 个文件...') | |
# print(f'文件名: {pdf_file_name}.pdf') | |
# tt1 = time.time() | |
# try: | |
# txt_string = get_txt_from_pdf(pdf_file_path) | |
# if txt_string != '': | |
# output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) | |
# count_success += 1 | |
# print('处理成功.') | |
# else: | |
# count_failed += 1 | |
# print('处理失败!') | |
# except Exception as e: | |
# print(e) | |
# count_failed += 1 | |
# print('处理失败!') | |
# tt2 = time.time() | |
# print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') | |
# | |
# print('*****************************************************') | |
# | |
# t2 = time.time() | |
# print('\n所有PDF格式的公告文件已处理完毕!') | |
# print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') | |
# print('执行耗时:', round(t2-t1, 3), '秒') | |
# print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') | |
# ################################################## | |
# ############## 算法:PDF2TXT_V1.py ################ | |
# ############## 测试示例 ################ | |
# ################################################## | |
# from Pdf2Txt.pdf2txt_v1 import * | |
# while True: | |
# count_total = 0 | |
# count_success = 0 | |
# count_failed = 0 | |
# | |
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
# if test_file_dir == 'exit': | |
# sys.exit() | |
# txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符(默认,按enter键) ') | |
# if txt_output_mode == '1': | |
# txt_output_mode = True | |
# else: | |
# txt_output_mode = False | |
# | |
# print('*****************************************************') | |
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
# count_total += 1 | |
# | |
# pdf_file_path = path | |
# pdf_dir_path = os.path.dirname(path) | |
# pdf_file_name = os.path.basename(pdf_file_path)[:-4] | |
# output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx" | |
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
# output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv" | |
# | |
# t1 = time.time() | |
# is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path) | |
# t2 = time.time() | |
# print(f'开始处理: 第 {idx + 1} 个文件...') | |
# print(f'文件名: {pdf_file_name}.pdf') | |
# print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验!') | |
# print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms') | |
# | |
# if not is_success: | |
# | |
# count_failed += 1 | |
# print(f'文件: {pdf_file_path}') | |
# print('错误: 原始pdf与生成的docx文件页数校验失败,拒绝进行下一步转换.') | |
# # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件,待原作者更新; | |
# # 若发生校验失败,后续可考虑直接丢弃该公告数据,或使用_get_txt_from_pdf()函数作直接转换。 | |
# | |
# else: | |
# | |
# document = Document(output_docx_file_path) | |
# | |
# is_success, txt_list = get_txt_from_docx(doc=document) | |
# t3 = time.time() | |
# print('步骤-2: 公告docx文件的段落提取与格式化已完成!') | |
# print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms') | |
# | |
# if not is_success: | |
# count_failed += 1 | |
# print(f'文件: {pdf_file_path}') | |
# print('错误: 原始docx转换为txt文本的过程中出错,拒绝进行下一步转换.') | |
# else: | |
# txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path, | |
# is_out_flag=False) | |
# t4 = time.time() | |
# print('步骤-3: 公告docx文件的表格提取与格式化已完成!') | |
# print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms') | |
# | |
# txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list) | |
# t5 = time.time() | |
# print('步骤-4: 公告txt文件的校对已完成!') | |
# print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms') | |
# | |
# write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode) | |
# str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode) | |
# t6 = time.time() | |
# print('步骤-5: 公告txt文件的输出已完成!') | |
# print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms') | |
# | |
# print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms') | |
# count_success += 1 | |
# | |
# if os.path.exists(output_docx_file_path): | |
# os.remove(output_docx_file_path) | |
# if os.path.exists(output_csv_file_path): | |
# os.remove(output_csv_file_path) | |
# print('*****************************************************') | |
# | |
# print('\n所有PDF格式的公告文件已处理完毕!') | |
# print(f'【文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}】') |