Application / pdf2txt_test.py
FrankWu's picture
Upload 5 files
e2dccf7 verified
# -*- coding: utf-8 -*-
'''
Created by Shengbo.Zhang on 2021/08/13
'''
import sys
import time
##################################################
############## 算法:PDF2TXT_V3.py ################
############## 测试示例 ################
##################################################
from Pdf2Txt.pdf2txt_v1 import find_all_local_file
from Pdf2Txt.pdf2txt_v3 import *
while True:
count_total = 0
count_success = 0
count_failed = 0
test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
if test_file_dir == 'exit':
sys.exit()
print('*****************************************************')
t1 = time.time()
for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
count_total += 1
pdf_file_path = path
pdf_dir_path = os.path.dirname(path)
pdf_file_name = os.path.basename(path)[:-4]
output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
print(f'开始处理: 第 {idx + 1} 个文件...')
print(f'文件名: {pdf_file_name}.pdf')
tt1 = time.time()
try:
txt_string = get_txt_from_pdf(pdf_file_path)
if txt_string != '':
output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
count_success += 1
print('处理成功.')
else:
count_failed += 1
print('处理失败!')
except Exception as e:
print(e)
count_failed += 1
print('处理失败!')
tt2 = time.time()
print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
print('*****************************************************')
t2 = time.time()
print('\n所有PDF格式的公告文件已处理完毕!')
print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}')
print('执行耗时:', round(t2-t1, 3), '秒')
print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个')
# ##################################################
# ############## 算法:PDF2TXT_V2.py ################
# ############## 测试示例 ################
# ##################################################
# from Pdf2Txt.pdf2txt_v1 import find_all_local_file
# from Pdf2Txt.pdf2txt_v2 import *
# while True:
# count_total = 0
# count_success = 0
# count_failed = 0
#
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
# if test_file_dir == 'exit':
# sys.exit()
#
# print('*****************************************************')
# t1 = time.time()
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
# count_total += 1
#
# pdf_file_path = path
# pdf_dir_path = os.path.dirname(path)
# pdf_file_name = os.path.basename(path)[:-4]
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
#
# print(f'开始处理: 第 {idx + 1} 个文件...')
# print(f'文件名: {pdf_file_name}.pdf')
# tt1 = time.time()
# try:
# txt_string = get_txt_from_pdf(pdf_file_path)
# if txt_string != '':
# output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
# count_success += 1
# print('处理成功.')
# else:
# count_failed += 1
# print('处理失败!')
# except Exception as e:
# print(e)
# count_failed += 1
# print('处理失败!')
# tt2 = time.time()
# print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
#
# print('*****************************************************')
#
# t2 = time.time()
# print('\n所有PDF格式的公告文件已处理完毕!')
# print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}')
# print('执行耗时:', round(t2-t1, 3), '秒')
# print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个')
# ##################################################
# ############## 算法:PDF2TXT_V1.py ################
# ############## 测试示例 ################
# ##################################################
# from Pdf2Txt.pdf2txt_v1 import *
# while True:
# count_total = 0
# count_success = 0
# count_failed = 0
#
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
# if test_file_dir == 'exit':
# sys.exit()
# txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符(默认,按enter键) ')
# if txt_output_mode == '1':
# txt_output_mode = True
# else:
# txt_output_mode = False
#
# print('*****************************************************')
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
# count_total += 1
#
# pdf_file_path = path
# pdf_dir_path = os.path.dirname(path)
# pdf_file_name = os.path.basename(pdf_file_path)[:-4]
# output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx"
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
# output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv"
#
# t1 = time.time()
# is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path)
# t2 = time.time()
# print(f'开始处理: 第 {idx + 1} 个文件...')
# print(f'文件名: {pdf_file_name}.pdf')
# print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验!')
# print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms')
#
# if not is_success:
#
# count_failed += 1
# print(f'文件: {pdf_file_path}')
# print('错误: 原始pdf与生成的docx文件页数校验失败,拒绝进行下一步转换.')
# # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件,待原作者更新;
# # 若发生校验失败,后续可考虑直接丢弃该公告数据,或使用_get_txt_from_pdf()函数作直接转换。
#
# else:
#
# document = Document(output_docx_file_path)
#
# is_success, txt_list = get_txt_from_docx(doc=document)
# t3 = time.time()
# print('步骤-2: 公告docx文件的段落提取与格式化已完成!')
# print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms')
#
# if not is_success:
# count_failed += 1
# print(f'文件: {pdf_file_path}')
# print('错误: 原始docx转换为txt文本的过程中出错,拒绝进行下一步转换.')
# else:
# txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path,
# is_out_flag=False)
# t4 = time.time()
# print('步骤-3: 公告docx文件的表格提取与格式化已完成!')
# print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms')
#
# txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list)
# t5 = time.time()
# print('步骤-4: 公告txt文件的校对已完成!')
# print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms')
#
# write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode)
# str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode)
# t6 = time.time()
# print('步骤-5: 公告txt文件的输出已完成!')
# print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms')
#
# print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms')
# count_success += 1
#
# if os.path.exists(output_docx_file_path):
# os.remove(output_docx_file_path)
# if os.path.exists(output_csv_file_path):
# os.remove(output_csv_file_path)
# print('*****************************************************')
#
# print('\n所有PDF格式的公告文件已处理完毕!')
# print(f'【文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}】')