Upload 5 files
Browse files- config.py +64 -0
- pdf2txt_test.py +207 -0
- pdf2txt_v1.py +630 -0
- pdf2txt_v2.py +399 -0
- pdf2txt_v3.py +555 -0
config.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
"""
|
4 |
+
Created by Shengbo.Zhang on 2021/10/12
|
5 |
+
"""
|
6 |
+
|
7 |
+
# 定义正文的各段落之间的分隔符
|
8 |
+
SEGMENT_SYMBOL = '\n'
|
9 |
+
|
10 |
+
# 定义表格之间的分隔符(无需添加换行符'\n')
|
11 |
+
TABLE_SYMBOL = '-----表格-----'
|
12 |
+
|
13 |
+
# 定义表格中单元格之间的分隔符
|
14 |
+
TABLE_CELL_SYMBOL = '\t'
|
15 |
+
|
16 |
+
# 定义临时生成的Docx文件的命名后缀
|
17 |
+
TEMP_DOCX_SUFFIX = 'TEMP_DOCX'
|
18 |
+
|
19 |
+
# 筛选可处理的公告文件标题特点
|
20 |
+
ANNOUNCEMENT_TITLE_FEATURE = [['公告', -2],
|
21 |
+
['通知', -2],
|
22 |
+
['说明', -2],
|
23 |
+
['意见', -2],
|
24 |
+
['预告', -2],
|
25 |
+
['快报', -2],
|
26 |
+
['摘要', -2],
|
27 |
+
['意见函', -3],
|
28 |
+
['回复函', -3],
|
29 |
+
['意见书', -3]]
|
30 |
+
def _check_ann_title_processable(title, exp=0):
|
31 |
+
if exp == 0:
|
32 |
+
for item in ANNOUNCEMENT_TITLE_FEATURE:
|
33 |
+
if title[item[1]:] == item[0]:
|
34 |
+
return True
|
35 |
+
elif exp == 1:
|
36 |
+
for item in ANNOUNCEMENT_TITLE_FEATURE:
|
37 |
+
if title[item[1]-1:] == item[0]+'\n':
|
38 |
+
return True
|
39 |
+
elif exp == 2:
|
40 |
+
for item in ANNOUNCEMENT_TITLE_FEATURE:
|
41 |
+
if title[-2+item[1]:-2] == item[0]:
|
42 |
+
return True
|
43 |
+
return False
|
44 |
+
|
45 |
+
# 一级专用名词语料库
|
46 |
+
FIRST_PROPER_CORPUS = ['被担保人名称:', '本次担保金额及累计为其担保金额:', '本次是否有反担保:', '对外担保逾期的累计数量:',
|
47 |
+
'企业名称:', '注册资本:', '经营范围:', '法定代表人:', '注册地址:', '财务状况(以下数据未经审计):',
|
48 |
+
'担保方式:', '担保期限:', '担保金额:', '担保额度:',
|
49 |
+
'主体要求:', '成立年限要求:', '客户类型要求:', '商业信用要求:', '反担保要求:', '资金安全性要求:',
|
50 |
+
'住所:', '成立日期:', '统一社会信用代码:', '甲方:', '乙方:', '甲方承诺:', '乙方承诺:', '理由:',
|
51 |
+
'本次会议是否有否决议案:', '审议结果:',
|
52 |
+
'律师事务所:', '律师:', '结论意见:',
|
53 |
+
'股东大会召开日期:', '网络投票系统:', '股东大会类型和届次', '股东大会类型和届次:', '股东大会召集人:',
|
54 |
+
'投票方式:', '召开的日期时间:', '召开地点:', '召开日期:', '起止时间:',
|
55 |
+
'各议案已披露的时间和披露媒体:', '特别决议议案:', '对中小投资者单独计票的议案:', '涉及关联股东回避表决的议案:',
|
56 |
+
'应回避表决的关联股东名称:', '涉及优先股股东参与表决的议案:', '登记地点:', '登记时间:',
|
57 |
+
'联系人:', '联系电话:', '传真:', '地址:', '邮编:',
|
58 |
+
'案件所属的诉讼阶段:', '上市公司子公司所处的当事人地位:', '涉案的金额:', '是否会对上市公司损益产生负面影响:',
|
59 |
+
'原告:', '被告:', '住所地:', '诉讼机构名称:', '上市公司控股子公司所处的当事人地位:',
|
60 |
+
'归属于上市公司股东的净利润:', '归属于上市公司股东的扣除非经常性损益的净利润:', '每股收益:'] + \
|
61 |
+
[f"甲方{i}:" for i in '一二三四五六七八九十'] + [f"乙方{i}:" for i in '一二三四五六七八九十']
|
62 |
+
|
63 |
+
# 二级专用名词语料库(指出现在一级专用名词所在段落中的名词,不应单独成段落)
|
64 |
+
SECOND_PROPER_CORPUS = ['许可经营项目:', '一般经营项目:']
|
pdf2txt_test.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
'''
|
4 |
+
Created by Shengbo.Zhang on 2021/08/13
|
5 |
+
'''
|
6 |
+
|
7 |
+
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
|
11 |
+
|
12 |
+
##################################################
|
13 |
+
############## 算法:PDF2TXT_V3.py ################
|
14 |
+
############## 测试示例 ################
|
15 |
+
##################################################
|
16 |
+
from Pdf2Txt.pdf2txt_v1 import find_all_local_file
|
17 |
+
from Pdf2Txt.pdf2txt_v3 import *
|
18 |
+
while True:
|
19 |
+
count_total = 0
|
20 |
+
count_success = 0
|
21 |
+
count_failed = 0
|
22 |
+
|
23 |
+
test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
|
24 |
+
if test_file_dir == 'exit':
|
25 |
+
sys.exit()
|
26 |
+
|
27 |
+
print('*****************************************************')
|
28 |
+
t1 = time.time()
|
29 |
+
for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
|
30 |
+
count_total += 1
|
31 |
+
|
32 |
+
pdf_file_path = path
|
33 |
+
pdf_dir_path = os.path.dirname(path)
|
34 |
+
pdf_file_name = os.path.basename(path)[:-4]
|
35 |
+
output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
|
36 |
+
|
37 |
+
print(f'开始处理: 第 {idx + 1} 个文件...')
|
38 |
+
print(f'文件名: {pdf_file_name}.pdf')
|
39 |
+
tt1 = time.time()
|
40 |
+
try:
|
41 |
+
txt_string = get_txt_from_pdf(pdf_file_path)
|
42 |
+
if txt_string != '':
|
43 |
+
output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
|
44 |
+
count_success += 1
|
45 |
+
print('处理成功.')
|
46 |
+
else:
|
47 |
+
count_failed += 1
|
48 |
+
print('处理失败!')
|
49 |
+
except Exception as e:
|
50 |
+
print(e)
|
51 |
+
count_failed += 1
|
52 |
+
print('处理失败!')
|
53 |
+
tt2 = time.time()
|
54 |
+
print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
|
55 |
+
|
56 |
+
print('*****************************************************')
|
57 |
+
|
58 |
+
t2 = time.time()
|
59 |
+
print('\n所有PDF格式的公告文件已处理完毕!')
|
60 |
+
print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}')
|
61 |
+
print('执行耗时:', round(t2-t1, 3), '秒')
|
62 |
+
print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个')
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
# ##################################################
|
67 |
+
# ############## 算法:PDF2TXT_V2.py ################
|
68 |
+
# ############## 测试示例 ################
|
69 |
+
# ##################################################
|
70 |
+
# from Pdf2Txt.pdf2txt_v1 import find_all_local_file
|
71 |
+
# from Pdf2Txt.pdf2txt_v2 import *
|
72 |
+
# while True:
|
73 |
+
# count_total = 0
|
74 |
+
# count_success = 0
|
75 |
+
# count_failed = 0
|
76 |
+
#
|
77 |
+
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
|
78 |
+
# if test_file_dir == 'exit':
|
79 |
+
# sys.exit()
|
80 |
+
#
|
81 |
+
# print('*****************************************************')
|
82 |
+
# t1 = time.time()
|
83 |
+
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
|
84 |
+
# count_total += 1
|
85 |
+
#
|
86 |
+
# pdf_file_path = path
|
87 |
+
# pdf_dir_path = os.path.dirname(path)
|
88 |
+
# pdf_file_name = os.path.basename(path)[:-4]
|
89 |
+
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
|
90 |
+
#
|
91 |
+
# print(f'开始处理: 第 {idx + 1} 个文件...')
|
92 |
+
# print(f'文件名: {pdf_file_name}.pdf')
|
93 |
+
# tt1 = time.time()
|
94 |
+
# try:
|
95 |
+
# txt_string = get_txt_from_pdf(pdf_file_path)
|
96 |
+
# if txt_string != '':
|
97 |
+
# output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
|
98 |
+
# count_success += 1
|
99 |
+
# print('处理成功.')
|
100 |
+
# else:
|
101 |
+
# count_failed += 1
|
102 |
+
# print('处理失败!')
|
103 |
+
# except Exception as e:
|
104 |
+
# print(e)
|
105 |
+
# count_failed += 1
|
106 |
+
# print('处理失败!')
|
107 |
+
# tt2 = time.time()
|
108 |
+
# print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
|
109 |
+
#
|
110 |
+
# print('*****************************************************')
|
111 |
+
#
|
112 |
+
# t2 = time.time()
|
113 |
+
# print('\n所有PDF格式的公告文件已处理完毕!')
|
114 |
+
# print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}')
|
115 |
+
# print('执行耗时:', round(t2-t1, 3), '秒')
|
116 |
+
# print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个')
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
# ##################################################
|
121 |
+
# ############## 算法:PDF2TXT_V1.py ################
|
122 |
+
# ############## 测试示例 ################
|
123 |
+
# ##################################################
|
124 |
+
# from Pdf2Txt.pdf2txt_v1 import *
|
125 |
+
# while True:
|
126 |
+
# count_total = 0
|
127 |
+
# count_success = 0
|
128 |
+
# count_failed = 0
|
129 |
+
#
|
130 |
+
# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ')
|
131 |
+
# if test_file_dir == 'exit':
|
132 |
+
# sys.exit()
|
133 |
+
# txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符(默认,按enter键) ')
|
134 |
+
# if txt_output_mode == '1':
|
135 |
+
# txt_output_mode = True
|
136 |
+
# else:
|
137 |
+
# txt_output_mode = False
|
138 |
+
#
|
139 |
+
# print('*****************************************************')
|
140 |
+
# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
|
141 |
+
# count_total += 1
|
142 |
+
#
|
143 |
+
# pdf_file_path = path
|
144 |
+
# pdf_dir_path = os.path.dirname(path)
|
145 |
+
# pdf_file_name = os.path.basename(pdf_file_path)[:-4]
|
146 |
+
# output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx"
|
147 |
+
# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
|
148 |
+
# output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv"
|
149 |
+
#
|
150 |
+
# t1 = time.time()
|
151 |
+
# is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path)
|
152 |
+
# t2 = time.time()
|
153 |
+
# print(f'开始处理: 第 {idx + 1} 个文件...')
|
154 |
+
# print(f'文件名: {pdf_file_name}.pdf')
|
155 |
+
# print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验!')
|
156 |
+
# print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms')
|
157 |
+
#
|
158 |
+
# if not is_success:
|
159 |
+
#
|
160 |
+
# count_failed += 1
|
161 |
+
# print(f'文件: {pdf_file_path}')
|
162 |
+
# print('错误: 原始pdf与生成的docx文件页数校验失败,拒绝进行下一步转换.')
|
163 |
+
# # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件,待原作者更新;
|
164 |
+
# # 若发生校验失败,后续可考虑直接丢弃该公告数据,或使用_get_txt_from_pdf()函数作直接转换。
|
165 |
+
#
|
166 |
+
# else:
|
167 |
+
#
|
168 |
+
# document = Document(output_docx_file_path)
|
169 |
+
#
|
170 |
+
# is_success, txt_list = get_txt_from_docx(doc=document)
|
171 |
+
# t3 = time.time()
|
172 |
+
# print('步骤-2: 公告docx文件的段落提取与格式化已完成!')
|
173 |
+
# print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms')
|
174 |
+
#
|
175 |
+
# if not is_success:
|
176 |
+
# count_failed += 1
|
177 |
+
# print(f'文件: {pdf_file_path}')
|
178 |
+
# print('错误: 原始docx转换为txt文本的过程中出错,拒绝进行下一步转换.')
|
179 |
+
# else:
|
180 |
+
# txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path,
|
181 |
+
# is_out_flag=False)
|
182 |
+
# t4 = time.time()
|
183 |
+
# print('步骤-3: 公告docx文件的表格提取与格式化已完成!')
|
184 |
+
# print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms')
|
185 |
+
#
|
186 |
+
# txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list)
|
187 |
+
# t5 = time.time()
|
188 |
+
# print('步骤-4: 公告txt文件的校对已完成!')
|
189 |
+
# print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms')
|
190 |
+
#
|
191 |
+
# write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode)
|
192 |
+
# str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode)
|
193 |
+
# t6 = time.time()
|
194 |
+
# print('步骤-5: 公告txt文件的输出已完成!')
|
195 |
+
# print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms')
|
196 |
+
#
|
197 |
+
# print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms')
|
198 |
+
# count_success += 1
|
199 |
+
#
|
200 |
+
# if os.path.exists(output_docx_file_path):
|
201 |
+
# os.remove(output_docx_file_path)
|
202 |
+
# if os.path.exists(output_csv_file_path):
|
203 |
+
# os.remove(output_csv_file_path)
|
204 |
+
# print('*****************************************************')
|
205 |
+
#
|
206 |
+
# print('\n所有PDF格式的公告文件已处理完毕!')
|
207 |
+
# print(f'【文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}】')
|
pdf2txt_v1.py
ADDED
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
"""
|
4 |
+
Created by Shengbo.Zhang on 2021/08/13
|
5 |
+
"""
|
6 |
+
|
7 |
+
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
import csv
|
12 |
+
import logging
|
13 |
+
from docx import Document
|
14 |
+
from pdf2docx import Converter
|
15 |
+
from Pdf2Txt.config import *
|
16 |
+
from pdfminer.layout import LAParams
|
17 |
+
from pdfminer.pdfpage import PDFPage
|
18 |
+
from pdfminer.converter import TextConverter
|
19 |
+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
20 |
+
from Pdf2Txt.config import _check_ann_title_processable
|
21 |
+
|
22 |
+
|
23 |
+
# 关闭pdf2docx模块中Converter的日志输出
|
24 |
+
logging.disable(logging.INFO)
|
25 |
+
logging.disable(logging.WARNING)
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
def _get_txt_from_pdf(pdf_path, out_path):
|
30 |
+
'''
|
31 |
+
读取Pdf文件,直接将其转换为Txt文本格式
|
32 |
+
:param pdf_path: 输入的pdf公告文件的完整路径
|
33 |
+
:param out_path: 输出的txt结果文件的完整路径
|
34 |
+
:return: bool
|
35 |
+
'''
|
36 |
+
manager = PDFResourceManager()
|
37 |
+
output = io.StringIO()
|
38 |
+
converter = TextConverter(manager, output, laparams=LAParams())
|
39 |
+
interpreter = PDFPageInterpreter(manager, converter)
|
40 |
+
with open(pdf_path, 'rb') as infile:
|
41 |
+
content = []
|
42 |
+
for page in PDFPage.get_pages(infile, check_extractable=True):
|
43 |
+
interpreter.process_page(page)
|
44 |
+
convertedPDF = output.getvalue()
|
45 |
+
# print(convertedPDF)
|
46 |
+
content.append(convertedPDF)
|
47 |
+
# print(len(content))
|
48 |
+
# print(content)
|
49 |
+
for idx, val in enumerate(content):
|
50 |
+
val = re.sub('\n+','\n', val)
|
51 |
+
val = re.sub('\n +', '', val)
|
52 |
+
val = val.replace('', '')
|
53 |
+
content[idx] = val
|
54 |
+
with open(out_path, 'wb') as f:
|
55 |
+
f.write(''.join(content).encode('utf-8'))
|
56 |
+
output.close()
|
57 |
+
converter.close()
|
58 |
+
f.close()
|
59 |
+
return True
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
def _get_cleaned_txt(txtPath, out_path):
|
64 |
+
'''
|
65 |
+
对Txt文件进行内容格式清洗(暂时仅供测试)
|
66 |
+
:param txtPath: 输入的txt文件的完整路径
|
67 |
+
:param out_path: 输出的txt文件的完整路径
|
68 |
+
:return: bool
|
69 |
+
'''
|
70 |
+
with open(txtPath, 'rb')as f:
|
71 |
+
content = f.read().decode('utf-8')
|
72 |
+
p = re.compile(r'(?<=##)\S.+(?=##)|[\u4e00-\u9fff+\u3002\uFF0C]')
|
73 |
+
x = ''.join(re.findall(p, content))
|
74 |
+
final_result = re.sub(u"[\uFF0C|\u3002|\u002B]{2,}", "", x)
|
75 |
+
with open(out_path, "w")as txtPath:
|
76 |
+
txtPath.write(final_result)
|
77 |
+
# print(final_result)
|
78 |
+
return True
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
def get_docx_from_pdf(pdf_path, out_path):
|
83 |
+
'''
|
84 |
+
读取Pdf文件,将其转换为Docx格式并存在本地
|
85 |
+
:param pdf_path: 输入的pdf公告文件的完整路径
|
86 |
+
:param out_path: 输出的中间docx结果文件的完整路径
|
87 |
+
:return: bool
|
88 |
+
'''
|
89 |
+
try:
|
90 |
+
cv = Converter(pdf_path)
|
91 |
+
cv.convert(out_path)
|
92 |
+
except Exception:
|
93 |
+
return False
|
94 |
+
for p in cv.pages:
|
95 |
+
if not p.finalized:
|
96 |
+
cv.close()
|
97 |
+
return False
|
98 |
+
cv.close()
|
99 |
+
return True
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
def _find_key_indexs(str, key):
|
104 |
+
'''
|
105 |
+
给定一个父字符串和子串,在父串中查找子串的所有索引位置,并返回一个包含所有下标的列表
|
106 |
+
:param str: 父字符串
|
107 |
+
:param key: 子字符串
|
108 |
+
:return: list
|
109 |
+
'''
|
110 |
+
lstKey = []
|
111 |
+
countStr = str.count(key)
|
112 |
+
if countStr < 1:
|
113 |
+
return []
|
114 |
+
elif countStr == 1:
|
115 |
+
indexKey = str.find(key)
|
116 |
+
return [indexKey]
|
117 |
+
else:
|
118 |
+
indexKey = str.find(key)
|
119 |
+
lstKey.append(indexKey)
|
120 |
+
while countStr > 1:
|
121 |
+
str_new = str[indexKey + 1:len(str) + 1]
|
122 |
+
indexKey_new = str_new.find(key)
|
123 |
+
indexKey = indexKey + 1 + indexKey_new
|
124 |
+
lstKey.append(indexKey)
|
125 |
+
countStr -= 1
|
126 |
+
lstKey.sort(reverse=True)
|
127 |
+
return lstKey
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
def _insert_char_into_str(str, idx, char):
|
132 |
+
'''
|
133 |
+
给定一个父字符串、下标位置、子串,在父串中的下标位置插入子串,并返回一个新的字符串
|
134 |
+
:param str: 父字符串
|
135 |
+
:param idx: 插入位置索引
|
136 |
+
:param char: 子字符串
|
137 |
+
:return: str
|
138 |
+
'''
|
139 |
+
tmp = list(str)
|
140 |
+
tmp.insert(idx, char)
|
141 |
+
return ''.join(tmp)
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
def _is_chinese(str):
|
146 |
+
'''
|
147 |
+
给定一个字符串,判断该字符串是否全是中文
|
148 |
+
:param str: 字符串
|
149 |
+
:return: bool
|
150 |
+
'''
|
151 |
+
for ch in str:
|
152 |
+
if '\u4e00' <= ch <= '\u9fff':
|
153 |
+
return True
|
154 |
+
return False
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
def _get_table_row_feat(str):
|
159 |
+
'''
|
160 |
+
给定一个空格分割的表格行字符串,计算它的特征(01组成的字符串)
|
161 |
+
:param str: 字符串
|
162 |
+
:return: 字符串
|
163 |
+
'''
|
164 |
+
s = str.split()
|
165 |
+
r = ''
|
166 |
+
for c in s:
|
167 |
+
try:
|
168 |
+
_ = float(c)
|
169 |
+
r += '1'
|
170 |
+
except Exception:
|
171 |
+
r += '0'
|
172 |
+
return r
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
def _check_if_include_first_proper(s, corpus):
|
177 |
+
'''
|
178 |
+
检查字符串s中是否包含语料列表first_corpus中的某一内容
|
179 |
+
:param s: 字符串
|
180 |
+
:param corpus: 字符串列表
|
181 |
+
:return: [bool, str]
|
182 |
+
'''
|
183 |
+
for i in corpus:
|
184 |
+
if i in s:
|
185 |
+
return [True, i]
|
186 |
+
return [False, '']
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
def _check_if_include_second_proper(s, corpus):
|
191 |
+
'''
|
192 |
+
检查字符串s中是否包含语料列表first_corpus中的某一内容
|
193 |
+
:param s: 字符串
|
194 |
+
:param corpus: 字符串列表
|
195 |
+
:return: list
|
196 |
+
'''
|
197 |
+
res = []
|
198 |
+
for i in corpus:
|
199 |
+
if i in s:
|
200 |
+
res.append([True, i])
|
201 |
+
else:
|
202 |
+
res.append([False, i])
|
203 |
+
return res
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
def _match_and_insert(string, pattern, substring):
|
208 |
+
'''
|
209 |
+
匹配string字符串中的pattern,计算所有pattern在string中的首个字符索引位置,并在string从后向前插入substring至这些位置
|
210 |
+
:param string: 待匹配的字符串
|
211 |
+
:param pattern: 匹配模式
|
212 |
+
:param substring: 待插入的子字符串
|
213 |
+
:return: 插入后的字符串
|
214 |
+
'''
|
215 |
+
idx_list = []
|
216 |
+
for j in re.finditer(pattern, string):
|
217 |
+
idx_list.append(j.span()[0])
|
218 |
+
# 将匹配模式的所有索引下标进行倒序排列,方便后续插入end_flag
|
219 |
+
idx_list.sort(reverse=True)
|
220 |
+
if idx_list != []:
|
221 |
+
for k in idx_list:
|
222 |
+
if k > 0 and string[k-1] != '“':
|
223 |
+
string = _insert_char_into_str(string, k, substring)
|
224 |
+
return string
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
def _match_and_delete(string, pattern):
|
229 |
+
'''
|
230 |
+
匹配string字符串中的pattern,计算pattern在string中的首个字符索引位置,删除该索引前2个位置的换行符\n
|
231 |
+
:param string: 待匹配的字符串
|
232 |
+
:param pattern: 匹配模式
|
233 |
+
:return: 删除'\n\n'子字符串后的字符串
|
234 |
+
'''
|
235 |
+
matcher = re.search(pattern, string)
|
236 |
+
if matcher:
|
237 |
+
k = matcher.span()[0]
|
238 |
+
if k >= 2 and string[k-1] == '\n' and string[k-2] == '\n':
|
239 |
+
string = string[:k-2] + string[k:]
|
240 |
+
return string
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
def get_txt_from_docx(doc):
|
245 |
+
'''
|
246 |
+
读取Docx文件中每个自然行的材料内容
|
247 |
+
:param doc: 一个Document对象实例
|
248 |
+
:param out_path: 输出的txt结果文件的完整路径
|
249 |
+
:return: bool(转换是否成功), list(格式化修正后的文本列表)
|
250 |
+
'''
|
251 |
+
# 公告中的编号符号的集合,例如:'(1)', '1、'
|
252 |
+
NUMBER_1 = '123456789一二三四五六七八九十'
|
253 |
+
# 数字与大小写的英文字母的集合
|
254 |
+
NUMBER_2 = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
255 |
+
# 提取docx文件中的单行文本至初始文本列表paras中
|
256 |
+
paras = [para.text+'\n' for i, para in enumerate(doc.paragraphs)]
|
257 |
+
# 存储首轮格式化修正文本的列表new_paras
|
258 |
+
new_paras = []
|
259 |
+
# new_paras中各字符串的长度
|
260 |
+
new_paras_len_cnt = []
|
261 |
+
|
262 |
+
try:
|
263 |
+
# 遍历paras文本列表中的各字符串
|
264 |
+
for val in paras:
|
265 |
+
# 若该行文本为空,或者是页面号,或者是’单位‘,则跳过
|
266 |
+
if val == '\n' or re.search('^[0-9]+ \n$', val) or val[:2] == '单位':
|
267 |
+
continue
|
268 |
+
# 否则,将该行文本添加进new_paras文本列表中
|
269 |
+
new_paras.append(val.lstrip())
|
270 |
+
# 计算该行文本的字符长度
|
271 |
+
new_paras_len_cnt.append(len(val))
|
272 |
+
|
273 |
+
# 正文标识符,指示正文开始的行号
|
274 |
+
line_mark = 0
|
275 |
+
# 遍历new_paras的前10行,目的是处理公告的头部信息,例如:证券代码、证券简称、公告编号、公告标题等
|
276 |
+
for i, val in enumerate(new_paras[:10]):
|
277 |
+
# 如果出现制表符,或者空格数大于1,则仅保留一个空格
|
278 |
+
if '\t' in val or val.count(' ') > 2:
|
279 |
+
new_paras[i] = ' '.join(val.split()) + '\n'
|
280 |
+
if '证券代码:' in new_paras[i]:
|
281 |
+
continue
|
282 |
+
# 如果行末尾是'有限公司',则去掉可能的空格
|
283 |
+
if val.replace(' ', '')[-5:] == '有限公司\n':
|
284 |
+
new_paras[i] = val.replace(' ', '')
|
285 |
+
continue
|
286 |
+
# 循环检查下一行,直到行末尾是'公告',或'股东大会的通知',或'董事意见函'(应与mongo.py中process_files()相对应)
|
287 |
+
# 此时认为抵达正文的起始位置,后续处理将从第line_mark行开始
|
288 |
+
if _check_ann_title_processable(val.replace(' ', ''), exp=1):
|
289 |
+
new_paras[i] = val.replace(' ', '')
|
290 |
+
line_mark = i + 1
|
291 |
+
break
|
292 |
+
else:
|
293 |
+
new_paras[i] = val.replace('\n', '').replace(' ', '')
|
294 |
+
|
295 |
+
# 计算new_paras中各行的平均字符长度
|
296 |
+
mean_len = sum(new_paras_len_cnt)//len(new_paras_len_cnt)
|
297 |
+
|
298 |
+
# 遍历new_paras
|
299 |
+
for i, _ in enumerate(new_paras):
|
300 |
+
# 如果是正文部分
|
301 |
+
if i >= line_mark:
|
302 |
+
# 去掉该行中的一些符号(空格、特殊符号、英文逗号)
|
303 |
+
new_paras[i] = new_paras[i]\
|
304 |
+
.replace(' ', '')\
|
305 |
+
.replace(' ', '')\
|
306 |
+
.replace('', '')\
|
307 |
+
.replace(',', '')
|
308 |
+
|
309 |
+
# 如果该行长度大于平均长度,并且(下一行首部为非编号,���者下一行首部是编号且以换行结尾),则认为该行在段落中,故去掉该行换行符
|
310 |
+
if i < len(new_paras)-1 and \
|
311 |
+
len(new_paras[i]) >= mean_len and \
|
312 |
+
((new_paras[i + 1].replace('(','').replace('(','')[0] not in NUMBER_1) or
|
313 |
+
new_paras[i + 1][-1] == '\n'):
|
314 |
+
new_paras[i] = new_paras[i].replace('\n', '')
|
315 |
+
|
316 |
+
# 如果该行的下一行长度大于等于3,并且该行的下一行首部是非编号,且不包含关键字'年',则认为该行在段落中,故去掉该行换行符
|
317 |
+
if i < len(new_paras)-2 and \
|
318 |
+
len(new_paras[i + 1]) >= 3 and \
|
319 |
+
new_paras[i + 1].replace('(','').replace('(','')[0] in NUMBER_2 and \
|
320 |
+
(not '.' in new_paras[i+1][:3]) and \
|
321 |
+
(not '、' in new_paras[i+1][:3]) and \
|
322 |
+
(not '年' in new_paras[i+1]):
|
323 |
+
new_paras[i] = new_paras[i].replace('\n', '')
|
324 |
+
|
325 |
+
# 查找该行中,中文冒号符号':'的所有索引位置j
|
326 |
+
for j in _find_key_indexs(new_paras[i], ':'):
|
327 |
+
# 如果该行的j+1位置不是换行,并且该行不包括中文括号'()'与书名号'《》',且该行不包含一级专用名词,则认为该行应独立成
|
328 |
+
# 段落,故在该行j+1位置插入换行符
|
329 |
+
# 注:执行插入操作时,若有多个位置进行插入,则总是从后往前插入,确保插入后索引仍然正确
|
330 |
+
if j < len(new_paras[i])-1 and new_paras[i][j+1] != '\n' and \
|
331 |
+
('(' not in new_paras[i]) and ('《' not in new_paras[i]) and \
|
332 |
+
(')' not in new_paras[i]) and ('》' not in new_paras[i]) and \
|
333 |
+
(not _check_if_include_first_proper(new_paras[i], FIRST_PROPER_CORPUS)[0]):
|
334 |
+
new_paras[i] = _insert_char_into_str(new_paras[i], j+1, '\n')
|
335 |
+
# 查找该行中,中文左括号符号'('的所有索引位置j
|
336 |
+
for j in _find_key_indexs(new_paras[i], '('):
|
337 |
+
# 如果该行的j+1位置是编号,并且该行的上一行末尾不是换行,且该行j-1位置为非中文和非书名号,则认为该行的下一行应独立成
|
338 |
+
# 段落,故在该行j位置插入换行符
|
339 |
+
if new_paras[i][j+1] in NUMBER_1 and new_paras[i-1][-1] != '\n' and \
|
340 |
+
(not _is_chinese(new_paras[i][j-1])) and new_paras[i][j-1] != '》':
|
341 |
+
new_paras[i] = _insert_char_into_str(new_paras[i], j, '\n')
|
342 |
+
# 查找该行中,英文左括号符号'('的所有索引位置j
|
343 |
+
for j in _find_key_indexs(new_paras[i], '('):
|
344 |
+
# 如果该行的j+1位置是编号,并且该行的上一行末尾不是换行,且该行j-1位置为非中文和非书名号,则认为该行的下一行应独立成
|
345 |
+
# 段落,故在该行j位置插入换行符
|
346 |
+
if new_paras[i][j + 1] in NUMBER_1 and new_paras[i - 1][-1] != '\n' and \
|
347 |
+
(not _is_chinese(new_paras[i][j - 1])) and new_paras[i][j - 1] != '》':
|
348 |
+
new_paras[i] = _insert_char_into_str(new_paras[i], j, '\n')
|
349 |
+
# 查找该行中,中文顿号符号'、'的所有索引位置j
|
350 |
+
for j in _find_key_indexs(new_paras[i], '、'):
|
351 |
+
# 如果该行的j-1位置是编号(不超过9或十),并且该行的上一行末尾不是换行,则认为该行的下一行应独立成段落,故在该行j-1
|
352 |
+
# 位置插入换行符
|
353 |
+
if (j-2) < len(new_paras[i]) and new_paras[i][j-1] in NUMBER_1 and new_paras[i][j-2] not in NUMBER_1 \
|
354 |
+
and new_paras[i][j-2] in '。;.;' and new_paras[i-1][-1] != '\n':
|
355 |
+
new_paras[i] = _insert_char_into_str(new_paras[i], j-1, '\n')
|
356 |
+
continue
|
357 |
+
# 如果该行的j-1与j-2位置都是编号(超过9或十),并且该行的上一行末尾不是换行,则认为该行的下一行应独立成段落,故在该
|
358 |
+
# 行j-2位置插入换行符
|
359 |
+
if (j-3) < len(new_paras[i]) and new_paras[i][j-1] in NUMBER_1 and new_paras[i][j-2] in NUMBER_1 \
|
360 |
+
and new_paras[i][j-3] in '。;.;' and new_paras[i-1][-1] != '\n':
|
361 |
+
new_paras[i] = _insert_char_into_str(new_paras[i], j-2, '\n')
|
362 |
+
|
363 |
+
# 修正某些情形下'特此公告。'未自成段落的情况
|
364 |
+
if new_paras[i] == '特此公告。\n':
|
365 |
+
if new_paras[i-1][-1] != '\n':
|
366 |
+
new_paras[i] = '\n特此公告。\n'
|
367 |
+
if new_paras[i+1][-1] != '\n':
|
368 |
+
new_paras[i+1] += '\n'
|
369 |
+
|
370 |
+
# 如果该行的下一行中含有独立的一级专用名词,则认为该行的下一行应独立成段落,故在该行的末尾插入缺省的换行符
|
371 |
+
if (i+1) < len(new_paras):
|
372 |
+
tmp_flag, tmp_str = _check_if_include_first_proper(new_paras[i+1], FIRST_PROPER_CORPUS)
|
373 |
+
if tmp_flag:
|
374 |
+
tmp_idx = new_paras[i+1].index(tmp_str) - 1
|
375 |
+
if tmp_idx >= 0 and new_paras[i+1][tmp_idx] != '(':
|
376 |
+
if new_paras[i][-1] != '\n':
|
377 |
+
new_paras[i] += '\n'
|
378 |
+
|
379 |
+
# 将new_paras中的若干字符串连接成为一个字符串str_sum
|
380 |
+
str_sum = ''.join(new_paras)
|
381 |
+
# 将str_num字符串按照换行符进行分割,生成次轮格式化修正文本的列表final_paras
|
382 |
+
final_paras = str_sum.split('\n')
|
383 |
+
# 遍历final_paras
|
384 |
+
for i, val in enumerate(final_paras):
|
385 |
+
# 每一自然段落的末尾符号,这里为两个换行符,便于清晰地查看最终生成的txt文本
|
386 |
+
end_flag = '\n\n'
|
387 |
+
# 给final_paras中的每一行添加一个末尾符号
|
388 |
+
final_paras[i] += end_flag
|
389 |
+
|
390 |
+
# 在该行中查找匹配到的所有形如: '(1)', '(2)' 的模式
|
391 |
+
# 此处认为该模式的起始位置应独立成段落,例如'\n(1)XXX...', '\n(1)XXX...'
|
392 |
+
if '(' in final_paras[i]:
|
393 |
+
final_paras[i] = _match_and_insert(final_paras[i], '[\(\(]+[0-9]{1,2}[\)\)]+', end_flag)
|
394 |
+
|
395 |
+
# 在该行中,查找所有的中文左括号符号'('与中文右括号符号')',计算它们各自的数量
|
396 |
+
# 如果两符号的数量不相等,则认为该行处在段落中,故应去掉该行末尾的end_flag
|
397 |
+
if len(_find_key_indexs(final_paras[i], '(')) != len(_find_key_indexs(final_paras[i], ')')):
|
398 |
+
final_paras[i] = final_paras[i][:-2]
|
399 |
+
|
400 |
+
# 将final_paras中的若干字符串连接成为一个字符串str_sum
|
401 |
+
str_sum = ''.join(final_paras)
|
402 |
+
# 将str_num字符串按照换行符进行分割,生成终轮格式化修正文本的列表,覆盖掉之前的final_paras
|
403 |
+
final_paras = str_sum.split('\n\n')
|
404 |
+
# 遍历final_paras
|
405 |
+
for i, val in enumerate(final_paras):
|
406 |
+
# 每一自然段落的末尾符号,这里为两个换行符,便于清晰地查看最终生成的txt文本
|
407 |
+
end_flag = '\n\n'
|
408 |
+
# 给final_paras中的每一行再次添加上一个末尾符号
|
409 |
+
final_paras[i] += end_flag
|
410 |
+
|
411 |
+
# 修正某些情形下'重要内容提示:'未自成段落的情况
|
412 |
+
if '重要内容提示:' in final_paras[i]:
|
413 |
+
idx = final_paras[i].index('重要内容提示:')
|
414 |
+
if final_paras[i][idx+7] != '\n':
|
415 |
+
final_paras[i] = _insert_char_into_str(final_paras[i], idx+7, '\n\n')
|
416 |
+
if idx > 0:
|
417 |
+
if final_paras[i][idx-1] != '\n':
|
418 |
+
final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
|
419 |
+
|
420 |
+
# 修正某些情形下'表决结果:'及其跟随的结果未自成段落的情况
|
421 |
+
if '表决结果:' in final_paras[i]:
|
422 |
+
if final_paras[i][:5] == '表决结果:':
|
423 |
+
final_paras[i] = final_paras[i][:-2]
|
424 |
+
elif final_paras[i][-7:] == '表决结果:\n\n':
|
425 |
+
idx = final_paras[i].find('表决结果:')
|
426 |
+
final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
|
427 |
+
final_paras[i] = final_paras[i][:-2]
|
428 |
+
else:
|
429 |
+
idx = final_paras[i].find('表决结果:')
|
430 |
+
final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
|
431 |
+
|
432 |
+
# 检查该行中的所有二级专用名词(指一级专用名词后所在段落中出现的专用名词,不应独立成段落)
|
433 |
+
for is_include, s_include in _check_if_include_second_proper(final_paras[i], SECOND_PROPER_CORPUS):
|
434 |
+
if is_include:
|
435 |
+
# 如果该行中含有某一二级专用名词,并且名词后有换行符,则去掉该行的换行符
|
436 |
+
if final_paras[i][final_paras[i].index(s_include)+len(s_include)] == '\n':
|
437 |
+
final_paras[i] = final_paras[i].replace('\n', '')
|
438 |
+
|
439 |
+
# 在该行中查找匹配到的所有形如: '(一)', '(1)', '(一)', '(1)' 的模式
|
440 |
+
# 此处认为该模式的起始位置应独立成段落,例如'\n(一)XXX...', '\n(1)XXX...'
|
441 |
+
if '(' in final_paras[i]:
|
442 |
+
final_paras[i] = _match_and_insert(final_paras[i], '[\(\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\)\)]+', end_flag)
|
443 |
+
final_paras[i] = _match_and_insert(final_paras[i], '[\(\(]+[0-9]{1,2}[\)\)]+', end_flag)
|
444 |
+
|
445 |
+
# 在该行中查找匹配到的所有形如: '一、', '1、' 的模式
|
446 |
+
# 此处认为该模式的起始位置应独立成段落,例如'\n一、XXX...', '\n1、XXX...'
|
447 |
+
if '、' in final_paras[i]:
|
448 |
+
final_paras[i] = _match_and_insert(final_paras[i], '[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', end_flag)
|
449 |
+
final_paras[i] = _match_and_insert(final_paras[i], '[0-9]{1,2}、', end_flag)
|
450 |
+
# 这里对形如: 'XXX第一、二组、三组的XXX' 的特例进行处理,即去掉前序错误添加的换行符
|
451 |
+
final_paras[i] = _match_and_delete(final_paras[i], '[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、[\S]+、[\S]+')
|
452 |
+
final_paras[i] = _match_and_delete(final_paras[i], '[0-9]+、[0-9]+')
|
453 |
+
|
454 |
+
# 再次检查'●'符号,若非独立成行则在该符号前添加换行符
|
455 |
+
for j in _find_key_indexs(final_paras[i], '●'):
|
456 |
+
if j > 0:
|
457 |
+
final_paras[i] = _insert_char_into_str(final_paras[i], j, end_flag)
|
458 |
+
|
459 |
+
# 如果上述处理流程出现任何异常抛掷,则返回(False, []),标志转换失败
|
460 |
+
except Exception:
|
461 |
+
return False, []
|
462 |
+
# 返回(True, final_paras),标志转换成功
|
463 |
+
return True, final_paras
|
464 |
+
|
465 |
+
|
466 |
+
|
467 |
+
def get_table_from_docx(doc, txt, out_path="", is_out_flag=False):
|
468 |
+
'''
|
469 |
+
读取Docx文件中每个表格的材料内容
|
470 |
+
:param doc: 一个Document对象实例
|
471 |
+
:param txt: 一个字符串列表,包含PDF的正文文本内容
|
472 |
+
:param out_path: 输出的csv结果文件的完整路径
|
473 |
+
:param is_out_flag: 是否输出csv结果文件,默认不输出
|
474 |
+
:return: list, list
|
475 |
+
'''
|
476 |
+
data = []
|
477 |
+
table_txt = []
|
478 |
+
attach_txt = {}
|
479 |
+
for table in doc.tables[:]:
|
480 |
+
table_txt.append('-----表格-----\n')
|
481 |
+
for i, row in enumerate(table.rows[:]):
|
482 |
+
row_content = []
|
483 |
+
for cell in row.cells[:]:
|
484 |
+
c = cell.text
|
485 |
+
new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
|
486 |
+
row_content.append(new_c)
|
487 |
+
if row_content == []:
|
488 |
+
continue
|
489 |
+
if '本公司' in row_content[0]:
|
490 |
+
tmp = ''
|
491 |
+
for line in row_content:
|
492 |
+
tmp += line.strip()
|
493 |
+
tmp += '\n\n'
|
494 |
+
attach_txt['000'] = tmp
|
495 |
+
continue
|
496 |
+
if '证券代码' in row_content[0]:
|
497 |
+
tmp = '^'
|
498 |
+
for line in row_content:
|
499 |
+
tmp += line.strip()+' '
|
500 |
+
tmp += '$\n'
|
501 |
+
txt.insert(tmp, 0)
|
502 |
+
continue
|
503 |
+
data.append(row_content)
|
504 |
+
new_row = '^' + '\t'.join(row_content) + '$\n'
|
505 |
+
if new_row.replace('\t','') != '^$\n':
|
506 |
+
table_txt.append(new_row)
|
507 |
+
data.append('-----表格-----\n')
|
508 |
+
table_txt.append('-----表格-----\n')
|
509 |
+
|
510 |
+
flag = False
|
511 |
+
for i, val in enumerate(table_txt):
|
512 |
+
if val == '-----表格-----\n':
|
513 |
+
if not flag:
|
514 |
+
flag = True
|
515 |
+
else:
|
516 |
+
table_txt[i] = '^$\n'
|
517 |
+
else:
|
518 |
+
flag = False
|
519 |
+
|
520 |
+
table_txt = list(filter(lambda x: x != '^$\n', table_txt))
|
521 |
+
for i, val in enumerate(table_txt):
|
522 |
+
if val == '-----表格-----\n' and (i > 0) and (i < len(table_txt)-1):
|
523 |
+
feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
|
524 |
+
feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
|
525 |
+
if feat1 == feat2:
|
526 |
+
table_txt[i] = '^$\n'
|
527 |
+
|
528 |
+
if len(table_txt) == 1 and table_txt[0] == '-----表格-----\n':
|
529 |
+
table_txt[0] = '^$\n'
|
530 |
+
|
531 |
+
for i, val in enumerate(table_txt):
|
532 |
+
if val == '-----表格-----':
|
533 |
+
continue
|
534 |
+
if val == '^$\n':
|
535 |
+
table_txt[i] = ''
|
536 |
+
continue
|
537 |
+
table_txt[i] = val[1:][:-2] + '\n'
|
538 |
+
|
539 |
+
txt.extend(table_txt)
|
540 |
+
|
541 |
+
if is_out_flag:
|
542 |
+
f = open(out_path, 'w+', newline='')
|
543 |
+
writer = csv.writer(f)
|
544 |
+
for i, val in enumerate(data):
|
545 |
+
if i == 0 and val == '\n':
|
546 |
+
continue
|
547 |
+
writer.writerow(val)
|
548 |
+
f.close()
|
549 |
+
|
550 |
+
return txt, attach_txt
|
551 |
+
|
552 |
+
|
553 |
+
|
554 |
+
def refine_pdf2txt_list_result(txt, att_txt):
|
555 |
+
'''
|
556 |
+
对txt字符串列表进行最后的校对,还原或附加误识别为表格的正文内容
|
557 |
+
:param txt: 一个字符串列表,包含PDF的正文文本内容
|
558 |
+
:param att_txt: 一些误识别为表格的正文内容
|
559 |
+
:return: list
|
560 |
+
'''
|
561 |
+
for id, val in enumerate(txt):
|
562 |
+
if id > 10: break
|
563 |
+
else:
|
564 |
+
if val[-6:-2] == '有限公司':
|
565 |
+
txt[id] = val[:-2]
|
566 |
+
continue
|
567 |
+
if '000' in att_txt and _check_ann_title_processable(val, exp=2):
|
568 |
+
txt.insert(id+1, att_txt['000'])
|
569 |
+
break
|
570 |
+
return txt
|
571 |
+
|
572 |
+
|
573 |
+
|
574 |
+
def write_pdf2txt_list_result(out_path, txt, out_mode_flag=True):
|
575 |
+
'''
|
576 |
+
将txt字符��列表写为txt文本文件
|
577 |
+
:param out_path: 生成的txt文本文件的路径
|
578 |
+
:param txt: 一个字符串列表,包含PDF的正文和表格
|
579 |
+
:param out_mode_flag: 是否添加段头标识'^'和段尾标识'$'
|
580 |
+
:return: bool
|
581 |
+
'''
|
582 |
+
with open(out_path, "w", encoding='utf-8') as f:
|
583 |
+
if not out_mode_flag:
|
584 |
+
for line in txt:
|
585 |
+
if line != '^$\n':
|
586 |
+
f.write(line)
|
587 |
+
else:
|
588 |
+
strs = ''.join(txt)
|
589 |
+
paras = strs.split('\n')
|
590 |
+
for line in paras:
|
591 |
+
if line != '':
|
592 |
+
f.write('^' + line + '$\n')
|
593 |
+
return True
|
594 |
+
|
595 |
+
|
596 |
+
|
597 |
+
def get_pdf2txt_str_result(txt, out_mode_flag=True):
|
598 |
+
'''
|
599 |
+
将txt字符串列表内元素拼接为完整的txt内容
|
600 |
+
:param txt: 一个字符串列表,包含PDF的正文和表格
|
601 |
+
:param out_mode_flag: 是否添加段头标识'^'和段尾标识'$'
|
602 |
+
:return: str
|
603 |
+
'''
|
604 |
+
txt_str = ""
|
605 |
+
for line in txt:
|
606 |
+
if not out_mode_flag:
|
607 |
+
for line in txt:
|
608 |
+
if line != '^$\n':
|
609 |
+
txt_str += line
|
610 |
+
else:
|
611 |
+
strs = ''.join(txt)
|
612 |
+
paras = strs.split('\n')
|
613 |
+
for line in paras:
|
614 |
+
if line != '':
|
615 |
+
txt_str += ('^' + line + '$\n')
|
616 |
+
return txt_str
|
617 |
+
|
618 |
+
|
619 |
+
def find_all_local_file(base, extension):
|
620 |
+
'''
|
621 |
+
找出给定目录下所有的指定后缀格式的文件路径
|
622 |
+
:param base: 目录路径
|
623 |
+
:param extension: 后缀格式,例如: '.pdf'
|
624 |
+
:return: str
|
625 |
+
'''
|
626 |
+
for root, ds, fs in os.walk(base):
|
627 |
+
for f in fs:
|
628 |
+
if f.endswith(extension.lower()) or f.endswith(extension.upper()):
|
629 |
+
fullname = os.path.join(root, f).replace('/', '//').replace('\\', '//')
|
630 |
+
yield fullname
|
pdf2txt_v2.py
ADDED
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
"""
|
4 |
+
Created by Shengbo.Zhang on 2021/09/20
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
import logging
|
10 |
+
import pdfplumber
|
11 |
+
from docx import Document
|
12 |
+
from Pdf2Txt.config import *
|
13 |
+
from Pdf2Txt.config import _check_ann_title_processable
|
14 |
+
from pdf2docx import Converter
|
15 |
+
from collections import Counter
|
16 |
+
from pdfminer.pdfpage import PDFPage
|
17 |
+
from pdfminer.layout import LAParams, LTTextBox
|
18 |
+
from pdfminer.converter import PDFPageAggregator
|
19 |
+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
20 |
+
|
21 |
+
|
22 |
+
# 临时关闭pdf2docx模块中Converter的日志输出
|
23 |
+
logging.disable(logging.INFO)
|
24 |
+
logging.disable(logging.WARNING)
|
25 |
+
|
26 |
+
|
27 |
+
def get_string_list_from_pdf(pdf_path):
|
28 |
+
'''
|
29 |
+
从一个PDF文件中直接逐行读取文本内容(除表格以外的正文),结果存放在一个列表中
|
30 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
31 |
+
:return: 两个列表:string_list,ann_info_list。前者存放PDF的逐行文本内容,后者存放公告的头部信息(例如:证券代码、证券简称、公告编号等)
|
32 |
+
'''
|
33 |
+
string_list = []
|
34 |
+
ann_info_list = []
|
35 |
+
with pdfplumber.open(pdf_path) as pdf:
|
36 |
+
for id, page in enumerate(pdf.pages):
|
37 |
+
bboxes = [table.bbox for table in page.find_tables()]
|
38 |
+
def _not_within_bboxes(obj):
|
39 |
+
def _obj_in_bbox(_bbox):
|
40 |
+
v_mid = (obj["top"] + obj["bottom"]) / 2
|
41 |
+
h_mid = (obj["x0"] + obj["x1"]) / 2
|
42 |
+
x0, top, x1, bottom = _bbox
|
43 |
+
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
44 |
+
return not any(_obj_in_bbox(__bbox) for __bbox in bboxes)
|
45 |
+
new_page = page.filter(_not_within_bboxes)
|
46 |
+
string = new_page.extract_text()
|
47 |
+
string_split = string.split('\n')
|
48 |
+
if id == 0:
|
49 |
+
ann_info_list = string_split[:10]
|
50 |
+
string_split = [new_string.replace(' ', '').replace('\n', '').replace('\t', '') + '\n' for new_string in string_split]
|
51 |
+
string_split = list(filter(lambda x: x != '\n' and x != '', string_split))
|
52 |
+
string_list.extend(string_split)
|
53 |
+
return string_list, ann_info_list
|
54 |
+
|
55 |
+
|
56 |
+
def get_ann_info_from_pdf(pdf_path):
|
57 |
+
'''
|
58 |
+
获取PDF公告文件的头部信息(此处截取了前5行文本,可能包括非头部数据,将在refine_txt_list()中进一步处理)
|
59 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
60 |
+
:return: 一个列表,存放PDF公告文件的头部信息(例如:证券代码、证券简称、公告编号等)
|
61 |
+
'''
|
62 |
+
try:
|
63 |
+
with pdfplumber.open(pdf_path) as pdf:
|
64 |
+
string = pdf.pages[0].extract_text()
|
65 |
+
string_split = string.split('\n')
|
66 |
+
ann_info_list = string_split[:10]
|
67 |
+
except:
|
68 |
+
ann_info_list = []
|
69 |
+
return ann_info_list
|
70 |
+
|
71 |
+
|
72 |
+
def get_string_list_from_pdf_converted_docx(pdf_path, docx_path):
|
73 |
+
'''
|
74 |
+
将PDF文件转换为Docx格式,逐行读取Docx文件中的正文内容(除表格以外)
|
75 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
76 |
+
:return: 一个列表,string_list,存放PDF的逐行文本内容;一个Document实例对象,存放临时的Docx文件
|
77 |
+
'''
|
78 |
+
document = None
|
79 |
+
string_list = []
|
80 |
+
if docx_path == '':
|
81 |
+
output_docx_file_path = f"{os.path.dirname(pdf_path)}//{os.path.basename(pdf_path)[:-4]}_{TEMP_DOCX_SUFFIX}.docx"
|
82 |
+
else:
|
83 |
+
output_docx_file_path = docx_path
|
84 |
+
is_success = get_docx_from_pdf(pdf_path=pdf_path, out_path=output_docx_file_path)
|
85 |
+
if is_success:
|
86 |
+
document = Document(output_docx_file_path)
|
87 |
+
for val in document.paragraphs:
|
88 |
+
tmp = val.text.strip()
|
89 |
+
tmp_list = tmp.split('\n')
|
90 |
+
for s in tmp_list:
|
91 |
+
s = s.strip()
|
92 |
+
if s == '': continue
|
93 |
+
string_list.append(s)
|
94 |
+
string_list = [string.replace(' ', '').replace('\n', '').replace('\t', '') + '\n' for string in string_list]
|
95 |
+
ann_headers = []
|
96 |
+
for i, val in enumerate(string_list):
|
97 |
+
if i > 10: break
|
98 |
+
if val.strip()[-4:] == '有限公司': break
|
99 |
+
ann_headers.append(val)
|
100 |
+
for i, val1 in enumerate(string_list):
|
101 |
+
for j, val2 in enumerate(ann_headers):
|
102 |
+
if val1 == val2: string_list[i] = ''
|
103 |
+
if os.path.exists(output_docx_file_path):
|
104 |
+
os.remove(output_docx_file_path)
|
105 |
+
return string_list, document
|
106 |
+
|
107 |
+
|
108 |
+
def get_abscissa_dict_from_pdf(pdf_path):
|
109 |
+
'''
|
110 |
+
从一个PDF文件中逐行读取该行首个文本块字符的横坐标值(以PDF页面左上角为原点),以该行文本内容为键,横坐标值为值,建立一个字典
|
111 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
112 |
+
:return: 一个字典:abscissa_dict,存放PDF文件中某一文本块的起始横坐标值
|
113 |
+
'''
|
114 |
+
abscissa_dict = {}
|
115 |
+
fp = open(pdf_path, 'rb')
|
116 |
+
rsrcmgr = PDFResourceManager()
|
117 |
+
laparams = LAParams()
|
118 |
+
device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
|
119 |
+
interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device)
|
120 |
+
pages = PDFPage.get_pages(fp)
|
121 |
+
for i, page in enumerate(pages):
|
122 |
+
interpreter.process_page(page)
|
123 |
+
layout = device.get_result()
|
124 |
+
for lobj in layout:
|
125 |
+
if isinstance(lobj, LTTextBox):
|
126 |
+
x, text = int(lobj.bbox[0]), lobj.get_text()
|
127 |
+
tmp = text.replace(' ', '').replace('\n', '').replace('\t', '') + '\n'
|
128 |
+
if tmp != '\n' and tmp != '':
|
129 |
+
abscissa_dict[tmp] = x
|
130 |
+
fp.close()
|
131 |
+
return abscissa_dict
|
132 |
+
|
133 |
+
|
134 |
+
def get_min_abscissa_value(abscissa_dict, string_list_length):
|
135 |
+
'''
|
136 |
+
计算PDF文本块横坐标的最小值(正文块),这里假设该值至少应大于或等于某一阈值(此处设为文本总行数的1/4)
|
137 |
+
:param abscissa_dict: 一个字典,存放PDF文件中某一文本块的起始横坐标值
|
138 |
+
:param string_list_length: 整型,PDF的文本字符串列表
|
139 |
+
:return: 整型,PDF正文块横坐标的最小值
|
140 |
+
'''
|
141 |
+
abscissa_x_list = list(abscissa_dict.values())
|
142 |
+
abscissa_x_list_counter = list(dict(Counter(abscissa_x_list)).items())
|
143 |
+
abscissa_x_list_counter.sort()
|
144 |
+
x_threshold = string_list_length // 4
|
145 |
+
min_abscissa_value = min(abscissa_x_list)
|
146 |
+
for item in abscissa_x_list_counter:
|
147 |
+
if item[1] >= x_threshold:
|
148 |
+
min_abscissa_value = item[0]
|
149 |
+
break
|
150 |
+
return min_abscissa_value
|
151 |
+
|
152 |
+
|
153 |
+
def refine_txt_list(txt, ann_info):
|
154 |
+
'''
|
155 |
+
此时PDF文件的文本字符串列表(正文)已经过首轮处理,此处将对它进行最后的格式上的优化
|
156 |
+
:param txt: PDF的文本列表,包含PDF的正文文本内容
|
157 |
+
:param ann_info: PDF的公告的头部信息
|
158 |
+
:return: 一个新的PDF文本列表
|
159 |
+
'''
|
160 |
+
# 格式化PDF的【公告头部信息】
|
161 |
+
if ann_info != []:
|
162 |
+
new_ann_info_list = []
|
163 |
+
for i, val in enumerate(ann_info):
|
164 |
+
if val.strip() == '': continue
|
165 |
+
if val.strip()[-4:] == '有限公司': break
|
166 |
+
else: new_ann_info_list.append(' '.join(val.split()) + SEGMENT_SYMBOL)
|
167 |
+
if new_ann_info_list != []:
|
168 |
+
new_ann_info_list[-1] = new_ann_info_list[-1].replace(SEGMENT_SYMBOL, '')
|
169 |
+
if txt[0].strip()[-4:] == '有限公司':
|
170 |
+
for i in range(len(new_ann_info_list)):
|
171 |
+
txt.insert(0, '')
|
172 |
+
for i, val in enumerate(new_ann_info_list):
|
173 |
+
txt[i] = val
|
174 |
+
# 格式化PDF的【公告标题】【董事会承诺说明】
|
175 |
+
for i, val in enumerate(txt):
|
176 |
+
if i > 10: break
|
177 |
+
else:
|
178 |
+
val = val.strip()
|
179 |
+
if _check_ann_title_processable(val):
|
180 |
+
if SEGMENT_SYMBOL not in val:
|
181 |
+
txt[i] = (SEGMENT_SYMBOL + val)
|
182 |
+
if val[-4:] == '有限公司':
|
183 |
+
if SEGMENT_SYMBOL not in txt[i]:
|
184 |
+
txt[i] = (SEGMENT_SYMBOL + val)
|
185 |
+
if _check_ann_title_processable(txt[i+1]):
|
186 |
+
txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
|
187 |
+
if txt[i+2].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
|
188 |
+
if SEGMENT_SYMBOL not in txt[i+2]:
|
189 |
+
txt[i+2] = (SEGMENT_SYMBOL + txt[i+2])
|
190 |
+
txt[i+3] = txt[i+3].replace(SEGMENT_SYMBOL, '')
|
191 |
+
break
|
192 |
+
if _check_ann_title_processable(txt[i+2]):
|
193 |
+
txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
|
194 |
+
txt[i+2] = txt[i+2].replace(SEGMENT_SYMBOL, '')
|
195 |
+
if txt[i+3].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
|
196 |
+
if SEGMENT_SYMBOL not in txt[i+3]:
|
197 |
+
txt[i+3] = (SEGMENT_SYMBOL + txt[i+3])
|
198 |
+
txt[i+4] = txt[i+4].replace(SEGMENT_SYMBOL, '')
|
199 |
+
break
|
200 |
+
# 次轮遍历PDF的文本字符串列表
|
201 |
+
for i, _ in enumerate(txt):
|
202 |
+
# 格式化PDF的【大小节编号】
|
203 |
+
if (SEGMENT_SYMBOL not in txt[i]):
|
204 |
+
match_check = [1, 1, 1, 1, 1]
|
205 |
+
# 形如: '一、'的匹配模式
|
206 |
+
match_1 = re.match('[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', txt[i])
|
207 |
+
# 形如: '1、'的匹配模式
|
208 |
+
match_2 = re.match('[0-9]{1,2}、', txt[i])
|
209 |
+
# 形如: '1.'的匹配模式
|
210 |
+
match_3 = re.match('[0-9]{1,2}\.', txt[i])
|
211 |
+
# 形如: '(一)'或'(一)'的匹配模式
|
212 |
+
match_4 = re.match('[\(\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\)\)]+', txt[i])
|
213 |
+
# 形如: '(1)'或'(1)'的匹配模式
|
214 |
+
match_5 = re.match('[\(\(]+[0-9]{1,2}[\)\)]+', txt[i])
|
215 |
+
if match_1: match_check[0] = match_1.start()
|
216 |
+
if match_2: match_check[1] = match_2.start()
|
217 |
+
if match_3: match_check[2] = match_3.start()
|
218 |
+
if match_4: match_check[3] = match_4.start()
|
219 |
+
if match_5: match_check[4] = match_5.start()
|
220 |
+
if 0 in match_check:
|
221 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
222 |
+
# 修正某些情况下【重要内容提示】字段未自成一行的错误
|
223 |
+
if ('重要内容提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('重要内容提示') == 0):
|
224 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
225 |
+
# 修正某些情况下【单位:元】字段未被删除的错误
|
226 |
+
if (txt[i] == '单位:元') or (txt[i] == SEGMENT_SYMBOL + '单位:元'):
|
227 |
+
txt[i] = ''
|
228 |
+
# 修正某些情况下【特别提示】字段未自成一行的错误
|
229 |
+
if ('特别提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特别提示') == 0):
|
230 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
231 |
+
# 修正某些情况下【特此公告】字段未自成一行的错误
|
232 |
+
if ('特此公告' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特此公告') == 0):
|
233 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
234 |
+
# 修正某些情况下该行文本与下一行文本内容重复的错误(仅保留一行)
|
235 |
+
if (i+1) < len(txt) and (txt[i] == txt[i+1]):
|
236 |
+
txt[i] = ''
|
237 |
+
return txt
|
238 |
+
|
239 |
+
|
240 |
+
def get_docx_from_pdf(pdf_path, out_path):
|
241 |
+
'''
|
242 |
+
读入一个PDF文件,将其转换为Docx格式并临时存放于本地
|
243 |
+
:param pdf_path: 输入的PDF公告文件的完整路径
|
244 |
+
:param out_path: 输出的中间Docx结果文件的完整路径
|
245 |
+
:return: 布尔值,是否转换成功
|
246 |
+
'''
|
247 |
+
cv = Converter(pdf_path)
|
248 |
+
try:
|
249 |
+
cv.convert(out_path)
|
250 |
+
except Exception:
|
251 |
+
cv.close()
|
252 |
+
return False
|
253 |
+
for p in cv.pages:
|
254 |
+
if not p.finalized:
|
255 |
+
cv.close()
|
256 |
+
return False
|
257 |
+
cv.close()
|
258 |
+
return True
|
259 |
+
|
260 |
+
|
261 |
+
def _get_table_row_feat(str):
|
262 |
+
'''
|
263 |
+
给定一个空格分割的表格行字符串,计算它的特征(01组成的字符串)
|
264 |
+
:param str: 字符串
|
265 |
+
:return: 字符串
|
266 |
+
'''
|
267 |
+
s = str.split()
|
268 |
+
r = ''
|
269 |
+
for c in s:
|
270 |
+
try:
|
271 |
+
_ = float(c)
|
272 |
+
r += '1'
|
273 |
+
except Exception:
|
274 |
+
r += '0'
|
275 |
+
return r
|
276 |
+
|
277 |
+
|
278 |
+
def append_table_from_docx(doc, txt):
|
279 |
+
'''
|
280 |
+
读取Docx文件中每个表格的内容,格式化处理后追加至PDF的文本列表中
|
281 |
+
:param doc: 一个Document对象实例
|
282 |
+
:param txt: 一个字符串列表,包含PDF的正文文本内容
|
283 |
+
:return: 一个新的PDF文本列表
|
284 |
+
'''
|
285 |
+
data = []
|
286 |
+
table_txt = []
|
287 |
+
table_tag = '-' + TABLE_SYMBOL + '-'
|
288 |
+
for table in doc.tables[:]:
|
289 |
+
table_txt.append(f'{table_tag}\n')
|
290 |
+
for i, row in enumerate(table.rows[:]):
|
291 |
+
row_content = []
|
292 |
+
for cell in row.cells[:]:
|
293 |
+
c = cell.text
|
294 |
+
new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
|
295 |
+
row_content.append(new_c)
|
296 |
+
if row_content == []: continue
|
297 |
+
if '本公司' in row_content[0]:
|
298 |
+
tmp = SEGMENT_SYMBOL
|
299 |
+
for line in row_content:
|
300 |
+
tmp += line.strip()
|
301 |
+
if '特别提示' in tmp:
|
302 |
+
tmp = tmp[:tmp.index('特别提示')+4]+SEGMENT_SYMBOL+tmp[tmp.index('特别提示')+4:]
|
303 |
+
for id, val in enumerate(txt):
|
304 |
+
if id > 10: break
|
305 |
+
else:
|
306 |
+
if _check_ann_title_processable(val):
|
307 |
+
txt.insert(id+1, tmp)
|
308 |
+
break
|
309 |
+
continue
|
310 |
+
if '证券代码' in row_content[0]: continue
|
311 |
+
data.append(row_content)
|
312 |
+
new_row = '^' + TABLE_CELL_SYMBOL.join(row_content) + '$\n'
|
313 |
+
if new_row.replace(TABLE_CELL_SYMBOL,'') != '^$\n':
|
314 |
+
table_txt.append(new_row)
|
315 |
+
data.append(f'{table_tag}\n')
|
316 |
+
table_txt.append(f'{table_tag}\n')
|
317 |
+
flag = False
|
318 |
+
for i, val in enumerate(table_txt):
|
319 |
+
if val == f'{table_tag}\n':
|
320 |
+
if not flag:
|
321 |
+
flag = True
|
322 |
+
else:
|
323 |
+
table_txt[i] = '^$\n'
|
324 |
+
else:
|
325 |
+
flag = False
|
326 |
+
table_txt = list(filter(lambda x: x != '^$\n', table_txt))
|
327 |
+
for i, val in enumerate(table_txt):
|
328 |
+
if val == f'{table_tag}\n' and (i > 0) and (i < len(table_txt)-1):
|
329 |
+
feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
|
330 |
+
feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
|
331 |
+
if feat1 == feat2:
|
332 |
+
table_txt[i] = '^$\n'
|
333 |
+
if len(table_txt) == 1 and table_txt[0] == f'{table_tag}\n':
|
334 |
+
table_txt[0] = '^$\n'
|
335 |
+
for i, val in enumerate(table_txt):
|
336 |
+
if val == table_tag:
|
337 |
+
continue
|
338 |
+
if val == '^$\n':
|
339 |
+
table_txt[i] = ''
|
340 |
+
continue
|
341 |
+
table_txt[i] = val[1:][:-2] + '\n'
|
342 |
+
txt.extend(table_txt)
|
343 |
+
return txt
|
344 |
+
|
345 |
+
|
346 |
+
def output_txt_string(txt_path, txt_string):
|
347 |
+
'''
|
348 |
+
将PDF公告的格式化文本字符串写出至一��.txt的纯文本文件
|
349 |
+
:param txt_path: 纯文本文件的路径
|
350 |
+
:param txt_string: PDF公告的纯文本字符串
|
351 |
+
:return: 布尔值,是否写出成功
|
352 |
+
'''
|
353 |
+
try:
|
354 |
+
with open(txt_path, "w", encoding='utf-8') as f:
|
355 |
+
f.write(txt_string)
|
356 |
+
# txt_string_split = txt_string.split('\n')
|
357 |
+
# with open(txt_path, "w", encoding='utf-8') as f:
|
358 |
+
# for string in txt_string_split:
|
359 |
+
# if string != '':
|
360 |
+
# f.write('^' + string + '$\n')
|
361 |
+
except:
|
362 |
+
return False
|
363 |
+
return True
|
364 |
+
|
365 |
+
|
366 |
+
def get_txt_from_pdf(pdf_path, docx_path=''):
|
367 |
+
'''
|
368 |
+
给定一个PDF格式的公告文件,将其转化为格式化的TXT文本字符串
|
369 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
370 |
+
:return: 一个字符串,PDF经转换后的纯文本(已格式化,前部正文,后部表格)
|
371 |
+
'''
|
372 |
+
txt_string = ''
|
373 |
+
ann_info_list = get_ann_info_from_pdf(pdf_path)
|
374 |
+
string_list, document = get_string_list_from_pdf_converted_docx(pdf_path, docx_path)
|
375 |
+
if ann_info_list != [] and string_list != [] and document is not None:
|
376 |
+
abscissa_dict = get_abscissa_dict_from_pdf(pdf_path)
|
377 |
+
min_abscissa_value = get_min_abscissa_value(abscissa_dict, len(string_list))
|
378 |
+
for i, val in enumerate(string_list):
|
379 |
+
if i > 10: break
|
380 |
+
if val.replace('\n', '')[-4:] == '有限公司': break
|
381 |
+
else: abscissa_dict[val] = min_abscissa_value
|
382 |
+
txt_list = []
|
383 |
+
for id, string in enumerate(string_list):
|
384 |
+
new_string = string.replace('\n', '').replace('\t', '').replace(' ', '').replace(' ', '').replace('', '').replace(',', '')
|
385 |
+
if (not (len(new_string) <= 3 and new_string.isdigit())) and string != '':
|
386 |
+
try:
|
387 |
+
if abscissa_dict[string] > min_abscissa_value:
|
388 |
+
txt_list.append(SEGMENT_SYMBOL + new_string)
|
389 |
+
else:
|
390 |
+
txt_list.append(new_string)
|
391 |
+
except:
|
392 |
+
txt_list.append(new_string)
|
393 |
+
txt_list = refine_txt_list(txt_list, ann_info_list)
|
394 |
+
if document is not None:
|
395 |
+
txt_list.append(SEGMENT_SYMBOL)
|
396 |
+
txt_list = append_table_from_docx(doc=document, txt=txt_list)
|
397 |
+
for val in txt_list:
|
398 |
+
txt_string += val
|
399 |
+
return txt_string
|
pdf2txt_v3.py
ADDED
@@ -0,0 +1,555 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
"""
|
4 |
+
Created by Shengbo.Zhang on 2021/10/08
|
5 |
+
"""
|
6 |
+
|
7 |
+
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
import logging
|
11 |
+
import pdfplumber
|
12 |
+
from docx import Document
|
13 |
+
from Pdf2Txt.config import *
|
14 |
+
from pdf2docx import Converter
|
15 |
+
from collections import Counter
|
16 |
+
from Pdf2Txt.config import _check_ann_title_processable
|
17 |
+
|
18 |
+
|
19 |
+
# 临时关闭pdf2docx模块中Converter的日志输出
|
20 |
+
logging.disable(logging.INFO)
|
21 |
+
logging.disable(logging.WARNING)
|
22 |
+
|
23 |
+
|
24 |
+
def get_string_and_abscissa_list_from_pdf(pdf_path):
|
25 |
+
'''
|
26 |
+
从一个PDF文件中直接逐行读取文本内容(除表格以外的正文)以及最左侧字符的距左边距,结果存放在一个列表中
|
27 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
28 |
+
:return: 一个列表:string_abscissa_list,列表元素为[i, j]。其中,i为PDF的整行文本块内容,j为该文本块的横坐标(距左边距)
|
29 |
+
'''
|
30 |
+
string_abscissa_list = []
|
31 |
+
temp_list = []
|
32 |
+
temp_string_list = []
|
33 |
+
temp_abscissa_list = []
|
34 |
+
with pdfplumber.open(pdf_path) as pdf:
|
35 |
+
for id, page in enumerate(pdf.pages):
|
36 |
+
bboxes = [table.bbox for table in page.find_tables()]
|
37 |
+
def _not_within_bboxes(obj):
|
38 |
+
def _obj_in_bbox(_bbox):
|
39 |
+
v_mid = (obj["top"] + obj["bottom"]) / 2
|
40 |
+
h_mid = (obj["x0"] + obj["x1"]) / 2
|
41 |
+
x0, top, x1, bottom = _bbox
|
42 |
+
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
43 |
+
return not any(_obj_in_bbox(__bbox) for __bbox in bboxes)
|
44 |
+
new_page = page.filter(_not_within_bboxes)
|
45 |
+
words_list = new_page.extract_words()
|
46 |
+
for item in words_list:
|
47 |
+
text = item['text'].replace('\n', '').replace('\t', '').replace(' ', '').replace(' ', '').replace(',', '')
|
48 |
+
x0 = int(str(item['x0']).split('.')[0])
|
49 |
+
y0 = int(str(item['top']).split('.')[0])
|
50 |
+
if text != '':
|
51 |
+
temp_list.append([text, x0, y0])
|
52 |
+
|
53 |
+
for id, _ in enumerate(temp_list):
|
54 |
+
if id < len(temp_list)-1 and temp_list[id+1][2] != temp_list[id][2] and abs(temp_list[id+1][2] - temp_list[id][2]) <= 3:
|
55 |
+
temp_list[id+1][2] = temp_list[id][2]
|
56 |
+
|
57 |
+
i = 0
|
58 |
+
j = 1
|
59 |
+
while True:
|
60 |
+
if i < len(temp_list):
|
61 |
+
temp_str = temp_list[i][0]
|
62 |
+
while j < len(temp_list):
|
63 |
+
if temp_list[i][2] == temp_list[j][2]:
|
64 |
+
temp_str += temp_list[j][0]
|
65 |
+
else:
|
66 |
+
break
|
67 |
+
j += 1
|
68 |
+
if i < len(temp_list)-1 and j == len(temp_list):
|
69 |
+
temp_string_list.append(temp_str)
|
70 |
+
temp_abscissa_list.append(temp_list[i][1])
|
71 |
+
break
|
72 |
+
temp_string_list.append(temp_str)
|
73 |
+
temp_abscissa_list.append(temp_list[i][1])
|
74 |
+
i = j
|
75 |
+
j += 1
|
76 |
+
if i == len(temp_list)-1 and j == len(temp_list):
|
77 |
+
temp_string_list.append(temp_list[i][0])
|
78 |
+
temp_abscissa_list.append(temp_list[i][1])
|
79 |
+
break
|
80 |
+
else:
|
81 |
+
break
|
82 |
+
|
83 |
+
for i, j in zip(temp_string_list, temp_abscissa_list):
|
84 |
+
string_abscissa_list.append([i, j])
|
85 |
+
|
86 |
+
return string_abscissa_list
|
87 |
+
|
88 |
+
|
89 |
+
def get_ann_info_from_pdf(pdf_path):
|
90 |
+
'''
|
91 |
+
获取PDF公告文件的头部信息(此处截取了前5行文本,可能包括非头部数据,将在refine_txt_list()中进一步处理)
|
92 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
93 |
+
:return: 一个列表,存放PDF公告文件的头部信息(例如:证券代码、证券简称、公告编号等)
|
94 |
+
'''
|
95 |
+
try:
|
96 |
+
with pdfplumber.open(pdf_path) as pdf:
|
97 |
+
string = pdf.pages[0].extract_text()
|
98 |
+
string_split = string.split('\n')
|
99 |
+
ann_info_list = string_split[:10]
|
100 |
+
except:
|
101 |
+
ann_info_list = []
|
102 |
+
return ann_info_list
|
103 |
+
|
104 |
+
|
105 |
+
def get_document_from_pdf_converted_docx(pdf_path, docx_path):
|
106 |
+
'''
|
107 |
+
将PDF文件转换为Docx格式,逐行读取Docx文件中的正文内容(除表格以外)
|
108 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
109 |
+
:return: 一个列表,string_list,存放PDF的逐行文本内容;一个Document实例对象,存放临时的Docx文件
|
110 |
+
'''
|
111 |
+
document = None
|
112 |
+
if docx_path == '':
|
113 |
+
output_docx_file_path = f"{os.path.dirname(pdf_path)}//{os.path.basename(pdf_path)[:-4]}_{TEMP_DOCX_SUFFIX}.docx"
|
114 |
+
else:
|
115 |
+
output_docx_file_path = docx_path
|
116 |
+
is_success = get_docx_from_pdf(pdf_path=pdf_path, out_path=output_docx_file_path)
|
117 |
+
if is_success:
|
118 |
+
document = Document(output_docx_file_path)
|
119 |
+
if os.path.exists(output_docx_file_path):
|
120 |
+
os.remove(output_docx_file_path)
|
121 |
+
return document
|
122 |
+
|
123 |
+
|
124 |
+
def get_min_abscissa_value(abscissa_list, string_list_length):
|
125 |
+
'''
|
126 |
+
计算PDF文本块横坐标的最小值(正文块),这里假设该值至少应大于或等于某一阈值(此处设为文本总��数的1/4)
|
127 |
+
:param abscissa_dict: 一个字典,存放PDF文件中某一文本块的起始横坐标值
|
128 |
+
:param string_list_length: 整型,PDF的文本字符串列表
|
129 |
+
:return: 整型,PDF正文块横坐标的最小值
|
130 |
+
'''
|
131 |
+
abscissa_x_list = abscissa_list
|
132 |
+
abscissa_x_list_counter = list(dict(Counter(abscissa_x_list)).items())
|
133 |
+
abscissa_x_list_counter.sort()
|
134 |
+
x_threshold = string_list_length // 4
|
135 |
+
min_abscissa_value = min(abscissa_x_list)
|
136 |
+
for item in abscissa_x_list_counter:
|
137 |
+
if item[1] >= x_threshold:
|
138 |
+
min_abscissa_value = item[0]
|
139 |
+
break
|
140 |
+
return min_abscissa_value
|
141 |
+
|
142 |
+
|
143 |
+
def refine_txt_list(txt, ann_info, string_abscissa_dict):
|
144 |
+
'''
|
145 |
+
此时PDF文件的文本字符串列表(正文)已经过首轮处理,此处将对它进行最后的格式上的优化
|
146 |
+
:param txt: PDF的文本列表,包含PDF的正文文本内容
|
147 |
+
:param ann_info: PDF的公告的头部信息
|
148 |
+
:return: 一个新的PDF文本列表
|
149 |
+
'''
|
150 |
+
# 格式化PDF的【公告头部信息】
|
151 |
+
if ann_info != []:
|
152 |
+
new_ann_info_list = []
|
153 |
+
for i, val in enumerate(ann_info):
|
154 |
+
if val.strip() == '': continue
|
155 |
+
if val.strip()[-4:] == '有限公司': break
|
156 |
+
else: new_ann_info_list.append(' '.join(val.split()) + SEGMENT_SYMBOL)
|
157 |
+
if new_ann_info_list != []:
|
158 |
+
new_ann_info_list[-1] = new_ann_info_list[-1].replace(SEGMENT_SYMBOL, '')
|
159 |
+
if txt[0].strip()[-4:] == '有限公司':
|
160 |
+
for i in range(len(new_ann_info_list)):
|
161 |
+
txt.insert(0, '')
|
162 |
+
for i, val in enumerate(new_ann_info_list):
|
163 |
+
txt[i] = val
|
164 |
+
# 格式化PDF的【公告标题】【董事会承诺说明】
|
165 |
+
for i, val in enumerate(txt):
|
166 |
+
if i > 10: break
|
167 |
+
else:
|
168 |
+
val = val.strip()
|
169 |
+
if _check_ann_title_processable(val):
|
170 |
+
if SEGMENT_SYMBOL not in val:
|
171 |
+
txt[i] = (SEGMENT_SYMBOL + val)
|
172 |
+
if val[-4:] == '有限公司':
|
173 |
+
if SEGMENT_SYMBOL not in txt[i]:
|
174 |
+
txt[i] = (SEGMENT_SYMBOL + val)
|
175 |
+
if _check_ann_title_processable(txt[i+1]):
|
176 |
+
txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
|
177 |
+
if txt[i+2].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
|
178 |
+
if SEGMENT_SYMBOL not in txt[i+2]:
|
179 |
+
txt[i+2] = (SEGMENT_SYMBOL + txt[i+2])
|
180 |
+
txt[i+3] = txt[i+3].replace(SEGMENT_SYMBOL, '')
|
181 |
+
break
|
182 |
+
if _check_ann_title_processable(txt[i+2]):
|
183 |
+
txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
|
184 |
+
txt[i+2] = txt[i+2].replace(SEGMENT_SYMBOL, '')
|
185 |
+
if txt[i+3].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
|
186 |
+
if SEGMENT_SYMBOL not in txt[i+3]:
|
187 |
+
txt[i+3] = (SEGMENT_SYMBOL + txt[i+3])
|
188 |
+
txt[i+4] = txt[i+4].replace(SEGMENT_SYMBOL, '')
|
189 |
+
break
|
190 |
+
# 次轮遍历PDF的文本字符串列表
|
191 |
+
for i, _ in enumerate(txt):
|
192 |
+
# 格式化PDF的【大小节编号】
|
193 |
+
if (SEGMENT_SYMBOL not in txt[i]):
|
194 |
+
match_check = [1, 1, 1, 1, 1]
|
195 |
+
# 形如: '一、'的匹配模式
|
196 |
+
match_1 = re.match('[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', txt[i])
|
197 |
+
# 形如: '1、'的匹配模式
|
198 |
+
match_2 = re.match('[0-9]{1,2}、', txt[i])
|
199 |
+
# 形如: '1.'的匹配模式
|
200 |
+
match_3 = re.match('[0-9]{1,2}\.', txt[i])
|
201 |
+
# 形如: '(一)'或'(一)'的匹配模式
|
202 |
+
match_4 = re.match('[\(\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\)\)]+', txt[i])
|
203 |
+
# 形如: '(1)'或'(1)'的匹配模式
|
204 |
+
match_5 = re.match('[\(\(]+[0-9]{1,2}[\)\)]+', txt[i])
|
205 |
+
if match_1: match_check[0] = match_1.start()
|
206 |
+
if match_2: match_check[1] = match_2.start()
|
207 |
+
if match_3: match_check[2] = match_3.start()
|
208 |
+
if match_4: match_check[3] = match_4.start()
|
209 |
+
if match_5: match_check[4] = match_5.start()
|
210 |
+
if 0 in match_check:
|
211 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
212 |
+
# 修正某些情况下【重要内容提示】字段未自成一行的错误
|
213 |
+
if ('重要内容提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('重要内容提示') == 0):
|
214 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
215 |
+
if txt[i][-1] != '\n':
|
216 |
+
txt[i] += SEGMENT_SYMBOL
|
217 |
+
# 修正某些情况下【单位:元】【单位:人民币元】字段未被删除的错误
|
218 |
+
if (txt[i] == '单位:元') or (txt[i] == SEGMENT_SYMBOL + '单位:元'):
|
219 |
+
txt[i] = ''
|
220 |
+
if (txt[i] == '单位:人民币元') or (txt[i] == SEGMENT_SYMBOL + '单位:人民币元'):
|
221 |
+
txt[i] = ''
|
222 |
+
# 修��某些情况下【特别提示】字段未自成一行的错误
|
223 |
+
if ('特别提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特别提示') == 0):
|
224 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
225 |
+
if txt[i][-1] != '\n':
|
226 |
+
txt[i] += SEGMENT_SYMBOL
|
227 |
+
# 修正某些情况下【特此公告】字段未自成一行的错误
|
228 |
+
if ('特此公告' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特此公告') == 0):
|
229 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
230 |
+
if txt[i][-1] != '\n':
|
231 |
+
txt[i] += SEGMENT_SYMBOL
|
232 |
+
# 修正某些情况下【附件:】【附件1:】字段前未断行的错误
|
233 |
+
match_6 = re.match('附件[0-9]{0,2}:', txt[i])
|
234 |
+
if match_6:
|
235 |
+
if match_6.start() == 0:
|
236 |
+
txt[i] = SEGMENT_SYMBOL + txt[i]
|
237 |
+
# 修正某些情况下该行文本与下一行文本内容重复的错误(仅保留一行)
|
238 |
+
if (i+1) < len(txt) and (txt[i] == txt[i+1]):
|
239 |
+
txt[i] = ''
|
240 |
+
# 修正某些情况下【】【●】符号分段的内容未自成一行的错误
|
241 |
+
if ('' in txt[i]) or ('●' in txt[i]):
|
242 |
+
txt[i] = txt[i].replace('', '').replace('●', '')
|
243 |
+
for idx in range(i+1, len(txt)-1):
|
244 |
+
if ('' in txt[idx]) or ('●' in txt[idx]):
|
245 |
+
break
|
246 |
+
txt[idx] = txt[idx].replace(SEGMENT_SYMBOL, '')
|
247 |
+
if string_abscissa_dict[txt[idx+1].replace(SEGMENT_SYMBOL, '')] < string_abscissa_dict[txt[idx].replace(SEGMENT_SYMBOL, '')]:
|
248 |
+
break
|
249 |
+
# 修正某些情况下非首页页眉重复出现的错误
|
250 |
+
if i != 0 and txt[i].replace(SEGMENT_SYMBOL, '').replace(' ', '') in txt[0].replace(SEGMENT_SYMBOL, '').replace(' ', ''):
|
251 |
+
txt[i] = ''
|
252 |
+
# 修正某些特殊形式的页码标识未被正确移除的错误
|
253 |
+
if (re.match('^[0-9]{1,2}/[0-9]{1,2}', txt[i].strip().replace('', ''))) or \
|
254 |
+
(re.match('^第[0-9]{1,2}页', txt[i].strip().replace('', ''))) or \
|
255 |
+
(re.match(r'^-[0-9]{1,2}-', txt[i].strip().replace('', ''))):
|
256 |
+
txt[i] = ''
|
257 |
+
return txt
|
258 |
+
|
259 |
+
|
260 |
+
def get_docx_from_pdf(pdf_path, out_path):
|
261 |
+
'''
|
262 |
+
读入一个PDF文件,将其转换为Docx格式并临时存放于本地
|
263 |
+
:param pdf_path: 输入的PDF公告文件的完整路径
|
264 |
+
:param out_path: 输出的中间Docx结果文件的完整路径
|
265 |
+
:return: 布尔值,是否转换成功
|
266 |
+
'''
|
267 |
+
cv = Converter(pdf_path)
|
268 |
+
try:
|
269 |
+
cv.convert(out_path)
|
270 |
+
except Exception:
|
271 |
+
cv.close()
|
272 |
+
return False
|
273 |
+
for p in cv.pages:
|
274 |
+
if not p.finalized:
|
275 |
+
cv.close()
|
276 |
+
return False
|
277 |
+
cv.close()
|
278 |
+
return True
|
279 |
+
|
280 |
+
|
281 |
+
def _get_table_row_feat(str):
|
282 |
+
'''
|
283 |
+
给定一个空格分割的表格行字符串,计算它的特征(01组成的字符串)
|
284 |
+
:param str: 字符串
|
285 |
+
:return: 字符串
|
286 |
+
'''
|
287 |
+
s = str.split()
|
288 |
+
r = ''
|
289 |
+
for c in s:
|
290 |
+
try:
|
291 |
+
_ = float(c)
|
292 |
+
r += '1'
|
293 |
+
except Exception:
|
294 |
+
r += '0'
|
295 |
+
return r
|
296 |
+
|
297 |
+
|
298 |
+
def append_table_from_docx(doc, txt):
|
299 |
+
'''
|
300 |
+
读取Docx文件中每个表格的内容,格式化处理后追加至PDF的文本列表中
|
301 |
+
:param doc: 一个Document对象实例
|
302 |
+
:param txt: 一个字符串列表,包含PDF的正文文本内容
|
303 |
+
:return: 一个新的PDF文本列表
|
304 |
+
'''
|
305 |
+
data = []
|
306 |
+
table_txt = []
|
307 |
+
table_tag = '-' + TABLE_SYMBOL + '-'
|
308 |
+
for table in doc.tables[:]:
|
309 |
+
table_txt.append(f'{table_tag}\n')
|
310 |
+
for i, row in enumerate(table.rows[:]):
|
311 |
+
row_content = []
|
312 |
+
for cell in row.cells[:]:
|
313 |
+
c = cell.text
|
314 |
+
new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
|
315 |
+
row_content.append(new_c)
|
316 |
+
if row_content == []: continue
|
317 |
+
if '本公司' in row_content[0]:
|
318 |
+
local_flag = True
|
319 |
+
for val in txt[:10]:
|
320 |
+
if '本公司' in val:
|
321 |
+
local_flag = False
|
322 |
+
break
|
323 |
+
if local_flag:
|
324 |
+
tmp = SEGMENT_SYMBOL
|
325 |
+
for line in row_content:
|
326 |
+
tmp += line.strip()
|
327 |
+
if '特别提示' in tmp:
|
328 |
+
tmp = tmp[:tmp.index('特别提示')+4]+SEGMENT_SYMBOL+tmp[tmp.index('特别提示')+4:]
|
329 |
+
for id, val in enumerate(txt):
|
330 |
+
if id > 10: break
|
331 |
+
else:
|
332 |
+
if _check_ann_title_processable(val):
|
333 |
+
txt.insert(id+1, tmp)
|
334 |
+
break
|
335 |
+
continue
|
336 |
+
if '证券代码' in row_content[0]:
|
337 |
+
continue
|
338 |
+
data.append(row_content)
|
339 |
+
new_row = '^' + TABLE_CELL_SYMBOL.join(row_content) + '$\n'
|
340 |
+
if new_row.replace(TABLE_CELL_SYMBOL, '') != '^$\n':
|
341 |
+
table_txt.append(new_row)
|
342 |
+
data.append(f'{table_tag}\n')
|
343 |
+
table_txt.append(f'{table_tag}\n')
|
344 |
+
flag = False
|
345 |
+
for i, val in enumerate(table_txt):
|
346 |
+
if val == f'{table_tag}\n':
|
347 |
+
if not flag:
|
348 |
+
flag = True
|
349 |
+
else:
|
350 |
+
table_txt[i] = '^$\n'
|
351 |
+
else:
|
352 |
+
flag = False
|
353 |
+
table_txt = list(filter(lambda x: x != '^$\n', table_txt))
|
354 |
+
for i, val in enumerate(table_txt):
|
355 |
+
if val == f'{table_tag}\n' and (i > 0) and (i < len(table_txt)-1):
|
356 |
+
feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
|
357 |
+
feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
|
358 |
+
if feat1 == feat2:
|
359 |
+
table_txt[i] = '^$\n'
|
360 |
+
if len(table_txt) == 1 and table_txt[0] == f'{table_tag}\n':
|
361 |
+
table_txt[0] = '^$\n'
|
362 |
+
for i, val in enumerate(table_txt):
|
363 |
+
if val == table_tag:
|
364 |
+
continue
|
365 |
+
if val == '^$\n':
|
366 |
+
table_txt[i] = ''
|
367 |
+
continue
|
368 |
+
table_txt[i] = val[1:][:-2] + '\n'
|
369 |
+
txt.extend(table_txt)
|
370 |
+
return txt
|
371 |
+
|
372 |
+
|
373 |
+
def output_txt_string(txt_path, txt_string):
|
374 |
+
'''
|
375 |
+
将PDF公告的格式化文本字符串写出至一个.txt的纯文本文件
|
376 |
+
:param txt_path: 纯文本文件的路径
|
377 |
+
:param txt_string: PDF公告的纯文本字符串
|
378 |
+
:return: 布尔值,是否写出成功
|
379 |
+
'''
|
380 |
+
try:
|
381 |
+
with open(txt_path, "w", encoding='utf-8') as f:
|
382 |
+
f.write(txt_string)
|
383 |
+
# txt_string_split = txt_string.split('\n')
|
384 |
+
# with open(txt_path, "w", encoding='utf-8') as f:
|
385 |
+
# for string in txt_string_split:
|
386 |
+
# if string != '':
|
387 |
+
# f.write('^' + string + '$\n')
|
388 |
+
except:
|
389 |
+
return False
|
390 |
+
return True
|
391 |
+
|
392 |
+
|
393 |
+
def refine_table_txt(txt):
|
394 |
+
'''
|
395 |
+
对传入的txt_list再进行针对表头和跨页的优化
|
396 |
+
:param txt: PDF的文本列表,包含PDF的正文文本内容和追加的表格文本内容
|
397 |
+
:return: 一个新的文本列表
|
398 |
+
'''
|
399 |
+
new_txt_list = []
|
400 |
+
j = -1
|
401 |
+
for i, _ in enumerate(txt):
|
402 |
+
if txt[i] == f'{TABLE_SYMBOL}\n':
|
403 |
+
j = i
|
404 |
+
break
|
405 |
+
else:
|
406 |
+
new_txt_list.append(txt[i])
|
407 |
+
|
408 |
+
table_txt = txt[j:]
|
409 |
+
|
410 |
+
table_txt = list(filter(None, table_txt))
|
411 |
+
for i, _ in enumerate(table_txt):
|
412 |
+
if table_txt[i] == f'{TABLE_SYMBOL}\n' and i + 2 < len(table_txt):
|
413 |
+
pre_cut = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
|
414 |
+
if (len(pre_cut) == 1) or (len(pre_cut) == 2 and pre_cut[0] == ''):
|
415 |
+
table_txt[i + 1] = ''
|
416 |
+
if '公司及董事会' in table_txt[i + 1]:
|
417 |
+
table_txt[i + 1] = ''
|
418 |
+
if table_txt[i + 2] == f'{TABLE_SYMBOL}\n':
|
419 |
+
table_txt[i] = ''
|
420 |
+
table_txt[i + 2] = table_txt[i + 1]
|
421 |
+
table_txt[i + 1] = f'{TABLE_SYMBOL}\n'
|
422 |
+
|
423 |
+
table_txt = list(filter(None, table_txt))
|
424 |
+
for i, _ in enumerate(table_txt):
|
425 |
+
if table_txt[i] == f'{TABLE_SYMBOL}\n' and i + 2 < len(table_txt):
|
426 |
+
if '同意' in table_txt[i + 1] and table_txt[i + 1].count('同意') == 2:
|
427 |
+
cut = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
|
428 |
+
for k, val in enumerate(cut):
|
429 |
+
if val == '同意':
|
430 |
+
cut[k] += '票数'
|
431 |
+
cut[k+1] += '比例'
|
432 |
+
if val == '反对':
|
433 |
+
cut[k] += '票数'
|
434 |
+
cut[k+1] += '比例'
|
435 |
+
if val == '弃权':
|
436 |
+
cut[k] += '票数'
|
437 |
+
cut[k+1] += '比例'
|
438 |
+
table_txt[i + 1] = TABLE_CELL_SYMBOL.join(cut).replace(SEGMENT_SYMBOL, '')+SEGMENT_SYMBOL
|
439 |
+
table_txt[i + 2] = ''
|
440 |
+
continue
|
441 |
+
|
442 |
+
cut1 = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
|
443 |
+
set_cut1 = list(set(cut1))
|
444 |
+
set_cut1.sort(key=cut1.index)
|
445 |
+
set_cut1 = list(filter(None, set_cut1))
|
446 |
+
|
447 |
+
cut2 = table_txt[i + 2].split(TABLE_CELL_SYMBOL)
|
448 |
+
set_cut2 = list(set(cut2))
|
449 |
+
set_cut2.sort(key=cut2.index)
|
450 |
+
set_cut2 = list(filter(None, set_cut2))
|
451 |
+
|
452 |
+
head_cut = []
|
453 |
+
counter = 0
|
454 |
+
for val1, val2 in zip(set_cut1, set_cut2):
|
455 |
+
if counter:
|
456 |
+
if len(set_cut1) > len(set_cut2):
|
457 |
+
head_cut = set_cut1
|
458 |
+
else:
|
459 |
+
head_cut = set_cut2
|
460 |
+
break
|
461 |
+
if val1 == val2:
|
462 |
+
counter += 1
|
463 |
+
if counter and head_cut:
|
464 |
+
table_txt[i + 1] = TABLE_CELL_SYMBOL.join(head_cut)
|
465 |
+
table_txt[i + 2] = ''
|
466 |
+
|
467 |
+
if counter:
|
468 |
+
if i+4 < len(table_txt):
|
469 |
+
cut3 = table_txt[i + 3].split(TABLE_CELL_SYMBOL)
|
470 |
+
set_cut3 = list(set(cut3))
|
471 |
+
set_cut3.sort(key=cut3.index)
|
472 |
+
set_cut3 = list(filter(None, set_cut3))
|
473 |
+
|
474 |
+
flag = False
|
475 |
+
for val3 in set_cut3:
|
476 |
+
if re.match(r'^[0-9]+(|.)[0-9]+(|%)$', val3):
|
477 |
+
flag = True
|
478 |
+
break
|
479 |
+
|
480 |
+
if not flag:
|
481 |
+
cut4 = table_txt[i + 4].split(TABLE_CELL_SYMBOL)
|
482 |
+
set_cut4 = list(set(cut4))
|
483 |
+
set_cut4.sort(key=cut4.index)
|
484 |
+
set_cut4 = list(filter(None, set_cut4))
|
485 |
+
|
486 |
+
counter_2 = 0
|
487 |
+
for val3, val4 in zip(set_cut3, set_cut4):
|
488 |
+
if counter_2:
|
489 |
+
if len(set_cut4) > len(set_cut3):
|
490 |
+
head_cut = set_cut4
|
491 |
+
else:
|
492 |
+
head_cut = set_cut3
|
493 |
+
break
|
494 |
+
if val3 == val4:
|
495 |
+
counter_2 += 1
|
496 |
+
if counter_2 and head_cut:
|
497 |
+
table_txt[i + 1] = TABLE_CELL_SYMBOL.join(head_cut)
|
498 |
+
table_txt[i + 2] = ''
|
499 |
+
table_txt[i + 3] = ''
|
500 |
+
table_txt[i + 4] = ''
|
501 |
+
|
502 |
+
for val in table_txt:
|
503 |
+
new_txt_list.append(val)
|
504 |
+
return new_txt_list
|
505 |
+
|
506 |
+
|
507 |
+
def get_txt_from_pdf(pdf_path, docx_path=''):
|
508 |
+
'''
|
509 |
+
给定一个PDF格式的公告文件,将其转化为格式化的TXT文本字符串
|
510 |
+
:param pdf_path: 一个字符串,PDF文件的路径地址
|
511 |
+
:return: 一个字符串,PDF经转换后的纯文本(已格式化,前部正文,后部表格)
|
512 |
+
'''
|
513 |
+
txt_string = ''
|
514 |
+
ann_info_list = get_ann_info_from_pdf(pdf_path)
|
515 |
+
string_abscissa_list = get_string_and_abscissa_list_from_pdf(pdf_path)
|
516 |
+
document = get_document_from_pdf_converted_docx(pdf_path, docx_path)
|
517 |
+
string_abscissa_dict = {}
|
518 |
+
|
519 |
+
if ann_info_list != [] and string_abscissa_list != [] and document is not None:
|
520 |
+
abscissa_list = [x[1] for x in string_abscissa_list]
|
521 |
+
min_abscissa_value = get_min_abscissa_value(abscissa_list, len(abscissa_list))
|
522 |
+
|
523 |
+
for id, item in enumerate(string_abscissa_list):
|
524 |
+
if id > 10:
|
525 |
+
break
|
526 |
+
if item[0].replace('\n', '')[-4:] == '有限公司':
|
527 |
+
break
|
528 |
+
else:
|
529 |
+
string_abscissa_list[id][1] = min_abscissa_value
|
530 |
+
|
531 |
+
txt_list = []
|
532 |
+
|
533 |
+
for id, item in enumerate(string_abscissa_list):
|
534 |
+
if (not (len(item[0]) <= 3 and item[0].isdigit())):
|
535 |
+
string_abscissa_dict[item[0]] = item[1]
|
536 |
+
if item[1] > min_abscissa_value:
|
537 |
+
if abs(item[1]-min_abscissa_value) <= 8:
|
538 |
+
txt_list.append(item[0])
|
539 |
+
else:
|
540 |
+
txt_list.append(SEGMENT_SYMBOL + item[0])
|
541 |
+
else:
|
542 |
+
txt_list.append(item[0])
|
543 |
+
|
544 |
+
txt_list = refine_txt_list(txt_list, ann_info_list, string_abscissa_dict)
|
545 |
+
|
546 |
+
if document is not None:
|
547 |
+
txt_list.append(SEGMENT_SYMBOL)
|
548 |
+
txt_list = append_table_from_docx(doc=document, txt=txt_list)
|
549 |
+
|
550 |
+
txt_list = refine_table_txt(txt_list)
|
551 |
+
|
552 |
+
for val in txt_list:
|
553 |
+
txt_string += val
|
554 |
+
|
555 |
+
return txt_string
|