FrankWu
/

Application

Model card Files Files and versions Community

FrankWu commited on Mar 31, 2024

Commit

e2dccf7

verified ·

1 Parent(s): b583704

Upload 5 files

Browse files

Files changed (5) hide show

config.py +64 -0
pdf2txt_test.py +207 -0
pdf2txt_v1.py +630 -0
pdf2txt_v2.py +399 -0
pdf2txt_v3.py +555 -0

config.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# -*- coding: utf-8 -*-
+"""
+Created by Shengbo.Zhang on 2021/10/12
+"""
+# 定义正文的各段落之间的分隔符
+SEGMENT_SYMBOL = '\n'
+# 定义表格之间的分隔符（无需添加换行符'\n'）
+TABLE_SYMBOL = '-----表格-----'
+# 定义表格中单元格之间的分隔符
+TABLE_CELL_SYMBOL = '\t'
+# 定义临时生成的Docx文件的命名后缀
+TEMP_DOCX_SUFFIX = 'TEMP_DOCX'
+# 筛选可处理的公告文件标题特点
+ANNOUNCEMENT_TITLE_FEATURE = [['公告', -2],
+                              ['通知', -2],
+                              ['说明', -2],
+                              ['意见', -2],
+                              ['预告', -2],
+                              ['快报', -2],
+                              ['摘要', -2],
+                              ['意见函', -3],
+                              ['回复函', -3],
+                              ['意见书', -3]]
+def _check_ann_title_processable(title, exp=0):
+    if exp == 0:
+        for item in ANNOUNCEMENT_TITLE_FEATURE:
+            if title[item[1]:] == item[0]:
+                return True
+    elif exp == 1:
+        for item in ANNOUNCEMENT_TITLE_FEATURE:
+            if title[item[1]-1:] == item[0]+'\n':
+                return True
+    elif exp == 2:
+        for item in ANNOUNCEMENT_TITLE_FEATURE:
+            if title[-2+item[1]:-2] == item[0]:
+                return True
+    return False
+# 一级专用名词语料库
+FIRST_PROPER_CORPUS = ['被担保人名称：', '本次担保金额及累计为其担保金额：', '本次是否有反担保：', '对外担保逾期的累计数量：',
+                       '企业名称：', '注册资本：', '经营范围：', '法定代表人：', '注册地址：', '财务状况（以下数据未经审计）：',
+                       '担保方式：', '担保期限：', '担保金额：', '担保额度：',
+                       '主体要求：', '成立年限要求：', '客户类型要求：', '商业信用要求：', '反担保要求：', '资金安全性要求：',
+                       '住所：', '成立日期：', '统一社会信用代码：', '甲方：', '乙方：', '甲方承诺：', '乙方承诺：', '理由：',
+                       '本次会议是否有否决议案：', '审议结果：',
+                       '律师事务所：', '律师：', '结论意见：',
+                       '股东大会召开日期：', '网络投票系统：', '股东大会类型和届次', '股东大会类型和届次：', '股东大会召集人：',
+                       '投票方式：', '召开的日期时间：', '召开地点：', '召开日期：', '起止时间：',
+                       '各议案已披露的时间和披露媒体：', '特别决议议案：', '对中小投资者单独计票的议案：', '涉及关联股东回避表决的议案：',
+                       '应回避表决的关联股东名称：', '涉及优先股股东参与表决的议案：', '登记地点：', '登记时间：',
+                       '联系人：', '联系电话：', '传真：', '地址：', '邮编：',
+                       '案件所属的诉讼阶段：', '上市公司子公司所处的当事人地位：', '涉案的金额：', '是否会对上市公司损益产生负面影响：',
+                       '原告：', '被告：', '住所地：', '诉讼机构名称：', '上市公司控股子公司所处的当事人地位：',
+                       '归属于上市公司股东的净利润：', '归属于上市公司股东的扣除非经常性损益的净利润：', '每股收益：'] + \
+                      [f"甲方{i}：" for i in '一二三四五六七八九十'] + [f"乙方{i}：" for i in '一二三四五六七八九十']
+# 二级专用名词语料库（指出现在一级专用名词所在段落中的名词，不应单独成段落）
+SECOND_PROPER_CORPUS = ['许可经营项目：', '一般经营项目：']

pdf2txt_test.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# -*- coding: utf-8 -*-
+'''
+Created by Shengbo.Zhang on 2021/08/13
+'''
+import sys
+import time
+##################################################
+############## 算法：PDF2TXT_V3.py ################
+##############       测试示例      ################
+##################################################
+from Pdf2Txt.pdf2txt_v1 import find_all_local_file
+from Pdf2Txt.pdf2txt_v3 import *
+while True:
+    count_total = 0
+    count_success = 0
+    count_failed = 0
+    test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
+    if test_file_dir == 'exit':
+        sys.exit()
+    print('*****************************************************')
+    t1 = time.time()
+    for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
+        count_total += 1
+        pdf_file_path = path
+        pdf_dir_path = os.path.dirname(path)
+        pdf_file_name = os.path.basename(path)[:-4]
+        output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
+        print(f'开始处理: 第 {idx + 1} 个文件...')
+        print(f'文件名: {pdf_file_name}.pdf')
+        tt1 = time.time()
+        try:
+            txt_string = get_txt_from_pdf(pdf_file_path)
+            if txt_string != '':
+                output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
+                count_success += 1
+                print('处理成功.')
+            else:
+                count_failed += 1
+                print('处理失败！')
+        except Exception as e:
+            print(e)
+            count_failed += 1
+            print('处理失败！')
+        tt2 = time.time()
+        print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
+        print('*****************************************************')
+    t2 = time.time()
+    print('\n所有PDF格式的公告文件已处理完毕！')
+    print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
+    print('执行耗时：', round(t2-t1, 3), '秒')
+    print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')
+# ##################################################
+# ############## 算法：PDF2TXT_V2.py ################
+# ##############       测试示例      ################
+# ##################################################
+# from Pdf2Txt.pdf2txt_v1 import find_all_local_file
+# from Pdf2Txt.pdf2txt_v2 import *
+# while True:
+#     count_total = 0
+#     count_success = 0
+#     count_failed = 0
+#
+#     test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
+#     if test_file_dir == 'exit':
+#         sys.exit()
+#
+#     print('*****************************************************')
+#     t1 = time.time()
+#     for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
+#         count_total += 1
+#
+#         pdf_file_path = path
+#         pdf_dir_path = os.path.dirname(path)
+#         pdf_file_name = os.path.basename(path)[:-4]
+#         output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
+#
+#         print(f'开始处理: 第 {idx + 1} 个文件...')
+#         print(f'文件名: {pdf_file_name}.pdf')
+#         tt1 = time.time()
+#         try:
+#             txt_string = get_txt_from_pdf(pdf_file_path)
+#             if txt_string != '':
+#                 output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
+#                 count_success += 1
+#                 print('处理成功.')
+#             else:
+#                 count_failed += 1
+#                 print('处理失败！')
+#         except Exception as e:
+#             print(e)
+#             count_failed += 1
+#             print('处理失败！')
+#         tt2 = time.time()
+#         print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
+#
+#         print('*****************************************************')
+#
+#     t2 = time.time()
+#     print('\n所有PDF格式的公告文件已处理完毕！')
+#     print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
+#     print('执行耗时：', round(t2-t1, 3), '秒')
+#     print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')
+# ##################################################
+# ############## 算法：PDF2TXT_V1.py ################
+# ##############       测试示例      ################
+# ##################################################
+# from Pdf2Txt.pdf2txt_v1 import *
+# while True:
+#     count_total = 0
+#     count_success = 0
+#     count_failed = 0
+#
+#     test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
+#     if test_file_dir == 'exit':
+#         sys.exit()
+#     txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符  2. 不带段头段尾标识符（默认，按enter键） ')
+#     if txt_output_mode == '1':
+#         txt_output_mode = True
+#     else:
+#         txt_output_mode = False
+#
+#     print('*****************************************************')
+#     for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
+#         count_total += 1
+#
+#         pdf_file_path = path
+#         pdf_dir_path = os.path.dirname(path)
+#         pdf_file_name = os.path.basename(pdf_file_path)[:-4]
+#         output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx"
+#         output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
+#         output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv"
+#
+#         t1 = time.time()
+#         is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path)
+#         t2 = time.time()
+#         print(f'开始处理: 第 {idx + 1} 个文件...')
+#         print(f'文件名: {pdf_file_name}.pdf')
+#         print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验！')
+#         print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms')
+#
+#         if not is_success:
+#
+#             count_failed += 1
+#             print(f'文件: {pdf_file_path}')
+#             print('错误: 原始pdf与生成的docx文件页数校验失败，拒绝进行下一步转换.')
+#             # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件，待原作者更新；
+#             # 若发生校验失败，后续可考虑直接丢弃该公告数据，或使用_get_txt_from_pdf()函数作直接转换。
+#
+#         else:
+#
+#             document = Document(output_docx_file_path)
+#
+#             is_success, txt_list = get_txt_from_docx(doc=document)
+#             t3 = time.time()
+#             print('步骤-2: 公告docx文件的段落提取与格式化已完成！')
+#             print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms')
+#
+#             if not is_success:
+#                 count_failed += 1
+#                 print(f'文件: {pdf_file_path}')
+#                 print('错误: 原始docx转换为txt文本的过程中出错，拒绝进行下一步转换.')
+#             else:
+#                 txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path,
+#                                                             is_out_flag=False)
+#                 t4 = time.time()
+#                 print('步骤-3: 公告docx文件的表格提取与格式化已完成！')
+#                 print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms')
+#
+#                 txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list)
+#                 t5 = time.time()
+#                 print('步骤-4: 公告txt文件的校对已完成！')
+#                 print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms')
+#
+#                 write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode)
+#                 str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode)
+#                 t6 = time.time()
+#                 print('步骤-5: 公告txt文件的输出已完成！')
+#                 print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms')
+#
+#                 print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms')
+#                 count_success += 1
+#
+#         if os.path.exists(output_docx_file_path):
+#             os.remove(output_docx_file_path)
+#         if os.path.exists(output_csv_file_path):
+#             os.remove(output_csv_file_path)
+#         print('*****************************************************')
+#
+#     print('\n所有PDF格式的公告文件已处理完毕！')
+#     print(f'【文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}】')

pdf2txt_v1.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# -*- coding: utf-8 -*-
+"""
+Created by Shengbo.Zhang on 2021/08/13
+"""
+import io
+import re
+import os
+import csv
+import logging
+from docx import Document
+from pdf2docx import Converter
+from Pdf2Txt.config import *
+from pdfminer.layout import LAParams
+from pdfminer.pdfpage import PDFPage
+from pdfminer.converter import TextConverter
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from Pdf2Txt.config import _check_ann_title_processable
+# 关闭pdf2docx模块中Converter的日志输出
+logging.disable(logging.INFO)
+logging.disable(logging.WARNING)
+def _get_txt_from_pdf(pdf_path, out_path):
+    '''
+    读取Pdf文件，直接将其转换为Txt文本格式
+    :param pdf_path: 输入的pdf公告文件的完整路径
+    :param out_path: 输出的txt结果文件的完整路径
+    :return: bool
+    '''
+    manager = PDFResourceManager()
+    output = io.StringIO()
+    converter = TextConverter(manager, output, laparams=LAParams())
+    interpreter = PDFPageInterpreter(manager, converter)
+    with open(pdf_path, 'rb') as infile:
+        content = []
+        for page in PDFPage.get_pages(infile, check_extractable=True):
+            interpreter.process_page(page)
+            convertedPDF = output.getvalue()
+        # print(convertedPDF)
+        content.append(convertedPDF)
+    # print(len(content))
+    # print(content)
+    for idx, val in enumerate(content):
+        val = re.sub('\n+','\n', val)
+        val = re.sub('\n +', '', val)
+        val = val.replace('', '')
+        content[idx] = val
+    with open(out_path, 'wb') as f:
+        f.write(''.join(content).encode('utf-8'))
+    output.close()
+    converter.close()
+    f.close()
+    return True
+def _get_cleaned_txt(txtPath, out_path):
+    '''
+    对Txt文件进行内容格式清洗（暂时仅供测试）
+    :param txtPath: 输入的txt文件的完整路径
+    :param out_path: 输出的txt文件的完整路径
+    :return: bool
+    '''
+    with open(txtPath, 'rb')as f:
+        content = f.read().decode('utf-8')
+    p = re.compile(r'(?<=##)\S.+(?=##)|[\u4e00-\u9fff+\u3002\uFF0C]')
+    x = ''.join(re.findall(p, content))
+    final_result = re.sub(u"[\uFF0C|\u3002|\u002B]{2,}", "", x)
+    with open(out_path, "w")as txtPath:
+        txtPath.write(final_result)
+    # print(final_result)
+    return True
+def get_docx_from_pdf(pdf_path, out_path):
+    '''
+    读取Pdf文件，将其转换为Docx格式并存在本地
+    :param pdf_path: 输入的pdf公告文件的完整路径
+    :param out_path: 输出的中间docx结果文件的完整路径
+    :return: bool
+    '''
+    try:
+        cv = Converter(pdf_path)
+        cv.convert(out_path)
+    except Exception:
+        return False
+    for p in cv.pages:
+        if not p.finalized:
+            cv.close()
+            return False
+    cv.close()
+    return True
+def _find_key_indexs(str, key):
+    '''
+    给定一个父字符串和子串，在父串中查找子串的所有索引位置，并返回一个包含所有下标的列表
+    :param str: 父字符串
+    :param key: 子字符串
+    :return: list
+    '''
+    lstKey = []
+    countStr = str.count(key)
+    if countStr < 1:
+        return []
+    elif countStr == 1:
+        indexKey = str.find(key)
+        return [indexKey]
+    else:
+        indexKey = str.find(key)
+        lstKey.append(indexKey)
+        while countStr > 1:
+            str_new = str[indexKey + 1:len(str) + 1]
+            indexKey_new = str_new.find(key)
+            indexKey = indexKey + 1 + indexKey_new
+            lstKey.append(indexKey)
+            countStr -= 1
+        lstKey.sort(reverse=True)
+        return lstKey
+def _insert_char_into_str(str, idx, char):
+    '''
+    给定一个父字符串、下标位置、子串，在父串中的下标位置插入子串，并返回一个新的字符串
+    :param str: 父字符串
+    :param idx: 插入位置索引
+    :param char: 子字符串
+    :return: str
+    '''
+    tmp = list(str)
+    tmp.insert(idx, char)
+    return ''.join(tmp)
+def _is_chinese(str):
+    '''
+    给定一个字符串，判断该字符串是否全是中文
+    :param str: 字符串
+    :return: bool
+    '''
+    for ch in str:
+        if '\u4e00' <= ch <= '\u9fff':
+            return True
+    return False
+def _get_table_row_feat(str):
+    '''
+    给定一个空格分割的表格行字符串，计算它的特征（01组成的字符串）
+    :param str: 字符串
+    :return: 字符串
+    '''
+    s = str.split()
+    r = ''
+    for c in s:
+        try:
+            _ = float(c)
+            r += '1'
+        except Exception:
+            r += '0'
+    return r
+def _check_if_include_first_proper(s, corpus):
+    '''
+    检查字符串s中是否包含语料列表first_corpus中的某一内容
+    :param s: 字符串
+    :param corpus: 字符串列表
+    :return: [bool, str]
+    '''
+    for i in corpus:
+        if i in s:
+            return [True, i]
+    return [False, '']
+def _check_if_include_second_proper(s, corpus):
+    '''
+    检查字符串s中是否包含语料列表first_corpus中的某一内容
+    :param s: 字符串
+    :param corpus: 字符串列表
+    :return: list
+    '''
+    res = []
+    for i in corpus:
+        if i in s:
+            res.append([True, i])
+        else:
+            res.append([False, i])
+    return res
+def _match_and_insert(string, pattern, substring):
+    '''
+    匹配string字符串中的pattern，计算所有pattern在string中的首个字符索引位置，并在string从后向前插入substring至这些位置
+    :param string: 待匹配的字符串
+    :param pattern: 匹配模式
+    :param substring: 待插入的子字符串
+    :return: 插入后的字符串
+    '''
+    idx_list = []
+    for j in re.finditer(pattern, string):
+        idx_list.append(j.span()[0])
+    # 将匹配模式的所有索引下标进行倒序排列，方便后续插入end_flag
+    idx_list.sort(reverse=True)
+    if idx_list != []:
+        for k in idx_list:
+            if k > 0 and string[k-1] != '“':
+                string = _insert_char_into_str(string, k, substring)
+    return string
+def _match_and_delete(string, pattern):
+    '''
+    匹配string字符串中的pattern，计算pattern在string中的首个字符索引位置，删除该索引前2个位置的换行符\n
+    :param string: 待匹配的字符串
+    :param pattern: 匹配模式
+    :return: 删除'\n\n'子字符串后的字符串
+    '''
+    matcher = re.search(pattern, string)
+    if matcher:
+        k = matcher.span()[0]
+        if k >= 2 and string[k-1] == '\n' and string[k-2] == '\n':
+            string = string[:k-2] + string[k:]
+    return string
+def get_txt_from_docx(doc):
+    '''
+    读取Docx文件中每个自然行的材料内容
+    :param doc: 一个Document对象实例
+    :param out_path: 输出的txt结果文件的完整路径
+    :return: bool（转换是否成功）, list（格式化修正后的文本列表）
+    '''
+    # 公告中的编号符号的集合，例如：'（1）', '1、'
+    NUMBER_1 = '123456789一二三四五六七八九十'
+    # 数字与大小写的英文字母的集合
+    NUMBER_2 = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+    # 提取docx文件中的单行文本至初始文本列表paras中
+    paras = [para.text+'\n' for i, para in enumerate(doc.paragraphs)]
+    # 存储首轮格式化修正文本的列表new_paras
+    new_paras = []
+    # new_paras中各字符串的长度
+    new_paras_len_cnt = []
+    try:
+        # 遍历paras文本列表中的各字符串
+        for val in paras:
+            # 若该行文本为空，或者是页面号，或者是’单位‘，则跳过
+            if val == '\n' or re.search('^[0-9]+ \n$', val) or val[:2] == '单位':
+                continue
+            # 否则，将该行文本添加进new_paras文本列表中
+            new_paras.append(val.lstrip())
+            # 计算该行文本的字符长度
+            new_paras_len_cnt.append(len(val))
+        # 正文标识符，指示正文开始的行号
+        line_mark = 0
+        # 遍历new_paras的前10行，目的是处理公告的头部信息，例如：证券代码、证券简称、公告编号、公告标题等
+        for i, val in enumerate(new_paras[:10]):
+            # 如果出现制表符，或者空格数大于1，则仅保留一个空格
+            if '\t' in val or val.count(' ') > 2:
+                new_paras[i] = ' '.join(val.split()) + '\n'
+                if '证券代码：' in new_paras[i]:
+                    continue
+            # 如果行末尾是'有限公司'，则去掉可能的空格
+            if val.replace(' ', '')[-5:] == '有限公司\n':
+                new_paras[i] = val.replace(' ', '')
+                continue
+            # 循环检查下一行，直到行末尾是'公告'，或'股东大会的通知'，或'董事意见函'（应与mongo.py中process_files()相对应）
+            # 此时认为抵达正文的起始位置，后续处理将从第line_mark行开始
+            if _check_ann_title_processable(val.replace(' ', ''), exp=1):
+                new_paras[i] = val.replace(' ', '')
+                line_mark = i + 1
+                break
+            else:
+                new_paras[i] = val.replace('\n', '').replace(' ', '')
+        # 计算new_paras中各行的平均字符长度
+        mean_len = sum(new_paras_len_cnt)//len(new_paras_len_cnt)
+        # 遍历new_paras
+        for i, _ in enumerate(new_paras):
+            # 如果是正文部分
+            if i >= line_mark:
+                # 去掉该行中的一些符号（空格、特殊符号、英文逗号）
+                new_paras[i] = new_paras[i]\
+                    .replace(' ', '')\
+                    .replace('	', '')\
+                    .replace('', '')\
+                    .replace(',', '')
+                # 如果该行长度大于平均长度，并且（下一行首部为非编号，���者下一行首部是编号且以换行结尾），则认为该行在段落中，故去掉该行换行符
+                if i < len(new_paras)-1 and \
+                        len(new_paras[i]) >= mean_len and \
+                        ((new_paras[i + 1].replace('(','').replace('（','')[0] not in NUMBER_1) or
+                         new_paras[i + 1][-1] == '\n'):
+                    new_paras[i] = new_paras[i].replace('\n', '')
+                # 如果该行的下一行长度大于等于3，并且该行的下一行首部是非编号，且不包含关键字'年'，则认为该行在段落中，故去掉该行换行符
+                if i < len(new_paras)-2 and \
+                        len(new_paras[i + 1]) >= 3 and \
+                        new_paras[i + 1].replace('(','').replace('（','')[0] in NUMBER_2 and \
+                        (not '.' in new_paras[i+1][:3]) and \
+                        (not '、' in new_paras[i+1][:3]) and \
+                        (not '年' in new_paras[i+1]):
+                    new_paras[i] = new_paras[i].replace('\n', '')
+                # 查找该行中，中文冒号符号'：'的所有索引位置j
+                for j in _find_key_indexs(new_paras[i], '：'):
+                    # 如果该行的j+1位置不是换行，并且该行不包括中文括号'（）'与书名号'《》'，且该行不包含一级专用名词，则认为该行应独立成
+                    # 段落，故在该行j+1位置插入换行符
+                    # 注：执行插入操作时，若有多个位置进行插入，则总是从后往前插入，确保插入后索引仍然正确
+                    if j < len(new_paras[i])-1 and new_paras[i][j+1] != '\n' and \
+                            ('（' not in new_paras[i]) and ('《' not in new_paras[i]) and \
+                            ('）' not in new_paras[i]) and ('》' not in new_paras[i]) and \
+                            (not _check_if_include_first_proper(new_paras[i], FIRST_PROPER_CORPUS)[0]):
+                        new_paras[i] = _insert_char_into_str(new_paras[i], j+1, '\n')
+                # 查找该行中，中文左括号符号'（'的所有索引位置j
+                for j in _find_key_indexs(new_paras[i], '（'):
+                    # 如果该行的j+1位置是编号，并且该行的上一行末尾不是换行，且该行j-1位置为非中文和非书名号，则认为该行的下一行应独立成
+                    # 段落，故在该行j位置插入换行符
+                    if new_paras[i][j+1] in NUMBER_1 and new_paras[i-1][-1] != '\n' and \
+                            (not _is_chinese(new_paras[i][j-1])) and new_paras[i][j-1] != '》':
+                        new_paras[i] = _insert_char_into_str(new_paras[i], j, '\n')
+                # 查找该行中，英文左括号符号'('的所有索引位置j
+                for j in _find_key_indexs(new_paras[i], '('):
+                    # 如果该行的j+1位置是编号，并且该行的上一行末尾不是换行，且该行j-1位置为非中文和非书名号，则认为该行的下一行应独立成
+                    # 段落，故在该行j位置插入换行符
+                    if new_paras[i][j + 1] in NUMBER_1 and new_paras[i - 1][-1] != '\n' and \
+                            (not _is_chinese(new_paras[i][j - 1])) and new_paras[i][j - 1] != '》':
+                        new_paras[i] = _insert_char_into_str(new_paras[i], j, '\n')
+                # 查找该行中，中文顿号符号'、'的所有索引位置j
+                for j in _find_key_indexs(new_paras[i], '、'):
+                    # 如果该行的j-1位置是编号（不超过9或十），并且该行的上一行末尾不是换行，则认为该行的下一行应独立成段落，故在该行j-1
+                    # 位置插入换行符
+                    if (j-2) < len(new_paras[i]) and new_paras[i][j-1] in NUMBER_1 and new_paras[i][j-2] not in NUMBER_1 \
+                            and new_paras[i][j-2] in '。；.;' and new_paras[i-1][-1] != '\n':
+                        new_paras[i] = _insert_char_into_str(new_paras[i], j-1, '\n')
+                        continue
+                    # 如果该行的j-1与j-2位置都是编号（超过9或十），并且该行的上一行末尾不是换行，则认为该行的下一行应独立成段落，故在该
+                    # 行j-2位置插入换行符
+                    if (j-3) < len(new_paras[i]) and  new_paras[i][j-1] in NUMBER_1 and new_paras[i][j-2] in NUMBER_1 \
+                            and new_paras[i][j-3] in '。；.;' and new_paras[i-1][-1] != '\n':
+                        new_paras[i] = _insert_char_into_str(new_paras[i], j-2, '\n')
+                # 修正某些情形下'特此公告。'未自成段落的情况
+                if new_paras[i] == '特此公告。\n':
+                    if new_paras[i-1][-1] != '\n':
+                        new_paras[i] = '\n特此公告。\n'
+                    if new_paras[i+1][-1] != '\n':
+                        new_paras[i+1] += '\n'
+                # 如果该行的下一行中含有独立的一级专用名词，则认为该行的下一行应独立成段落，故在该行的末尾插入缺省的换行符
+                if (i+1) < len(new_paras):
+                    tmp_flag, tmp_str = _check_if_include_first_proper(new_paras[i+1], FIRST_PROPER_CORPUS)
+                    if tmp_flag:
+                        tmp_idx = new_paras[i+1].index(tmp_str) - 1
+                        if tmp_idx >= 0 and new_paras[i+1][tmp_idx] != '（':
+                            if new_paras[i][-1] != '\n':
+                                new_paras[i] += '\n'
+        # 将new_paras中的若干字符串连接成为一个字符串str_sum
+        str_sum = ''.join(new_paras)
+        # 将str_num字符串按照换行符进行分割，生成次轮格式化修正文本的列表final_paras
+        final_paras = str_sum.split('\n')
+        # 遍历final_paras
+        for i, val in enumerate(final_paras):
+            # 每一自然段落的末尾符号，这里为两个换行符，便于清晰地查看最终生成的txt文本
+            end_flag = '\n\n'
+            # 给final_paras中的每一行添加一个末尾符号
+            final_paras[i] += end_flag
+            # 在该行中查找匹配到的所有形如: '（1）', '(2)' 的模式
+            # 此处认为该模式的起始位置应独立成段落，例如'\n（1）XXX...', '\n(1)XXX...'
+            if '（' in final_paras[i]:
+                final_paras[i] = _match_and_insert(final_paras[i], '[\（\(]+[0-9]{1,2}[\）\)]+', end_flag)
+            # 在该行中，查找所有的中文左括号符号'（'与中文右括号符号'）'，计算它们各自的数量
+            # 如果两符号的数量不相等，则认为该行处在段落中，故应去掉该行末尾的end_flag
+            if len(_find_key_indexs(final_paras[i], '（')) != len(_find_key_indexs(final_paras[i], '）')):
+                final_paras[i] = final_paras[i][:-2]
+        # 将final_paras中的若干字符串连接成为一个字符串str_sum
+        str_sum = ''.join(final_paras)
+        # 将str_num字符串按照换行符进行分割，生成终轮格式化修正文本的列表，覆盖掉之前的final_paras
+        final_paras = str_sum.split('\n\n')
+        # 遍历final_paras
+        for i, val in enumerate(final_paras):
+            # 每一自然段落的末尾符号，这里为两个换行符，便于清晰地查看最终生成的txt文本
+            end_flag = '\n\n'
+            # 给final_paras中的每一行再次添加上一个末尾符号
+            final_paras[i] += end_flag
+            # 修正某些情形下'重要内容提示：'未自成段落的情况
+            if '重要内容提示：' in final_paras[i]:
+                idx = final_paras[i].index('重要内容提示：')
+                if final_paras[i][idx+7] != '\n':
+                    final_paras[i] = _insert_char_into_str(final_paras[i], idx+7, '\n\n')
+                if idx > 0:
+                    if final_paras[i][idx-1] != '\n':
+                        final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
+            # 修正某些情形下'表决结果：'及其跟随的结果未自成段落的情况
+            if '表决结果：' in final_paras[i]:
+                if final_paras[i][:5] == '表决结果：':
+                    final_paras[i] = final_paras[i][:-2]
+                elif final_paras[i][-7:] == '表决结果：\n\n':
+                    idx = final_paras[i].find('表决结果：')
+                    final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
+                    final_paras[i] = final_paras[i][:-2]
+                else:
+                    idx = final_paras[i].find('表决结果：')
+                    final_paras[i] = _insert_char_into_str(final_paras[i], idx, '\n\n')
+            # 检查该行中的所有二级专用名词（指一级专用名词后所在段落中出现的专用名词，不应独立成段落）
+            for is_include, s_include in _check_if_include_second_proper(final_paras[i], SECOND_PROPER_CORPUS):
+                if is_include:
+                    # 如果该行中含有某一二级专用名词，并且名词后有换行符，则去掉该行的换行符
+                    if final_paras[i][final_paras[i].index(s_include)+len(s_include)] == '\n':
+                        final_paras[i] = final_paras[i].replace('\n', '')
+            # 在该行中查找匹配到的所有形如: '（一）', '（1）', '(一)', '(1)' 的模式
+            # 此处认为该模式的起始位置应独立成段落，例如'\n（一）XXX...', '\n（1）XXX...'
+            if '（' in final_paras[i]:
+                final_paras[i] = _match_and_insert(final_paras[i], '[\（\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\）\)]+', end_flag)
+                final_paras[i] = _match_and_insert(final_paras[i], '[\（\(]+[0-9]{1,2}[\）\)]+', end_flag)
+            # 在该行中查找匹配到的所有形如: '一、', '1、' 的模式
+            # 此处认为该模式的起始位置应独立成段落，例如'\n一、XXX...', '\n1、XXX...'
+            if '、' in final_paras[i]:
+                final_paras[i] = _match_and_insert(final_paras[i], '[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', end_flag)
+                final_paras[i] = _match_and_insert(final_paras[i], '[0-9]{1,2}、', end_flag)
+                # 这里对形如: 'XXX第一、二组、三组的XXX' 的特例进行处理，即去掉前序错误添加的换行符
+                final_paras[i] = _match_and_delete(final_paras[i], '[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、[\S]+、[\S]+')
+                final_paras[i] = _match_and_delete(final_paras[i], '[0-9]+、[0-9]+')
+            # 再次检查'●'符号，若非独立成行则在该符号前添加换行符
+            for j in _find_key_indexs(final_paras[i], '●'):
+                if j > 0:
+                    final_paras[i] = _insert_char_into_str(final_paras[i], j, end_flag)
+    # 如果上述处理流程出现任何异常抛掷，则返回(False, [])，标志转换失败
+    except Exception:
+        return False, []
+    # 返回(True, final_paras)，标志转换成功
+    return True, final_paras
+def get_table_from_docx(doc, txt, out_path="", is_out_flag=False):
+    '''
+    读取Docx文件中每个表格的材料内容
+    :param doc: 一个Document对象实例
+    :param txt: 一个字符串列表，包含PDF的正文文本内容
+    :param out_path: 输出的csv结果文件的完整路径
+    :param is_out_flag: 是否输出csv结果文件，默认不输出
+    :return: list, list
+    '''
+    data = []
+    table_txt = []
+    attach_txt = {}
+    for table in doc.tables[:]:
+        table_txt.append('-----表格-----\n')
+        for i, row in enumerate(table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
+                row_content.append(new_c)
+            if row_content == []:
+                continue
+            if '本公司' in row_content[0]:
+                tmp = ''
+                for line in row_content:
+                    tmp += line.strip()
+                tmp += '\n\n'
+                attach_txt['000'] = tmp
+                continue
+            if '证券代码' in row_content[0]:
+                tmp = '^'
+                for line in row_content:
+                    tmp += line.strip()+' '
+                tmp += '$\n'
+                txt.insert(tmp, 0)
+                continue
+            data.append(row_content)
+            new_row = '^' + '\t'.join(row_content) + '$\n'
+            if new_row.replace('\t','') != '^$\n':
+                table_txt.append(new_row)
+        data.append('-----表格-----\n')
+        table_txt.append('-----表格-----\n')
+    flag = False
+    for i, val in enumerate(table_txt):
+        if val == '-----表格-----\n':
+            if not flag:
+                flag = True
+            else:
+                table_txt[i] = '^$\n'
+        else:
+            flag = False
+    table_txt = list(filter(lambda x: x != '^$\n', table_txt))
+    for i, val in enumerate(table_txt):
+        if val == '-----表格-----\n' and (i > 0) and (i < len(table_txt)-1):
+            feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
+            feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
+            if feat1 == feat2:
+                table_txt[i] = '^$\n'
+    if len(table_txt) == 1 and table_txt[0] == '-----表格-----\n':
+        table_txt[0] = '^$\n'
+    for i, val in enumerate(table_txt):
+        if val == '-----表格-----':
+            continue
+        if val == '^$\n':
+            table_txt[i] = ''
+            continue
+        table_txt[i] = val[1:][:-2] + '\n'
+    txt.extend(table_txt)
+    if is_out_flag:
+        f = open(out_path, 'w+', newline='')
+        writer = csv.writer(f)
+        for i, val in enumerate(data):
+            if i == 0 and val == '\n':
+                continue
+            writer.writerow(val)
+        f.close()
+    return txt, attach_txt
+def refine_pdf2txt_list_result(txt, att_txt):
+    '''
+    对txt字符串列表进行最后的校对，还原或附加误识别为表格的正文内容
+    :param txt: 一个字符串列表，包含PDF的正文文本内容
+    :param att_txt: 一些误识别为表格的正文内容
+    :return: list
+    '''
+    for id, val in enumerate(txt):
+        if id > 10: break
+        else:
+            if val[-6:-2] == '有限公司':
+                txt[id] = val[:-2]
+                continue
+            if '000' in att_txt and _check_ann_title_processable(val, exp=2):
+                txt.insert(id+1, att_txt['000'])
+                break
+    return txt
+def write_pdf2txt_list_result(out_path, txt, out_mode_flag=True):
+    '''
+    将txt字符��列表写为txt文本文件
+    :param out_path: 生成的txt文本文件的路径
+    :param txt: 一个字符串列表，包含PDF的正文和表格
+    :param out_mode_flag: 是否添加段头标识'^'和段尾标识'$'
+    :return: bool
+    '''
+    with open(out_path, "w", encoding='utf-8') as f:
+        if not out_mode_flag:
+            for line in txt:
+                if line != '^$\n':
+                    f.write(line)
+        else:
+            strs = ''.join(txt)
+            paras = strs.split('\n')
+            for line in paras:
+                if line != '':
+                    f.write('^' + line + '$\n')
+    return True
+def get_pdf2txt_str_result(txt, out_mode_flag=True):
+    '''
+    将txt字符串列表内元素拼接为完整的txt内容
+    :param txt: 一个字符串列表，包含PDF的正文和表格
+    :param out_mode_flag: 是否添加段头标识'^'和段尾标识'$'
+    :return: str
+    '''
+    txt_str = ""
+    for line in txt:
+        if not out_mode_flag:
+            for line in txt:
+                if line != '^$\n':
+                    txt_str += line
+        else:
+            strs = ''.join(txt)
+            paras = strs.split('\n')
+            for line in paras:
+                if line != '':
+                    txt_str += ('^' + line + '$\n')
+    return txt_str
+def find_all_local_file(base, extension):
+    '''
+    找出给定目录下所有的指定后缀格式的文件路径
+    :param base: 目录路径
+    :param extension: 后缀格式，例如: '.pdf'
+    :return: str
+    '''
+    for root, ds, fs in os.walk(base):
+        for f in fs:
+            if f.endswith(extension.lower()) or f.endswith(extension.upper()):
+                fullname = os.path.join(root, f).replace('/', '//').replace('\\', '//')
+                yield fullname

pdf2txt_v2.py ADDED Viewed

	@@ -0,0 +1,399 @@

+# -*- coding: utf-8 -*-
+"""
+Created by Shengbo.Zhang on 2021/09/20
+"""
+import os
+import re
+import logging
+import pdfplumber
+from docx import Document
+from Pdf2Txt.config import *
+from Pdf2Txt.config import _check_ann_title_processable
+from pdf2docx import Converter
+from collections import Counter
+from pdfminer.pdfpage import PDFPage
+from pdfminer.layout import LAParams, LTTextBox
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+# 临时关闭pdf2docx模块中Converter的日志输出
+logging.disable(logging.INFO)
+logging.disable(logging.WARNING)
+def get_string_list_from_pdf(pdf_path):
+    '''
+    从一个PDF文件中直接逐行读取文本内容（除表格以外的正文），结果存放在一个列表中
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 两个列表：string_list，ann_info_list。前者存放PDF的逐行文本内容，后者存放公告的头部信息（例如：证券代码、证券简称、公告编号等）
+    '''
+    string_list = []
+    ann_info_list = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for id, page in enumerate(pdf.pages):
+            bboxes = [table.bbox for table in page.find_tables()]
+            def _not_within_bboxes(obj):
+                def _obj_in_bbox(_bbox):
+                    v_mid = (obj["top"] + obj["bottom"]) / 2
+                    h_mid = (obj["x0"] + obj["x1"]) / 2
+                    x0, top, x1, bottom = _bbox
+                    return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
+                return not any(_obj_in_bbox(__bbox) for __bbox in bboxes)
+            new_page = page.filter(_not_within_bboxes)
+            string = new_page.extract_text()
+            string_split = string.split('\n')
+            if id == 0:
+                ann_info_list = string_split[:10]
+            string_split = [new_string.replace(' ', '').replace('\n', '').replace('\t', '') + '\n' for new_string in string_split]
+            string_split = list(filter(lambda x: x != '\n' and x != '', string_split))
+            string_list.extend(string_split)
+    return string_list, ann_info_list
+def get_ann_info_from_pdf(pdf_path):
+    '''
+    获取PDF公告文件的头部信息（此处截取了前5行文本，可能包括非头部数据，将在refine_txt_list()中进一步处理）
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个列表，存放PDF公告文件的头部信息（例如：证券代码、证券简称、公告编号等）
+    '''
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            string = pdf.pages[0].extract_text()
+            string_split = string.split('\n')
+            ann_info_list = string_split[:10]
+    except:
+        ann_info_list = []
+    return ann_info_list
+def get_string_list_from_pdf_converted_docx(pdf_path, docx_path):
+    '''
+    将PDF文件转换为Docx格式，逐行读取Docx文件中的正文内容（除表格以外）
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个列表，string_list，存放PDF的逐行文本内容；一个Document实例对象，存放临时的Docx文件
+    '''
+    document = None
+    string_list = []
+    if docx_path == '':
+        output_docx_file_path = f"{os.path.dirname(pdf_path)}//{os.path.basename(pdf_path)[:-4]}_{TEMP_DOCX_SUFFIX}.docx"
+    else:
+        output_docx_file_path = docx_path
+    is_success = get_docx_from_pdf(pdf_path=pdf_path, out_path=output_docx_file_path)
+    if is_success:
+        document = Document(output_docx_file_path)
+        for val in document.paragraphs:
+            tmp = val.text.strip()
+            tmp_list = tmp.split('\n')
+            for s in tmp_list:
+                s = s.strip()
+                if s == '': continue
+                string_list.append(s)
+        string_list = [string.replace(' ', '').replace('\n', '').replace('\t', '') + '\n' for string in string_list]
+    ann_headers = []
+    for i, val in enumerate(string_list):
+        if i > 10: break
+        if val.strip()[-4:] == '有限公司': break
+        ann_headers.append(val)
+    for i, val1 in enumerate(string_list):
+        for j, val2 in enumerate(ann_headers):
+            if val1 == val2: string_list[i] = ''
+    if os.path.exists(output_docx_file_path):
+        os.remove(output_docx_file_path)
+    return string_list, document
+def get_abscissa_dict_from_pdf(pdf_path):
+    '''
+    从一个PDF文件中逐行读取该行首个文本块字符的横坐标值（以PDF页面左上角为原点），以该行文本内容为键，横坐标值为值，建立一个字典
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个字典：abscissa_dict，存放PDF文件中某一文本块的起始横坐标值
+    '''
+    abscissa_dict = {}
+    fp = open(pdf_path, 'rb')
+    rsrcmgr = PDFResourceManager()
+    laparams = LAParams()
+    device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device)
+    pages = PDFPage.get_pages(fp)
+    for i, page in enumerate(pages):
+        interpreter.process_page(page)
+        layout = device.get_result()
+        for lobj in layout:
+            if isinstance(lobj, LTTextBox):
+                x, text = int(lobj.bbox[0]), lobj.get_text()
+                tmp = text.replace(' ', '').replace('\n', '').replace('\t', '') + '\n'
+                if tmp != '\n' and tmp != '':
+                    abscissa_dict[tmp] = x
+    fp.close()
+    return abscissa_dict
+def get_min_abscissa_value(abscissa_dict, string_list_length):
+    '''
+    计算PDF文本块横坐标的最小值（正文块），这里假设该值至少应大于或等于某一阈值（此处设为文本总行数的1/4）
+    :param abscissa_dict: 一个字典，存放PDF文件中某一文本块的起始横坐标值
+    :param string_list_length: 整型，PDF的文本字符串列表
+    :return: 整型，PDF正文块横坐标的最小值
+    '''
+    abscissa_x_list = list(abscissa_dict.values())
+    abscissa_x_list_counter = list(dict(Counter(abscissa_x_list)).items())
+    abscissa_x_list_counter.sort()
+    x_threshold = string_list_length // 4
+    min_abscissa_value = min(abscissa_x_list)
+    for item in abscissa_x_list_counter:
+        if item[1] >= x_threshold:
+            min_abscissa_value = item[0]
+            break
+    return min_abscissa_value
+def refine_txt_list(txt, ann_info):
+    '''
+    此时PDF文件的文本字符串列表（正文）已经过首轮处理，此处将对它进行最后的格式上的优化
+    :param txt: PDF的文本列表，包含PDF的正文文本内容
+    :param ann_info: PDF的公告的头部信息
+    :return: 一个新的PDF文本列表
+    '''
+    # 格式化PDF的【公告头部信息】
+    if ann_info != []:
+        new_ann_info_list = []
+        for i, val in enumerate(ann_info):
+            if val.strip() == '': continue
+            if val.strip()[-4:] == '有限公司': break
+            else: new_ann_info_list.append(' '.join(val.split()) + SEGMENT_SYMBOL)
+        if new_ann_info_list != []:
+            new_ann_info_list[-1] = new_ann_info_list[-1].replace(SEGMENT_SYMBOL, '')
+            if txt[0].strip()[-4:] == '有限公司':
+                for i in range(len(new_ann_info_list)):
+                    txt.insert(0, '')
+            for i, val in enumerate(new_ann_info_list):
+                txt[i] = val
+    # 格式化PDF的【公告标题】【董事会承诺说明】
+    for i, val in enumerate(txt):
+        if i > 10: break
+        else:
+            val = val.strip()
+            if _check_ann_title_processable(val):
+                if SEGMENT_SYMBOL not in val:
+                    txt[i] = (SEGMENT_SYMBOL + val)
+            if val[-4:] == '有限公司':
+                if SEGMENT_SYMBOL not in txt[i]:
+                    txt[i] = (SEGMENT_SYMBOL + val)
+                if _check_ann_title_processable(txt[i+1]):
+                    txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
+                    if txt[i+2].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
+                        if SEGMENT_SYMBOL not in txt[i+2]:
+                            txt[i+2] = (SEGMENT_SYMBOL + txt[i+2])
+                        txt[i+3] = txt[i+3].replace(SEGMENT_SYMBOL, '')
+                    break
+                if _check_ann_title_processable(txt[i+2]):
+                    txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
+                    txt[i+2] = txt[i+2].replace(SEGMENT_SYMBOL, '')
+                    if txt[i+3].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
+                        if SEGMENT_SYMBOL not in txt[i+3]:
+                            txt[i+3] = (SEGMENT_SYMBOL + txt[i+3])
+                        txt[i+4] = txt[i+4].replace(SEGMENT_SYMBOL, '')
+                    break
+    # 次轮遍历PDF的文本字符串列表
+    for i, _ in enumerate(txt):
+        # 格式化PDF的【大小节编号】
+        if (SEGMENT_SYMBOL not in txt[i]):
+            match_check = [1, 1, 1, 1, 1]
+            # 形如: '一、'的匹配模式
+            match_1 = re.match('[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', txt[i])
+            # 形如: '1、'的匹配模式
+            match_2 = re.match('[0-9]{1,2}、', txt[i])
+            # 形如: '1.'的匹配模式
+            match_3 = re.match('[0-9]{1,2}\.', txt[i])
+            # 形如: '（一）'或'(一)'的匹配模式
+            match_4 = re.match('[\（\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\）\)]+', txt[i])
+            # 形如: '（1）'或'(1)'的匹配模式
+            match_5 = re.match('[\（\(]+[0-9]{1,2}[\）\)]+', txt[i])
+            if match_1: match_check[0] = match_1.start()
+            if match_2: match_check[1] = match_2.start()
+            if match_3: match_check[2] = match_3.start()
+            if match_4: match_check[3] = match_4.start()
+            if match_5: match_check[4] = match_5.start()
+            if 0 in match_check:
+                txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下【重要内容提示】字段未自成一行的错误
+        if ('重要内容提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('重要内容提示') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下【单位：元】字段未被删除的错误
+        if (txt[i] == '单位：元') or (txt[i] == SEGMENT_SYMBOL + '单位：元'):
+            txt[i] = ''
+        # 修正某些情况下【特别提示】字段未自成一行的错误
+        if ('特别提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特别提示') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下【特此公告】字段未自成一行的错误
+        if ('特此公告' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特此公告') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下该行文本与下一行文本内容重复的错误（仅保留一行）
+        if (i+1) < len(txt) and (txt[i] == txt[i+1]):
+            txt[i] = ''
+    return txt
+def get_docx_from_pdf(pdf_path, out_path):
+    '''
+    读入一个PDF文件，将其转换为Docx格式并临时存放于本地
+    :param pdf_path: 输入的PDF公告文件的完整路径
+    :param out_path: 输出的中间Docx结果文件的完整路径
+    :return: 布尔值，是否转换成功
+    '''
+    cv = Converter(pdf_path)
+    try:
+        cv.convert(out_path)
+    except Exception:
+        cv.close()
+        return False
+    for p in cv.pages:
+        if not p.finalized:
+            cv.close()
+            return False
+    cv.close()
+    return True
+def _get_table_row_feat(str):
+    '''
+    给定一个空格分割的表格行字符串，计算它的特征（01组成的字符串）
+    :param str: 字符串
+    :return: 字符串
+    '''
+    s = str.split()
+    r = ''
+    for c in s:
+        try:
+            _ = float(c)
+            r += '1'
+        except Exception:
+            r += '0'
+    return r
+def append_table_from_docx(doc, txt):
+    '''
+    读取Docx文件中每个表格的内容，格式化处理后追加至PDF的文本列表中
+    :param doc: 一个Document对象实例
+    :param txt: 一个字符串列表，包含PDF的正文文本内容
+    :return: 一个新的PDF文本列表
+    '''
+    data = []
+    table_txt = []
+    table_tag = '-' + TABLE_SYMBOL + '-'
+    for table in doc.tables[:]:
+        table_txt.append(f'{table_tag}\n')
+        for i, row in enumerate(table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
+                row_content.append(new_c)
+            if row_content == []: continue
+            if '本公司' in row_content[0]:
+                tmp = SEGMENT_SYMBOL
+                for line in row_content:
+                    tmp += line.strip()
+                if '特别提示' in tmp:
+                    tmp = tmp[:tmp.index('特别提示')+4]+SEGMENT_SYMBOL+tmp[tmp.index('特别提示')+4:]
+                for id, val in enumerate(txt):
+                    if id > 10: break
+                    else:
+                        if _check_ann_title_processable(val):
+                            txt.insert(id+1, tmp)
+                            break
+                continue
+            if '证券代码' in row_content[0]: continue
+            data.append(row_content)
+            new_row = '^' + TABLE_CELL_SYMBOL.join(row_content) + '$\n'
+            if new_row.replace(TABLE_CELL_SYMBOL,'') != '^$\n':
+                table_txt.append(new_row)
+        data.append(f'{table_tag}\n')
+        table_txt.append(f'{table_tag}\n')
+    flag = False
+    for i, val in enumerate(table_txt):
+        if val == f'{table_tag}\n':
+            if not flag:
+                flag = True
+            else:
+                table_txt[i] = '^$\n'
+        else:
+            flag = False
+    table_txt = list(filter(lambda x: x != '^$\n', table_txt))
+    for i, val in enumerate(table_txt):
+        if val == f'{table_tag}\n' and (i > 0) and (i < len(table_txt)-1):
+            feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
+            feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
+            if feat1 == feat2:
+                table_txt[i] = '^$\n'
+    if len(table_txt) == 1 and table_txt[0] == f'{table_tag}\n':
+        table_txt[0] = '^$\n'
+    for i, val in enumerate(table_txt):
+        if val == table_tag:
+            continue
+        if val == '^$\n':
+            table_txt[i] = ''
+            continue
+        table_txt[i] = val[1:][:-2] + '\n'
+    txt.extend(table_txt)
+    return txt
+def output_txt_string(txt_path, txt_string):
+    '''
+    将PDF公告的格式化文本字符串写出至一��.txt的纯文本文件
+    :param txt_path: 纯文本文件的路径
+    :param txt_string: PDF公告的纯文本字符串
+    :return: 布尔值，是否写出成功
+    '''
+    try:
+        with open(txt_path, "w", encoding='utf-8') as f:
+            f.write(txt_string)
+        # txt_string_split = txt_string.split('\n')
+        # with open(txt_path, "w", encoding='utf-8') as f:
+        #     for string in txt_string_split:
+        #         if string != '':
+        #             f.write('^' + string + '$\n')
+    except:
+        return False
+    return True
+def get_txt_from_pdf(pdf_path, docx_path=''):
+    '''
+    给定一个PDF格式的公告文件，将其转化为格式化的TXT文本字符串
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个字符串，PDF经转换后的纯文本（已格式化，前部正文，后部表格）
+    '''
+    txt_string = ''
+    ann_info_list = get_ann_info_from_pdf(pdf_path)
+    string_list, document = get_string_list_from_pdf_converted_docx(pdf_path, docx_path)
+    if ann_info_list != [] and string_list != [] and document is not None:
+        abscissa_dict = get_abscissa_dict_from_pdf(pdf_path)
+        min_abscissa_value = get_min_abscissa_value(abscissa_dict, len(string_list))
+        for i, val in enumerate(string_list):
+            if i > 10: break
+            if val.replace('\n', '')[-4:] == '有限公司': break
+            else: abscissa_dict[val] = min_abscissa_value
+        txt_list = []
+        for id, string in enumerate(string_list):
+            new_string = string.replace('\n', '').replace('\t', '').replace(' ', '').replace('	', '').replace('', '').replace(',', '')
+            if (not (len(new_string) <= 3 and new_string.isdigit())) and string != '':
+                try:
+                    if abscissa_dict[string] > min_abscissa_value:
+                        txt_list.append(SEGMENT_SYMBOL + new_string)
+                    else:
+                        txt_list.append(new_string)
+                except:
+                    txt_list.append(new_string)
+        txt_list = refine_txt_list(txt_list, ann_info_list)
+        if document is not None:
+            txt_list.append(SEGMENT_SYMBOL)
+            txt_list = append_table_from_docx(doc=document, txt=txt_list)
+        for val in txt_list:
+            txt_string += val
+    return txt_string

pdf2txt_v3.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# -*- coding: utf-8 -*-
+"""
+Created by Shengbo.Zhang on 2021/10/08
+"""
+import os
+import re
+import logging
+import pdfplumber
+from docx import Document
+from Pdf2Txt.config import *
+from pdf2docx import Converter
+from collections import Counter
+from Pdf2Txt.config import _check_ann_title_processable
+# 临时关闭pdf2docx模块中Converter的日志输出
+logging.disable(logging.INFO)
+logging.disable(logging.WARNING)
+def get_string_and_abscissa_list_from_pdf(pdf_path):
+    '''
+    从一个PDF文件中直接逐行读取文本内容（除表格以外的正文）以及最左侧字符的距左边距，结果存放在一个列表中
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个列表：string_abscissa_list，列表元素为[i, j]。其中，i为PDF的整行文本块内容，j为该文本块的横坐标（距左边距）
+    '''
+    string_abscissa_list = []
+    temp_list = []
+    temp_string_list = []
+    temp_abscissa_list = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for id, page in enumerate(pdf.pages):
+            bboxes = [table.bbox for table in page.find_tables()]
+            def _not_within_bboxes(obj):
+                def _obj_in_bbox(_bbox):
+                    v_mid = (obj["top"] + obj["bottom"]) / 2
+                    h_mid = (obj["x0"] + obj["x1"]) / 2
+                    x0, top, x1, bottom = _bbox
+                    return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
+                return not any(_obj_in_bbox(__bbox) for __bbox in bboxes)
+            new_page = page.filter(_not_within_bboxes)
+            words_list = new_page.extract_words()
+            for item in words_list:
+                text = item['text'].replace('\n', '').replace('\t', '').replace(' ', '').replace('	', '').replace(',', '')
+                x0 = int(str(item['x0']).split('.')[0])
+                y0 = int(str(item['top']).split('.')[0])
+                if text != '':
+                    temp_list.append([text, x0, y0])
+    for id, _ in enumerate(temp_list):
+        if id < len(temp_list)-1 and temp_list[id+1][2] != temp_list[id][2] and abs(temp_list[id+1][2] - temp_list[id][2]) <= 3:
+            temp_list[id+1][2] = temp_list[id][2]
+    i = 0
+    j = 1
+    while True:
+        if i < len(temp_list):
+            temp_str = temp_list[i][0]
+            while j < len(temp_list):
+                if temp_list[i][2] == temp_list[j][2]:
+                    temp_str += temp_list[j][0]
+                else:
+                    break
+                j += 1
+            if i < len(temp_list)-1 and j == len(temp_list):
+                temp_string_list.append(temp_str)
+                temp_abscissa_list.append(temp_list[i][1])
+                break
+            temp_string_list.append(temp_str)
+            temp_abscissa_list.append(temp_list[i][1])
+            i = j
+            j += 1
+            if i == len(temp_list)-1 and j == len(temp_list):
+                temp_string_list.append(temp_list[i][0])
+                temp_abscissa_list.append(temp_list[i][1])
+                break
+        else:
+            break
+    for i, j in zip(temp_string_list, temp_abscissa_list):
+        string_abscissa_list.append([i, j])
+    return string_abscissa_list
+def get_ann_info_from_pdf(pdf_path):
+    '''
+    获取PDF公告文件的头部信息（此处截取了前5行文本，可能包括非头部数据，将在refine_txt_list()中进一步处理）
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个列表，存放PDF公告文件的头部信息（例如：证券代码、证券简称、公告编号等）
+    '''
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            string = pdf.pages[0].extract_text()
+            string_split = string.split('\n')
+            ann_info_list = string_split[:10]
+    except:
+        ann_info_list = []
+    return ann_info_list
+def get_document_from_pdf_converted_docx(pdf_path, docx_path):
+    '''
+    将PDF文件转换为Docx格式，逐行读取Docx文件中的正文内容（除表格以外）
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个列表，string_list，存放PDF的逐行文本内容；一个Document实例对象，存放临时的Docx文件
+    '''
+    document = None
+    if docx_path == '':
+        output_docx_file_path = f"{os.path.dirname(pdf_path)}//{os.path.basename(pdf_path)[:-4]}_{TEMP_DOCX_SUFFIX}.docx"
+    else:
+        output_docx_file_path = docx_path
+    is_success = get_docx_from_pdf(pdf_path=pdf_path, out_path=output_docx_file_path)
+    if is_success:
+        document = Document(output_docx_file_path)
+    if os.path.exists(output_docx_file_path):
+        os.remove(output_docx_file_path)
+    return document
+def get_min_abscissa_value(abscissa_list, string_list_length):
+    '''
+    计算PDF文本块横坐标的最小值（正文块），这里假设该值至少应大于或等于某一阈值（此处设为文本总��数的1/4）
+    :param abscissa_dict: 一个字典，存放PDF文件中某一文本块的起始横坐标值
+    :param string_list_length: 整型，PDF的文本字符串列表
+    :return: 整型，PDF正文块横坐标的最小值
+    '''
+    abscissa_x_list = abscissa_list
+    abscissa_x_list_counter = list(dict(Counter(abscissa_x_list)).items())
+    abscissa_x_list_counter.sort()
+    x_threshold = string_list_length // 4
+    min_abscissa_value = min(abscissa_x_list)
+    for item in abscissa_x_list_counter:
+        if item[1] >= x_threshold:
+            min_abscissa_value = item[0]
+            break
+    return min_abscissa_value
+def refine_txt_list(txt, ann_info, string_abscissa_dict):
+    '''
+    此时PDF文件的文本字符串列表（正文）已经过首轮处理，此处将对它进行最后的格式上的优化
+    :param txt: PDF的文本列表，包含PDF的正文文本内容
+    :param ann_info: PDF的公告的头部信息
+    :return: 一个新的PDF文本列表
+    '''
+    # 格式化PDF的【公告头部信息】
+    if ann_info != []:
+        new_ann_info_list = []
+        for i, val in enumerate(ann_info):
+            if val.strip() == '': continue
+            if val.strip()[-4:] == '有限公司': break
+            else: new_ann_info_list.append(' '.join(val.split()) + SEGMENT_SYMBOL)
+        if new_ann_info_list != []:
+            new_ann_info_list[-1] = new_ann_info_list[-1].replace(SEGMENT_SYMBOL, '')
+            if txt[0].strip()[-4:] == '有限公司':
+                for i in range(len(new_ann_info_list)):
+                    txt.insert(0, '')
+            for i, val in enumerate(new_ann_info_list):
+                txt[i] = val
+    # 格式化PDF的【公告标题】【董事会承诺说明】
+    for i, val in enumerate(txt):
+        if i > 10: break
+        else:
+            val = val.strip()
+            if _check_ann_title_processable(val):
+                if SEGMENT_SYMBOL not in val:
+                    txt[i] = (SEGMENT_SYMBOL + val)
+            if val[-4:] == '有限公司':
+                if SEGMENT_SYMBOL not in txt[i]:
+                    txt[i] = (SEGMENT_SYMBOL + val)
+                if _check_ann_title_processable(txt[i+1]):
+                    txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
+                    if txt[i+2].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
+                        if SEGMENT_SYMBOL not in txt[i+2]:
+                            txt[i+2] = (SEGMENT_SYMBOL + txt[i+2])
+                        txt[i+3] = txt[i+3].replace(SEGMENT_SYMBOL, '')
+                    break
+                if _check_ann_title_processable(txt[i+2]):
+                    txt[i+1] = txt[i+1].replace(SEGMENT_SYMBOL, '')
+                    txt[i+2] = txt[i+2].replace(SEGMENT_SYMBOL, '')
+                    if txt[i+3].replace(SEGMENT_SYMBOL, '')[:3] == '本公司':
+                        if SEGMENT_SYMBOL not in txt[i+3]:
+                            txt[i+3] = (SEGMENT_SYMBOL + txt[i+3])
+                        txt[i+4] = txt[i+4].replace(SEGMENT_SYMBOL, '')
+                    break
+    # 次轮遍历PDF的文本字符串列表
+    for i, _ in enumerate(txt):
+        # 格式化PDF的【大小节编号】
+        if (SEGMENT_SYMBOL not in txt[i]):
+            match_check = [1, 1, 1, 1, 1]
+            # 形如: '一、'的匹配模式
+            match_1 = re.match('[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}、', txt[i])
+            # 形如: '1、'的匹配模式
+            match_2 = re.match('[0-9]{1,2}、', txt[i])
+            # 形如: '1.'的匹配模式
+            match_3 = re.match('[0-9]{1,2}\.', txt[i])
+            # 形如: '（一）'或'(一)'的匹配模式
+            match_4 = re.match('[\（\(]+[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]{1,2}[\）\)]+', txt[i])
+            # 形如: '（1）'或'(1)'的匹配模式
+            match_5 = re.match('[\（\(]+[0-9]{1,2}[\）\)]+', txt[i])
+            if match_1: match_check[0] = match_1.start()
+            if match_2: match_check[1] = match_2.start()
+            if match_3: match_check[2] = match_3.start()
+            if match_4: match_check[3] = match_4.start()
+            if match_5: match_check[4] = match_5.start()
+            if 0 in match_check:
+                txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下【重要内容提示】字段未自成一行的错误
+        if ('重要内容提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('重要内容提示') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+            if txt[i][-1] != '\n':
+                txt[i] += SEGMENT_SYMBOL
+        # 修正某些情况下【单位：元】【单位：人民币元】字段未被删除的错误
+        if (txt[i] == '单位：元') or (txt[i] == SEGMENT_SYMBOL + '单位：元'):
+            txt[i] = ''
+        if (txt[i] == '单位：人民币元') or (txt[i] == SEGMENT_SYMBOL + '单位：人民币元'):
+            txt[i] = ''
+        # 修��某些情况下【特别提示】字段未自成一行的错误
+        if ('特别提示' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特别提示') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+            if txt[i][-1] != '\n':
+                txt[i] += SEGMENT_SYMBOL
+        # 修正某些情况下【特此公告】字段未自成一行的错误
+        if ('特此公告' in txt[i]) and (SEGMENT_SYMBOL not in txt[i]) and (txt[i].index('特此公告') == 0):
+            txt[i] = SEGMENT_SYMBOL + txt[i]
+            if txt[i][-1] != '\n':
+                txt[i] += SEGMENT_SYMBOL
+        # 修正某些情况下【附件：】【附件1：】字段前未断行的错误
+        match_6 = re.match('附件[0-9]{0,2}：', txt[i])
+        if match_6:
+            if match_6.start() == 0:
+                txt[i] = SEGMENT_SYMBOL + txt[i]
+        # 修正某些情况下该行文本与下一行文本内容重复的错误（仅保留一行）
+        if (i+1) < len(txt) and (txt[i] == txt[i+1]):
+            txt[i] = ''
+        # 修正某些情况下【】【●】符号分段的内容未自成一行的错误
+        if ('' in txt[i]) or ('●' in txt[i]):
+            txt[i] = txt[i].replace('', '').replace('●', '')
+            for idx in range(i+1, len(txt)-1):
+                if ('' in txt[idx]) or ('●' in txt[idx]):
+                    break
+                txt[idx] = txt[idx].replace(SEGMENT_SYMBOL, '')
+                if string_abscissa_dict[txt[idx+1].replace(SEGMENT_SYMBOL, '')] < string_abscissa_dict[txt[idx].replace(SEGMENT_SYMBOL, '')]:
+                    break
+        # 修正某些情况下非首页页眉重复出现的错误
+        if i != 0 and txt[i].replace(SEGMENT_SYMBOL, '').replace(' ', '') in txt[0].replace(SEGMENT_SYMBOL, '').replace(' ', ''):
+            txt[i] = ''
+        # 修正某些特殊形式的页码标识未被正确移除的错误
+        if (re.match('^[0-9]{1,2}/[0-9]{1,2}', txt[i].strip().replace('', ''))) or \
+                (re.match('^第[0-9]{1,2}页', txt[i].strip().replace('', ''))) or \
+                (re.match(r'^-[0-9]{1,2}-', txt[i].strip().replace('', ''))):
+            txt[i] = ''
+    return txt
+def get_docx_from_pdf(pdf_path, out_path):
+    '''
+    读入一个PDF文件，将其转换为Docx格式并临时存放于本地
+    :param pdf_path: 输入的PDF公告文件的完整路径
+    :param out_path: 输出的中间Docx结果文件的完整路径
+    :return: 布尔值，是否转换成功
+    '''
+    cv = Converter(pdf_path)
+    try:
+        cv.convert(out_path)
+    except Exception:
+        cv.close()
+        return False
+    for p in cv.pages:
+        if not p.finalized:
+            cv.close()
+            return False
+    cv.close()
+    return True
+def _get_table_row_feat(str):
+    '''
+    给定一个空格分割的表格行字符串，计算它的特征（01组成的字符串）
+    :param str: 字符串
+    :return: 字符串
+    '''
+    s = str.split()
+    r = ''
+    for c in s:
+        try:
+            _ = float(c)
+            r += '1'
+        except Exception:
+            r += '0'
+    return r
+def append_table_from_docx(doc, txt):
+    '''
+    读取Docx文件中每个表格的内容，格式化处理后追加至PDF的文本列表中
+    :param doc: 一个Document对象实例
+    :param txt: 一个字符串列表，包含PDF的正文文本内容
+    :return: 一个新的PDF文本列表
+    '''
+    data = []
+    table_txt = []
+    table_tag = '-' + TABLE_SYMBOL + '-'
+    for table in doc.tables[:]:
+        table_txt.append(f'{table_tag}\n')
+        for i, row in enumerate(table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                new_c = c.replace('\n', '').replace(' ','').replace('\t','').replace(',','')
+                row_content.append(new_c)
+            if row_content == []: continue
+            if '本公司' in row_content[0]:
+                local_flag = True
+                for val in txt[:10]:
+                    if '本公司' in val:
+                        local_flag = False
+                        break
+                if local_flag:
+                    tmp = SEGMENT_SYMBOL
+                    for line in row_content:
+                        tmp += line.strip()
+                    if '特别提示' in tmp:
+                        tmp = tmp[:tmp.index('特别提示')+4]+SEGMENT_SYMBOL+tmp[tmp.index('特别提示')+4:]
+                    for id, val in enumerate(txt):
+                        if id > 10: break
+                        else:
+                            if _check_ann_title_processable(val):
+                                txt.insert(id+1, tmp)
+                                break
+                continue
+            if '证券代码' in row_content[0]:
+                continue
+            data.append(row_content)
+            new_row = '^' + TABLE_CELL_SYMBOL.join(row_content) + '$\n'
+            if new_row.replace(TABLE_CELL_SYMBOL, '') != '^$\n':
+                table_txt.append(new_row)
+        data.append(f'{table_tag}\n')
+        table_txt.append(f'{table_tag}\n')
+    flag = False
+    for i, val in enumerate(table_txt):
+        if val == f'{table_tag}\n':
+            if not flag:
+                flag = True
+            else:
+                table_txt[i] = '^$\n'
+        else:
+            flag = False
+    table_txt = list(filter(lambda x: x != '^$\n', table_txt))
+    for i, val in enumerate(table_txt):
+        if val == f'{table_tag}\n' and (i > 0) and (i < len(table_txt)-1):
+            feat1 = _get_table_row_feat(table_txt[i-1].replace('\n', ''))
+            feat2 = _get_table_row_feat(table_txt[i+1].replace('\n', ''))
+            if feat1 == feat2:
+                table_txt[i] = '^$\n'
+    if len(table_txt) == 1 and table_txt[0] == f'{table_tag}\n':
+        table_txt[0] = '^$\n'
+    for i, val in enumerate(table_txt):
+        if val == table_tag:
+            continue
+        if val == '^$\n':
+            table_txt[i] = ''
+            continue
+        table_txt[i] = val[1:][:-2] + '\n'
+    txt.extend(table_txt)
+    return txt
+def output_txt_string(txt_path, txt_string):
+    '''
+    将PDF公告的格式化文本字符串写出至一个.txt的纯文本文件
+    :param txt_path: 纯文本文件的路径
+    :param txt_string: PDF公告的纯文本字符串
+    :return: 布尔值，是否写出成功
+    '''
+    try:
+        with open(txt_path, "w", encoding='utf-8') as f:
+            f.write(txt_string)
+        # txt_string_split = txt_string.split('\n')
+        # with open(txt_path, "w", encoding='utf-8') as f:
+        #     for string in txt_string_split:
+        #         if string != '':
+        #             f.write('^' + string + '$\n')
+    except:
+        return False
+    return True
+def refine_table_txt(txt):
+    '''
+    对传入的txt_list再进行针对表头和跨页的优化
+    :param txt: PDF的文本列表，包含PDF的正文文本内容和追加的表格文本内容
+    :return: 一个新的文本列表
+    '''
+    new_txt_list = []
+    j = -1
+    for i, _ in enumerate(txt):
+        if txt[i] == f'{TABLE_SYMBOL}\n':
+            j = i
+            break
+        else:
+            new_txt_list.append(txt[i])
+    table_txt = txt[j:]
+    table_txt = list(filter(None, table_txt))
+    for i, _ in enumerate(table_txt):
+        if table_txt[i] == f'{TABLE_SYMBOL}\n' and i + 2 < len(table_txt):
+            pre_cut = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
+            if (len(pre_cut) == 1) or (len(pre_cut) == 2 and pre_cut[0] == ''):
+                table_txt[i + 1] = ''
+            if '公司及董事会' in table_txt[i + 1]:
+                table_txt[i + 1] = ''
+            if table_txt[i + 2] == f'{TABLE_SYMBOL}\n':
+                table_txt[i] = ''
+                table_txt[i + 2] = table_txt[i + 1]
+                table_txt[i + 1] = f'{TABLE_SYMBOL}\n'
+    table_txt = list(filter(None, table_txt))
+    for i, _ in enumerate(table_txt):
+        if table_txt[i] == f'{TABLE_SYMBOL}\n' and i + 2 < len(table_txt):
+            if '同意' in table_txt[i + 1] and table_txt[i + 1].count('同意') == 2:
+                cut = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
+                for k, val in enumerate(cut):
+                    if val == '同意':
+                        cut[k] += '票数'
+                        cut[k+1] += '比例'
+                    if val == '反对':
+                        cut[k] += '票数'
+                        cut[k+1] += '比例'
+                    if val == '弃权':
+                        cut[k] += '票数'
+                        cut[k+1] += '比例'
+                table_txt[i + 1] = TABLE_CELL_SYMBOL.join(cut).replace(SEGMENT_SYMBOL, '')+SEGMENT_SYMBOL
+                table_txt[i + 2] = ''
+                continue
+            cut1 = table_txt[i + 1].split(TABLE_CELL_SYMBOL)
+            set_cut1 = list(set(cut1))
+            set_cut1.sort(key=cut1.index)
+            set_cut1 = list(filter(None, set_cut1))
+            cut2 = table_txt[i + 2].split(TABLE_CELL_SYMBOL)
+            set_cut2 = list(set(cut2))
+            set_cut2.sort(key=cut2.index)
+            set_cut2 = list(filter(None, set_cut2))
+            head_cut = []
+            counter = 0
+            for val1, val2 in zip(set_cut1, set_cut2):
+                if counter:
+                    if len(set_cut1) > len(set_cut2):
+                        head_cut = set_cut1
+                    else:
+                        head_cut = set_cut2
+                    break
+                if val1 == val2:
+                    counter += 1
+            if counter and head_cut:
+                table_txt[i + 1] = TABLE_CELL_SYMBOL.join(head_cut)
+                table_txt[i + 2] = ''
+            if counter:
+                if i+4 < len(table_txt):
+                    cut3 = table_txt[i + 3].split(TABLE_CELL_SYMBOL)
+                    set_cut3 = list(set(cut3))
+                    set_cut3.sort(key=cut3.index)
+                    set_cut3 = list(filter(None, set_cut3))
+                    flag = False
+                    for val3 in set_cut3:
+                        if re.match(r'^[0-9]+(|.)[0-9]+(|%)$', val3):
+                            flag = True
+                            break
+                    if not flag:
+                        cut4 = table_txt[i + 4].split(TABLE_CELL_SYMBOL)
+                        set_cut4 = list(set(cut4))
+                        set_cut4.sort(key=cut4.index)
+                        set_cut4 = list(filter(None, set_cut4))
+                        counter_2 = 0
+                        for val3, val4 in zip(set_cut3, set_cut4):
+                            if counter_2:
+                                if len(set_cut4) > len(set_cut3):
+                                    head_cut = set_cut4
+                                else:
+                                    head_cut = set_cut3
+                                break
+                            if val3 == val4:
+                                counter_2 += 1
+                        if counter_2 and head_cut:
+                            table_txt[i + 1] = TABLE_CELL_SYMBOL.join(head_cut)
+                            table_txt[i + 2] = ''
+                            table_txt[i + 3] = ''
+                            table_txt[i + 4] = ''
+    for val in table_txt:
+        new_txt_list.append(val)
+    return new_txt_list
+def get_txt_from_pdf(pdf_path, docx_path=''):
+    '''
+    给定一个PDF格式的公告文件，将其转化为格式化的TXT文本字符串
+    :param pdf_path: 一个字符串，PDF文件的路径地址
+    :return: 一个字符串，PDF经转换后的纯文本（已格式化，前部正文，后部表格）
+    '''
+    txt_string = ''
+    ann_info_list = get_ann_info_from_pdf(pdf_path)
+    string_abscissa_list = get_string_and_abscissa_list_from_pdf(pdf_path)
+    document = get_document_from_pdf_converted_docx(pdf_path, docx_path)
+    string_abscissa_dict = {}
+    if ann_info_list != [] and string_abscissa_list != [] and document is not None:
+        abscissa_list = [x[1] for x in string_abscissa_list]
+        min_abscissa_value = get_min_abscissa_value(abscissa_list, len(abscissa_list))
+        for id, item in enumerate(string_abscissa_list):
+            if id > 10:
+                break
+            if item[0].replace('\n', '')[-4:] == '有限公司':
+                break
+            else:
+                string_abscissa_list[id][1] = min_abscissa_value
+        txt_list = []
+        for id, item in enumerate(string_abscissa_list):
+            if (not (len(item[0]) <= 3 and item[0].isdigit())):
+                string_abscissa_dict[item[0]] = item[1]
+                if item[1] > min_abscissa_value:
+                    if abs(item[1]-min_abscissa_value) <= 8:
+                        txt_list.append(item[0])
+                    else:
+                        txt_list.append(SEGMENT_SYMBOL + item[0])
+                else:
+                    txt_list.append(item[0])
+        txt_list = refine_txt_list(txt_list, ann_info_list, string_abscissa_dict)
+        if document is not None:
+            txt_list.append(SEGMENT_SYMBOL)
+            txt_list = append_table_from_docx(doc=document, txt=txt_list)
+        txt_list = refine_table_txt(txt_list)
+        for val in txt_list:
+            txt_string += val
+    return txt_string