File size: 3,259 Bytes
e0169c8 10ddae5 360f505 e0169c8 d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 eeafaaa d7fdb42 e0169c8 10ddae5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import re
from bs4 import BeautifulSoup
from markdown import markdown
from settings import *
def split_path(path):
components = []
while True:
path, tail = os.path.split(path)
if tail == "":
if path != "":
components.append(path)
break
components.append(tail)
components.reverse()
return components
def remove_comments(md):
return re.sub(r'<!--((.|\n)*)-->', '', md)
header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')
def split_content(content):
text_chunk_size = context_lengths[EMBED_NAME] - 32
_parts = content.split('\n\n')
parts = []
for p in _parts:
if len(p) < text_chunk_size:
parts.append(p)
else:
parts.extend(p.split('\n'))
res = ['']
for p in parts:
if len(res[-1]) + len(p) < text_chunk_size:
res[-1] += p + '\n\n'
else:
res.append(p + '\n\n')
return res
def split_markdown(md):
def construct_chunks(content):
parts = split_content(content)
for p in parts:
construct_chunk(p)
def construct_chunk(content):
content = content.strip()
if len(content) == 0:
return
chunk = ''
for i in sorted(name_hierarchy):
if len(name_hierarchy[i]) != 0:
j = i + 1
while j in name_hierarchy:
if name_hierarchy[j].find(name_hierarchy[i]) != -1:
break
j += 1
else:
chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'
chunk += content
chunk = chunk.strip()
res.append(chunk)
# to find a header at the top of a file
md = f'\n\n{md}'
headers = list(header_pattern.finditer(md))
# only first header can be first-level
headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]
name_hierarchy = {i: '' for i in (1, 2, 3)}
res = []
for i in range(len(headers)):
header = headers[i]
level = len(header.group(1))
name = header.group(2).strip()
name_hierarchy[level] = name
if i == 0 and header.start() != 0:
construct_chunks(md[:header.start()])
start = header.end()
end = headers[i + 1].start() if i + 1 < len(headers) else None
construct_chunks(md[start:end])
if len(headers) == 0:
construct_chunks(md)
return res
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
html = re.sub(r'<!--((.|\n)*)-->', '', html)
html = re.sub('<code>bash', '<code>', html)
# extract text
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(string=True))
text = re.sub('```(py|diff|python)', '', text)
text = re.sub('```\n', '\n', text)
text = re.sub('- .*', '', text)
text = text.replace('...', '')
text = re.sub('\n(\n)+', '\n\n', text)
return text
def md2txt_then_split(md):
txt = markdown_to_text(md)
return split_content(txt)
|