AlexanderKazakov
configurable chunking and embedding
10ddae5
import os
import re
from bs4 import BeautifulSoup
from markdown import markdown
from settings import *
def split_path(path):
components = []
while True:
path, tail = os.path.split(path)
if tail == "":
if path != "":
components.append(path)
break
components.append(tail)
components.reverse()
return components
def remove_comments(md):
return re.sub(r'<!--((.|\n)*)-->', '', md)
header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')
def split_content(content):
text_chunk_size = context_lengths[EMBED_NAME] - 32
_parts = content.split('\n\n')
parts = []
for p in _parts:
if len(p) < text_chunk_size:
parts.append(p)
else:
parts.extend(p.split('\n'))
res = ['']
for p in parts:
if len(res[-1]) + len(p) < text_chunk_size:
res[-1] += p + '\n\n'
else:
res.append(p + '\n\n')
return res
def split_markdown(md):
def construct_chunks(content):
parts = split_content(content)
for p in parts:
construct_chunk(p)
def construct_chunk(content):
content = content.strip()
if len(content) == 0:
return
chunk = ''
for i in sorted(name_hierarchy):
if len(name_hierarchy[i]) != 0:
j = i + 1
while j in name_hierarchy:
if name_hierarchy[j].find(name_hierarchy[i]) != -1:
break
j += 1
else:
chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'
chunk += content
chunk = chunk.strip()
res.append(chunk)
# to find a header at the top of a file
md = f'\n\n{md}'
headers = list(header_pattern.finditer(md))
# only first header can be first-level
headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]
name_hierarchy = {i: '' for i in (1, 2, 3)}
res = []
for i in range(len(headers)):
header = headers[i]
level = len(header.group(1))
name = header.group(2).strip()
name_hierarchy[level] = name
if i == 0 and header.start() != 0:
construct_chunks(md[:header.start()])
start = header.end()
end = headers[i + 1].start() if i + 1 < len(headers) else None
construct_chunks(md[start:end])
if len(headers) == 0:
construct_chunks(md)
return res
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
html = re.sub(r'<!--((.|\n)*)-->', '', html)
html = re.sub('<code>bash', '<code>', html)
# extract text
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(string=True))
text = re.sub('```(py|diff|python)', '', text)
text = re.sub('```\n', '\n', text)
text = re.sub('- .*', '', text)
text = text.replace('...', '')
text = re.sub('\n(\n)+', '\n\n', text)
return text
def md2txt_then_split(md):
txt = markdown_to_text(md)
return split_content(txt)