Spaces:

akazakov
/

rag-gradio-sample-project

Paused

File size: 3,259 Bytes

e0169c8
 
 
10ddae5
 
 
360f505
e0169c8
 
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeafaaa
d7fdb42
 
 
eeafaaa
d7fdb42
 
 
eeafaaa
d7fdb42
 
 
 
 
 
eeafaaa
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeafaaa
 
 
 
 
 
 
d7fdb42
 
 
 
 
eeafaaa
 
d7fdb42
eeafaaa
 
 
d7fdb42
 
 
 
 
eeafaaa
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
e0169c8
10ddae5

import os
import re

from bs4 import BeautifulSoup
from markdown import markdown

from settings import *


def split_path(path):
    components = []
    while True:
        path, tail = os.path.split(path)
        if tail == "":
            if path != "":
                components.append(path)
            break
        components.append(tail)
    components.reverse()
    return components


def remove_comments(md):
    return re.sub(r'<!--((.|\n)*)-->', '', md)


header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')


def split_content(content):
    text_chunk_size = context_lengths[EMBED_NAME] - 32
    _parts = content.split('\n\n')
    parts = []
    for p in _parts:
        if len(p) < text_chunk_size:
            parts.append(p)
        else:
            parts.extend(p.split('\n'))

    res = ['']
    for p in parts:
        if len(res[-1]) + len(p) < text_chunk_size:
            res[-1] += p + '\n\n'
        else:
            res.append(p + '\n\n')

    return res


def split_markdown(md):
    def construct_chunks(content):
        parts = split_content(content)
        for p in parts:
            construct_chunk(p)

    def construct_chunk(content):
        content = content.strip()
        if len(content) == 0:
            return

        chunk = ''
        for i in sorted(name_hierarchy):
            if len(name_hierarchy[i]) != 0:
                j = i + 1
                while j in name_hierarchy:
                    if name_hierarchy[j].find(name_hierarchy[i]) != -1:
                        break
                    j += 1
                else:
                    chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'

        chunk += content
        chunk = chunk.strip()
        res.append(chunk)

    # to find a header at the top of a file
    md = f'\n\n{md}'
    headers = list(header_pattern.finditer(md))
    # only first header can be first-level
    headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]

    name_hierarchy = {i: '' for i in (1, 2, 3)}
    res = []
    for i in range(len(headers)):
        header = headers[i]
        level = len(header.group(1))
        name = header.group(2).strip()
        name_hierarchy[level] = name
        if i == 0 and header.start() != 0:
            construct_chunks(md[:header.start()])

        start = header.end()
        end = headers[i + 1].start() if i + 1 < len(headers) else None
        construct_chunks(md[start:end])

    if len(headers) == 0:
        construct_chunks(md)

    return res


def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    html = re.sub(r'<!--((.|\n)*)-->', '', html)
    html = re.sub('<code>bash', '<code>', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))

    text = re.sub('```(py|diff|python)', '', text)
    text = re.sub('```\n', '\n', text)
    text = re.sub('-         .*', '', text)
    text = text.replace('...', '')
    text = re.sub('\n(\n)+', '\n\n', text)

    return text


def md2txt_then_split(md):
    txt = markdown_to_text(md)
    return split_content(txt)