File size: 3,259 Bytes
e0169c8
 
 
10ddae5
 
 
360f505
e0169c8
 
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeafaaa
d7fdb42
 
 
eeafaaa
d7fdb42
 
 
eeafaaa
d7fdb42
 
 
 
 
 
eeafaaa
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeafaaa
 
 
 
 
 
 
d7fdb42
 
 
 
 
eeafaaa
 
d7fdb42
eeafaaa
 
 
d7fdb42
 
 
 
 
eeafaaa
d7fdb42
 
 
 
 
 
 
 
 
 
 
 
e0169c8
10ddae5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import re

from bs4 import BeautifulSoup
from markdown import markdown

from settings import *


def split_path(path):
    components = []
    while True:
        path, tail = os.path.split(path)
        if tail == "":
            if path != "":
                components.append(path)
            break
        components.append(tail)
    components.reverse()
    return components


def remove_comments(md):
    return re.sub(r'<!--((.|\n)*)-->', '', md)


header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')


def split_content(content):
    text_chunk_size = context_lengths[EMBED_NAME] - 32
    _parts = content.split('\n\n')
    parts = []
    for p in _parts:
        if len(p) < text_chunk_size:
            parts.append(p)
        else:
            parts.extend(p.split('\n'))

    res = ['']
    for p in parts:
        if len(res[-1]) + len(p) < text_chunk_size:
            res[-1] += p + '\n\n'
        else:
            res.append(p + '\n\n')

    return res


def split_markdown(md):
    def construct_chunks(content):
        parts = split_content(content)
        for p in parts:
            construct_chunk(p)

    def construct_chunk(content):
        content = content.strip()
        if len(content) == 0:
            return

        chunk = ''
        for i in sorted(name_hierarchy):
            if len(name_hierarchy[i]) != 0:
                j = i + 1
                while j in name_hierarchy:
                    if name_hierarchy[j].find(name_hierarchy[i]) != -1:
                        break
                    j += 1
                else:
                    chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'

        chunk += content
        chunk = chunk.strip()
        res.append(chunk)

    # to find a header at the top of a file
    md = f'\n\n{md}'
    headers = list(header_pattern.finditer(md))
    # only first header can be first-level
    headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]

    name_hierarchy = {i: '' for i in (1, 2, 3)}
    res = []
    for i in range(len(headers)):
        header = headers[i]
        level = len(header.group(1))
        name = header.group(2).strip()
        name_hierarchy[level] = name
        if i == 0 and header.start() != 0:
            construct_chunks(md[:header.start()])

        start = header.end()
        end = headers[i + 1].start() if i + 1 < len(headers) else None
        construct_chunks(md[start:end])

    if len(headers) == 0:
        construct_chunks(md)

    return res


def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    html = re.sub(r'<!--((.|\n)*)-->', '', html)
    html = re.sub('<code>bash', '<code>', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))

    text = re.sub('```(py|diff|python)', '', text)
    text = re.sub('```\n', '\n', text)
    text = re.sub('-         .*', '', text)
    text = text.replace('...', '')
    text = re.sub('\n(\n)+', '\n\n', text)

    return text


def md2txt_then_split(md):
    txt = markdown_to_text(md)
    return split_content(txt)