File size: 5,980 Bytes
91ad34e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import sys
import re
from sentence_transformers import util

script_dir = os.path.dirname(os.path.realpath(__file__))
parent_dir = os.path.dirname(script_dir)
sys.path.append(parent_dir)

# autopep8: off
from routers.tool_find_related import EMBEDDING_CTX
# autopep8: on

MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
BASE_URL = "https://docs.blender.org/manual/en/dev"


def process_text(text):
    # Remove repeated characters
    text = re.sub(r'%{2,}', '', text)
    text = re.sub(r'#{2,}', '', text)
    text = re.sub(r'={3,}', '', text)
    text = re.sub(r'\*{3,}', '', text)
    text = re.sub(r'\^{3,}', '', text)
    text = re.sub(r'-{3,}', '', text)

    # Remove patterns ".. word:: " and ":word:"
    text = re.sub(r'\.\. \S+', '', text)
    text = re.sub(r':\w+:', '', text)

    text = re.sub(r'(\s*\n\s*)+', '\n', text)
    return text


def parse_file(filedir, filename):
    with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
        content = file.read()

    parsed_data = {}

    if not filename.endswith('index.rst'):
        body = content.strip()
    else:
        parts = content.split(".. toctree::")
        body = parts[0].strip()

        if len(parts) > 1:
            parsed_data["toctree"] = {}
            for part in parts[1:]:
                toctree_entries = part.split('\n')
                line = toctree_entries[0]
                for entry in toctree_entries[1:]:
                    entry = entry.strip()
                    if not entry:
                        continue

                    if entry.startswith('/'):
                        # relative path.
                        continue

                    if not entry.endswith('.rst'):
                        continue

                    if entry.endswith('/index.rst'):
                        entry_name = entry[:-10]
                        filedir_ = os.path.join(filedir, entry_name)
                        filename_ = 'index.rst'
                    else:
                        entry_name = entry[:-4]
                        filedir_ = filedir
                        filename_ = entry

                    parsed_data['toctree'][entry_name] = parse_file(
                        filedir_, filename_)

    processed_text = process_text(body)
    tokens = EMBEDDING_CTX.model.tokenizer.tokenize(processed_text)
    if len(tokens) > EMBEDDING_CTX.model.max_seq_length:
        pass
    # parsed_data['body'] = body
    parsed_data['processed_text'] = processed_text
    parsed_data['n_tokens'] = len(tokens)

    return parsed_data


# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens):

    # Split the text into sentences
    paragraphs = text.split('.\n')

    # Get the number of tokens for each sentence
    n_tokens = [len(EMBEDDING_CTX.model.tokenizer.tokenize(" " + sentence))
                for sentence in paragraphs]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(paragraphs, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append((".\n".join(chunk) + ".", tokens_so_far))
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    if chunk:
        chunks.append((".\n".join(chunk) + ".", tokens_so_far))

    return chunks


def get_texts(data, path):
    result = []
    processed_texts = [data['processed_text']]
    processed_tokens = [data['n_tokens']]
    max_tokens = EMBEDDING_CTX.model.max_seq_length

    data_ = data
    for key in path:
        data_ = data_['toctree'][key]
        processed_texts.append(data_['processed_text'])
        processed_tokens.append(data_['n_tokens'])

    if processed_tokens[-1] > max_tokens:
        chunks = split_into_many(processed_texts[-1], max_tokens)
    else:
        chunks = [(processed_texts[-1], processed_tokens[-1])]

    for text, n_tokens in chunks:
        # Add context to the text if we have space
        for i in range(len(processed_texts) - 2, -1, -1):
            n_tokens_parent = processed_tokens[i]
            if n_tokens + n_tokens_parent >= max_tokens:
                break

            text_parent = processed_texts[i]
            text = text_parent + '\n' + text
            n_tokens += n_tokens_parent

        result.append([path, text])

    try:
        for key in data_['toctree'].keys():
            result.extend(get_texts(data, path + [key]))
    except KeyError:
        pass

    return result


def _sort_similarity(chunks, embeddings, text_to_search, limit):
    results = []

    query_emb = EMBEDDING_CTX.encode([text_to_search])
    ret = util.semantic_search(
        query_emb, embeddings, top_k=limit, score_function=util.dot_score)

    for score in ret[0]:
        corpus_id = score['corpus_id']
        chunk = chunks[corpus_id]
        path = chunk[0]
        results.append(path)

    return results


if __name__ == '__main__':
    # path = 'addons/3d_view'
    data = parse_file(MANUAL_DIR, 'index.rst')
    data['toctree']["copyright"] = parse_file(MANUAL_DIR, 'copyright.rst')

    # Create a list to store the text files
    chunks = []
    chunks.extend(get_texts(data, []))

    embeddings = EMBEDDING_CTX.encode([text for path, text in chunks])

    result = _sort_similarity(chunks, embeddings, "Set Snap Base", 50)
    print(result)