Spaces:

mano-wii
/

tools

Running

File size: 5,980 Bytes

91ad34e

import os
import sys
import re
from sentence_transformers import util

script_dir = os.path.dirname(os.path.realpath(__file__))
parent_dir = os.path.dirname(script_dir)
sys.path.append(parent_dir)

# autopep8: off
from routers.tool_find_related import EMBEDDING_CTX
# autopep8: on

MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
BASE_URL = "https://docs.blender.org/manual/en/dev"


def process_text(text):
    # Remove repeated characters
    text = re.sub(r'%{2,}', '', text)
    text = re.sub(r'#{2,}', '', text)
    text = re.sub(r'={3,}', '', text)
    text = re.sub(r'\*{3,}', '', text)
    text = re.sub(r'\^{3,}', '', text)
    text = re.sub(r'-{3,}', '', text)

    # Remove patterns ".. word:: " and ":word:"
    text = re.sub(r'\.\. \S+', '', text)
    text = re.sub(r':\w+:', '', text)

    text = re.sub(r'(\s*\n\s*)+', '\n', text)
    return text


def parse_file(filedir, filename):
    with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
        content = file.read()

    parsed_data = {}

    if not filename.endswith('index.rst'):
        body = content.strip()
    else:
        parts = content.split(".. toctree::")
        body = parts[0].strip()

        if len(parts) > 1:
            parsed_data["toctree"] = {}
            for part in parts[1:]:
                toctree_entries = part.split('\n')
                line = toctree_entries[0]
                for entry in toctree_entries[1:]:
                    entry = entry.strip()
                    if not entry:
                        continue

                    if entry.startswith('/'):
                        # relative path.
                        continue

                    if not entry.endswith('.rst'):
                        continue

                    if entry.endswith('/index.rst'):
                        entry_name = entry[:-10]
                        filedir_ = os.path.join(filedir, entry_name)
                        filename_ = 'index.rst'
                    else:
                        entry_name = entry[:-4]
                        filedir_ = filedir
                        filename_ = entry

                    parsed_data['toctree'][entry_name] = parse_file(
                        filedir_, filename_)

    processed_text = process_text(body)
    tokens = EMBEDDING_CTX.model.tokenizer.tokenize(processed_text)
    if len(tokens) > EMBEDDING_CTX.model.max_seq_length:
        pass
    # parsed_data['body'] = body
    parsed_data['processed_text'] = processed_text
    parsed_data['n_tokens'] = len(tokens)

    return parsed_data


# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens):

    # Split the text into sentences
    paragraphs = text.split('.\n')

    # Get the number of tokens for each sentence
    n_tokens = [len(EMBEDDING_CTX.model.tokenizer.tokenize(" " + sentence))
                for sentence in paragraphs]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(paragraphs, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append((".\n".join(chunk) + ".", tokens_so_far))
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    if chunk:
        chunks.append((".\n".join(chunk) + ".", tokens_so_far))

    return chunks


def get_texts(data, path):
    result = []
    processed_texts = [data['processed_text']]
    processed_tokens = [data['n_tokens']]
    max_tokens = EMBEDDING_CTX.model.max_seq_length

    data_ = data
    for key in path:
        data_ = data_['toctree'][key]
        processed_texts.append(data_['processed_text'])
        processed_tokens.append(data_['n_tokens'])

    if processed_tokens[-1] > max_tokens:
        chunks = split_into_many(processed_texts[-1], max_tokens)
    else:
        chunks = [(processed_texts[-1], processed_tokens[-1])]

    for text, n_tokens in chunks:
        # Add context to the text if we have space
        for i in range(len(processed_texts) - 2, -1, -1):
            n_tokens_parent = processed_tokens[i]
            if n_tokens + n_tokens_parent >= max_tokens:
                break

            text_parent = processed_texts[i]
            text = text_parent + '\n' + text
            n_tokens += n_tokens_parent

        result.append([path, text])

    try:
        for key in data_['toctree'].keys():
            result.extend(get_texts(data, path + [key]))
    except KeyError:
        pass

    return result


def _sort_similarity(chunks, embeddings, text_to_search, limit):
    results = []

    query_emb = EMBEDDING_CTX.encode([text_to_search])
    ret = util.semantic_search(
        query_emb, embeddings, top_k=limit, score_function=util.dot_score)

    for score in ret[0]:
        corpus_id = score['corpus_id']
        chunk = chunks[corpus_id]
        path = chunk[0]
        results.append(path)

    return results


if __name__ == '__main__':
    # path = 'addons/3d_view'
    data = parse_file(MANUAL_DIR, 'index.rst')
    data['toctree']["copyright"] = parse_file(MANUAL_DIR, 'copyright.rst')

    # Create a list to store the text files
    chunks = []
    chunks.extend(get_texts(data, []))

    embeddings = EMBEDDING_CTX.encode([text for path, text in chunks])

    result = _sort_similarity(chunks, embeddings, "Set Snap Base", 50)
    print(result)