tools / utils /generate_blender_doc.py
Germano Cavalcante
Add Utils for generate documentantion
91ad34e
raw
history blame
5.98 kB
import os
import sys
import re
from sentence_transformers import util
script_dir = os.path.dirname(os.path.realpath(__file__))
parent_dir = os.path.dirname(script_dir)
sys.path.append(parent_dir)
# autopep8: off
from routers.tool_find_related import EMBEDDING_CTX
# autopep8: on
MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
BASE_URL = "https://docs.blender.org/manual/en/dev"
def process_text(text):
# Remove repeated characters
text = re.sub(r'%{2,}', '', text)
text = re.sub(r'#{2,}', '', text)
text = re.sub(r'={3,}', '', text)
text = re.sub(r'\*{3,}', '', text)
text = re.sub(r'\^{3,}', '', text)
text = re.sub(r'-{3,}', '', text)
# Remove patterns ".. word:: " and ":word:"
text = re.sub(r'\.\. \S+', '', text)
text = re.sub(r':\w+:', '', text)
text = re.sub(r'(\s*\n\s*)+', '\n', text)
return text
def parse_file(filedir, filename):
with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
content = file.read()
parsed_data = {}
if not filename.endswith('index.rst'):
body = content.strip()
else:
parts = content.split(".. toctree::")
body = parts[0].strip()
if len(parts) > 1:
parsed_data["toctree"] = {}
for part in parts[1:]:
toctree_entries = part.split('\n')
line = toctree_entries[0]
for entry in toctree_entries[1:]:
entry = entry.strip()
if not entry:
continue
if entry.startswith('/'):
# relative path.
continue
if not entry.endswith('.rst'):
continue
if entry.endswith('/index.rst'):
entry_name = entry[:-10]
filedir_ = os.path.join(filedir, entry_name)
filename_ = 'index.rst'
else:
entry_name = entry[:-4]
filedir_ = filedir
filename_ = entry
parsed_data['toctree'][entry_name] = parse_file(
filedir_, filename_)
processed_text = process_text(body)
tokens = EMBEDDING_CTX.model.tokenizer.tokenize(processed_text)
if len(tokens) > EMBEDDING_CTX.model.max_seq_length:
pass
# parsed_data['body'] = body
parsed_data['processed_text'] = processed_text
parsed_data['n_tokens'] = len(tokens)
return parsed_data
# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens):
# Split the text into sentences
paragraphs = text.split('.\n')
# Get the number of tokens for each sentence
n_tokens = [len(EMBEDDING_CTX.model.tokenizer.tokenize(" " + sentence))
for sentence in paragraphs]
chunks = []
tokens_so_far = 0
chunk = []
# Loop through the sentences and tokens joined together in a tuple
for sentence, token in zip(paragraphs, n_tokens):
# If the number of tokens so far plus the number of tokens in the current sentence is greater
# than the max number of tokens, then add the chunk to the list of chunks and reset
# the chunk and tokens so far
if tokens_so_far + token > max_tokens:
chunks.append((".\n".join(chunk) + ".", tokens_so_far))
chunk = []
tokens_so_far = 0
# If the number of tokens in the current sentence is greater than the max number of
# tokens, go to the next sentence
if token > max_tokens:
continue
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
chunk.append(sentence)
tokens_so_far += token + 1
if chunk:
chunks.append((".\n".join(chunk) + ".", tokens_so_far))
return chunks
def get_texts(data, path):
result = []
processed_texts = [data['processed_text']]
processed_tokens = [data['n_tokens']]
max_tokens = EMBEDDING_CTX.model.max_seq_length
data_ = data
for key in path:
data_ = data_['toctree'][key]
processed_texts.append(data_['processed_text'])
processed_tokens.append(data_['n_tokens'])
if processed_tokens[-1] > max_tokens:
chunks = split_into_many(processed_texts[-1], max_tokens)
else:
chunks = [(processed_texts[-1], processed_tokens[-1])]
for text, n_tokens in chunks:
# Add context to the text if we have space
for i in range(len(processed_texts) - 2, -1, -1):
n_tokens_parent = processed_tokens[i]
if n_tokens + n_tokens_parent >= max_tokens:
break
text_parent = processed_texts[i]
text = text_parent + '\n' + text
n_tokens += n_tokens_parent
result.append([path, text])
try:
for key in data_['toctree'].keys():
result.extend(get_texts(data, path + [key]))
except KeyError:
pass
return result
def _sort_similarity(chunks, embeddings, text_to_search, limit):
results = []
query_emb = EMBEDDING_CTX.encode([text_to_search])
ret = util.semantic_search(
query_emb, embeddings, top_k=limit, score_function=util.dot_score)
for score in ret[0]:
corpus_id = score['corpus_id']
chunk = chunks[corpus_id]
path = chunk[0]
results.append(path)
return results
if __name__ == '__main__':
# path = 'addons/3d_view'
data = parse_file(MANUAL_DIR, 'index.rst')
data['toctree']["copyright"] = parse_file(MANUAL_DIR, 'copyright.rst')
# Create a list to store the text files
chunks = []
chunks.extend(get_texts(data, []))
embeddings = EMBEDDING_CTX.encode([text for path, text in chunks])
result = _sort_similarity(chunks, embeddings, "Set Snap Base", 50)
print(result)