Germano Cavalcante commited on
Commit
af4d94e
1 Parent(s): 5974bb1

Add Blender docs to search

Browse files
routers/embedding/embeddings_manual_wiki.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c4a71f60f1878e528b190c3c43f744611c90efdea4c2ef333962773fd2fd637
3
- size 19670346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e12f37bd8b14982fa070b5db9d9c468c0bc858fc65ab136cc714bf8fcce48d69
3
+ size 31873812
routers/tool_wiki_search.py CHANGED
@@ -6,7 +6,7 @@ import pickle
6
  import re
7
  import torch
8
  from enum import Enum
9
- from typing import Dict, List
10
  from sentence_transformers import util
11
  from fastapi import APIRouter
12
  from fastapi.responses import PlainTextResponse
@@ -19,14 +19,106 @@ except:
19
  from utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
20
 
21
  MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
 
22
 
23
 
24
  class Group(str, Enum):
 
25
  wiki = "wiki"
26
  manual = "manual"
27
  all = "all"
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  class _Data(dict):
31
  cache_path = "routers/embedding/embeddings_manual_wiki.pkl"
32
 
@@ -44,8 +136,12 @@ class _Data(dict):
44
  self[grp.name] = {}
45
 
46
  # Create a list to store the text files
47
- texts = self.manual_get_texts_to_embed(
48
- ) if grp == Group.manual else self.wiki_get_texts_to_embed()
 
 
 
 
49
 
50
  self[grp]['texts'] = texts
51
  self[grp]['embeddings'] = EMBEDDING_CTX.encode(texts)
@@ -57,19 +153,6 @@ class _Data(dict):
57
 
58
  pickle.dump(dict(self), file, protocol=pickle.HIGHEST_PROTOCOL)
59
 
60
- @staticmethod
61
- def reduce_text(text):
62
- # Remove repeated characters
63
- text = re.sub(r'%{2,}', '', text) # Title
64
- text = re.sub(r'#{2,}', '', text) # Title
65
- text = re.sub(r'\*{3,}', '', text) # Title
66
- text = re.sub(r'={3,}', '', text) # Topic
67
- text = re.sub(r'\^{3,}', '', text)
68
- text = re.sub(r'-{3,}', '', text)
69
-
70
- text = re.sub(r'(\s*\n\s*)+', '\n', text)
71
- return text
72
-
73
  @classmethod
74
  def parse_file_recursive(cls, filepath):
75
  with open(filepath, 'r', encoding='utf-8') as file:
@@ -105,211 +188,108 @@ class _Data(dict):
105
 
106
  return parsed_data
107
 
108
- @staticmethod
109
- def split_into_topics(text: str, prefix: str = '') -> Dict[str, List[str]]:
110
- """
111
- Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
112
-
113
- Args:
114
- text (str): The input text to be split. The text should contain titles marked by asterisks (***)
115
- or subtitles marked by equal signs (===).
116
- prefix (str): prefix to titles and subtitles
117
-
118
- Returns:
119
- Dict[str, List[str]]: A dictionary where keys are section titles or subtitles, and values are lists of
120
- strings corresponding to the content under each title or subtitle.
121
-
122
- Example:
123
- text = '''
124
- *********************
125
- The Blender Community
126
- *********************
127
-
128
- Being freely available from the start.
129
-
130
- Independent Sites
131
- =================
132
-
133
- There are `several independent websites.
134
-
135
- Getting Support
136
- ===============
137
-
138
- Blender's community is one of its greatest features.
139
- '''
140
-
141
- result = split_in_topics(text)
142
- # result will be:
143
- # {
144
- # "# The Blender Community": [
145
- # "Being freely available from the start."
146
- # ],
147
- # "# The Blender Community | Independent Sites": [
148
- # "There are `several independent websites."
149
- # ],
150
- # "# The Blender Community | Getting Support": [
151
- # "Blender's community is one of its greatest features."
152
- # ]
153
- # }
154
- """
155
-
156
- # Remove patterns ".. word::" and ":word:"
157
- text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
158
-
159
- # Regular expression to find titles and subtitles
160
- pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
161
-
162
- # Split text by found patterns
163
- sections = re.split(pattern, text)
164
-
165
- # Remove possible white spaces at the beginning and end of each section
166
- sections = [section for section in sections if section.strip()]
167
-
168
- # Separate sections into a dictionary
169
- topics = {}
170
- current_title = ''
171
- current_topic = prefix
172
-
173
- for section in sections:
174
- if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
175
- current_topic = current_title = f'{prefix}# {match.group(1)}'
176
- topics[current_topic] = []
177
- elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
178
- current_topic = current_title + ' | ' + match.group(1)
179
- topics[current_topic] = []
180
- else:
181
- if current_topic == prefix:
182
- raise
183
- topics[current_topic].append(section)
184
-
185
- return topics
186
-
187
- @classmethod
188
- def split_into_many(cls, page_body, prefix=''):
189
- """
190
- # Function to split the text into chunks of a maximum number of tokens
191
- """
192
- tokenizer = EMBEDDING_CTX.model.tokenizer
193
- max_tokens = EMBEDDING_CTX.model.max_seq_length
194
- topics = cls.split_into_topics(page_body, prefix)
195
-
196
- for topic, content_list in topics.items():
197
- title = topic + ':\n'
198
- title_tokens_len = len(tokenizer.tokenize(title))
199
- content_list_new = []
200
- for content in content_list:
201
- content_reduced = cls.reduce_text(content)
202
- content_tokens_len = len(tokenizer.tokenize(content_reduced))
203
- if title_tokens_len + content_tokens_len <= max_tokens:
204
- content_list_new.append(content_reduced)
205
- continue
206
-
207
- # Split the text into sentences
208
- paragraphs = content_reduced.split('.\n')
209
- sentences = ''
210
- tokens_so_far = title_tokens_len
211
-
212
- # Loop through the sentences and tokens joined together in a tuple
213
- for sentence in paragraphs:
214
- sentence += '.\n'
215
-
216
- # Get the number of tokens for each sentence
217
- n_tokens = len(tokenizer.tokenize(sentence))
218
-
219
- # If the number of tokens so far plus the number of tokens in the current sentence is greater
220
- # than the max number of tokens, then add the chunk to the list of chunks and reset
221
- # the chunk and tokens so far
222
- if tokens_so_far + n_tokens > max_tokens:
223
- content_list_new.append(sentences)
224
- sentences = ''
225
- tokens_so_far = title_tokens_len
226
-
227
- sentences += sentence
228
- tokens_so_far += n_tokens
229
-
230
- if sentences:
231
- content_list_new.append(sentences)
232
-
233
- # Replace content_list
234
- content_list.clear()
235
- content_list.extend(content_list_new)
236
-
237
- result = []
238
- for topic, content_list in topics.items():
239
- for content in content_list:
240
- result.append(topic + ':\n' + content)
241
-
242
- return result
243
-
244
- @classmethod
245
- def get_texts_recursive(cls, page, path='index'):
246
- result = cls.split_into_many(page['body'], path)
247
-
248
- try:
249
- for key in page['toctree'].keys():
250
- page_child = page['toctree'][key]
251
- result.extend(cls.get_texts_recursive(
252
- page_child, path.replace('index', key)))
253
- except KeyError:
254
- pass
255
-
256
- return result
257
-
258
  @classmethod
259
  def manual_get_texts_to_embed(cls):
260
- manual = cls.parse_file_recursive(
261
- os.path.join(MANUAL_DIR, 'index.rst'))
262
- manual['toctree']["copyright"] = cls.parse_file_recursive(
263
- os.path.join(MANUAL_DIR, 'copyright.rst'))
264
-
265
- return cls.get_texts_recursive(manual)
266
-
267
- @classmethod
268
- def wiki_get_texts_to_embed(cls):
269
- tokenizer = EMBEDDING_CTX.model.tokenizer
270
- max_tokens = EMBEDDING_CTX.model.max_seq_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- texts = []
273
- owner = "blender"
274
- repo = "blender"
275
- pages = gitea_wiki_pages_get(owner, repo)
276
- for page_name in pages:
277
- page_name_title = page_name["title"]
278
- page = gitea_wiki_page_get(owner, repo, page_name_title)
279
- prefix = f'/{page["sub_url"]}\n# {page_name_title}:'
280
- text = base64.b64decode(page["content_base64"]).decode('utf-8')
281
- text = text.replace(
282
- 'https://projects.blender.org/blender/blender', '')
283
- tokens_prefix_len = len(tokenizer.tokenize(prefix))
284
- tokens_so_far = tokens_prefix_len
285
- text_so_far = prefix
286
- text_parts = text.split('\n#')
287
- for part in text_parts:
288
- part = '\n#' + part
289
- part_tokens_len = len(tokenizer.tokenize(part))
290
- if tokens_so_far + part_tokens_len > max_tokens:
291
- texts.append(text_so_far)
292
- text_so_far = prefix
293
- tokens_so_far = tokens_prefix_len
294
- text_so_far += part
295
- tokens_so_far += part_tokens_len
296
-
297
- if tokens_so_far != tokens_prefix_len:
298
- texts.append(text_so_far)
299
 
300
- return texts
 
 
301
 
302
- def _sort_similarity(self, text_to_search, group: Group = Group.all, limit=4):
303
  result = []
304
 
305
  query_emb = EMBEDDING_CTX.encode([text_to_search])
306
 
307
  ret = {}
308
 
309
- for grp in list(Group)[:-1]:
310
- if group in {grp, Group.all}:
311
- ret[grp] = util.semantic_search(
312
- query_emb, self[grp]['embeddings'], top_k=limit, score_function=util.dot_score)
 
 
313
 
314
  score_best = 0.0
315
  group_best = None
@@ -336,28 +316,28 @@ router = APIRouter()
336
  @router.get("/wiki_search", response_class=PlainTextResponse)
337
  def wiki_search(query: str = "", group: Group = Group.all) -> str:
338
  base_url = {
339
- Group.wiki: "https://projects.blender.org/blender/blender",
340
- Group.manual: "https://docs.blender.org/manual/en/dev"
 
341
  }
342
- texts, group_best = G_data._sort_similarity(query, group)
 
 
 
 
 
 
 
 
343
 
344
  result = f'BASE_URL: {base_url[group_best]}\n'
345
  for text in texts:
346
- if group_best == Group.wiki:
347
- result += f'''---
348
- {text}
349
- '''
350
- else:
351
- index = text.find('#')
352
- result += f'''---
353
- {text[:index] + '.html'}
354
- {text[index:]}
355
- '''
356
  return result
357
 
358
 
359
  if __name__ == '__main__':
360
  tests = ["Set Snap Base", "Building the Manual",
361
- "Bisect Object", "Who are the Triagers"]
362
- result = wiki_search(tests[1], Group.all)
363
  print(result)
 
6
  import re
7
  import torch
8
  from enum import Enum
9
+ from typing import Dict, List, Tuple, Set
10
  from sentence_transformers import util
11
  from fastapi import APIRouter
12
  from fastapi.responses import PlainTextResponse
 
19
  from utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
20
 
21
  MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
22
+ DOCS_DIR = "D:/BlenderDev/blender-developer-docs/docs"
23
 
24
 
25
  class Group(str, Enum):
26
+ docs = "docs"
27
  wiki = "wiki"
28
  manual = "manual"
29
  all = "all"
30
 
31
 
32
+ class Split:
33
+ filedir = None
34
+ filetype = '.md'
35
+
36
+ def __init__(self, filedir=None, filetype='.md'):
37
+ self.filedir = filedir
38
+ self.filetype = filetype
39
+
40
+ def split_in_topics(self) -> List[Tuple[str, str]]:
41
+ for root, _dirs, files in os.walk(self.filedir):
42
+ for name in files:
43
+ if not name.endswith(self.filetype) or name == 'navigation.md':
44
+ continue
45
+
46
+ full_path = os.path.join(root, name)
47
+ with open(full_path, 'r', encoding='utf-8') as file:
48
+ content = file.read()
49
+
50
+ prefix = full_path.replace(self.filedir, '')
51
+ prefix = re.sub(r'(index)?.md', '', prefix)
52
+ prefix = prefix.replace('\\', '/')
53
+
54
+ # Protect code parts
55
+ parts = ['']
56
+ is_first = True
57
+ is_in_code_block = False
58
+ for line in content.splitlines():
59
+ if not line:
60
+ continue
61
+ line += '\n'
62
+ is_in_code_block = is_in_code_block != line.strip().startswith('```')
63
+ if not is_in_code_block and line.startswith('## '):
64
+ if not is_first:
65
+ parts.append(line)
66
+ continue
67
+
68
+ is_first = False
69
+ parts[-1] += line
70
+
71
+ title_main = ''
72
+ for topic in parts:
73
+ topic = topic.strip()
74
+ if not topic or topic.startswith('---\nhide'):
75
+ continue
76
+
77
+ try:
78
+ title, body = topic.split('\n', 1)
79
+ except ValueError:
80
+ # ignore non content
81
+ continue
82
+
83
+ if not title_main:
84
+ title_main = title
85
+ else:
86
+ title = title_main + ' | ' + title
87
+
88
+ yield (prefix + '\n' + title, body)
89
+
90
+ def reduce_text(_self, text):
91
+ text = re.sub(r'<.*?>', '', text) # Remove HTML tags
92
+ text = re.sub(r':\S*: ', '', text) # Remove [:...:] patterns
93
+ text = re.sub(r'(index)?.md', '', text) # Remove .md
94
+ return re.sub(r'(\s*\n\s*)+', '\n', text)
95
+
96
+ def split_for_embedding(self):
97
+ tokenizer = EMBEDDING_CTX.model.tokenizer
98
+ max_tokens = EMBEDDING_CTX.model.max_seq_length
99
+ texts = []
100
+
101
+ for prefix, content in self.split_in_topics():
102
+ prefix += '\n\n'
103
+ tokens_prefix_len = len(tokenizer.tokenize(prefix))
104
+ tokens_so_far = tokens_prefix_len
105
+ text_so_far = prefix
106
+ for part in self.reduce_text(content).splitlines():
107
+ part += '\n'
108
+ part_tokens_len = len(tokenizer.tokenize(part))
109
+ if tokens_so_far + part_tokens_len > max_tokens:
110
+ texts.append(text_so_far)
111
+ text_so_far = prefix
112
+ tokens_so_far = tokens_prefix_len
113
+ text_so_far += part
114
+ tokens_so_far += part_tokens_len
115
+
116
+ if tokens_so_far != tokens_prefix_len:
117
+ texts.append(text_so_far)
118
+
119
+ return texts
120
+
121
+
122
  class _Data(dict):
123
  cache_path = "routers/embedding/embeddings_manual_wiki.pkl"
124
 
 
136
  self[grp.name] = {}
137
 
138
  # Create a list to store the text files
139
+ if grp is Group.docs:
140
+ texts = self.docs_get_texts_to_embed()
141
+ elif grp is Group.wiki:
142
+ texts = self.wiki_get_texts_to_embed()
143
+ else:
144
+ texts = self.manual_get_texts_to_embed()
145
 
146
  self[grp]['texts'] = texts
147
  self[grp]['embeddings'] = EMBEDDING_CTX.encode(texts)
 
153
 
154
  pickle.dump(dict(self), file, protocol=pickle.HIGHEST_PROTOCOL)
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  @classmethod
157
  def parse_file_recursive(cls, filepath):
158
  with open(filepath, 'r', encoding='utf-8') as file:
 
188
 
189
  return parsed_data
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  @classmethod
192
  def manual_get_texts_to_embed(cls):
193
+ class SplitManual(Split):
194
+ def split_in_topics(_self):
195
+ def get_topics_recursive(page, path='/index.html'):
196
+ # Remove patterns ".. word::" and ":word:"
197
+ text = re.sub(
198
+ r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', page['body'])
199
+
200
+ # Regular expression to find titles and subtitles
201
+ pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
202
+
203
+ # Split text by found patterns
204
+ sections = re.split(pattern, text)
205
+
206
+ # Remove possible white spaces at the beginning and end of each section
207
+ sections = [
208
+ section for section in sections if section.strip()]
209
+
210
+ # Separate sections into a dictionary
211
+ topics = []
212
+ current_title = ''
213
+ current_topic = path
214
+
215
+ for section in sections:
216
+ if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
217
+ current_topic = current_title = f'{path}\n# {match.group(1)}:'
218
+ elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
219
+ current_topic = f'{current_title} | {match.group(1)}'
220
+ else:
221
+ if current_topic == path:
222
+ raise
223
+ topics.append((current_topic, section))
224
+
225
+ try:
226
+ for key in page['toctree'].keys():
227
+ page_child = page['toctree'][key]
228
+ topics.extend(get_topics_recursive(
229
+ page_child, path.replace('index', key)))
230
+ except KeyError:
231
+ pass
232
+
233
+ return topics
234
+
235
+ manual = cls.parse_file_recursive(
236
+ os.path.join(MANUAL_DIR, 'index.rst'))
237
+ manual['toctree']["copyright"] = cls.parse_file_recursive(
238
+ os.path.join(MANUAL_DIR, 'copyright.rst'))
239
+
240
+ return get_topics_recursive(manual)
241
+
242
+ def reduce_text(_self, text):
243
+ # Remove repeated characters
244
+ text = re.sub(r'%{2,}', '', text) # Title
245
+ text = re.sub(r'#{2,}', '', text) # Title
246
+ text = re.sub(r'\*{3,}', '', text) # Title
247
+ text = re.sub(r'={3,}', '', text) # Topic
248
+ text = re.sub(r'\^{3,}', '', text)
249
+ text = re.sub(r'-{3,}', '', text)
250
+
251
+ text = re.sub(r'(\s*\n\s*)+', '\n', text)
252
+ return text
253
+
254
+ return SplitManual().split_for_embedding()
255
 
256
+ @staticmethod
257
+ def wiki_get_texts_to_embed():
258
+ class SplitWiki(Split):
259
+ def split_in_topics(_self):
260
+ owner = "blender"
261
+ repo = "blender"
262
+ pages = gitea_wiki_pages_get(owner, repo)
263
+ for page_name in pages:
264
+ page_name_title = page_name["title"]
265
+ page = gitea_wiki_page_get(owner, repo, page_name_title)
266
+ prefix = f'/{owner}/{repo}/{page["sub_url"]}\n# {page_name_title}:\n'
267
+ text = base64.b64decode(
268
+ page["content_base64"]).decode('utf-8')
269
+ yield (prefix, text)
270
+
271
+ def reduce_text(_self, text):
272
+ return super().reduce_text(text).replace('https://projects.blender.org', '')
273
+
274
+ return SplitWiki().split_for_embedding()
 
 
 
 
 
 
 
 
275
 
276
+ @staticmethod
277
+ def docs_get_texts_to_embed():
278
+ return Split(DOCS_DIR).split_for_embedding()
279
 
280
+ def _sort_similarity(self, text_to_search, groups: Set[Group] = {Group.docs, Group.wiki, Group.manual}, limit=5):
281
  result = []
282
 
283
  query_emb = EMBEDDING_CTX.encode([text_to_search])
284
 
285
  ret = {}
286
 
287
+ for grp in groups:
288
+ if not grp in self:
289
+ continue
290
+
291
+ ret[grp] = util.semantic_search(
292
+ query_emb, self[grp]['embeddings'], top_k=limit, score_function=util.dot_score)
293
 
294
  score_best = 0.0
295
  group_best = None
 
316
  @router.get("/wiki_search", response_class=PlainTextResponse)
317
  def wiki_search(query: str = "", group: Group = Group.all) -> str:
318
  base_url = {
319
+ "docs": "https://developer.blender.org/docs",
320
+ "wiki": "https://projects.blender.org",
321
+ "manual": "https://docs.blender.org/manual/en/dev"
322
  }
323
+
324
+ if group is Group.all:
325
+ groups = {Group.docs, Group.wiki, Group.manual}
326
+ elif group is Group.wiki:
327
+ groups = {Group.docs, Group.wiki}
328
+ else:
329
+ groups = {group}
330
+
331
+ texts, group_best = G_data._sort_similarity(query, groups)
332
 
333
  result = f'BASE_URL: {base_url[group_best]}\n'
334
  for text in texts:
335
+ result += f'\n---\n{text}'
 
 
 
 
 
 
 
 
 
336
  return result
337
 
338
 
339
  if __name__ == '__main__':
340
  tests = ["Set Snap Base", "Building the Manual",
341
+ "Bisect Object", "Who are the Triagers", "4.3 Release Notes Motion Paths"]
342
+ result = wiki_search(tests[4], Group.wiki)
343
  print(result)