Germano Cavalcante commited on
Commit
9a6a74b
1 Parent(s): ed15883

API changes

Browse files
routers/tool_bpy_doc.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import pickle
4
  from fastapi import APIRouter
 
5
 
6
 
7
  router = APIRouter()
@@ -53,10 +54,10 @@ def bpy_doc_get_documentation(api):
53
  return documentation
54
 
55
 
56
- @router.get("/bpy_doc")
57
  def bpy_doc(api: str = ""):
58
  message = bpy_doc_get_documentation(api)
59
- return {"message": message}
60
 
61
 
62
  if __name__ == "__main__":
 
2
 
3
  import pickle
4
  from fastapi import APIRouter
5
+ from fastapi.responses import PlainTextResponse
6
 
7
 
8
  router = APIRouter()
 
54
  return documentation
55
 
56
 
57
+ @router.get("/bpy_doc", response_class=PlainTextResponse)
58
  def bpy_doc(api: str = ""):
59
  message = bpy_doc_get_documentation(api)
60
+ return message
61
 
62
 
63
  if __name__ == "__main__":
routers/tool_find_related.py CHANGED
@@ -10,6 +10,7 @@ from datetime import datetime, timedelta
10
  from enum import Enum
11
  from sentence_transformers import util
12
  from fastapi import APIRouter
 
13
 
14
  try:
15
  from .embedding import EMBEDDING_CTX
@@ -24,9 +25,6 @@ router = APIRouter()
24
  issue_attr_filter = {'number', 'title', 'body',
25
  'state', 'updated_at', 'created_at'}
26
 
27
- G_cache_path = "routers/embedding/embeddings_issues.pkl"
28
- G_data = {}
29
-
30
 
31
  class State(str, Enum):
32
  opened = "opened"
@@ -34,263 +32,265 @@ class State(str, Enum):
34
  all = "all"
35
 
36
 
37
- def _create_issue_string(title, body):
38
- cleaned_body = body.replace('\r', '')
39
- cleaned_body = cleaned_body.replace('**System Information**\n', '')
40
- cleaned_body = cleaned_body.replace('**Blender Version**\n', '')
41
- cleaned_body = cleaned_body.replace(
42
- 'Worked: (newest version of Blender that worked as expected)\n', '')
43
- cleaned_body = cleaned_body.replace('**Short description of error**\n', '')
44
- cleaned_body = cleaned_body.replace('**Addon Information**\n', '')
45
- cleaned_body = cleaned_body.replace(
46
- '**Exact steps for others to reproduce the error**\n', '')
47
- cleaned_body = cleaned_body.replace(
48
- '[Please describe the exact steps needed to reproduce the issue]\n', '')
49
- cleaned_body = cleaned_body.replace(
50
- '[Please fill out a short description of the error here]\n', '')
51
- cleaned_body = cleaned_body.replace(
52
- '[Based on the default startup or an attached .blend file (as simple as possible)]\n', '')
53
- cleaned_body = re.sub(
54
- r', branch: .+?, commit date: \d{4}-\d{2}-\d{2} \d{2}:\d{2}, hash: `.+?`', '', cleaned_body)
55
- cleaned_body = re.sub(
56
- r'\/?attachments\/[a-zA-Z0-9\-]+', 'attachment', cleaned_body)
57
- cleaned_body = re.sub(
58
- r'https?:\/\/[^\s/]+(?:\/[^\s/]+)*\/([^\s/]+)', lambda match: match.group(1), cleaned_body)
59
-
60
- return title + '\n' + cleaned_body
61
-
62
-
63
- def _find_latest_date(issues, default_str=None):
64
- # Handle the case where 'issues' is empty
65
- if not issues:
66
- return default_str
67
-
68
- return max((issue['updated_at'] for issue in issues), default=default_str)
69
-
70
-
71
- def _create_strings_to_embbed(issues):
72
- texts_to_embed = [_create_issue_string(
73
- issue['title'], issue['body']) for issue in issues]
74
-
75
- return texts_to_embed
76
-
77
-
78
- def _data_ensure_size(repo, size_new):
79
- global G_data
80
-
81
- ARRAY_CHUNK_SIZE = 4096
82
-
83
- updated_at_old = None
84
- arrays_size_old = 0
85
- titles_old = []
86
- try:
87
- arrays_size_old = G_data[repo]['arrays_size']
88
- if size_new <= arrays_size_old:
89
- return
90
- except:
91
- pass
92
-
93
- arrays_size_new = ARRAY_CHUNK_SIZE * (int(size_new / ARRAY_CHUNK_SIZE) + 1)
94
-
95
- data_new = {
96
- 'updated_at': updated_at_old,
97
- 'arrays_size': arrays_size_new,
98
- 'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
99
- 'embeddings': torch.empty((arrays_size_new, *EMBEDDING_CTX.embedding_shape),
100
- dtype=EMBEDDING_CTX.embedding_dtype,
101
- device=EMBEDDING_CTX.embedding_device),
102
- 'opened': torch.zeros(arrays_size_new, dtype=torch.bool),
103
- 'closed': torch.zeros(arrays_size_new, dtype=torch.bool),
104
- }
105
-
106
- try:
107
- data_new['embeddings'][:arrays_size_old] = G_data[repo]['embeddings']
108
- data_new['opened'][:arrays_size_old] = G_data[repo]['opened']
109
- data_new['closed'][:arrays_size_old] = G_data[repo]['closed']
110
- except:
111
- pass
112
-
113
- G_data[repo] = data_new
114
-
115
-
116
- def _embeddings_generate(repo):
117
- global G_data
118
-
119
- if os.path.exists(G_cache_path):
120
- with open(G_cache_path, 'rb') as file:
121
- G_data = pickle.load(file)
122
- if repo in G_data:
123
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- issues = gitea_fetch_issues('blender', repo, state='all', since=None,
126
- issue_attr_filter=issue_attr_filter)
 
 
 
 
127
 
128
- # issues = sorted(issues, key=lambda issue: int(issue['number']))
129
 
130
- print("Embedding Issues...")
131
- texts_to_embed = _create_strings_to_embbed(issues)
132
- embeddings = EMBEDDING_CTX.encode(texts_to_embed)
 
 
 
 
133
 
134
- _data_ensure_size(repo, int(issues[0]['number']))
135
- G_data[repo]['updated_at'] = _find_latest_date(issues)
136
 
137
- titles = G_data[repo]['titles']
138
- embeddings_new = G_data[repo]['embeddings']
139
- opened = G_data[repo]['opened']
140
- closed = G_data[repo]['closed']
141
 
142
- for i, issue in enumerate(issues):
143
- number = int(issue['number'])
144
- titles[number] = issue['title']
145
- embeddings_new[number] = embeddings[i]
146
- if issue['state'] == 'open':
147
- opened[number] = True
148
- if issue['state'] == 'closed':
149
- closed[number] = True
150
 
 
 
151
 
152
- def _embeddings_updated_get(repo):
153
- global G_data
 
 
154
 
155
- with EMBEDDING_CTX.lock:
156
- try:
157
- data_repo = G_data[repo]
158
- except:
159
- _embeddings_generate(repo)
160
- data_repo = G_data[repo]
 
 
161
 
162
- date_old = data_repo['updated_at']
 
 
 
 
 
 
163
 
164
- issues = gitea_fetch_issues(
165
- 'blender', repo, since=date_old, issue_attr_filter=issue_attr_filter)
166
 
167
- # Get the most recent date
168
- date_new = _find_latest_date(issues, date_old)
169
 
170
- if date_new == date_old:
171
- # Nothing changed
172
- return data_repo
173
 
174
- data_repo['updated_at'] = date_new
 
 
175
 
176
- # autopep8: off
177
- # Consider that if the time hasn't changed, it's the same issue.
178
- issues = [issue for issue in issues if issue['updated_at'] != date_old]
179
 
180
- _data_ensure_size(repo, int(issues[0]['number']))
 
 
181
 
182
- updated_at = gitea_issues_body_updated_at_get(issues)
183
- issues_to_embed = []
184
 
185
- for i, issue in enumerate(issues):
186
- number = int(issue['number'])
187
- if issue['state'] == 'open':
188
- data_repo['opened'][number] = True
189
- if issue['state'] == 'closed':
190
- data_repo['closed'][number] = True
191
 
192
- title_old = data_repo['titles'][number]
193
- if title_old != issue['title']:
194
- data_repo['titles'][number] = issue['title']
195
- issues_to_embed.append(issue)
196
- elif updated_at[i] >= date_old:
197
- issues_to_embed.append(issue)
198
 
199
- if issues_to_embed:
200
- print(f"Embedding {len(issues_to_embed)} issue{'s' if len(issues_to_embed) > 1 else ''}")
201
- texts_to_embed = _create_strings_to_embbed(issues_to_embed)
202
- embeddings = EMBEDDING_CTX.encode(texts_to_embed)
 
 
203
 
204
- for i, issue in enumerate(issues_to_embed):
205
- number = int(issue['number'])
206
- data_repo['embeddings'][number] = embeddings[i]
 
 
 
 
 
 
 
 
207
 
208
- # autopep8: on
209
- return data_repo
 
 
 
 
210
 
 
 
 
211
 
212
- def _sort_similarity(data: dict,
213
- query_emb: List[torch.Tensor],
214
- limit: int,
215
- state: State = State.opened) -> list:
216
- duplicates = []
217
- embeddings = data['embeddings']
218
- mask_opened = data["opened"]
219
 
220
- if state == State.all:
221
- mask = mask_opened | data["closed"]
222
- else:
223
- mask = data[state.value]
224
 
225
- embeddings = embeddings[mask]
226
- true_indices = mask.nonzero(as_tuple=True)[0]
227
 
228
- ret = util.semantic_search(
229
- query_emb, embeddings, top_k=limit, score_function=util.dot_score)
 
 
 
 
230
 
231
- for score in ret[0]:
232
- corpus_id = score['corpus_id']
233
- number = true_indices[corpus_id].item()
234
- closed_char = "" if mask_opened[number] else "~~"
235
- text = f"{closed_char}#{number}{closed_char}: {data['titles'][number]}"
236
- duplicates.append(text)
237
 
238
- return duplicates
 
239
 
 
 
 
 
 
 
 
240
 
241
- def find_relatedness(repo: str, number: int, limit: int = 20, state: State = State.opened):
242
- data = _embeddings_updated_get(repo)
243
 
244
- # Check if the embedding already exists.
245
- if data['titles'][number] is not None:
246
- new_embedding = data['embeddings'][number]
247
- else:
248
- gitea_issue = gitea_json_issue_get('blender', repo, number)
249
- text_to_embed = _create_issue_string(
250
- gitea_issue['title'], gitea_issue['body'])
251
 
252
- new_embedding = EMBEDDING_CTX.encode([text_to_embed])
 
253
 
254
- duplicates = _sort_similarity(
255
- data, new_embedding, limit=limit, state=state)
 
 
256
 
257
- if not duplicates:
258
- return ''
259
 
260
- if match := re.search(r'(~~)?#(\d+)(~~)?:', duplicates[0]):
261
- number_cached = int(match.group(2))
262
- if number_cached == number:
263
- return '\n'.join(duplicates[1:])
264
 
265
- return '\n'.join(duplicates)
266
 
267
 
268
- @router.get("/find_related/{repo}/{number}")
269
  def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened) -> str:
270
- related = find_relatedness(repo, number, limit=limit, state=state)
271
  return related
272
 
273
 
274
  if __name__ == "__main__":
275
  update_cache = True
276
  if update_cache:
277
- _embeddings_updated_get('blender')
278
- _embeddings_updated_get('blender-addons')
279
- with open(G_cache_path, "wb") as file:
280
  # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
281
  for val in G_data.values():
282
  val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
283
 
284
- pickle.dump(G_data, file, protocol=pickle.HIGHEST_PROTOCOL)
285
 
286
  # Converting the embeddings to be GPU.
287
  for val in G_data.values():
288
  val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
289
 
290
  # 'blender/blender/111434' must print #96153, #83604 and #79762
291
- related1 = find_relatedness(
292
  'blender', 111434, limit=20, state=State.all)
293
- related2 = find_relatedness('blender-addons', 104399, limit=20)
294
 
295
  print("These are the 20 most related issues:")
296
  print(related1)
 
10
  from enum import Enum
11
  from sentence_transformers import util
12
  from fastapi import APIRouter
13
+ from fastapi.responses import PlainTextResponse
14
 
15
  try:
16
  from .embedding import EMBEDDING_CTX
 
25
  issue_attr_filter = {'number', 'title', 'body',
26
  'state', 'updated_at', 'created_at'}
27
 
 
 
 
28
 
29
  class State(str, Enum):
30
  opened = "opened"
 
32
  all = "all"
33
 
34
 
35
+ class _Data(dict):
36
+ cache_path = "routers/embedding/embeddings_issues.pkl"
37
+
38
+ @staticmethod
39
+ def _create_issue_string(title, body):
40
+ cleaned_body = body.replace('\r', '')
41
+ cleaned_body = cleaned_body.replace('**System Information**\n', '')
42
+ cleaned_body = cleaned_body.replace('**Blender Version**\n', '')
43
+ cleaned_body = cleaned_body.replace(
44
+ 'Worked: (newest version of Blender that worked as expected)\n', '')
45
+ cleaned_body = cleaned_body.replace(
46
+ '**Short description of error**\n', '')
47
+ cleaned_body = cleaned_body.replace('**Addon Information**\n', '')
48
+ cleaned_body = cleaned_body.replace(
49
+ '**Exact steps for others to reproduce the error**\n', '')
50
+ cleaned_body = cleaned_body.replace(
51
+ '[Please describe the exact steps needed to reproduce the issue]\n', '')
52
+ cleaned_body = cleaned_body.replace(
53
+ '[Please fill out a short description of the error here]\n', '')
54
+ cleaned_body = cleaned_body.replace(
55
+ '[Based on the default startup or an attached .blend file (as simple as possible)]\n', '')
56
+ cleaned_body = re.sub(
57
+ r', branch: .+?, commit date: \d{4}-\d{2}-\d{2} \d{2}:\d{2}, hash: `.+?`', '', cleaned_body)
58
+ cleaned_body = re.sub(
59
+ r'\/?attachments\/[a-zA-Z0-9\-]+', 'attachment', cleaned_body)
60
+ cleaned_body = re.sub(
61
+ r'https?:\/\/[^\s/]+(?:\/[^\s/]+)*\/([^\s/]+)', lambda match: match.group(1), cleaned_body)
62
+
63
+ return title + '\n' + cleaned_body
64
+
65
+ @staticmethod
66
+ def _find_latest_date(issues, default_str=None):
67
+ # Handle the case where 'issues' is empty
68
+ if not issues:
69
+ return default_str
70
+
71
+ return max((issue['updated_at'] for issue in issues), default=default_str)
72
+
73
+ @classmethod
74
+ def _create_strings_to_embbed(cls, issues):
75
+ texts_to_embed = [cls._create_issue_string(
76
+ issue['title'], issue['body']) for issue in issues]
77
+
78
+ return texts_to_embed
79
+
80
+ def _data_ensure_size(self, repo, size_new):
81
+ ARRAY_CHUNK_SIZE = 4096
82
+
83
+ updated_at_old = None
84
+ arrays_size_old = 0
85
+ titles_old = []
86
+ try:
87
+ arrays_size_old = self[repo]['arrays_size']
88
+ if size_new <= arrays_size_old:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  return
90
+ except:
91
+ pass
92
+
93
+ arrays_size_new = ARRAY_CHUNK_SIZE * \
94
+ (int(size_new / ARRAY_CHUNK_SIZE) + 1)
95
+
96
+ data_new = {
97
+ 'updated_at': updated_at_old,
98
+ 'arrays_size': arrays_size_new,
99
+ 'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
100
+ 'embeddings': torch.empty((arrays_size_new, *EMBEDDING_CTX.embedding_shape),
101
+ dtype=EMBEDDING_CTX.embedding_dtype,
102
+ device=EMBEDDING_CTX.embedding_device),
103
+ 'opened': torch.zeros(arrays_size_new, dtype=torch.bool),
104
+ 'closed': torch.zeros(arrays_size_new, dtype=torch.bool),
105
+ }
106
 
107
+ try:
108
+ data_new['embeddings'][:arrays_size_old] = self[repo]['embeddings']
109
+ data_new['opened'][:arrays_size_old] = self[repo]['opened']
110
+ data_new['closed'][:arrays_size_old] = self[repo]['closed']
111
+ except:
112
+ pass
113
 
114
+ self[repo] = data_new
115
 
116
+ def _embeddings_generate(self, repo):
117
+ if os.path.exists(self.cache_path):
118
+ with open(self.cache_path, 'rb') as file:
119
+ data = pickle.load(file)
120
+ self.update(data)
121
+ if repo in self:
122
+ return
123
 
124
+ issues = gitea_fetch_issues('blender', repo, state='all', since=None,
125
+ issue_attr_filter=issue_attr_filter)
126
 
127
+ # issues = sorted(issues, key=lambda issue: int(issue['number']))
 
 
 
128
 
129
+ print("Embedding Issues...")
130
+ texts_to_embed = self._create_strings_to_embbed(issues)
131
+ embeddings = EMBEDDING_CTX.encode(texts_to_embed)
 
 
 
 
 
132
 
133
+ self._data_ensure_size(repo, int(issues[0]['number']))
134
+ self[repo]['updated_at'] = self._find_latest_date(issues)
135
 
136
+ titles = self[repo]['titles']
137
+ embeddings_new = self[repo]['embeddings']
138
+ opened = self[repo]['opened']
139
+ closed = self[repo]['closed']
140
 
141
+ for i, issue in enumerate(issues):
142
+ number = int(issue['number'])
143
+ titles[number] = issue['title']
144
+ embeddings_new[number] = embeddings[i]
145
+ if issue['state'] == 'open':
146
+ opened[number] = True
147
+ if issue['state'] == 'closed':
148
+ closed[number] = True
149
 
150
+ def _embeddings_updated_get(self, repo):
151
+ with EMBEDDING_CTX.lock:
152
+ try:
153
+ data_repo = self[repo]
154
+ except:
155
+ self._embeddings_generate(repo)
156
+ data_repo = self[repo]
157
 
158
+ date_old = data_repo['updated_at']
 
159
 
160
+ issues = gitea_fetch_issues(
161
+ 'blender', repo, since=date_old, issue_attr_filter=issue_attr_filter)
162
 
163
+ # Get the most recent date
164
+ date_new = self._find_latest_date(issues, date_old)
 
165
 
166
+ if date_new == date_old:
167
+ # Nothing changed
168
+ return data_repo
169
 
170
+ data_repo['updated_at'] = date_new
 
 
171
 
172
+ # autopep8: off
173
+ # Consider that if the time hasn't changed, it's the same issue.
174
+ issues = [issue for issue in issues if issue['updated_at'] != date_old]
175
 
176
+ self._data_ensure_size(repo, int(issues[0]['number']))
 
177
 
178
+ updated_at = gitea_issues_body_updated_at_get(issues)
179
+ issues_to_embed = []
 
 
 
 
180
 
181
+ for i, issue in enumerate(issues):
182
+ number = int(issue['number'])
183
+ if issue['state'] == 'open':
184
+ data_repo['opened'][number] = True
185
+ if issue['state'] == 'closed':
186
+ data_repo['closed'][number] = True
187
 
188
+ title_old = data_repo['titles'][number]
189
+ if title_old != issue['title']:
190
+ data_repo['titles'][number] = issue['title']
191
+ issues_to_embed.append(issue)
192
+ elif updated_at[i] >= date_old:
193
+ issues_to_embed.append(issue)
194
 
195
+ if issues_to_embed:
196
+ print(f"Embedding {len(issues_to_embed)} issue{'s' if len(issues_to_embed) > 1 else ''}")
197
+ texts_to_embed = self._create_strings_to_embbed(issues_to_embed)
198
+ embeddings = EMBEDDING_CTX.encode(texts_to_embed)
199
+
200
+ for i, issue in enumerate(issues_to_embed):
201
+ number = int(issue['number'])
202
+ data_repo['embeddings'][number] = embeddings[i]
203
+
204
+ # autopep8: on
205
+ return data_repo
206
 
207
+ def _sort_similarity(self,
208
+ repo: str,
209
+ query_emb: List[torch.Tensor],
210
+ limit: int,
211
+ state: State = State.opened) -> list:
212
+ duplicates = []
213
 
214
+ data = self[repo]
215
+ embeddings = data['embeddings']
216
+ mask_opened = data["opened"]
217
 
218
+ if state == State.all:
219
+ mask = mask_opened | data["closed"]
220
+ else:
221
+ mask = data[state.value]
 
 
 
222
 
223
+ embeddings = embeddings[mask]
224
+ true_indices = mask.nonzero(as_tuple=True)[0]
 
 
225
 
226
+ ret = util.semantic_search(
227
+ query_emb, embeddings, top_k=limit, score_function=util.dot_score)
228
 
229
+ for score in ret[0]:
230
+ corpus_id = score['corpus_id']
231
+ number = true_indices[corpus_id].item()
232
+ closed_char = "" if mask_opened[number] else "~~"
233
+ text = f"{closed_char}#{number}{closed_char}: {data['titles'][number]}"
234
+ duplicates.append(text)
235
 
236
+ return duplicates
 
 
 
 
 
237
 
238
+ def find_relatedness(self, repo: str, number: int, limit: int = 20, state: State = State.opened):
239
+ data = self._embeddings_updated_get(repo)
240
 
241
+ # Check if the embedding already exists.
242
+ if data['titles'][number] is not None:
243
+ new_embedding = data['embeddings'][number]
244
+ else:
245
+ gitea_issue = gitea_json_issue_get('blender', repo, number)
246
+ text_to_embed = self._create_issue_string(
247
+ gitea_issue['title'], gitea_issue['body'])
248
 
249
+ new_embedding = EMBEDDING_CTX.encode([text_to_embed])
 
250
 
251
+ duplicates = self._sort_similarity(
252
+ repo, new_embedding, limit=limit, state=state)
 
 
 
 
 
253
 
254
+ if not duplicates:
255
+ return ''
256
 
257
+ if match := re.search(r'(~~)?#(\d+)(~~)?:', duplicates[0]):
258
+ number_cached = int(match.group(2))
259
+ if number_cached == number:
260
+ return '\n'.join(duplicates[1:])
261
 
262
+ return '\n'.join(duplicates)
 
263
 
 
 
 
 
264
 
265
+ G_data = _Data()
266
 
267
 
268
+ @router.get("/find_related/{repo}/{number}", response_class=PlainTextResponse)
269
  def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened) -> str:
270
+ related = G_data.find_relatedness(repo, number, limit=limit, state=state)
271
  return related
272
 
273
 
274
  if __name__ == "__main__":
275
  update_cache = True
276
  if update_cache:
277
+ G_data._embeddings_updated_get('blender')
278
+ G_data._embeddings_updated_get('blender-addons')
279
+ with open(G_data.cache_path, "wb") as file:
280
  # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
281
  for val in G_data.values():
282
  val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
283
 
284
+ pickle.dump(dict(G_data), file, protocol=pickle.HIGHEST_PROTOCOL)
285
 
286
  # Converting the embeddings to be GPU.
287
  for val in G_data.values():
288
  val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
289
 
290
  # 'blender/blender/111434' must print #96153, #83604 and #79762
291
+ related1 = G_data.find_relatedness(
292
  'blender', 111434, limit=20, state=State.all)
293
+ related2 = G_data.find_relatedness('blender-addons', 104399, limit=20)
294
 
295
  print("These are the 20 most related issues:")
296
  print(related1)
routers/tool_gpu_checker.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import re
4
  from fastapi import APIRouter
 
5
 
6
 
7
  router = APIRouter()
@@ -45,6 +46,7 @@ def _check_amd(graphics_card_info):
45
  r"(Radeon\s*)?RX\s*560\b": "it has Baffin XT chip that belongs to GCN 4th gen architecture",
46
  r"(Radeon\s*)?5(40X|50X)\b": "it has Polaris 23 XT chip that belongs to GCN 4th gen architecture",
47
  r"(Radeon\s*)?RX\s*5(40|50)\b": "it has Lexa Pro chip that belongs to GCN 4th gen architecture",
 
48
  r"(Radeon\s*)?(\(TM\)\s*)?RX\s*4[6-8]0(\b|D)": "it has Ellesmere chip that belongs to GCN 4st gen architecture",
49
  r"(Radeon\s*)?5(30X|35)\b": "it has Polaris 24 XT chip that belongs to GCN 3rd gen architecture",
50
  r"(Radeon\s*)?530\b": "it has Weston chip that belongs to GCN 3rd gen architecture",
@@ -192,10 +194,10 @@ If that doesn't help, you can use Blender 2.79: https://www.blender.org/download
192
 
193
 
194
  @router.get("/gpu_checker")
195
- def gpu_checker(gpu_info: str = ""):
196
  message = gpu_checker_get_message(gpu_info)
197
 
198
- return {"message": message}
199
 
200
 
201
  if __name__ == "__main__":
 
2
 
3
  import re
4
  from fastapi import APIRouter
5
+ from fastapi.responses import PlainTextResponse
6
 
7
 
8
  router = APIRouter()
 
46
  r"(Radeon\s*)?RX\s*560\b": "it has Baffin XT chip that belongs to GCN 4th gen architecture",
47
  r"(Radeon\s*)?5(40X|50X)\b": "it has Polaris 23 XT chip that belongs to GCN 4th gen architecture",
48
  r"(Radeon\s*)?RX\s*5(40|50)\b": "it has Lexa Pro chip that belongs to GCN 4th gen architecture",
49
+ r"(Radeon\s*)?RX\s*480\b": "it has Arctic Islands chip that belongs to GCN 4th gen architecture",
50
  r"(Radeon\s*)?(\(TM\)\s*)?RX\s*4[6-8]0(\b|D)": "it has Ellesmere chip that belongs to GCN 4st gen architecture",
51
  r"(Radeon\s*)?5(30X|35)\b": "it has Polaris 24 XT chip that belongs to GCN 3rd gen architecture",
52
  r"(Radeon\s*)?530\b": "it has Weston chip that belongs to GCN 3rd gen architecture",
 
194
 
195
 
196
  @router.get("/gpu_checker")
197
+ def gpu_checker(gpu_info: str = "", response_class=PlainTextResponse):
198
  message = gpu_checker_get_message(gpu_info)
199
 
200
+ return message
201
 
202
 
203
  if __name__ == "__main__":
routers/tool_wiki_search.py CHANGED
@@ -3,9 +3,11 @@
3
  import os
4
  import pickle
5
  import re
 
6
  from typing import Dict, List
7
  from sentence_transformers import util
8
  from fastapi import APIRouter
 
9
 
10
  try:
11
  from .embedding import EMBEDDING_CTX
@@ -16,267 +18,272 @@ router = APIRouter()
16
 
17
  MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
18
  BASE_URL = "https://docs.blender.org/manual/en/dev"
19
- G_cache_path = "routers/embedding/embeddings_manual.pkl"
20
  G_data = None
21
 
22
 
23
- def _embeddings_generate():
24
- global G_data
25
 
26
- if os.path.exists(G_cache_path):
27
- with open(G_cache_path, 'rb') as file:
28
- G_data = pickle.load(file)
29
- return G_data
 
 
 
 
 
30
 
31
- # path = 'addons/3d_view'
32
- G_data = parse_file_recursive(MANUAL_DIR, 'index.rst')
33
- G_data['toctree']["copyright"] = parse_file_recursive(
34
- MANUAL_DIR, 'copyright.rst')
35
 
36
- # Create a list to store the text files
37
- texts = get_texts_recursive(data)
 
 
38
 
39
- print("Embedding Texts...")
40
- G_data['texts'] = texts
41
- G_data['embeddings'] = EMBEDDING_CTX.encode(texts)
42
 
43
- with open(self.cache_path, "wb") as file:
44
- # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
45
- G_data['embeddings'] = G_data['embeddings'].to(
46
- torch.device('cpu'))
47
-
48
- pickle.dump(G_data, file, protocol=pickle.HIGHEST_PROTOCOL)
49
-
50
- return G_data
51
-
52
-
53
- def reduce_text(text):
54
- # Remove repeated characters
55
- text = re.sub(r'%{2,}', '', text) # Title
56
- text = re.sub(r'#{2,}', '', text) # Title
57
- text = re.sub(r'\*{3,}', '', text) # Title
58
- text = re.sub(r'={3,}', '', text) # Topic
59
- text = re.sub(r'\^{3,}', '', text)
60
- text = re.sub(r'-{3,}', '', text)
61
-
62
- text = re.sub(r'(\s*\n\s*)+', '\n', text)
63
- return text
64
-
65
-
66
- def parse_file_recursive(filedir, filename):
67
- with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
68
- content = file.read()
69
-
70
- parsed_data = {}
71
-
72
- if not filename.endswith('index.rst'):
73
- body = content.strip()
74
- else:
75
- parts = content.split(".. toctree::")
76
- body = parts[0].strip()
77
-
78
- if len(parts) > 1:
79
- parsed_data["toctree"] = {}
80
- for part in parts[1:]:
81
- toctree_entries = part.split('\n')
82
- line = toctree_entries[0]
83
- for entry in toctree_entries[1:]:
84
- entry = entry.strip()
85
- if not entry:
86
- continue
87
-
88
- if entry.startswith('/'):
89
- # relative path.
90
- continue
91
-
92
- if not entry.endswith('.rst'):
93
- continue
94
-
95
- if entry.endswith('/index.rst'):
96
- entry_name = entry[:-10]
97
- filedir_ = os.path.join(filedir, entry_name)
98
- filename_ = 'index.rst'
99
- else:
100
- entry_name = entry[:-4]
101
- filedir_ = filedir
102
- filename_ = entry
103
-
104
- parsed_data['toctree'][entry_name] = parse_file_recursive(
105
- filedir_, filename_)
106
-
107
- # The '\n' at the end of the file resolves regex patterns
108
- parsed_data['body'] = body + '\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- return parsed_data
 
 
111
 
 
 
 
 
112
 
113
- def split_into_topics(text: str, prefix: str = '') -> Dict[str, List[str]]:
114
- """
115
- Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
116
 
117
- Args:
118
- text (str): The input text to be split. The text should contain titles marked by asterisks (***)
119
- or subtitles marked by equal signs (===).
120
- prefix (str): prefix to titles and subtitles
121
 
122
- Returns:
123
- Dict[str, List[str]]: A dictionary where keys are section titles or subtitles, and values are lists of
124
- strings corresponding to the content under each title or subtitle.
125
-
126
- Example:
127
- text = '''
128
- *********************
129
- The Blender Community
130
- *********************
131
-
132
- Being freely available from the start.
133
-
134
- Independent Sites
135
- =================
136
-
137
- There are `several independent websites.
138
-
139
- Getting Support
140
- ===============
141
-
142
- Blender's community is one of its greatest features.
143
- '''
144
-
145
- result = split_in_topics(text)
146
- # result will be:
147
- # {
148
- # "# The Blender Community": [
149
- # "Being freely available from the start."
150
- # ],
151
- # "# The Blender Community | Independent Sites": [
152
- # "There are `several independent websites."
153
- # ],
154
- # "# The Blender Community | Getting Support": [
155
- # "Blender's community is one of its greatest features."
156
- # ]
157
- # }
158
- """
159
-
160
- # Remove patterns ".. word::" and ":word:"
161
- text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
162
-
163
- # Regular expression to find titles and subtitles
164
- pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
165
-
166
- # Split text by found patterns
167
- sections = re.split(pattern, text)
168
-
169
- # Remove possible white spaces at the beginning and end of each section
170
- sections = [section for section in sections if section.strip()]
171
-
172
- # Separate sections into a dictionary
173
- topics = {}
174
- current_title = ''
175
- current_topic = prefix
176
-
177
- for section in sections:
178
- if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
179
- current_topic = current_title = f'{prefix}# {match.group(1)}'
180
- topics[current_topic] = []
181
- elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
182
- current_topic = current_title + ' | ' + match.group(1)
183
- topics[current_topic] = []
184
- else:
185
- if current_topic == prefix:
186
- raise
187
- topics[current_topic].append(section)
188
-
189
- return topics
190
-
191
-
192
- # Function to split the text into chunks of a maximum number of tokens
193
- def split_into_many(page_body, prefix=''):
194
- tokenizer = EMBEDDING_CTX.model.tokenizer
195
- max_tokens = EMBEDDING_CTX.model.max_seq_length
196
- topics = split_into_topics(page_body, prefix)
197
-
198
- for topic, content_list in topics.items():
199
- title = topic + ':\n'
200
- title_tokens_len = len(tokenizer.tokenize(title))
201
- content_list_new = []
202
- for content in content_list:
203
- content_reduced = reduce_text(content)
204
- content_tokens_len = len(tokenizer.tokenize(content_reduced))
205
- if title_tokens_len + content_tokens_len <= max_tokens:
206
- content_list_new.append(content_reduced)
207
- continue
208
-
209
- # Split the text into sentences
210
- paragraphs = content_reduced.split('.\n')
211
- sentences = ''
212
- tokens_so_far = title_tokens_len
213
-
214
- # Loop through the sentences and tokens joined together in a tuple
215
- for sentence in paragraphs:
216
- sentence += '.\n'
217
-
218
- # Get the number of tokens for each sentence
219
- n_tokens = len(tokenizer.tokenize(sentence))
220
-
221
- # If the number of tokens so far plus the number of tokens in the current sentence is greater
222
- # than the max number of tokens, then add the chunk to the list of chunks and reset
223
- # the chunk and tokens so far
224
- if tokens_so_far + n_tokens > max_tokens:
225
- content_list_new.append(sentences)
226
- sentences = ''
227
- tokens_so_far = title_tokens_len
228
 
229
- sentences += sentence
230
- tokens_so_far += n_tokens
231
 
232
- if sentences:
233
- content_list_new.append(sentences)
 
 
 
 
234
 
235
- # Replace content_list
236
- content_list.clear()
237
- content_list.extend(content_list_new)
238
 
239
- result = []
240
- for topic, content_list in topics.items():
241
- for content in content_list:
242
- result.append(topic + ':\n' + content)
243
 
244
- return result
 
245
 
 
 
 
246
 
247
- def get_texts_recursive(page, path=''):
248
- result = split_into_many(page['body'], path)
 
 
249
 
250
- try:
251
- for key in page['toctree'].keys():
252
- page_child = page['toctree'][key]
253
- result.extend(get_texts_recursive(page_child, f'{path}/{key}'))
254
- except KeyError:
255
- pass
256
 
257
- return result
 
258
 
 
 
 
259
 
260
- def _sort_similarity(data, text_to_search, limit):
261
- results = []
 
 
 
262
 
263
- query_emb = EMBEDDING_CTX.encode([text_to_search])
264
- ret = util.semantic_search(
265
- query_emb, data['embeddings'], top_k=limit, score_function=util.dot_score)
266
 
267
- texts = data['texts']
268
- for score in ret[0]:
269
- corpus_id = score['corpus_id']
270
- text = texts[corpus_id]
271
- results.append(text)
272
 
273
- return results
274
 
275
 
276
- @router.get("/wiki_search")
277
  def wiki_search(query: str = "") -> str:
278
- data = _embeddings_generate()
279
- texts = _sort_similarity(data, query, 5)
280
 
281
  result = f'BASE_URL: {BASE_URL}\n'
282
  for text in texts:
 
3
  import os
4
  import pickle
5
  import re
6
+ import torch
7
  from typing import Dict, List
8
  from sentence_transformers import util
9
  from fastapi import APIRouter
10
+ from fastapi.responses import PlainTextResponse
11
 
12
  try:
13
  from .embedding import EMBEDDING_CTX
 
18
 
19
  MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
20
  BASE_URL = "https://docs.blender.org/manual/en/dev"
 
21
  G_data = None
22
 
23
 
24
+ class _Data(dict):
25
+ cache_path = "routers/embedding/embeddings_manual.pkl"
26
 
27
+ @staticmethod
28
+ def reduce_text(text):
29
+ # Remove repeated characters
30
+ text = re.sub(r'%{2,}', '', text) # Title
31
+ text = re.sub(r'#{2,}', '', text) # Title
32
+ text = re.sub(r'\*{3,}', '', text) # Title
33
+ text = re.sub(r'={3,}', '', text) # Topic
34
+ text = re.sub(r'\^{3,}', '', text)
35
+ text = re.sub(r'-{3,}', '', text)
36
 
37
+ text = re.sub(r'(\s*\n\s*)+', '\n', text)
38
+ return text
 
 
39
 
40
+ @classmethod
41
+ def parse_file_recursive(cls, filedir, filename):
42
+ with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
43
+ content = file.read()
44
 
45
+ parsed_data = {}
 
 
46
 
47
+ if not filename.endswith('index.rst'):
48
+ body = content.strip()
49
+ else:
50
+ parts = content.split(".. toctree::")
51
+ body = parts[0].strip()
52
+
53
+ if len(parts) > 1:
54
+ parsed_data["toctree"] = {}
55
+ for part in parts[1:]:
56
+ toctree_entries = part.split('\n')
57
+ line = toctree_entries[0]
58
+ for entry in toctree_entries[1:]:
59
+ entry = entry.strip()
60
+ if not entry:
61
+ continue
62
+
63
+ if entry.startswith('/'):
64
+ # relative path.
65
+ continue
66
+
67
+ if not entry.endswith('.rst'):
68
+ continue
69
+
70
+ if entry.endswith('/index.rst'):
71
+ entry_name = entry[:-10]
72
+ filedir_ = os.path.join(filedir, entry_name)
73
+ filename_ = 'index.rst'
74
+ else:
75
+ entry_name = entry[:-4]
76
+ filedir_ = filedir
77
+ filename_ = entry
78
+
79
+ parsed_data['toctree'][entry_name] = cls.parse_file_recursive(
80
+ filedir_, filename_)
81
+
82
+ # The '\n' at the end of the file resolves regex patterns
83
+ parsed_data['body'] = body + '\n'
84
+
85
+ return parsed_data
86
+
87
+ @staticmethod
88
+ def split_into_topics(text: str, prefix: str = '') -> Dict[str, List[str]]:
89
+ """
90
+ Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
91
+
92
+ Args:
93
+ text (str): The input text to be split. The text should contain titles marked by asterisks (***)
94
+ or subtitles marked by equal signs (===).
95
+ prefix (str): prefix to titles and subtitles
96
+
97
+ Returns:
98
+ Dict[str, List[str]]: A dictionary where keys are section titles or subtitles, and values are lists of
99
+ strings corresponding to the content under each title or subtitle.
100
+
101
+ Example:
102
+ text = '''
103
+ *********************
104
+ The Blender Community
105
+ *********************
106
+
107
+ Being freely available from the start.
108
+
109
+ Independent Sites
110
+ =================
111
+
112
+ There are `several independent websites.
113
+
114
+ Getting Support
115
+ ===============
116
+
117
+ Blender's community is one of its greatest features.
118
+ '''
119
+
120
+ result = split_in_topics(text)
121
+ # result will be:
122
+ # {
123
+ # "# The Blender Community": [
124
+ # "Being freely available from the start."
125
+ # ],
126
+ # "# The Blender Community | Independent Sites": [
127
+ # "There are `several independent websites."
128
+ # ],
129
+ # "# The Blender Community | Getting Support": [
130
+ # "Blender's community is one of its greatest features."
131
+ # ]
132
+ # }
133
+ """
134
+
135
+ # Remove patterns ".. word::" and ":word:"
136
+ text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
137
+
138
+ # Regular expression to find titles and subtitles
139
+ pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
140
+
141
+ # Split text by found patterns
142
+ sections = re.split(pattern, text)
143
+
144
+ # Remove possible white spaces at the beginning and end of each section
145
+ sections = [section for section in sections if section.strip()]
146
+
147
+ # Separate sections into a dictionary
148
+ topics = {}
149
+ current_title = ''
150
+ current_topic = prefix
151
+
152
+ for section in sections:
153
+ if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
154
+ current_topic = current_title = f'{prefix}# {match.group(1)}'
155
+ topics[current_topic] = []
156
+ elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
157
+ current_topic = current_title + ' | ' + match.group(1)
158
+ topics[current_topic] = []
159
+ else:
160
+ if current_topic == prefix:
161
+ raise
162
+ topics[current_topic].append(section)
163
+
164
+ return topics
165
+
166
+ @classmethod
167
+ def split_into_many(cls, page_body, prefix=''):
168
+ """
169
+ # Function to split the text into chunks of a maximum number of tokens
170
+ """
171
+ tokenizer = EMBEDDING_CTX.model.tokenizer
172
+ max_tokens = EMBEDDING_CTX.model.max_seq_length
173
+ topics = cls.split_into_topics(page_body, prefix)
174
+
175
+ for topic, content_list in topics.items():
176
+ title = topic + ':\n'
177
+ title_tokens_len = len(tokenizer.tokenize(title))
178
+ content_list_new = []
179
+ for content in content_list:
180
+ content_reduced = cls.reduce_text(content)
181
+ content_tokens_len = len(tokenizer.tokenize(content_reduced))
182
+ if title_tokens_len + content_tokens_len <= max_tokens:
183
+ content_list_new.append(content_reduced)
184
+ continue
185
+
186
+ # Split the text into sentences
187
+ paragraphs = content_reduced.split('.\n')
188
+ sentences = ''
189
+ tokens_so_far = title_tokens_len
190
+
191
+ # Loop through the sentences and tokens joined together in a tuple
192
+ for sentence in paragraphs:
193
+ sentence += '.\n'
194
+
195
+ # Get the number of tokens for each sentence
196
+ n_tokens = len(tokenizer.tokenize(sentence))
197
+
198
+ # If the number of tokens so far plus the number of tokens in the current sentence is greater
199
+ # than the max number of tokens, then add the chunk to the list of chunks and reset
200
+ # the chunk and tokens so far
201
+ if tokens_so_far + n_tokens > max_tokens:
202
+ content_list_new.append(sentences)
203
+ sentences = ''
204
+ tokens_so_far = title_tokens_len
205
+
206
+ sentences += sentence
207
+ tokens_so_far += n_tokens
208
+
209
+ if sentences:
210
+ content_list_new.append(sentences)
211
 
212
+ # Replace content_list
213
+ content_list.clear()
214
+ content_list.extend(content_list_new)
215
 
216
+ result = []
217
+ for topic, content_list in topics.items():
218
+ for content in content_list:
219
+ result.append(topic + ':\n' + content)
220
 
221
+ return result
 
 
222
 
223
+ @classmethod
224
+ def get_texts_recursive(cls, page, path=''):
225
+ result = cls.split_into_many(page['body'], path)
 
226
 
227
+ try:
228
+ for key in page['toctree'].keys():
229
+ page_child = page['toctree'][key]
230
+ result.extend(cls.get_texts_recursive(
231
+ page_child, f'{path}/{key}'))
232
+ except KeyError:
233
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ return result
 
236
 
237
+ def _embeddings_generate(self):
238
+ if os.path.exists(self.cache_path):
239
+ with open(self.cache_path, 'rb') as file:
240
+ data = pickle.load(file)
241
+ self.update(data)
242
+ return self
243
 
244
+ # Generate
 
 
245
 
246
+ manual = self.parse_file_recursive(MANUAL_DIR, 'index.rst')
247
+ manual['toctree']["copyright"] = self.parse_file_recursive(
248
+ MANUAL_DIR, 'copyright.rst')
 
249
 
250
+ # Create a list to store the text files
251
+ texts = self.get_texts_recursive(manual)
252
 
253
+ print("Embedding Texts...")
254
+ self['texts'] = texts
255
+ self['embeddings'] = EMBEDDING_CTX.encode(texts)
256
 
257
+ with open(self.cache_path, "wb") as file:
258
+ # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
259
+ self['embeddings'] = self['embeddings'].to(torch.device('cpu'))
260
+ pickle.dump(dict(self), file, protocol=pickle.HIGHEST_PROTOCOL)
261
 
262
+ return G_data
 
 
 
 
 
263
 
264
+ def _sort_similarity(self, text_to_search, limit):
265
+ results = []
266
 
267
+ query_emb = EMBEDDING_CTX.encode([text_to_search])
268
+ ret = util.semantic_search(
269
+ query_emb, self['embeddings'], top_k=limit, score_function=util.dot_score)
270
 
271
+ texts = self['texts']
272
+ for score in ret[0]:
273
+ corpus_id = score['corpus_id']
274
+ text = texts[corpus_id]
275
+ results.append(text)
276
 
277
+ return results
 
 
278
 
 
 
 
 
 
279
 
280
+ G_data = _Data()
281
 
282
 
283
+ @router.get("/wiki_search", response_class=PlainTextResponse)
284
  def wiki_search(query: str = "") -> str:
285
+ data = G_data._embeddings_generate()
286
+ texts = G_data._sort_similarity(query, 5)
287
 
288
  result = f'BASE_URL: {BASE_URL}\n'
289
  for text in texts: