Spaces:
Running
Running
Germano Cavalcante
commited on
Commit
•
9a6a74b
1
Parent(s):
ed15883
API changes
Browse files- routers/tool_bpy_doc.py +3 -2
- routers/tool_find_related.py +202 -202
- routers/tool_gpu_checker.py +4 -2
- routers/tool_wiki_search.py +238 -231
routers/tool_bpy_doc.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import pickle
|
4 |
from fastapi import APIRouter
|
|
|
5 |
|
6 |
|
7 |
router = APIRouter()
|
@@ -53,10 +54,10 @@ def bpy_doc_get_documentation(api):
|
|
53 |
return documentation
|
54 |
|
55 |
|
56 |
-
@router.get("/bpy_doc")
|
57 |
def bpy_doc(api: str = ""):
|
58 |
message = bpy_doc_get_documentation(api)
|
59 |
-
return
|
60 |
|
61 |
|
62 |
if __name__ == "__main__":
|
|
|
2 |
|
3 |
import pickle
|
4 |
from fastapi import APIRouter
|
5 |
+
from fastapi.responses import PlainTextResponse
|
6 |
|
7 |
|
8 |
router = APIRouter()
|
|
|
54 |
return documentation
|
55 |
|
56 |
|
57 |
+
@router.get("/bpy_doc", response_class=PlainTextResponse)
|
58 |
def bpy_doc(api: str = ""):
|
59 |
message = bpy_doc_get_documentation(api)
|
60 |
+
return message
|
61 |
|
62 |
|
63 |
if __name__ == "__main__":
|
routers/tool_find_related.py
CHANGED
@@ -10,6 +10,7 @@ from datetime import datetime, timedelta
|
|
10 |
from enum import Enum
|
11 |
from sentence_transformers import util
|
12 |
from fastapi import APIRouter
|
|
|
13 |
|
14 |
try:
|
15 |
from .embedding import EMBEDDING_CTX
|
@@ -24,9 +25,6 @@ router = APIRouter()
|
|
24 |
issue_attr_filter = {'number', 'title', 'body',
|
25 |
'state', 'updated_at', 'created_at'}
|
26 |
|
27 |
-
G_cache_path = "routers/embedding/embeddings_issues.pkl"
|
28 |
-
G_data = {}
|
29 |
-
|
30 |
|
31 |
class State(str, Enum):
|
32 |
opened = "opened"
|
@@ -34,263 +32,265 @@ class State(str, Enum):
|
|
34 |
all = "all"
|
35 |
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
pass
|
92 |
-
|
93 |
-
arrays_size_new = ARRAY_CHUNK_SIZE * (int(size_new / ARRAY_CHUNK_SIZE) + 1)
|
94 |
-
|
95 |
-
data_new = {
|
96 |
-
'updated_at': updated_at_old,
|
97 |
-
'arrays_size': arrays_size_new,
|
98 |
-
'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
|
99 |
-
'embeddings': torch.empty((arrays_size_new, *EMBEDDING_CTX.embedding_shape),
|
100 |
-
dtype=EMBEDDING_CTX.embedding_dtype,
|
101 |
-
device=EMBEDDING_CTX.embedding_device),
|
102 |
-
'opened': torch.zeros(arrays_size_new, dtype=torch.bool),
|
103 |
-
'closed': torch.zeros(arrays_size_new, dtype=torch.bool),
|
104 |
-
}
|
105 |
-
|
106 |
-
try:
|
107 |
-
data_new['embeddings'][:arrays_size_old] = G_data[repo]['embeddings']
|
108 |
-
data_new['opened'][:arrays_size_old] = G_data[repo]['opened']
|
109 |
-
data_new['closed'][:arrays_size_old] = G_data[repo]['closed']
|
110 |
-
except:
|
111 |
-
pass
|
112 |
-
|
113 |
-
G_data[repo] = data_new
|
114 |
-
|
115 |
-
|
116 |
-
def _embeddings_generate(repo):
|
117 |
-
global G_data
|
118 |
-
|
119 |
-
if os.path.exists(G_cache_path):
|
120 |
-
with open(G_cache_path, 'rb') as file:
|
121 |
-
G_data = pickle.load(file)
|
122 |
-
if repo in G_data:
|
123 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
|
134 |
-
|
135 |
-
|
136 |
|
137 |
-
|
138 |
-
embeddings_new = G_data[repo]['embeddings']
|
139 |
-
opened = G_data[repo]['opened']
|
140 |
-
closed = G_data[repo]['closed']
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
embeddings_new[number] = embeddings[i]
|
146 |
-
if issue['state'] == 'open':
|
147 |
-
opened[number] = True
|
148 |
-
if issue['state'] == 'closed':
|
149 |
-
closed[number] = True
|
150 |
|
|
|
|
|
151 |
|
152 |
-
|
153 |
-
|
|
|
|
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
161 |
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
|
165 |
-
'blender', repo, since=date_old, issue_attr_filter=issue_attr_filter)
|
166 |
|
167 |
-
|
168 |
-
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
return data_repo
|
173 |
|
174 |
-
|
|
|
|
|
175 |
|
176 |
-
|
177 |
-
# Consider that if the time hasn't changed, it's the same issue.
|
178 |
-
issues = [issue for issue in issues if issue['updated_at'] != date_old]
|
179 |
|
180 |
-
|
|
|
|
|
181 |
|
182 |
-
|
183 |
-
issues_to_embed = []
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
if issue['state'] == 'open':
|
188 |
-
data_repo['opened'][number] = True
|
189 |
-
if issue['state'] == 'closed':
|
190 |
-
data_repo['closed'][number] = True
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
210 |
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
duplicates = []
|
217 |
-
embeddings = data['embeddings']
|
218 |
-
mask_opened = data["opened"]
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
else:
|
223 |
-
mask = data[state.value]
|
224 |
|
225 |
-
|
226 |
-
|
227 |
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
230 |
|
231 |
-
|
232 |
-
corpus_id = score['corpus_id']
|
233 |
-
number = true_indices[corpus_id].item()
|
234 |
-
closed_char = "" if mask_opened[number] else "~~"
|
235 |
-
text = f"{closed_char}#{number}{closed_char}: {data['titles'][number]}"
|
236 |
-
duplicates.append(text)
|
237 |
|
238 |
-
|
|
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
-
|
242 |
-
data = _embeddings_updated_get(repo)
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
new_embedding = data['embeddings'][number]
|
247 |
-
else:
|
248 |
-
gitea_issue = gitea_json_issue_get('blender', repo, number)
|
249 |
-
text_to_embed = _create_issue_string(
|
250 |
-
gitea_issue['title'], gitea_issue['body'])
|
251 |
|
252 |
-
|
|
|
253 |
|
254 |
-
|
255 |
-
|
|
|
|
|
256 |
|
257 |
-
|
258 |
-
return ''
|
259 |
|
260 |
-
if match := re.search(r'(~~)?#(\d+)(~~)?:', duplicates[0]):
|
261 |
-
number_cached = int(match.group(2))
|
262 |
-
if number_cached == number:
|
263 |
-
return '\n'.join(duplicates[1:])
|
264 |
|
265 |
-
|
266 |
|
267 |
|
268 |
-
@router.get("/find_related/{repo}/{number}")
|
269 |
def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened) -> str:
|
270 |
-
related = find_relatedness(repo, number, limit=limit, state=state)
|
271 |
return related
|
272 |
|
273 |
|
274 |
if __name__ == "__main__":
|
275 |
update_cache = True
|
276 |
if update_cache:
|
277 |
-
_embeddings_updated_get('blender')
|
278 |
-
_embeddings_updated_get('blender-addons')
|
279 |
-
with open(
|
280 |
# Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
|
281 |
for val in G_data.values():
|
282 |
val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
|
283 |
|
284 |
-
pickle.dump(G_data, file, protocol=pickle.HIGHEST_PROTOCOL)
|
285 |
|
286 |
# Converting the embeddings to be GPU.
|
287 |
for val in G_data.values():
|
288 |
val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
|
289 |
|
290 |
# 'blender/blender/111434' must print #96153, #83604 and #79762
|
291 |
-
related1 = find_relatedness(
|
292 |
'blender', 111434, limit=20, state=State.all)
|
293 |
-
related2 = find_relatedness('blender-addons', 104399, limit=20)
|
294 |
|
295 |
print("These are the 20 most related issues:")
|
296 |
print(related1)
|
|
|
10 |
from enum import Enum
|
11 |
from sentence_transformers import util
|
12 |
from fastapi import APIRouter
|
13 |
+
from fastapi.responses import PlainTextResponse
|
14 |
|
15 |
try:
|
16 |
from .embedding import EMBEDDING_CTX
|
|
|
25 |
issue_attr_filter = {'number', 'title', 'body',
|
26 |
'state', 'updated_at', 'created_at'}
|
27 |
|
|
|
|
|
|
|
28 |
|
29 |
class State(str, Enum):
|
30 |
opened = "opened"
|
|
|
32 |
all = "all"
|
33 |
|
34 |
|
35 |
+
class _Data(dict):
|
36 |
+
cache_path = "routers/embedding/embeddings_issues.pkl"
|
37 |
+
|
38 |
+
@staticmethod
|
39 |
+
def _create_issue_string(title, body):
|
40 |
+
cleaned_body = body.replace('\r', '')
|
41 |
+
cleaned_body = cleaned_body.replace('**System Information**\n', '')
|
42 |
+
cleaned_body = cleaned_body.replace('**Blender Version**\n', '')
|
43 |
+
cleaned_body = cleaned_body.replace(
|
44 |
+
'Worked: (newest version of Blender that worked as expected)\n', '')
|
45 |
+
cleaned_body = cleaned_body.replace(
|
46 |
+
'**Short description of error**\n', '')
|
47 |
+
cleaned_body = cleaned_body.replace('**Addon Information**\n', '')
|
48 |
+
cleaned_body = cleaned_body.replace(
|
49 |
+
'**Exact steps for others to reproduce the error**\n', '')
|
50 |
+
cleaned_body = cleaned_body.replace(
|
51 |
+
'[Please describe the exact steps needed to reproduce the issue]\n', '')
|
52 |
+
cleaned_body = cleaned_body.replace(
|
53 |
+
'[Please fill out a short description of the error here]\n', '')
|
54 |
+
cleaned_body = cleaned_body.replace(
|
55 |
+
'[Based on the default startup or an attached .blend file (as simple as possible)]\n', '')
|
56 |
+
cleaned_body = re.sub(
|
57 |
+
r', branch: .+?, commit date: \d{4}-\d{2}-\d{2} \d{2}:\d{2}, hash: `.+?`', '', cleaned_body)
|
58 |
+
cleaned_body = re.sub(
|
59 |
+
r'\/?attachments\/[a-zA-Z0-9\-]+', 'attachment', cleaned_body)
|
60 |
+
cleaned_body = re.sub(
|
61 |
+
r'https?:\/\/[^\s/]+(?:\/[^\s/]+)*\/([^\s/]+)', lambda match: match.group(1), cleaned_body)
|
62 |
+
|
63 |
+
return title + '\n' + cleaned_body
|
64 |
+
|
65 |
+
@staticmethod
|
66 |
+
def _find_latest_date(issues, default_str=None):
|
67 |
+
# Handle the case where 'issues' is empty
|
68 |
+
if not issues:
|
69 |
+
return default_str
|
70 |
+
|
71 |
+
return max((issue['updated_at'] for issue in issues), default=default_str)
|
72 |
+
|
73 |
+
@classmethod
|
74 |
+
def _create_strings_to_embbed(cls, issues):
|
75 |
+
texts_to_embed = [cls._create_issue_string(
|
76 |
+
issue['title'], issue['body']) for issue in issues]
|
77 |
+
|
78 |
+
return texts_to_embed
|
79 |
+
|
80 |
+
def _data_ensure_size(self, repo, size_new):
|
81 |
+
ARRAY_CHUNK_SIZE = 4096
|
82 |
+
|
83 |
+
updated_at_old = None
|
84 |
+
arrays_size_old = 0
|
85 |
+
titles_old = []
|
86 |
+
try:
|
87 |
+
arrays_size_old = self[repo]['arrays_size']
|
88 |
+
if size_new <= arrays_size_old:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return
|
90 |
+
except:
|
91 |
+
pass
|
92 |
+
|
93 |
+
arrays_size_new = ARRAY_CHUNK_SIZE * \
|
94 |
+
(int(size_new / ARRAY_CHUNK_SIZE) + 1)
|
95 |
+
|
96 |
+
data_new = {
|
97 |
+
'updated_at': updated_at_old,
|
98 |
+
'arrays_size': arrays_size_new,
|
99 |
+
'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
|
100 |
+
'embeddings': torch.empty((arrays_size_new, *EMBEDDING_CTX.embedding_shape),
|
101 |
+
dtype=EMBEDDING_CTX.embedding_dtype,
|
102 |
+
device=EMBEDDING_CTX.embedding_device),
|
103 |
+
'opened': torch.zeros(arrays_size_new, dtype=torch.bool),
|
104 |
+
'closed': torch.zeros(arrays_size_new, dtype=torch.bool),
|
105 |
+
}
|
106 |
|
107 |
+
try:
|
108 |
+
data_new['embeddings'][:arrays_size_old] = self[repo]['embeddings']
|
109 |
+
data_new['opened'][:arrays_size_old] = self[repo]['opened']
|
110 |
+
data_new['closed'][:arrays_size_old] = self[repo]['closed']
|
111 |
+
except:
|
112 |
+
pass
|
113 |
|
114 |
+
self[repo] = data_new
|
115 |
|
116 |
+
def _embeddings_generate(self, repo):
|
117 |
+
if os.path.exists(self.cache_path):
|
118 |
+
with open(self.cache_path, 'rb') as file:
|
119 |
+
data = pickle.load(file)
|
120 |
+
self.update(data)
|
121 |
+
if repo in self:
|
122 |
+
return
|
123 |
|
124 |
+
issues = gitea_fetch_issues('blender', repo, state='all', since=None,
|
125 |
+
issue_attr_filter=issue_attr_filter)
|
126 |
|
127 |
+
# issues = sorted(issues, key=lambda issue: int(issue['number']))
|
|
|
|
|
|
|
128 |
|
129 |
+
print("Embedding Issues...")
|
130 |
+
texts_to_embed = self._create_strings_to_embbed(issues)
|
131 |
+
embeddings = EMBEDDING_CTX.encode(texts_to_embed)
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
self._data_ensure_size(repo, int(issues[0]['number']))
|
134 |
+
self[repo]['updated_at'] = self._find_latest_date(issues)
|
135 |
|
136 |
+
titles = self[repo]['titles']
|
137 |
+
embeddings_new = self[repo]['embeddings']
|
138 |
+
opened = self[repo]['opened']
|
139 |
+
closed = self[repo]['closed']
|
140 |
|
141 |
+
for i, issue in enumerate(issues):
|
142 |
+
number = int(issue['number'])
|
143 |
+
titles[number] = issue['title']
|
144 |
+
embeddings_new[number] = embeddings[i]
|
145 |
+
if issue['state'] == 'open':
|
146 |
+
opened[number] = True
|
147 |
+
if issue['state'] == 'closed':
|
148 |
+
closed[number] = True
|
149 |
|
150 |
+
def _embeddings_updated_get(self, repo):
|
151 |
+
with EMBEDDING_CTX.lock:
|
152 |
+
try:
|
153 |
+
data_repo = self[repo]
|
154 |
+
except:
|
155 |
+
self._embeddings_generate(repo)
|
156 |
+
data_repo = self[repo]
|
157 |
|
158 |
+
date_old = data_repo['updated_at']
|
|
|
159 |
|
160 |
+
issues = gitea_fetch_issues(
|
161 |
+
'blender', repo, since=date_old, issue_attr_filter=issue_attr_filter)
|
162 |
|
163 |
+
# Get the most recent date
|
164 |
+
date_new = self._find_latest_date(issues, date_old)
|
|
|
165 |
|
166 |
+
if date_new == date_old:
|
167 |
+
# Nothing changed
|
168 |
+
return data_repo
|
169 |
|
170 |
+
data_repo['updated_at'] = date_new
|
|
|
|
|
171 |
|
172 |
+
# autopep8: off
|
173 |
+
# Consider that if the time hasn't changed, it's the same issue.
|
174 |
+
issues = [issue for issue in issues if issue['updated_at'] != date_old]
|
175 |
|
176 |
+
self._data_ensure_size(repo, int(issues[0]['number']))
|
|
|
177 |
|
178 |
+
updated_at = gitea_issues_body_updated_at_get(issues)
|
179 |
+
issues_to_embed = []
|
|
|
|
|
|
|
|
|
180 |
|
181 |
+
for i, issue in enumerate(issues):
|
182 |
+
number = int(issue['number'])
|
183 |
+
if issue['state'] == 'open':
|
184 |
+
data_repo['opened'][number] = True
|
185 |
+
if issue['state'] == 'closed':
|
186 |
+
data_repo['closed'][number] = True
|
187 |
|
188 |
+
title_old = data_repo['titles'][number]
|
189 |
+
if title_old != issue['title']:
|
190 |
+
data_repo['titles'][number] = issue['title']
|
191 |
+
issues_to_embed.append(issue)
|
192 |
+
elif updated_at[i] >= date_old:
|
193 |
+
issues_to_embed.append(issue)
|
194 |
|
195 |
+
if issues_to_embed:
|
196 |
+
print(f"Embedding {len(issues_to_embed)} issue{'s' if len(issues_to_embed) > 1 else ''}")
|
197 |
+
texts_to_embed = self._create_strings_to_embbed(issues_to_embed)
|
198 |
+
embeddings = EMBEDDING_CTX.encode(texts_to_embed)
|
199 |
+
|
200 |
+
for i, issue in enumerate(issues_to_embed):
|
201 |
+
number = int(issue['number'])
|
202 |
+
data_repo['embeddings'][number] = embeddings[i]
|
203 |
+
|
204 |
+
# autopep8: on
|
205 |
+
return data_repo
|
206 |
|
207 |
+
def _sort_similarity(self,
|
208 |
+
repo: str,
|
209 |
+
query_emb: List[torch.Tensor],
|
210 |
+
limit: int,
|
211 |
+
state: State = State.opened) -> list:
|
212 |
+
duplicates = []
|
213 |
|
214 |
+
data = self[repo]
|
215 |
+
embeddings = data['embeddings']
|
216 |
+
mask_opened = data["opened"]
|
217 |
|
218 |
+
if state == State.all:
|
219 |
+
mask = mask_opened | data["closed"]
|
220 |
+
else:
|
221 |
+
mask = data[state.value]
|
|
|
|
|
|
|
222 |
|
223 |
+
embeddings = embeddings[mask]
|
224 |
+
true_indices = mask.nonzero(as_tuple=True)[0]
|
|
|
|
|
225 |
|
226 |
+
ret = util.semantic_search(
|
227 |
+
query_emb, embeddings, top_k=limit, score_function=util.dot_score)
|
228 |
|
229 |
+
for score in ret[0]:
|
230 |
+
corpus_id = score['corpus_id']
|
231 |
+
number = true_indices[corpus_id].item()
|
232 |
+
closed_char = "" if mask_opened[number] else "~~"
|
233 |
+
text = f"{closed_char}#{number}{closed_char}: {data['titles'][number]}"
|
234 |
+
duplicates.append(text)
|
235 |
|
236 |
+
return duplicates
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
+
def find_relatedness(self, repo: str, number: int, limit: int = 20, state: State = State.opened):
|
239 |
+
data = self._embeddings_updated_get(repo)
|
240 |
|
241 |
+
# Check if the embedding already exists.
|
242 |
+
if data['titles'][number] is not None:
|
243 |
+
new_embedding = data['embeddings'][number]
|
244 |
+
else:
|
245 |
+
gitea_issue = gitea_json_issue_get('blender', repo, number)
|
246 |
+
text_to_embed = self._create_issue_string(
|
247 |
+
gitea_issue['title'], gitea_issue['body'])
|
248 |
|
249 |
+
new_embedding = EMBEDDING_CTX.encode([text_to_embed])
|
|
|
250 |
|
251 |
+
duplicates = self._sort_similarity(
|
252 |
+
repo, new_embedding, limit=limit, state=state)
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
+
if not duplicates:
|
255 |
+
return ''
|
256 |
|
257 |
+
if match := re.search(r'(~~)?#(\d+)(~~)?:', duplicates[0]):
|
258 |
+
number_cached = int(match.group(2))
|
259 |
+
if number_cached == number:
|
260 |
+
return '\n'.join(duplicates[1:])
|
261 |
|
262 |
+
return '\n'.join(duplicates)
|
|
|
263 |
|
|
|
|
|
|
|
|
|
264 |
|
265 |
+
G_data = _Data()
|
266 |
|
267 |
|
268 |
+
@router.get("/find_related/{repo}/{number}", response_class=PlainTextResponse)
|
269 |
def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened) -> str:
|
270 |
+
related = G_data.find_relatedness(repo, number, limit=limit, state=state)
|
271 |
return related
|
272 |
|
273 |
|
274 |
if __name__ == "__main__":
|
275 |
update_cache = True
|
276 |
if update_cache:
|
277 |
+
G_data._embeddings_updated_get('blender')
|
278 |
+
G_data._embeddings_updated_get('blender-addons')
|
279 |
+
with open(G_data.cache_path, "wb") as file:
|
280 |
# Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
|
281 |
for val in G_data.values():
|
282 |
val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
|
283 |
|
284 |
+
pickle.dump(dict(G_data), file, protocol=pickle.HIGHEST_PROTOCOL)
|
285 |
|
286 |
# Converting the embeddings to be GPU.
|
287 |
for val in G_data.values():
|
288 |
val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
|
289 |
|
290 |
# 'blender/blender/111434' must print #96153, #83604 and #79762
|
291 |
+
related1 = G_data.find_relatedness(
|
292 |
'blender', 111434, limit=20, state=State.all)
|
293 |
+
related2 = G_data.find_relatedness('blender-addons', 104399, limit=20)
|
294 |
|
295 |
print("These are the 20 most related issues:")
|
296 |
print(related1)
|
routers/tool_gpu_checker.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import re
|
4 |
from fastapi import APIRouter
|
|
|
5 |
|
6 |
|
7 |
router = APIRouter()
|
@@ -45,6 +46,7 @@ def _check_amd(graphics_card_info):
|
|
45 |
r"(Radeon\s*)?RX\s*560\b": "it has Baffin XT chip that belongs to GCN 4th gen architecture",
|
46 |
r"(Radeon\s*)?5(40X|50X)\b": "it has Polaris 23 XT chip that belongs to GCN 4th gen architecture",
|
47 |
r"(Radeon\s*)?RX\s*5(40|50)\b": "it has Lexa Pro chip that belongs to GCN 4th gen architecture",
|
|
|
48 |
r"(Radeon\s*)?(\(TM\)\s*)?RX\s*4[6-8]0(\b|D)": "it has Ellesmere chip that belongs to GCN 4st gen architecture",
|
49 |
r"(Radeon\s*)?5(30X|35)\b": "it has Polaris 24 XT chip that belongs to GCN 3rd gen architecture",
|
50 |
r"(Radeon\s*)?530\b": "it has Weston chip that belongs to GCN 3rd gen architecture",
|
@@ -192,10 +194,10 @@ If that doesn't help, you can use Blender 2.79: https://www.blender.org/download
|
|
192 |
|
193 |
|
194 |
@router.get("/gpu_checker")
|
195 |
-
def gpu_checker(gpu_info: str = ""):
|
196 |
message = gpu_checker_get_message(gpu_info)
|
197 |
|
198 |
-
return
|
199 |
|
200 |
|
201 |
if __name__ == "__main__":
|
|
|
2 |
|
3 |
import re
|
4 |
from fastapi import APIRouter
|
5 |
+
from fastapi.responses import PlainTextResponse
|
6 |
|
7 |
|
8 |
router = APIRouter()
|
|
|
46 |
r"(Radeon\s*)?RX\s*560\b": "it has Baffin XT chip that belongs to GCN 4th gen architecture",
|
47 |
r"(Radeon\s*)?5(40X|50X)\b": "it has Polaris 23 XT chip that belongs to GCN 4th gen architecture",
|
48 |
r"(Radeon\s*)?RX\s*5(40|50)\b": "it has Lexa Pro chip that belongs to GCN 4th gen architecture",
|
49 |
+
r"(Radeon\s*)?RX\s*480\b": "it has Arctic Islands chip that belongs to GCN 4th gen architecture",
|
50 |
r"(Radeon\s*)?(\(TM\)\s*)?RX\s*4[6-8]0(\b|D)": "it has Ellesmere chip that belongs to GCN 4st gen architecture",
|
51 |
r"(Radeon\s*)?5(30X|35)\b": "it has Polaris 24 XT chip that belongs to GCN 3rd gen architecture",
|
52 |
r"(Radeon\s*)?530\b": "it has Weston chip that belongs to GCN 3rd gen architecture",
|
|
|
194 |
|
195 |
|
196 |
@router.get("/gpu_checker")
|
197 |
+
def gpu_checker(gpu_info: str = "", response_class=PlainTextResponse):
|
198 |
message = gpu_checker_get_message(gpu_info)
|
199 |
|
200 |
+
return message
|
201 |
|
202 |
|
203 |
if __name__ == "__main__":
|
routers/tool_wiki_search.py
CHANGED
@@ -3,9 +3,11 @@
|
|
3 |
import os
|
4 |
import pickle
|
5 |
import re
|
|
|
6 |
from typing import Dict, List
|
7 |
from sentence_transformers import util
|
8 |
from fastapi import APIRouter
|
|
|
9 |
|
10 |
try:
|
11 |
from .embedding import EMBEDDING_CTX
|
@@ -16,267 +18,272 @@ router = APIRouter()
|
|
16 |
|
17 |
MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
|
18 |
BASE_URL = "https://docs.blender.org/manual/en/dev"
|
19 |
-
G_cache_path = "routers/embedding/embeddings_manual.pkl"
|
20 |
G_data = None
|
21 |
|
22 |
|
23 |
-
|
24 |
-
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
G_data['toctree']["copyright"] = parse_file_recursive(
|
34 |
-
MANUAL_DIR, 'copyright.rst')
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
G_data['texts'] = texts
|
41 |
-
G_data['embeddings'] = EMBEDDING_CTX.encode(texts)
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
|
|
|
|
|
111 |
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
|
114 |
-
"""
|
115 |
-
Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
prefix (str): prefix to titles and subtitles
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
The Blender Community
|
130 |
-
*********************
|
131 |
-
|
132 |
-
Being freely available from the start.
|
133 |
-
|
134 |
-
Independent Sites
|
135 |
-
=================
|
136 |
-
|
137 |
-
There are `several independent websites.
|
138 |
-
|
139 |
-
Getting Support
|
140 |
-
===============
|
141 |
-
|
142 |
-
Blender's community is one of its greatest features.
|
143 |
-
'''
|
144 |
-
|
145 |
-
result = split_in_topics(text)
|
146 |
-
# result will be:
|
147 |
-
# {
|
148 |
-
# "# The Blender Community": [
|
149 |
-
# "Being freely available from the start."
|
150 |
-
# ],
|
151 |
-
# "# The Blender Community | Independent Sites": [
|
152 |
-
# "There are `several independent websites."
|
153 |
-
# ],
|
154 |
-
# "# The Blender Community | Getting Support": [
|
155 |
-
# "Blender's community is one of its greatest features."
|
156 |
-
# ]
|
157 |
-
# }
|
158 |
-
"""
|
159 |
-
|
160 |
-
# Remove patterns ".. word::" and ":word:"
|
161 |
-
text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
|
162 |
-
|
163 |
-
# Regular expression to find titles and subtitles
|
164 |
-
pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
|
165 |
-
|
166 |
-
# Split text by found patterns
|
167 |
-
sections = re.split(pattern, text)
|
168 |
-
|
169 |
-
# Remove possible white spaces at the beginning and end of each section
|
170 |
-
sections = [section for section in sections if section.strip()]
|
171 |
-
|
172 |
-
# Separate sections into a dictionary
|
173 |
-
topics = {}
|
174 |
-
current_title = ''
|
175 |
-
current_topic = prefix
|
176 |
-
|
177 |
-
for section in sections:
|
178 |
-
if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
|
179 |
-
current_topic = current_title = f'{prefix}# {match.group(1)}'
|
180 |
-
topics[current_topic] = []
|
181 |
-
elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
|
182 |
-
current_topic = current_title + ' | ' + match.group(1)
|
183 |
-
topics[current_topic] = []
|
184 |
-
else:
|
185 |
-
if current_topic == prefix:
|
186 |
-
raise
|
187 |
-
topics[current_topic].append(section)
|
188 |
-
|
189 |
-
return topics
|
190 |
-
|
191 |
-
|
192 |
-
# Function to split the text into chunks of a maximum number of tokens
|
193 |
-
def split_into_many(page_body, prefix=''):
|
194 |
-
tokenizer = EMBEDDING_CTX.model.tokenizer
|
195 |
-
max_tokens = EMBEDDING_CTX.model.max_seq_length
|
196 |
-
topics = split_into_topics(page_body, prefix)
|
197 |
-
|
198 |
-
for topic, content_list in topics.items():
|
199 |
-
title = topic + ':\n'
|
200 |
-
title_tokens_len = len(tokenizer.tokenize(title))
|
201 |
-
content_list_new = []
|
202 |
-
for content in content_list:
|
203 |
-
content_reduced = reduce_text(content)
|
204 |
-
content_tokens_len = len(tokenizer.tokenize(content_reduced))
|
205 |
-
if title_tokens_len + content_tokens_len <= max_tokens:
|
206 |
-
content_list_new.append(content_reduced)
|
207 |
-
continue
|
208 |
-
|
209 |
-
# Split the text into sentences
|
210 |
-
paragraphs = content_reduced.split('.\n')
|
211 |
-
sentences = ''
|
212 |
-
tokens_so_far = title_tokens_len
|
213 |
-
|
214 |
-
# Loop through the sentences and tokens joined together in a tuple
|
215 |
-
for sentence in paragraphs:
|
216 |
-
sentence += '.\n'
|
217 |
-
|
218 |
-
# Get the number of tokens for each sentence
|
219 |
-
n_tokens = len(tokenizer.tokenize(sentence))
|
220 |
-
|
221 |
-
# If the number of tokens so far plus the number of tokens in the current sentence is greater
|
222 |
-
# than the max number of tokens, then add the chunk to the list of chunks and reset
|
223 |
-
# the chunk and tokens so far
|
224 |
-
if tokens_so_far + n_tokens > max_tokens:
|
225 |
-
content_list_new.append(sentences)
|
226 |
-
sentences = ''
|
227 |
-
tokens_so_far = title_tokens_len
|
228 |
|
229 |
-
|
230 |
-
tokens_so_far += n_tokens
|
231 |
|
232 |
-
|
233 |
-
|
|
|
|
|
|
|
|
|
234 |
|
235 |
-
#
|
236 |
-
content_list.clear()
|
237 |
-
content_list.extend(content_list_new)
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
result.append(topic + ':\n' + content)
|
243 |
|
244 |
-
|
|
|
245 |
|
|
|
|
|
|
|
246 |
|
247 |
-
|
248 |
-
|
|
|
|
|
249 |
|
250 |
-
|
251 |
-
for key in page['toctree'].keys():
|
252 |
-
page_child = page['toctree'][key]
|
253 |
-
result.extend(get_texts_recursive(page_child, f'{path}/{key}'))
|
254 |
-
except KeyError:
|
255 |
-
pass
|
256 |
|
257 |
-
|
|
|
258 |
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
|
|
|
|
|
|
262 |
|
263 |
-
|
264 |
-
ret = util.semantic_search(
|
265 |
-
query_emb, data['embeddings'], top_k=limit, score_function=util.dot_score)
|
266 |
|
267 |
-
texts = data['texts']
|
268 |
-
for score in ret[0]:
|
269 |
-
corpus_id = score['corpus_id']
|
270 |
-
text = texts[corpus_id]
|
271 |
-
results.append(text)
|
272 |
|
273 |
-
|
274 |
|
275 |
|
276 |
-
@router.get("/wiki_search")
|
277 |
def wiki_search(query: str = "") -> str:
|
278 |
-
data = _embeddings_generate()
|
279 |
-
texts = _sort_similarity(
|
280 |
|
281 |
result = f'BASE_URL: {BASE_URL}\n'
|
282 |
for text in texts:
|
|
|
3 |
import os
|
4 |
import pickle
|
5 |
import re
|
6 |
+
import torch
|
7 |
from typing import Dict, List
|
8 |
from sentence_transformers import util
|
9 |
from fastapi import APIRouter
|
10 |
+
from fastapi.responses import PlainTextResponse
|
11 |
|
12 |
try:
|
13 |
from .embedding import EMBEDDING_CTX
|
|
|
18 |
|
19 |
MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
|
20 |
BASE_URL = "https://docs.blender.org/manual/en/dev"
|
|
|
21 |
G_data = None
|
22 |
|
23 |
|
24 |
+
class _Data(dict):
|
25 |
+
cache_path = "routers/embedding/embeddings_manual.pkl"
|
26 |
|
27 |
+
@staticmethod
|
28 |
+
def reduce_text(text):
|
29 |
+
# Remove repeated characters
|
30 |
+
text = re.sub(r'%{2,}', '', text) # Title
|
31 |
+
text = re.sub(r'#{2,}', '', text) # Title
|
32 |
+
text = re.sub(r'\*{3,}', '', text) # Title
|
33 |
+
text = re.sub(r'={3,}', '', text) # Topic
|
34 |
+
text = re.sub(r'\^{3,}', '', text)
|
35 |
+
text = re.sub(r'-{3,}', '', text)
|
36 |
|
37 |
+
text = re.sub(r'(\s*\n\s*)+', '\n', text)
|
38 |
+
return text
|
|
|
|
|
39 |
|
40 |
+
@classmethod
|
41 |
+
def parse_file_recursive(cls, filedir, filename):
|
42 |
+
with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
|
43 |
+
content = file.read()
|
44 |
|
45 |
+
parsed_data = {}
|
|
|
|
|
46 |
|
47 |
+
if not filename.endswith('index.rst'):
|
48 |
+
body = content.strip()
|
49 |
+
else:
|
50 |
+
parts = content.split(".. toctree::")
|
51 |
+
body = parts[0].strip()
|
52 |
+
|
53 |
+
if len(parts) > 1:
|
54 |
+
parsed_data["toctree"] = {}
|
55 |
+
for part in parts[1:]:
|
56 |
+
toctree_entries = part.split('\n')
|
57 |
+
line = toctree_entries[0]
|
58 |
+
for entry in toctree_entries[1:]:
|
59 |
+
entry = entry.strip()
|
60 |
+
if not entry:
|
61 |
+
continue
|
62 |
+
|
63 |
+
if entry.startswith('/'):
|
64 |
+
# relative path.
|
65 |
+
continue
|
66 |
+
|
67 |
+
if not entry.endswith('.rst'):
|
68 |
+
continue
|
69 |
+
|
70 |
+
if entry.endswith('/index.rst'):
|
71 |
+
entry_name = entry[:-10]
|
72 |
+
filedir_ = os.path.join(filedir, entry_name)
|
73 |
+
filename_ = 'index.rst'
|
74 |
+
else:
|
75 |
+
entry_name = entry[:-4]
|
76 |
+
filedir_ = filedir
|
77 |
+
filename_ = entry
|
78 |
+
|
79 |
+
parsed_data['toctree'][entry_name] = cls.parse_file_recursive(
|
80 |
+
filedir_, filename_)
|
81 |
+
|
82 |
+
# The '\n' at the end of the file resolves regex patterns
|
83 |
+
parsed_data['body'] = body + '\n'
|
84 |
+
|
85 |
+
return parsed_data
|
86 |
+
|
87 |
+
@staticmethod
|
88 |
+
def split_into_topics(text: str, prefix: str = '') -> Dict[str, List[str]]:
|
89 |
+
"""
|
90 |
+
Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
text (str): The input text to be split. The text should contain titles marked by asterisks (***)
|
94 |
+
or subtitles marked by equal signs (===).
|
95 |
+
prefix (str): prefix to titles and subtitles
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
Dict[str, List[str]]: A dictionary where keys are section titles or subtitles, and values are lists of
|
99 |
+
strings corresponding to the content under each title or subtitle.
|
100 |
+
|
101 |
+
Example:
|
102 |
+
text = '''
|
103 |
+
*********************
|
104 |
+
The Blender Community
|
105 |
+
*********************
|
106 |
+
|
107 |
+
Being freely available from the start.
|
108 |
+
|
109 |
+
Independent Sites
|
110 |
+
=================
|
111 |
+
|
112 |
+
There are `several independent websites.
|
113 |
+
|
114 |
+
Getting Support
|
115 |
+
===============
|
116 |
+
|
117 |
+
Blender's community is one of its greatest features.
|
118 |
+
'''
|
119 |
+
|
120 |
+
result = split_in_topics(text)
|
121 |
+
# result will be:
|
122 |
+
# {
|
123 |
+
# "# The Blender Community": [
|
124 |
+
# "Being freely available from the start."
|
125 |
+
# ],
|
126 |
+
# "# The Blender Community | Independent Sites": [
|
127 |
+
# "There are `several independent websites."
|
128 |
+
# ],
|
129 |
+
# "# The Blender Community | Getting Support": [
|
130 |
+
# "Blender's community is one of its greatest features."
|
131 |
+
# ]
|
132 |
+
# }
|
133 |
+
"""
|
134 |
+
|
135 |
+
# Remove patterns ".. word::" and ":word:"
|
136 |
+
text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
|
137 |
+
|
138 |
+
# Regular expression to find titles and subtitles
|
139 |
+
pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
|
140 |
+
|
141 |
+
# Split text by found patterns
|
142 |
+
sections = re.split(pattern, text)
|
143 |
+
|
144 |
+
# Remove possible white spaces at the beginning and end of each section
|
145 |
+
sections = [section for section in sections if section.strip()]
|
146 |
+
|
147 |
+
# Separate sections into a dictionary
|
148 |
+
topics = {}
|
149 |
+
current_title = ''
|
150 |
+
current_topic = prefix
|
151 |
+
|
152 |
+
for section in sections:
|
153 |
+
if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
|
154 |
+
current_topic = current_title = f'{prefix}# {match.group(1)}'
|
155 |
+
topics[current_topic] = []
|
156 |
+
elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
|
157 |
+
current_topic = current_title + ' | ' + match.group(1)
|
158 |
+
topics[current_topic] = []
|
159 |
+
else:
|
160 |
+
if current_topic == prefix:
|
161 |
+
raise
|
162 |
+
topics[current_topic].append(section)
|
163 |
+
|
164 |
+
return topics
|
165 |
+
|
166 |
+
@classmethod
|
167 |
+
def split_into_many(cls, page_body, prefix=''):
|
168 |
+
"""
|
169 |
+
# Function to split the text into chunks of a maximum number of tokens
|
170 |
+
"""
|
171 |
+
tokenizer = EMBEDDING_CTX.model.tokenizer
|
172 |
+
max_tokens = EMBEDDING_CTX.model.max_seq_length
|
173 |
+
topics = cls.split_into_topics(page_body, prefix)
|
174 |
+
|
175 |
+
for topic, content_list in topics.items():
|
176 |
+
title = topic + ':\n'
|
177 |
+
title_tokens_len = len(tokenizer.tokenize(title))
|
178 |
+
content_list_new = []
|
179 |
+
for content in content_list:
|
180 |
+
content_reduced = cls.reduce_text(content)
|
181 |
+
content_tokens_len = len(tokenizer.tokenize(content_reduced))
|
182 |
+
if title_tokens_len + content_tokens_len <= max_tokens:
|
183 |
+
content_list_new.append(content_reduced)
|
184 |
+
continue
|
185 |
+
|
186 |
+
# Split the text into sentences
|
187 |
+
paragraphs = content_reduced.split('.\n')
|
188 |
+
sentences = ''
|
189 |
+
tokens_so_far = title_tokens_len
|
190 |
+
|
191 |
+
# Loop through the sentences and tokens joined together in a tuple
|
192 |
+
for sentence in paragraphs:
|
193 |
+
sentence += '.\n'
|
194 |
+
|
195 |
+
# Get the number of tokens for each sentence
|
196 |
+
n_tokens = len(tokenizer.tokenize(sentence))
|
197 |
+
|
198 |
+
# If the number of tokens so far plus the number of tokens in the current sentence is greater
|
199 |
+
# than the max number of tokens, then add the chunk to the list of chunks and reset
|
200 |
+
# the chunk and tokens so far
|
201 |
+
if tokens_so_far + n_tokens > max_tokens:
|
202 |
+
content_list_new.append(sentences)
|
203 |
+
sentences = ''
|
204 |
+
tokens_so_far = title_tokens_len
|
205 |
+
|
206 |
+
sentences += sentence
|
207 |
+
tokens_so_far += n_tokens
|
208 |
+
|
209 |
+
if sentences:
|
210 |
+
content_list_new.append(sentences)
|
211 |
|
212 |
+
# Replace content_list
|
213 |
+
content_list.clear()
|
214 |
+
content_list.extend(content_list_new)
|
215 |
|
216 |
+
result = []
|
217 |
+
for topic, content_list in topics.items():
|
218 |
+
for content in content_list:
|
219 |
+
result.append(topic + ':\n' + content)
|
220 |
|
221 |
+
return result
|
|
|
|
|
222 |
|
223 |
+
@classmethod
|
224 |
+
def get_texts_recursive(cls, page, path=''):
|
225 |
+
result = cls.split_into_many(page['body'], path)
|
|
|
226 |
|
227 |
+
try:
|
228 |
+
for key in page['toctree'].keys():
|
229 |
+
page_child = page['toctree'][key]
|
230 |
+
result.extend(cls.get_texts_recursive(
|
231 |
+
page_child, f'{path}/{key}'))
|
232 |
+
except KeyError:
|
233 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
+
return result
|
|
|
236 |
|
237 |
+
def _embeddings_generate(self):
|
238 |
+
if os.path.exists(self.cache_path):
|
239 |
+
with open(self.cache_path, 'rb') as file:
|
240 |
+
data = pickle.load(file)
|
241 |
+
self.update(data)
|
242 |
+
return self
|
243 |
|
244 |
+
# Generate
|
|
|
|
|
245 |
|
246 |
+
manual = self.parse_file_recursive(MANUAL_DIR, 'index.rst')
|
247 |
+
manual['toctree']["copyright"] = self.parse_file_recursive(
|
248 |
+
MANUAL_DIR, 'copyright.rst')
|
|
|
249 |
|
250 |
+
# Create a list to store the text files
|
251 |
+
texts = self.get_texts_recursive(manual)
|
252 |
|
253 |
+
print("Embedding Texts...")
|
254 |
+
self['texts'] = texts
|
255 |
+
self['embeddings'] = EMBEDDING_CTX.encode(texts)
|
256 |
|
257 |
+
with open(self.cache_path, "wb") as file:
|
258 |
+
# Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
|
259 |
+
self['embeddings'] = self['embeddings'].to(torch.device('cpu'))
|
260 |
+
pickle.dump(dict(self), file, protocol=pickle.HIGHEST_PROTOCOL)
|
261 |
|
262 |
+
return G_data
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
+
def _sort_similarity(self, text_to_search, limit):
|
265 |
+
results = []
|
266 |
|
267 |
+
query_emb = EMBEDDING_CTX.encode([text_to_search])
|
268 |
+
ret = util.semantic_search(
|
269 |
+
query_emb, self['embeddings'], top_k=limit, score_function=util.dot_score)
|
270 |
|
271 |
+
texts = self['texts']
|
272 |
+
for score in ret[0]:
|
273 |
+
corpus_id = score['corpus_id']
|
274 |
+
text = texts[corpus_id]
|
275 |
+
results.append(text)
|
276 |
|
277 |
+
return results
|
|
|
|
|
278 |
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
+
G_data = _Data()
|
281 |
|
282 |
|
283 |
+
@router.get("/wiki_search", response_class=PlainTextResponse)
|
284 |
def wiki_search(query: str = "") -> str:
|
285 |
+
data = G_data._embeddings_generate()
|
286 |
+
texts = G_data._sort_similarity(query, 5)
|
287 |
|
288 |
result = f'BASE_URL: {BASE_URL}\n'
|
289 |
for text in texts:
|