Spaces:
Running
Running
Germano Cavalcante
commited on
Commit
•
5def575
1
Parent(s):
566fa33
Find Related: Optimize: Avoid creating embeddings for text that has not changed
Browse files- routers/tool_find_related.py +21 -8
- routers/tool_find_related_cache.pkl +2 -2
- routers/utils_gitea.py +21 -0
routers/tool_find_related.py
CHANGED
@@ -12,9 +12,9 @@ from sentence_transformers import SentenceTransformer, util
|
|
12 |
from fastapi import APIRouter
|
13 |
|
14 |
try:
|
15 |
-
from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get
|
16 |
except:
|
17 |
-
from utils_gitea import gitea_fetch_issues, gitea_json_issue_get
|
18 |
|
19 |
|
20 |
def _create_issue_string(title, body):
|
@@ -56,7 +56,8 @@ class EmbeddingContext:
|
|
56 |
TOKEN_LEN_MAX_FOR_EMBEDDING = 512
|
57 |
TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING
|
58 |
ARRAY_CHUNK_SIZE = 4096
|
59 |
-
issue_attr_filter = {'number', 'title', 'body',
|
|
|
60 |
cache_path = "routers/tool_find_related_cache.pkl"
|
61 |
|
62 |
# Set when creating the object
|
@@ -278,24 +279,36 @@ class EmbeddingContext:
|
|
278 |
data['updated_at'] = date_new
|
279 |
|
280 |
# autopep8: off
|
281 |
-
# WORKAROUND:
|
282 |
# Consider that if the time hasn't changed, it's the same issue.
|
283 |
issues = [issue for issue in issues if issue['updated_at'] != date_old]
|
284 |
|
285 |
self.data_ensure_size(repo, int(issues[0]['number']))
|
286 |
|
287 |
-
|
288 |
-
|
289 |
|
290 |
for i, issue in enumerate(issues):
|
291 |
number = int(issue['number'])
|
292 |
-
data['titles'][number] = issue['title']
|
293 |
-
data['embeddings'][number] = embeddings[i]
|
294 |
if issue['state'] == 'open':
|
295 |
data['opened'][number] = True
|
296 |
if issue['state'] == 'closed':
|
297 |
data['closed'][number] = True
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# autopep8: on
|
300 |
return data
|
301 |
|
|
|
12 |
from fastapi import APIRouter
|
13 |
|
14 |
try:
|
15 |
+
from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
|
16 |
except:
|
17 |
+
from utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
|
18 |
|
19 |
|
20 |
def _create_issue_string(title, body):
|
|
|
56 |
TOKEN_LEN_MAX_FOR_EMBEDDING = 512
|
57 |
TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING
|
58 |
ARRAY_CHUNK_SIZE = 4096
|
59 |
+
issue_attr_filter = {'number', 'title', 'body',
|
60 |
+
'state', 'updated_at', 'created_at'}
|
61 |
cache_path = "routers/tool_find_related_cache.pkl"
|
62 |
|
63 |
# Set when creating the object
|
|
|
279 |
data['updated_at'] = date_new
|
280 |
|
281 |
# autopep8: off
|
|
|
282 |
# Consider that if the time hasn't changed, it's the same issue.
|
283 |
issues = [issue for issue in issues if issue['updated_at'] != date_old]
|
284 |
|
285 |
self.data_ensure_size(repo, int(issues[0]['number']))
|
286 |
|
287 |
+
updated_at = gitea_issues_body_updated_at_get(issues)
|
288 |
+
issues_to_embed = []
|
289 |
|
290 |
for i, issue in enumerate(issues):
|
291 |
number = int(issue['number'])
|
|
|
|
|
292 |
if issue['state'] == 'open':
|
293 |
data['opened'][number] = True
|
294 |
if issue['state'] == 'closed':
|
295 |
data['closed'][number] = True
|
296 |
|
297 |
+
title_old = data['titles'][number]
|
298 |
+
if title_old != issue['title']:
|
299 |
+
data['titles'][number] = issue['title']
|
300 |
+
issues_to_embed.append(issue)
|
301 |
+
elif updated_at[i] >= date_old:
|
302 |
+
issues_to_embed.append(issue)
|
303 |
+
|
304 |
+
if issues_to_embed:
|
305 |
+
texts_to_embed = self.create_strings_to_embbed(issues_to_embed, black_list)
|
306 |
+
embeddings = self.encode(texts_to_embed)
|
307 |
+
|
308 |
+
for i, issue in enumerate(issues_to_embed):
|
309 |
+
number = int(issue['number'])
|
310 |
+
data['embeddings'][number] = embeddings[i]
|
311 |
+
|
312 |
# autopep8: on
|
313 |
return data
|
314 |
|
routers/tool_find_related_cache.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a181cc69d535d6502588e4c14bea367d74dfaca17a5602a23a72def479f592cc
|
3 |
+
size 723433353
|
routers/utils_gitea.py
CHANGED
@@ -87,3 +87,24 @@ def gitea_fetch_issues(owner, repo, state='all', labels='', issue_attr_filter=No
|
|
87 |
encoded_query_params = urllib.parse.urlencode(query_params)
|
88 |
issues_url = f"{base_url}?{encoded_query_params}"
|
89 |
return url_json_get_all_pages(issues_url, item_filter=issue_attr_filter, exclude=exclude, verbose=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
encoded_query_params = urllib.parse.urlencode(query_params)
|
88 |
issues_url = f"{base_url}?{encoded_query_params}"
|
89 |
return url_json_get_all_pages(issues_url, item_filter=issue_attr_filter, exclude=exclude, verbose=True)
|
90 |
+
|
91 |
+
|
92 |
+
def gitea_issues_body_updated_at_get(issues, verbose=True):
|
93 |
+
def fetch_issue(issue):
|
94 |
+
number = issue['number']
|
95 |
+
if verbose:
|
96 |
+
print(f"Fetched issue #{number}")
|
97 |
+
|
98 |
+
json_data = url_json_get(
|
99 |
+
f"https://projects.blender.org/blender/blender/issues/{number}/content-history/list")
|
100 |
+
# Verify that the response contains the expected data before trying to access it
|
101 |
+
if json_data and json_data['results']:
|
102 |
+
return json_data['results'][0]['name'].split('datetime="')[1].split('"')[0]
|
103 |
+
else:
|
104 |
+
return issue['created_at']
|
105 |
+
|
106 |
+
with ThreadPoolExecutor() as executor:
|
107 |
+
futures = [executor.submit(fetch_issue, issue) for issue in issues]
|
108 |
+
all_results = [future.result() for future in as_completed(futures)]
|
109 |
+
|
110 |
+
return all_results
|