Spaces:
Sleeping
Sleeping
romnatall
commited on
Commit
·
0514b29
1
Parent(s):
e633090
прогнозирование по трем моделям
Browse files- app.py +50 -23
- data/books_model (2).ipynb +620 -0
- data/data.csv +2 -2
- data/{embeddings.npy → embeddings_bert.npy} +2 -2
- data/tf_idf_vectorizer.pkl +3 -0
app.py
CHANGED
@@ -7,17 +7,27 @@ import torch
|
|
7 |
from transformers import AutoTokenizer, AutoModel
|
8 |
import numpy as np
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
movies = pd.read_csv('data/data.csv')
|
12 |
|
|
|
13 |
toggle_state = st.sidebar.checkbox("режим разметки")
|
14 |
input_search = st.text_input('Search')
|
15 |
|
16 |
|
17 |
|
18 |
-
data = np.load('data/
|
19 |
-
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
@st.cache_resource
|
23 |
def get_embeddings():
|
@@ -36,27 +46,50 @@ def embed_bert_cls(text, ):
|
|
36 |
embeddings = torch.nn.functional.normalize(embeddings)
|
37 |
return embeddings[0].cpu().numpy()
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
@st.cache_data
|
41 |
def predict_rating(input_search):
|
42 |
|
|
|
|
|
|
|
43 |
emb = embed_bert_cls(input_search)
|
44 |
X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
|
45 |
-
st.session_state["X"]=X
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
# import pickle
|
53 |
-
# with open('logreg.pkl', 'rb') as f:
|
54 |
-
# logreg = pickle.load(f)
|
55 |
-
# y = logreg.predict(X)
|
56 |
|
57 |
-
y= cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
def saverank(index, new_X,new_y):
|
@@ -174,19 +207,13 @@ def getnums(df,size=0,text=''):
|
|
174 |
return reqs[text]
|
175 |
|
176 |
|
177 |
-
def top_indices(array, n):
|
178 |
-
# Получаем индексы элементов, отсортированных по убыванию
|
179 |
-
st.session_state["pred"] = array
|
180 |
-
sorted_indices = np.argsort(array)[::-1]
|
181 |
-
# Выбираем первые n индексов
|
182 |
-
top_n_indices = sorted_indices[:n]
|
183 |
-
return top_n_indices
|
184 |
|
185 |
|
186 |
|
187 |
|
188 |
-
|
189 |
-
|
|
|
190 |
|
191 |
|
192 |
|
|
|
7 |
from transformers import AutoTokenizer, AutoModel
|
8 |
import numpy as np
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
from sklearn.metrics.pairwise import pairwise_distances
|
11 |
+
import faiss
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
import pickle
|
14 |
|
|
|
15 |
|
16 |
+
movies = pd.read_csv('data/data.csv')
|
17 |
toggle_state = st.sidebar.checkbox("режим разметки")
|
18 |
input_search = st.text_input('Search')
|
19 |
|
20 |
|
21 |
|
22 |
+
data = np.load('data/embeddings_bert.npy')
|
|
|
23 |
|
24 |
+
def top_indices(array, n,upsc=False):
|
25 |
+
# Получаем индексы элементов, отсортированных по убыванию
|
26 |
+
st.session_state["pred"] = array
|
27 |
+
sorted_indices = np.argsort(array)[::1 if upsc else -1]
|
28 |
+
# Выбираем первые n индексов
|
29 |
+
top_n_indices = sorted_indices[:n]
|
30 |
+
return top_n_indices
|
31 |
|
32 |
@st.cache_resource
|
33 |
def get_embeddings():
|
|
|
46 |
embeddings = torch.nn.functional.normalize(embeddings)
|
47 |
return embeddings[0].cpu().numpy()
|
48 |
|
49 |
+
@st.cache_resource
|
50 |
+
def getmodels():
|
51 |
+
|
52 |
+
with open('data/logreg.pkl', 'rb') as f:
|
53 |
+
logreg = pickle.load(f)
|
54 |
+
with open('data/tf_idf_vectorizer.pkl', 'rb') as f:
|
55 |
+
vectorizer = pickle.load(f)
|
56 |
+
return logreg, vectorizer
|
57 |
|
58 |
@st.cache_data
|
59 |
def predict_rating(input_search):
|
60 |
|
61 |
+
|
62 |
+
logreg, vectorizer=getmodels()
|
63 |
+
|
64 |
emb = embed_bert_cls(input_search)
|
65 |
X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
|
|
|
66 |
|
67 |
+
user_tfidf = vectorizer.transform([input_search])
|
68 |
+
tfidf_matrix = vectorizer.transform(movies['description'])
|
69 |
+
tfidf_matrix2 = vectorizer.transform(movies['name'])
|
70 |
+
|
71 |
+
similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)
|
72 |
+
similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)
|
73 |
+
|
74 |
+
y_log = logreg.predict(X)
|
75 |
+
y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
|
76 |
+
|
77 |
+
|
78 |
+
y=(similarity_scores_desc*0.9+similarity_scores_name*0.035+y_emb*.4+y_log*0.4).reshape(-1)
|
79 |
+
st.session_state["pred"]=y
|
80 |
+
|
81 |
+
return top_indices(y, 10,upsc=False)
|
82 |
+
|
83 |
+
|
84 |
|
|
|
|
|
|
|
|
|
85 |
|
|
|
86 |
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
|
94 |
|
95 |
def saverank(index, new_X,new_y):
|
|
|
207 |
return reqs[text]
|
208 |
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
|
212 |
|
213 |
|
214 |
+
if input_search:
|
215 |
+
for i in predict_rating(input_search):
|
216 |
+
display_movie_card(movies, i )
|
217 |
|
218 |
|
219 |
|
data/books_model (2).ipynb
ADDED
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"metadata": {
|
7 |
+
"id": "IlvYwT4VD8Bd"
|
8 |
+
},
|
9 |
+
"outputs": [
|
10 |
+
{
|
11 |
+
"name": "stdout",
|
12 |
+
"output_type": "stream",
|
13 |
+
"text": [
|
14 |
+
"Collecting sentence_transformers\n",
|
15 |
+
" Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)\n",
|
16 |
+
"Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (4.39.3)\n",
|
17 |
+
"Requirement already satisfied: tqdm in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (4.66.2)\n",
|
18 |
+
"Requirement already satisfied: torch>=1.11.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (2.2.2)\n",
|
19 |
+
"Requirement already satisfied: numpy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.26.4)\n",
|
20 |
+
"Requirement already satisfied: scikit-learn in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.4.1.post1)\n",
|
21 |
+
"Requirement already satisfied: scipy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.13.0)\n",
|
22 |
+
"Requirement already satisfied: huggingface-hub>=0.15.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (0.22.2)\n",
|
23 |
+
"Requirement already satisfied: Pillow in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (10.3.0)\n",
|
24 |
+
"Requirement already satisfied: filelock in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.13.3)\n",
|
25 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2024.3.1)\n",
|
26 |
+
"Requirement already satisfied: packaging>=20.9 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (24.0)\n",
|
27 |
+
"Requirement already satisfied: pyyaml>=5.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0.1)\n",
|
28 |
+
"Requirement already satisfied: requests in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.31.0)\n",
|
29 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.10.0)\n",
|
30 |
+
"Requirement already satisfied: sympy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (1.12)\n",
|
31 |
+
"Requirement already satisfied: networkx in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.2.1)\n",
|
32 |
+
"Requirement already satisfied: jinja2 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.3)\n",
|
33 |
+
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
|
34 |
+
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
|
35 |
+
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
|
36 |
+
"Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (8.9.2.26)\n",
|
37 |
+
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.3.1)\n",
|
38 |
+
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (11.0.2.54)\n",
|
39 |
+
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (10.3.2.106)\n",
|
40 |
+
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (11.4.5.107)\n",
|
41 |
+
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.0.106)\n",
|
42 |
+
"Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (2.19.3)\n",
|
43 |
+
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
|
44 |
+
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence_transformers) (12.4.127)\n",
|
45 |
+
"Requirement already satisfied: regex!=2019.12.17 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2023.12.25)\n",
|
46 |
+
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.15.2)\n",
|
47 |
+
"Requirement already satisfied: safetensors>=0.4.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.2)\n",
|
48 |
+
"Requirement already satisfied: joblib>=1.2.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from scikit-learn->sentence_transformers) (1.3.2)\n",
|
49 |
+
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from scikit-learn->sentence_transformers) (3.4.0)\n",
|
50 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.5)\n",
|
51 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.3.2)\n",
|
52 |
+
"Requirement already satisfied: idna<4,>=2.5 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.6)\n",
|
53 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.18)\n",
|
54 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2024.2.2)\n",
|
55 |
+
"Requirement already satisfied: mpmath>=0.19 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\n",
|
56 |
+
"Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)\n",
|
57 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m171.5/171.5 kB\u001b[0m \u001b[31m566.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
58 |
+
"\u001b[?25hInstalling collected packages: sentence_transformers\n",
|
59 |
+
"Successfully installed sentence_transformers-2.7.0\n"
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"source": [
|
64 |
+
"!pip install sentence_transformers"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 6,
|
70 |
+
"metadata": {
|
71 |
+
"id": "sdvkA3cwEVoZ"
|
72 |
+
},
|
73 |
+
"outputs": [
|
74 |
+
{
|
75 |
+
"name": "stdout",
|
76 |
+
"output_type": "stream",
|
77 |
+
"text": [
|
78 |
+
"Collecting faiss-cpu\n",
|
79 |
+
" Downloading faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
|
80 |
+
"Requirement already satisfied: numpy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from faiss-cpu) (1.26.4)\n",
|
81 |
+
"Downloading faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)\n",
|
82 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.0/27.0 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
83 |
+
"\u001b[?25hInstalling collected packages: faiss-cpu\n",
|
84 |
+
"Successfully installed faiss-cpu-1.8.0\n"
|
85 |
+
]
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"source": [
|
89 |
+
"!pip install faiss-cpu\n"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": 7,
|
95 |
+
"metadata": {
|
96 |
+
"id": "Hv0VlA_ZAtjH"
|
97 |
+
},
|
98 |
+
"outputs": [],
|
99 |
+
"source": [
|
100 |
+
"import pandas as pd\n",
|
101 |
+
"from sentence_transformers import SentenceTransformer\n",
|
102 |
+
"import faiss\n",
|
103 |
+
"import torch\n",
|
104 |
+
"import numpy as np"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 2,
|
110 |
+
"metadata": {
|
111 |
+
"colab": {
|
112 |
+
"base_uri": "https://localhost:8080/",
|
113 |
+
"height": 701
|
114 |
+
},
|
115 |
+
"id": "ENfI_teQBvxa",
|
116 |
+
"outputId": "a23427df-f5aa-40f0-ac35-2974803c66b4"
|
117 |
+
},
|
118 |
+
"outputs": [],
|
119 |
+
"source": [
|
120 |
+
"data = pd.read_csv('data.csv')\n"
|
121 |
+
]
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"cell_type": "code",
|
125 |
+
"execution_count": 26,
|
126 |
+
"metadata": {
|
127 |
+
"id": "uLOmQB8VP_rH"
|
128 |
+
},
|
129 |
+
"outputs": [],
|
130 |
+
"source": [
|
131 |
+
"data = data.sample(frac=1)"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": null,
|
137 |
+
"metadata": {
|
138 |
+
"id": "6eRXlMBBOyjr"
|
139 |
+
},
|
140 |
+
"outputs": [],
|
141 |
+
"source": [
|
142 |
+
"model = SentenceTransformer('distiluse-base-multilingual-cased')"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "code",
|
147 |
+
"execution_count": 48,
|
148 |
+
"metadata": {
|
149 |
+
"id": "HkhTwHXtO5xk"
|
150 |
+
},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"data['annotation_len'] = data['description'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)\n",
|
154 |
+
"\n",
|
155 |
+
"embedings =np.load('embeddings.npy')[data['annotation_len'] > 10]\n",
|
156 |
+
"data = data[data['annotation_len'] > 10] # Отсечение слишком коротких аннотаций\n",
|
157 |
+
"\n",
|
158 |
+
"\n",
|
159 |
+
"data.to_csv('data.csv')\n",
|
160 |
+
"np.save('embeddings.npy',embedings)\n"
|
161 |
+
]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"cell_type": "code",
|
165 |
+
"execution_count": 45,
|
166 |
+
"metadata": {},
|
167 |
+
"outputs": [
|
168 |
+
{
|
169 |
+
"ename": "AttributeError",
|
170 |
+
"evalue": "'numpy.ndarray' object has no attribute 'to_csv'",
|
171 |
+
"output_type": "error",
|
172 |
+
"traceback": [
|
173 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
174 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
175 |
+
"Cell \u001b[0;32mIn[45], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m np\u001b[38;5;241m.\u001b[39msave(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membeddings.npy\u001b[39m\u001b[38;5;124m'\u001b[39m,embedings)\n",
|
176 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'to_csv'"
|
177 |
+
]
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"source": [
|
181 |
+
"data.to_csv('data.csv')\n",
|
182 |
+
"np.save('embeddings.npy',embedings)"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": 50,
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"data": {
|
192 |
+
"text/html": [
|
193 |
+
"<div>\n",
|
194 |
+
"<style scoped>\n",
|
195 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
196 |
+
" vertical-align: middle;\n",
|
197 |
+
" }\n",
|
198 |
+
"\n",
|
199 |
+
" .dataframe tbody tr th {\n",
|
200 |
+
" vertical-align: top;\n",
|
201 |
+
" }\n",
|
202 |
+
"\n",
|
203 |
+
" .dataframe thead th {\n",
|
204 |
+
" text-align: right;\n",
|
205 |
+
" }\n",
|
206 |
+
"</style>\n",
|
207 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
208 |
+
" <thead>\n",
|
209 |
+
" <tr style=\"text-align: right;\">\n",
|
210 |
+
" <th></th>\n",
|
211 |
+
" <th>name</th>\n",
|
212 |
+
" <th>description</th>\n",
|
213 |
+
" <th>link</th>\n",
|
214 |
+
" <th>year</th>\n",
|
215 |
+
" <th>imdb</th>\n",
|
216 |
+
" <th>kp</th>\n",
|
217 |
+
" <th>country</th>\n",
|
218 |
+
" <th>age</th>\n",
|
219 |
+
" <th>actors</th>\n",
|
220 |
+
" <th>genres</th>\n",
|
221 |
+
" <th>poster</th>\n",
|
222 |
+
" <th>annotation_len</th>\n",
|
223 |
+
" </tr>\n",
|
224 |
+
" </thead>\n",
|
225 |
+
" <tbody>\n",
|
226 |
+
" <tr>\n",
|
227 |
+
" <th>0</th>\n",
|
228 |
+
" <td>Уэнсдэй</td>\n",
|
229 |
+
" <td>В американской хоррор-комедии показана детект...</td>\n",
|
230 |
+
" <td>https://www.lordfilm.bot/48211-ujensdjej-2022....</td>\n",
|
231 |
+
" <td>2022.0</td>\n",
|
232 |
+
" <td>8.1</td>\n",
|
233 |
+
" <td>8.0</td>\n",
|
234 |
+
" <td>США</td>\n",
|
235 |
+
" <td>0+</td>\n",
|
236 |
+
" <td>Дженна Ортега, Гвендолин Кристи, Рики Линдхоум...</td>\n",
|
237 |
+
" <td>Сериалы, Фильмы про подростков</td>\n",
|
238 |
+
" <td>https://www.lordfilm.bot/uploads/posts/2022-12...</td>\n",
|
239 |
+
" <td>157</td>\n",
|
240 |
+
" </tr>\n",
|
241 |
+
" <tr>\n",
|
242 |
+
" <th>1</th>\n",
|
243 |
+
" <td>Слово пацана. Кровь на асфальте</td>\n",
|
244 |
+
" <td>Перестройка уже шагнула с кремлевских трибун ...</td>\n",
|
245 |
+
" <td>https://www.lordfilm.bot/50219-slovo-pacana-kr...</td>\n",
|
246 |
+
" <td>2023.0</td>\n",
|
247 |
+
" <td>NaN</td>\n",
|
248 |
+
" <td>NaN</td>\n",
|
249 |
+
" <td>Россия</td>\n",
|
250 |
+
" <td>18+</td>\n",
|
251 |
+
" <td>Иван Янковский, Елизавета Базыкина, Ольга Лапш...</td>\n",
|
252 |
+
" <td>Сериалы, ru</td>\n",
|
253 |
+
" <td>https://www.lordfilm.bot/uploads/posts/2023-11...</td>\n",
|
254 |
+
" <td>150</td>\n",
|
255 |
+
" </tr>\n",
|
256 |
+
" <tr>\n",
|
257 |
+
" <th>2</th>\n",
|
258 |
+
" <td>Элементарно</td>\n",
|
259 |
+
" <td>В Городе Стихий обитатели огня, воды, земли и...</td>\n",
|
260 |
+
" <td>https://www.lordfilm.bot/48863-jelementarno-20...</td>\n",
|
261 |
+
" <td>2023.0</td>\n",
|
262 |
+
" <td>7.0</td>\n",
|
263 |
+
" <td>7.7</td>\n",
|
264 |
+
" <td>США</td>\n",
|
265 |
+
" <td>6+</td>\n",
|
266 |
+
" <td>Леа Льюис, Мамуду Ати, Роналдо Дель Кармен, Ши...</td>\n",
|
267 |
+
" <td>Мультфильмы</td>\n",
|
268 |
+
" <td>https://www.lordfilm.bot/uploads/posts/2023-06...</td>\n",
|
269 |
+
" <td>34</td>\n",
|
270 |
+
" </tr>\n",
|
271 |
+
" <tr>\n",
|
272 |
+
" <th>3</th>\n",
|
273 |
+
" <td>Лука</td>\n",
|
274 |
+
" <td>Свои незабываемые каникулы, в которых есть ме...</td>\n",
|
275 |
+
" <td>https://www.lordfilm.bot/27172-luka-11-12-2021...</td>\n",
|
276 |
+
" <td>2021.0</td>\n",
|
277 |
+
" <td>7.4</td>\n",
|
278 |
+
" <td>7.8</td>\n",
|
279 |
+
" <td>США</td>\n",
|
280 |
+
" <td>6+</td>\n",
|
281 |
+
" <td>Джейкоб Тремблей, Джек Дилан Грейзер, Саша Бар...</td>\n",
|
282 |
+
" <td>Мультфильмы</td>\n",
|
283 |
+
" <td>https://www.lordfilm.bot/uploads/posts/2021-06...</td>\n",
|
284 |
+
" <td>68</td>\n",
|
285 |
+
" </tr>\n",
|
286 |
+
" <tr>\n",
|
287 |
+
" <th>4</th>\n",
|
288 |
+
" <td>Локи</td>\n",
|
289 |
+
" <td>Увлекательные приключения скандинавского бога...</td>\n",
|
290 |
+
" <td>https://www.lordfilm.bot/27119-loki-g1.html</td>\n",
|
291 |
+
" <td>2021.0</td>\n",
|
292 |
+
" <td>8.2</td>\n",
|
293 |
+
" <td>7.7</td>\n",
|
294 |
+
" <td>США</td>\n",
|
295 |
+
" <td>0+</td>\n",
|
296 |
+
" <td>Том Хиддлстон, Софи Ди Мартино, Ричард Э. Гран...</td>\n",
|
297 |
+
" <td>Сериалы, Фильмы Marvel</td>\n",
|
298 |
+
" <td>https://www.lordfilm.bot/uploads/posts/2023-10...</td>\n",
|
299 |
+
" <td>162</td>\n",
|
300 |
+
" </tr>\n",
|
301 |
+
" </tbody>\n",
|
302 |
+
"</table>\n",
|
303 |
+
"</div>"
|
304 |
+
],
|
305 |
+
"text/plain": [
|
306 |
+
" name \\\n",
|
307 |
+
"0 Уэнсдэй \n",
|
308 |
+
"1 Слово пацана. Кровь на асфальте \n",
|
309 |
+
"2 Элементарно \n",
|
310 |
+
"3 Лука \n",
|
311 |
+
"4 Локи \n",
|
312 |
+
"\n",
|
313 |
+
" description \\\n",
|
314 |
+
"0 В американской хоррор-комедии показана детект... \n",
|
315 |
+
"1 Перестройка уже шагнула с кремлевских трибун ... \n",
|
316 |
+
"2 В Городе Стихий обитатели огня, воды, земли и... \n",
|
317 |
+
"3 Свои незабываемые каникулы, в которых есть ме... \n",
|
318 |
+
"4 Увлекательные приключения скандинавского бога... \n",
|
319 |
+
"\n",
|
320 |
+
" link year imdb kp \\\n",
|
321 |
+
"0 https://www.lordfilm.bot/48211-ujensdjej-2022.... 2022.0 8.1 8.0 \n",
|
322 |
+
"1 https://www.lordfilm.bot/50219-slovo-pacana-kr... 2023.0 NaN NaN \n",
|
323 |
+
"2 https://www.lordfilm.bot/48863-jelementarno-20... 2023.0 7.0 7.7 \n",
|
324 |
+
"3 https://www.lordfilm.bot/27172-luka-11-12-2021... 2021.0 7.4 7.8 \n",
|
325 |
+
"4 https://www.lordfilm.bot/27119-loki-g1.html 2021.0 8.2 7.7 \n",
|
326 |
+
"\n",
|
327 |
+
" country age actors \\\n",
|
328 |
+
"0 США 0+ Дженна Ортега, Гвендолин Кристи, Рики Линдхоум... \n",
|
329 |
+
"1 Россия 18+ Иван Янковский, Елизавета Базыкина, Ольга Лапш... \n",
|
330 |
+
"2 США 6+ Леа Льюис, Мамуду Ати, Роналдо Дель Кармен, Ши... \n",
|
331 |
+
"3 США 6+ Джейкоб Тремблей, Джек Дилан Грейзер, Саша Бар... \n",
|
332 |
+
"4 США 0+ Том Хиддлстон, Софи Ди Мартино, Ричард Э. Гран... \n",
|
333 |
+
"\n",
|
334 |
+
" genres \\\n",
|
335 |
+
"0 Сериалы, Фильмы про подростков \n",
|
336 |
+
"1 Сериалы, ru \n",
|
337 |
+
"2 Мультфильмы \n",
|
338 |
+
"3 Мультфильмы \n",
|
339 |
+
"4 Сериалы, Фильмы Marvel \n",
|
340 |
+
"\n",
|
341 |
+
" poster annotation_len \n",
|
342 |
+
"0 https://www.lordfilm.bot/uploads/posts/2022-12... 157 \n",
|
343 |
+
"1 https://www.lordfilm.bot/uploads/posts/2023-11... 150 \n",
|
344 |
+
"2 https://www.lordfilm.bot/uploads/posts/2023-06... 34 \n",
|
345 |
+
"3 https://www.lordfilm.bot/uploads/posts/2021-06... 68 \n",
|
346 |
+
"4 https://www.lordfilm.bot/uploads/posts/2023-10... 162 "
|
347 |
+
]
|
348 |
+
},
|
349 |
+
"execution_count": 50,
|
350 |
+
"metadata": {},
|
351 |
+
"output_type": "execute_result"
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"source": [
|
355 |
+
"data.head()"
|
356 |
+
]
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"cell_type": "code",
|
360 |
+
"execution_count": 3,
|
361 |
+
"metadata": {},
|
362 |
+
"outputs": [
|
363 |
+
{
|
364 |
+
"name": "stdout",
|
365 |
+
"output_type": "stream",
|
366 |
+
"text": [
|
367 |
+
"Recommended Movies:\n",
|
368 |
+
"Интерстеллар\n",
|
369 |
+
"Летящие сквозь ночь\n",
|
370 |
+
"Спящая сквозь время\n",
|
371 |
+
"Любовь сквозь время\n",
|
372 |
+
"Лагерь Холодный Ручей\n",
|
373 |
+
"Парадокс Кловерфилда\n",
|
374 |
+
"Путешествие сквозь ночь\n",
|
375 |
+
"Сквозь огонь\n",
|
376 |
+
"Живое\n",
|
377 |
+
"Моцзинь: Долина червя\n"
|
378 |
+
]
|
379 |
+
}
|
380 |
+
],
|
381 |
+
"source": [
|
382 |
+
"import pandas as pd\n",
|
383 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
384 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
385 |
+
"\n",
|
386 |
+
"\n",
|
387 |
+
"\n",
|
388 |
+
"# Vectorize the movie descriptions using TF-IDF\n",
|
389 |
+
"vectorizer = TfidfVectorizer()\n",
|
390 |
+
"tfidf_matrix = vectorizer.fit_transform(data['description'])\n",
|
391 |
+
"tfidf_matrix2 = vectorizer.transform(data['name'])\n",
|
392 |
+
"\n",
|
393 |
+
"# Function to recommend movies based on user input\n",
|
394 |
+
"def recommend_movies(user_input, df, vectorizer, tfidf_matrix, top_n=10):\n",
|
395 |
+
" # Vectorize the user input\n",
|
396 |
+
" user_tfidf = vectorizer.transform([user_input])\n",
|
397 |
+
"\n",
|
398 |
+
" # Calculate cosine similarity between user input and movie descriptions\n",
|
399 |
+
" similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)\n",
|
400 |
+
" similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)\n",
|
401 |
+
" similarity_scores=0.7*similarity_scores_desc+0.3*similarity_scores_name\n",
|
402 |
+
"\n",
|
403 |
+
" # Get indices of top N most similar movies\n",
|
404 |
+
" top_indices = similarity_scores.argsort(axis=1)[:, ::-1][:, :top_n]\n",
|
405 |
+
"\n",
|
406 |
+
" # Get movie names based on indices\n",
|
407 |
+
" recommended_movies = df.iloc[top_indices.ravel()]['name'].values\n",
|
408 |
+
"\n",
|
409 |
+
" return recommended_movies\n",
|
410 |
+
"\n",
|
411 |
+
"# Example usage\n",
|
412 |
+
"user_input = \"коллектив исследователей и учёных отправляется сквозь червоточину\" #input(\"Enter some words to get movie recommendations: \")\n",
|
413 |
+
"recommended_movies = recommend_movies(user_input, data, vectorizer, tfidf_matrix)\n",
|
414 |
+
"print(\"Recommended Movies:\")\n",
|
415 |
+
"for movie in recommended_movies:\n",
|
416 |
+
" print(movie)\n"
|
417 |
+
]
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"cell_type": "code",
|
421 |
+
"execution_count": 56,
|
422 |
+
"metadata": {},
|
423 |
+
"outputs": [],
|
424 |
+
"source": [
|
425 |
+
"import pickle\n",
|
426 |
+
"with open('vectorizer.pkl', 'wb') as f:\n",
|
427 |
+
" pickle.dump(vectorizer, f)"
|
428 |
+
]
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"cell_type": "code",
|
432 |
+
"execution_count": 49,
|
433 |
+
"metadata": {},
|
434 |
+
"outputs": [
|
435 |
+
{
|
436 |
+
"ename": "KeyboardInterrupt",
|
437 |
+
"evalue": "",
|
438 |
+
"output_type": "error",
|
439 |
+
"traceback": [
|
440 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
441 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
442 |
+
"Cell \u001b[0;32mIn[49], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Создание эмбеддингов текстовых аннотаций\u001b[39;00m\n\u001b[1;32m 2\u001b[0m annotations \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mtolist()\n\u001b[0;32m----> 3\u001b[0m annotation_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mannotations\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Инициализация поискового индекса\u001b[39;00m\n",
|
443 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py:371\u001b[0m, in \u001b[0;36mSentenceTransformer.encode\u001b[0;34m(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)\u001b[0m\n\u001b[1;32m 368\u001b[0m features\u001b[38;5;241m.\u001b[39mupdate(extra_features)\n\u001b[1;32m 370\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 371\u001b[0m out_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 372\u001b[0m out_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence_embedding\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m truncate_embeddings(\n\u001b[1;32m 373\u001b[0m out_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence_embedding\u001b[39m\u001b[38;5;124m\"\u001b[39m], \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtruncate_dim\n\u001b[1;32m 374\u001b[0m )\n\u001b[1;32m 376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_value \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_embeddings\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
444 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m):\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m:\n\u001b[0;32m--> 217\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28minput\u001b[39m\n",
|
445 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
446 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
447 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/sentence_transformers/models/Transformer.py:98\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[0;34m(self, features)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m features:\n\u001b[1;32m 96\u001b[0m trans_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m---> 98\u001b[0m output_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mauto_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtrans_features\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m output_tokens \u001b[38;5;241m=\u001b[39m output_states[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 101\u001b[0m features\u001b[38;5;241m.\u001b[39mupdate({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_embeddings\u001b[39m\u001b[38;5;124m\"\u001b[39m: output_tokens, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m: features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m]})\n",
|
448 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
449 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
450 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:822\u001b[0m, in \u001b[0;36mDistilBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 820\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mones(input_shape, device\u001b[38;5;241m=\u001b[39mdevice) \u001b[38;5;66;03m# (bs, seq_length)\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransformer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 827\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 829\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
451 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
452 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
453 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:587\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[0;34m(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 579\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 580\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 581\u001b[0m hidden_state,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 584\u001b[0m output_attentions,\n\u001b[1;32m 585\u001b[0m )\n\u001b[1;32m 586\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 587\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 590\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 592\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 594\u001b[0m hidden_state \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
|
454 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
455 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
456 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:531\u001b[0m, in \u001b[0;36mTransformerBlock.forward\u001b[0;34m(self, x, attn_mask, head_mask, output_attentions)\u001b[0m\n\u001b[1;32m 528\u001b[0m sa_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msa_layer_norm(sa_output \u001b[38;5;241m+\u001b[39m x) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;66;03m# Feed Forward Network\u001b[39;00m\n\u001b[0;32m--> 531\u001b[0m ffn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mffn\u001b[49m\u001b[43m(\u001b[49m\u001b[43msa_output\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 532\u001b[0m ffn_output: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layer_norm(ffn_output \u001b[38;5;241m+\u001b[39m sa_output) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 534\u001b[0m output \u001b[38;5;241m=\u001b[39m (ffn_output,)\n",
|
457 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
458 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
459 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:466\u001b[0m, in \u001b[0;36mFFN.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: torch\u001b[38;5;241m.\u001b[39mTensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[0;32m--> 466\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mapply_chunking_to_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mff_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_size_feed_forward\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mseq_len_dim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
460 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/pytorch_utils.py:237\u001b[0m, in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;66;03m# concatenate output at same dimension\u001b[39;00m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcat(output_chunks, dim\u001b[38;5;241m=\u001b[39mchunk_dim)\n\u001b[0;32m--> 237\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_tensors\u001b[49m\u001b[43m)\u001b[49m\n",
|
461 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:471\u001b[0m, in \u001b[0;36mFFN.ff_chunk\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 469\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlin1(\u001b[38;5;28minput\u001b[39m)\n\u001b[1;32m 470\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactivation(x)\n\u001b[0;32m--> 471\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlin2\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 472\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(x)\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
|
462 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
463 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
464 |
+
"File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
|
465 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
466 |
+
]
|
467 |
+
}
|
468 |
+
],
|
469 |
+
"source": [
|
470 |
+
"# Создание эмбеддингов текстовых аннотаций\n",
|
471 |
+
"annotations = data['description'].tolist()\n",
|
472 |
+
"annotation_embeddings = model.encode(annotations)\n",
|
473 |
+
"\n",
|
474 |
+
"# Инициализация поискового индекса\n"
|
475 |
+
]
|
476 |
+
},
|
477 |
+
{
|
478 |
+
"cell_type": "code",
|
479 |
+
"execution_count": 35,
|
480 |
+
"metadata": {
|
481 |
+
"id": "MOjSw4ahPMmh"
|
482 |
+
},
|
483 |
+
"outputs": [],
|
484 |
+
"source": [
|
485 |
+
"index = faiss.IndexFlatL2(embedings.shape[1])\n",
|
486 |
+
"index.add(np.array(embedings))"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"cell_type": "code",
|
491 |
+
"execution_count": 40,
|
492 |
+
"metadata": {},
|
493 |
+
"outputs": [],
|
494 |
+
"source": [
|
495 |
+
"\n",
|
496 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
497 |
+
"import numpy as np\n",
|
498 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
499 |
+
"import torch\n",
|
500 |
+
"import pandas as pd\n",
|
501 |
+
"\n",
|
502 |
+
"\n",
|
503 |
+
"data = np.load('embeddings.npy')\n",
|
504 |
+
"movies = pd.read_csv('data.csv')\n",
|
505 |
+
"\n",
|
506 |
+
"def get_embeddings():\n",
|
507 |
+
" tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
|
508 |
+
" model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
|
509 |
+
" # model.cuda() \n",
|
510 |
+
" return model, tokenizer\n",
|
511 |
+
"\n",
|
512 |
+
"def embed_bert_cls(text ):\n",
|
513 |
+
" model, tokenizer = get_embeddings()\n",
|
514 |
+
" t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
|
515 |
+
" with torch.no_grad():\n",
|
516 |
+
" model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
|
517 |
+
" embeddings = model_output.last_hidden_state[:, 0, :]\n",
|
518 |
+
" embeddings = torch.nn.functional.normalize(embeddings)\n",
|
519 |
+
" return embeddings[0].cpu().numpy()"
|
520 |
+
]
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"cell_type": "code",
|
524 |
+
"execution_count": 43,
|
525 |
+
"metadata": {
|
526 |
+
"id": "CMruSPejPUBu"
|
527 |
+
},
|
528 |
+
"outputs": [],
|
529 |
+
"source": [
|
530 |
+
"def search_books(query, k=5):\n",
|
531 |
+
" query_embedding = embed_bert_cls(query)\n",
|
532 |
+
"\n",
|
533 |
+
" # Поиск ближайших соседей в индексе\n",
|
534 |
+
" D, I = index.search(np.array([query_embedding]), k)\n",
|
535 |
+
"\n",
|
536 |
+
" return D,I\n",
|
537 |
+
" # for i, idx in enumerate(I[0]):\n",
|
538 |
+
" # book = data.iloc[idx]\n",
|
539 |
+
" # print(f'Рекомендуемая книга {i + 1}: {book[\"title\"]} by {book[\"author\"]}')\n",
|
540 |
+
" # print(f'Жанр: {book[\"genre\"]}')\n",
|
541 |
+
" # print(f'URL страницы книги: {book[\"page_url\"]}')\n",
|
542 |
+
" # print(f'Описание: {book[\"annotation\"]}')\n",
|
543 |
+
" # print(f'Мера подходящести под запрос: {1/(D[0][i]+1):.2f}\\n')"
|
544 |
+
]
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"cell_type": "code",
|
548 |
+
"execution_count": 42,
|
549 |
+
"metadata": {},
|
550 |
+
"outputs": [
|
551 |
+
{
|
552 |
+
"data": {
|
553 |
+
"text/plain": [
|
554 |
+
"(312,)"
|
555 |
+
]
|
556 |
+
},
|
557 |
+
"execution_count": 42,
|
558 |
+
"metadata": {},
|
559 |
+
"output_type": "execute_result"
|
560 |
+
}
|
561 |
+
],
|
562 |
+
"source": [
|
563 |
+
"embed_bert_cls(\"query\").shape"
|
564 |
+
]
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"cell_type": "code",
|
568 |
+
"execution_count": 44,
|
569 |
+
"metadata": {
|
570 |
+
"colab": {
|
571 |
+
"base_uri": "https://localhost:8080/"
|
572 |
+
},
|
573 |
+
"id": "krz9AxlqPZBl",
|
574 |
+
"outputId": "d4428884-f616-4963-b82d-1907c11f2304"
|
575 |
+
},
|
576 |
+
"outputs": [
|
577 |
+
{
|
578 |
+
"data": {
|
579 |
+
"text/plain": [
|
580 |
+
"(array([[0.8288138 , 0.84766877, 0.8478927 ]], dtype=float32),\n",
|
581 |
+
" array([[ 2967, 1486, 19329]]))"
|
582 |
+
]
|
583 |
+
},
|
584 |
+
"execution_count": 44,
|
585 |
+
"metadata": {},
|
586 |
+
"output_type": "execute_result"
|
587 |
+
}
|
588 |
+
],
|
589 |
+
"source": [
|
590 |
+
"query = \"страшные заклинания\"\n",
|
591 |
+
"search_books(query, k=3)"
|
592 |
+
]
|
593 |
+
}
|
594 |
+
],
|
595 |
+
"metadata": {
|
596 |
+
"accelerator": "GPU",
|
597 |
+
"colab": {
|
598 |
+
"gpuType": "T4",
|
599 |
+
"provenance": []
|
600 |
+
},
|
601 |
+
"kernelspec": {
|
602 |
+
"display_name": "Python 3",
|
603 |
+
"name": "python3"
|
604 |
+
},
|
605 |
+
"language_info": {
|
606 |
+
"codemirror_mode": {
|
607 |
+
"name": "ipython",
|
608 |
+
"version": 3
|
609 |
+
},
|
610 |
+
"file_extension": ".py",
|
611 |
+
"mimetype": "text/x-python",
|
612 |
+
"name": "python",
|
613 |
+
"nbconvert_exporter": "python",
|
614 |
+
"pygments_lexer": "ipython3",
|
615 |
+
"version": "3.12.2"
|
616 |
+
}
|
617 |
+
},
|
618 |
+
"nbformat": 4,
|
619 |
+
"nbformat_minor": 0
|
620 |
+
}
|
data/data.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4aaeb836325f8966ed7b0ed5e18ea4a29ec24300ea8afa615f201d571843e358
|
3 |
+
size 34361210
|
data/{embeddings.npy → embeddings_bert.npy}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a2243510b8892ac2f353478d28fc4d9707f2a3e0aec4fb4c17639f4a861ec1c
|
3 |
+
size 35503232
|
data/tf_idf_vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7ce22b069723ecfaecb88e16129a29ab56074106d076679703666a8648240dc
|
3 |
+
size 5236615
|