romnatall commited on
Commit
0514b29
·
1 Parent(s): e633090

прогнозирование по трем моделям

Browse files
app.py CHANGED
@@ -7,17 +7,27 @@ import torch
7
  from transformers import AutoTokenizer, AutoModel
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
10
 
11
- movies = pd.read_csv('data/data.csv')
12
 
 
13
  toggle_state = st.sidebar.checkbox("режим разметки")
14
  input_search = st.text_input('Search')
15
 
16
 
17
 
18
- data = np.load('data/embeddings.npy')
19
-
20
 
 
 
 
 
 
 
 
21
 
22
  @st.cache_resource
23
  def get_embeddings():
@@ -36,27 +46,50 @@ def embed_bert_cls(text, ):
36
  embeddings = torch.nn.functional.normalize(embeddings)
37
  return embeddings[0].cpu().numpy()
38
 
 
 
 
 
 
 
 
 
39
 
40
  @st.cache_data
41
  def predict_rating(input_search):
42
 
 
 
 
43
  emb = embed_bert_cls(input_search)
44
  X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
45
- st.session_state["X"]=X
46
 
47
- # from catboost import CatBoostRanker
48
- # cb= CatBoostRanker()
49
- # cb.load_model('model.cbm')
50
- # y = cb.predict(X)
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # import pickle
53
- # with open('logreg.pkl', 'rb') as f:
54
- # logreg = pickle.load(f)
55
- # y = logreg.predict(X)
56
 
57
- y= cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
58
 
59
- return top_indices(y, 10)
 
 
 
 
 
60
 
61
 
62
  def saverank(index, new_X,new_y):
@@ -174,19 +207,13 @@ def getnums(df,size=0,text=''):
174
  return reqs[text]
175
 
176
 
177
- def top_indices(array, n):
178
- # Получаем индексы элементов, отсортированных по убыванию
179
- st.session_state["pred"] = array
180
- sorted_indices = np.argsort(array)[::-1]
181
- # Выбираем первые n индексов
182
- top_n_indices = sorted_indices[:n]
183
- return top_n_indices
184
 
185
 
186
 
187
 
188
- for i in predict_rating(input_search):
189
- display_movie_card(movies, i )
 
190
 
191
 
192
 
 
7
  from transformers import AutoTokenizer, AutoModel
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ from sklearn.metrics.pairwise import pairwise_distances
11
+ import faiss
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ import pickle
14
 
 
15
 
16
+ movies = pd.read_csv('data/data.csv')
17
  toggle_state = st.sidebar.checkbox("режим разметки")
18
  input_search = st.text_input('Search')
19
 
20
 
21
 
22
+ data = np.load('data/embeddings_bert.npy')
 
23
 
24
+ def top_indices(array, n,upsc=False):
25
+ # Получаем индексы элементов, отсортированных по убыванию
26
+ st.session_state["pred"] = array
27
+ sorted_indices = np.argsort(array)[::1 if upsc else -1]
28
+ # Выбираем первые n индексов
29
+ top_n_indices = sorted_indices[:n]
30
+ return top_n_indices
31
 
32
  @st.cache_resource
33
  def get_embeddings():
 
46
  embeddings = torch.nn.functional.normalize(embeddings)
47
  return embeddings[0].cpu().numpy()
48
 
49
+ @st.cache_resource
50
+ def getmodels():
51
+
52
+ with open('data/logreg.pkl', 'rb') as f:
53
+ logreg = pickle.load(f)
54
+ with open('data/tf_idf_vectorizer.pkl', 'rb') as f:
55
+ vectorizer = pickle.load(f)
56
+ return logreg, vectorizer
57
 
58
  @st.cache_data
59
  def predict_rating(input_search):
60
 
61
+
62
+ logreg, vectorizer=getmodels()
63
+
64
  emb = embed_bert_cls(input_search)
65
  X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
 
66
 
67
+ user_tfidf = vectorizer.transform([input_search])
68
+ tfidf_matrix = vectorizer.transform(movies['description'])
69
+ tfidf_matrix2 = vectorizer.transform(movies['name'])
70
+
71
+ similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)
72
+ similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)
73
+
74
+ y_log = logreg.predict(X)
75
+ y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
76
+
77
+
78
+ y=(similarity_scores_desc*0.9+similarity_scores_name*0.035+y_emb*.4+y_log*0.4).reshape(-1)
79
+ st.session_state["pred"]=y
80
+
81
+ return top_indices(y, 10,upsc=False)
82
+
83
+
84
 
 
 
 
 
85
 
 
86
 
87
+
88
+
89
+
90
+
91
+
92
+
93
 
94
 
95
  def saverank(index, new_X,new_y):
 
207
  return reqs[text]
208
 
209
 
 
 
 
 
 
 
 
210
 
211
 
212
 
213
 
214
+ if input_search:
215
+ for i in predict_rating(input_search):
216
+ display_movie_card(movies, i )
217
 
218
 
219
 
data/books_model (2).ipynb ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {
7
+ "id": "IlvYwT4VD8Bd"
8
+ },
9
+ "outputs": [
10
+ {
11
+ "name": "stdout",
12
+ "output_type": "stream",
13
+ "text": [
14
+ "Collecting sentence_transformers\n",
15
+ " Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)\n",
16
+ "Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (4.39.3)\n",
17
+ "Requirement already satisfied: tqdm in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (4.66.2)\n",
18
+ "Requirement already satisfied: torch>=1.11.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (2.2.2)\n",
19
+ "Requirement already satisfied: numpy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.26.4)\n",
20
+ "Requirement already satisfied: scikit-learn in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.4.1.post1)\n",
21
+ "Requirement already satisfied: scipy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (1.13.0)\n",
22
+ "Requirement already satisfied: huggingface-hub>=0.15.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (0.22.2)\n",
23
+ "Requirement already satisfied: Pillow in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sentence_transformers) (10.3.0)\n",
24
+ "Requirement already satisfied: filelock in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.13.3)\n",
25
+ "Requirement already satisfied: fsspec>=2023.5.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2024.3.1)\n",
26
+ "Requirement already satisfied: packaging>=20.9 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (24.0)\n",
27
+ "Requirement already satisfied: pyyaml>=5.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0.1)\n",
28
+ "Requirement already satisfied: requests in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.31.0)\n",
29
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.10.0)\n",
30
+ "Requirement already satisfied: sympy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (1.12)\n",
31
+ "Requirement already satisfied: networkx in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.2.1)\n",
32
+ "Requirement already satisfied: jinja2 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.3)\n",
33
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
34
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
35
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
36
+ "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (8.9.2.26)\n",
37
+ "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.3.1)\n",
38
+ "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (11.0.2.54)\n",
39
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (10.3.2.106)\n",
40
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (11.4.5.107)\n",
41
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.0.106)\n",
42
+ "Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (2.19.3)\n",
43
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (12.1.105)\n",
44
+ "Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence_transformers) (12.4.127)\n",
45
+ "Requirement already satisfied: regex!=2019.12.17 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2023.12.25)\n",
46
+ "Requirement already satisfied: tokenizers<0.19,>=0.14 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.15.2)\n",
47
+ "Requirement already satisfied: safetensors>=0.4.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.2)\n",
48
+ "Requirement already satisfied: joblib>=1.2.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from scikit-learn->sentence_transformers) (1.3.2)\n",
49
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from scikit-learn->sentence_transformers) (3.4.0)\n",
50
+ "Requirement already satisfied: MarkupSafe>=2.0 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.5)\n",
51
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.3.2)\n",
52
+ "Requirement already satisfied: idna<4,>=2.5 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.6)\n",
53
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.18)\n",
54
+ "Requirement already satisfied: certifi>=2017.4.17 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2024.2.2)\n",
55
+ "Requirement already satisfied: mpmath>=0.19 in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\n",
56
+ "Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)\n",
57
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m171.5/171.5 kB\u001b[0m \u001b[31m566.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
58
+ "\u001b[?25hInstalling collected packages: sentence_transformers\n",
59
+ "Successfully installed sentence_transformers-2.7.0\n"
60
+ ]
61
+ }
62
+ ],
63
+ "source": [
64
+ "!pip install sentence_transformers"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 6,
70
+ "metadata": {
71
+ "id": "sdvkA3cwEVoZ"
72
+ },
73
+ "outputs": [
74
+ {
75
+ "name": "stdout",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Collecting faiss-cpu\n",
79
+ " Downloading faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
80
+ "Requirement already satisfied: numpy in /home/roma/anaconda3/envs/cv/lib/python3.12/site-packages (from faiss-cpu) (1.26.4)\n",
81
+ "Downloading faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)\n",
82
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.0/27.0 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
83
+ "\u001b[?25hInstalling collected packages: faiss-cpu\n",
84
+ "Successfully installed faiss-cpu-1.8.0\n"
85
+ ]
86
+ }
87
+ ],
88
+ "source": [
89
+ "!pip install faiss-cpu\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 7,
95
+ "metadata": {
96
+ "id": "Hv0VlA_ZAtjH"
97
+ },
98
+ "outputs": [],
99
+ "source": [
100
+ "import pandas as pd\n",
101
+ "from sentence_transformers import SentenceTransformer\n",
102
+ "import faiss\n",
103
+ "import torch\n",
104
+ "import numpy as np"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 2,
110
+ "metadata": {
111
+ "colab": {
112
+ "base_uri": "https://localhost:8080/",
113
+ "height": 701
114
+ },
115
+ "id": "ENfI_teQBvxa",
116
+ "outputId": "a23427df-f5aa-40f0-ac35-2974803c66b4"
117
+ },
118
+ "outputs": [],
119
+ "source": [
120
+ "data = pd.read_csv('data.csv')\n"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 26,
126
+ "metadata": {
127
+ "id": "uLOmQB8VP_rH"
128
+ },
129
+ "outputs": [],
130
+ "source": [
131
+ "data = data.sample(frac=1)"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {
138
+ "id": "6eRXlMBBOyjr"
139
+ },
140
+ "outputs": [],
141
+ "source": [
142
+ "model = SentenceTransformer('distiluse-base-multilingual-cased')"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 48,
148
+ "metadata": {
149
+ "id": "HkhTwHXtO5xk"
150
+ },
151
+ "outputs": [],
152
+ "source": [
153
+ "data['annotation_len'] = data['description'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)\n",
154
+ "\n",
155
+ "embedings =np.load('embeddings.npy')[data['annotation_len'] > 10]\n",
156
+ "data = data[data['annotation_len'] > 10] # Отсечение слишком коротких аннотаций\n",
157
+ "\n",
158
+ "\n",
159
+ "data.to_csv('data.csv')\n",
160
+ "np.save('embeddings.npy',embedings)\n"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": 45,
166
+ "metadata": {},
167
+ "outputs": [
168
+ {
169
+ "ename": "AttributeError",
170
+ "evalue": "'numpy.ndarray' object has no attribute 'to_csv'",
171
+ "output_type": "error",
172
+ "traceback": [
173
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
174
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
175
+ "Cell \u001b[0;32mIn[45], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m np\u001b[38;5;241m.\u001b[39msave(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membeddings.npy\u001b[39m\u001b[38;5;124m'\u001b[39m,embedings)\n",
176
+ "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'to_csv'"
177
+ ]
178
+ }
179
+ ],
180
+ "source": [
181
+ "data.to_csv('data.csv')\n",
182
+ "np.save('embeddings.npy',embedings)"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 50,
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>name</th>\n",
212
+ " <th>description</th>\n",
213
+ " <th>link</th>\n",
214
+ " <th>year</th>\n",
215
+ " <th>imdb</th>\n",
216
+ " <th>kp</th>\n",
217
+ " <th>country</th>\n",
218
+ " <th>age</th>\n",
219
+ " <th>actors</th>\n",
220
+ " <th>genres</th>\n",
221
+ " <th>poster</th>\n",
222
+ " <th>annotation_len</th>\n",
223
+ " </tr>\n",
224
+ " </thead>\n",
225
+ " <tbody>\n",
226
+ " <tr>\n",
227
+ " <th>0</th>\n",
228
+ " <td>Уэнсдэй</td>\n",
229
+ " <td>В американской хоррор-комедии показана детект...</td>\n",
230
+ " <td>https://www.lordfilm.bot/48211-ujensdjej-2022....</td>\n",
231
+ " <td>2022.0</td>\n",
232
+ " <td>8.1</td>\n",
233
+ " <td>8.0</td>\n",
234
+ " <td>США</td>\n",
235
+ " <td>0+</td>\n",
236
+ " <td>Дженна Ортега, Гвендолин Кристи, Рики Линдхоум...</td>\n",
237
+ " <td>Сериалы, Фильмы про подростков</td>\n",
238
+ " <td>https://www.lordfilm.bot/uploads/posts/2022-12...</td>\n",
239
+ " <td>157</td>\n",
240
+ " </tr>\n",
241
+ " <tr>\n",
242
+ " <th>1</th>\n",
243
+ " <td>Слово пацана. Кровь на асфальте</td>\n",
244
+ " <td>Перестройка уже шагнула с кремлевских трибун ...</td>\n",
245
+ " <td>https://www.lordfilm.bot/50219-slovo-pacana-kr...</td>\n",
246
+ " <td>2023.0</td>\n",
247
+ " <td>NaN</td>\n",
248
+ " <td>NaN</td>\n",
249
+ " <td>Россия</td>\n",
250
+ " <td>18+</td>\n",
251
+ " <td>Иван Янковский, Елизавета Базыкина, Ольга Лапш...</td>\n",
252
+ " <td>Сериалы, ru</td>\n",
253
+ " <td>https://www.lordfilm.bot/uploads/posts/2023-11...</td>\n",
254
+ " <td>150</td>\n",
255
+ " </tr>\n",
256
+ " <tr>\n",
257
+ " <th>2</th>\n",
258
+ " <td>Элементарно</td>\n",
259
+ " <td>В Городе Стихий обитатели огня, воды, земли и...</td>\n",
260
+ " <td>https://www.lordfilm.bot/48863-jelementarno-20...</td>\n",
261
+ " <td>2023.0</td>\n",
262
+ " <td>7.0</td>\n",
263
+ " <td>7.7</td>\n",
264
+ " <td>США</td>\n",
265
+ " <td>6+</td>\n",
266
+ " <td>Леа Льюис, Мамуду Ати, Роналдо Дель Кармен, Ши...</td>\n",
267
+ " <td>Мультфильмы</td>\n",
268
+ " <td>https://www.lordfilm.bot/uploads/posts/2023-06...</td>\n",
269
+ " <td>34</td>\n",
270
+ " </tr>\n",
271
+ " <tr>\n",
272
+ " <th>3</th>\n",
273
+ " <td>Лука</td>\n",
274
+ " <td>Свои незабываемые каникулы, в которых есть ме...</td>\n",
275
+ " <td>https://www.lordfilm.bot/27172-luka-11-12-2021...</td>\n",
276
+ " <td>2021.0</td>\n",
277
+ " <td>7.4</td>\n",
278
+ " <td>7.8</td>\n",
279
+ " <td>США</td>\n",
280
+ " <td>6+</td>\n",
281
+ " <td>Джейкоб Тремблей, Джек Дилан Грейзер, Саша Бар...</td>\n",
282
+ " <td>Мультфильмы</td>\n",
283
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-06...</td>\n",
284
+ " <td>68</td>\n",
285
+ " </tr>\n",
286
+ " <tr>\n",
287
+ " <th>4</th>\n",
288
+ " <td>Локи</td>\n",
289
+ " <td>Увлекательные приключения скандинавского бога...</td>\n",
290
+ " <td>https://www.lordfilm.bot/27119-loki-g1.html</td>\n",
291
+ " <td>2021.0</td>\n",
292
+ " <td>8.2</td>\n",
293
+ " <td>7.7</td>\n",
294
+ " <td>США</td>\n",
295
+ " <td>0+</td>\n",
296
+ " <td>Том Хиддлстон, Софи Ди Мартино, Ричард Э. Гран...</td>\n",
297
+ " <td>Сериалы, Фильмы Marvel</td>\n",
298
+ " <td>https://www.lordfilm.bot/uploads/posts/2023-10...</td>\n",
299
+ " <td>162</td>\n",
300
+ " </tr>\n",
301
+ " </tbody>\n",
302
+ "</table>\n",
303
+ "</div>"
304
+ ],
305
+ "text/plain": [
306
+ " name \\\n",
307
+ "0 Уэнсдэй \n",
308
+ "1 Слово пацана. Кровь на асфальте \n",
309
+ "2 Элементарно \n",
310
+ "3 Лука \n",
311
+ "4 Локи \n",
312
+ "\n",
313
+ " description \\\n",
314
+ "0 В американской хоррор-комедии показана детект... \n",
315
+ "1 Перестройка уже шагнула с кремлевских трибун ... \n",
316
+ "2 В Городе Стихий обитатели огня, воды, земли и... \n",
317
+ "3 Свои незабываемые каникулы, в которых есть ме... \n",
318
+ "4 Увлекательные приключения скандинавского бога... \n",
319
+ "\n",
320
+ " link year imdb kp \\\n",
321
+ "0 https://www.lordfilm.bot/48211-ujensdjej-2022.... 2022.0 8.1 8.0 \n",
322
+ "1 https://www.lordfilm.bot/50219-slovo-pacana-kr... 2023.0 NaN NaN \n",
323
+ "2 https://www.lordfilm.bot/48863-jelementarno-20... 2023.0 7.0 7.7 \n",
324
+ "3 https://www.lordfilm.bot/27172-luka-11-12-2021... 2021.0 7.4 7.8 \n",
325
+ "4 https://www.lordfilm.bot/27119-loki-g1.html 2021.0 8.2 7.7 \n",
326
+ "\n",
327
+ " country age actors \\\n",
328
+ "0 США 0+ Дженна Ортега, Гвендолин Кристи, Рики Линдхоум... \n",
329
+ "1 Россия 18+ Иван Янковский, Елизавета Базыкина, Ольга Лапш... \n",
330
+ "2 США 6+ Леа Льюис, Мамуду Ати, Роналдо Дель Кармен, Ши... \n",
331
+ "3 США 6+ Джейкоб Тремблей, Джек Дилан Грейзер, Саша Бар... \n",
332
+ "4 США 0+ Том Хиддлстон, Софи Ди Мартино, Ричард Э. Гран... \n",
333
+ "\n",
334
+ " genres \\\n",
335
+ "0 Сериалы, Фильмы про подростков \n",
336
+ "1 Сериалы, ru \n",
337
+ "2 Мультфильмы \n",
338
+ "3 Мультфильмы \n",
339
+ "4 Сериалы, Фильмы Marvel \n",
340
+ "\n",
341
+ " poster annotation_len \n",
342
+ "0 https://www.lordfilm.bot/uploads/posts/2022-12... 157 \n",
343
+ "1 https://www.lordfilm.bot/uploads/posts/2023-11... 150 \n",
344
+ "2 https://www.lordfilm.bot/uploads/posts/2023-06... 34 \n",
345
+ "3 https://www.lordfilm.bot/uploads/posts/2021-06... 68 \n",
346
+ "4 https://www.lordfilm.bot/uploads/posts/2023-10... 162 "
347
+ ]
348
+ },
349
+ "execution_count": 50,
350
+ "metadata": {},
351
+ "output_type": "execute_result"
352
+ }
353
+ ],
354
+ "source": [
355
+ "data.head()"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 3,
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "name": "stdout",
365
+ "output_type": "stream",
366
+ "text": [
367
+ "Recommended Movies:\n",
368
+ "Интерстеллар\n",
369
+ "Летящие сквозь ночь\n",
370
+ "Спящая сквозь время\n",
371
+ "Любовь сквозь время\n",
372
+ "Лагерь Холодный Ручей\n",
373
+ "Парадокс Кловерфилда\n",
374
+ "Путешествие сквозь ночь\n",
375
+ "Сквозь огонь\n",
376
+ "Живое\n",
377
+ "Моцзинь: Долина червя\n"
378
+ ]
379
+ }
380
+ ],
381
+ "source": [
382
+ "import pandas as pd\n",
383
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
384
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
385
+ "\n",
386
+ "\n",
387
+ "\n",
388
+ "# Vectorize the movie descriptions using TF-IDF\n",
389
+ "vectorizer = TfidfVectorizer()\n",
390
+ "tfidf_matrix = vectorizer.fit_transform(data['description'])\n",
391
+ "tfidf_matrix2 = vectorizer.transform(data['name'])\n",
392
+ "\n",
393
+ "# Function to recommend movies based on user input\n",
394
+ "def recommend_movies(user_input, df, vectorizer, tfidf_matrix, top_n=10):\n",
395
+ " # Vectorize the user input\n",
396
+ " user_tfidf = vectorizer.transform([user_input])\n",
397
+ "\n",
398
+ " # Calculate cosine similarity between user input and movie descriptions\n",
399
+ " similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)\n",
400
+ " similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)\n",
401
+ " similarity_scores=0.7*similarity_scores_desc+0.3*similarity_scores_name\n",
402
+ "\n",
403
+ " # Get indices of top N most similar movies\n",
404
+ " top_indices = similarity_scores.argsort(axis=1)[:, ::-1][:, :top_n]\n",
405
+ "\n",
406
+ " # Get movie names based on indices\n",
407
+ " recommended_movies = df.iloc[top_indices.ravel()]['name'].values\n",
408
+ "\n",
409
+ " return recommended_movies\n",
410
+ "\n",
411
+ "# Example usage\n",
412
+ "user_input = \"коллектив исследователей и учёных отправляется сквозь червоточину\" #input(\"Enter some words to get movie recommendations: \")\n",
413
+ "recommended_movies = recommend_movies(user_input, data, vectorizer, tfidf_matrix)\n",
414
+ "print(\"Recommended Movies:\")\n",
415
+ "for movie in recommended_movies:\n",
416
+ " print(movie)\n"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 56,
422
+ "metadata": {},
423
+ "outputs": [],
424
+ "source": [
425
+ "import pickle\n",
426
+ "with open('vectorizer.pkl', 'wb') as f:\n",
427
+ " pickle.dump(vectorizer, f)"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 49,
433
+ "metadata": {},
434
+ "outputs": [
435
+ {
436
+ "ename": "KeyboardInterrupt",
437
+ "evalue": "",
438
+ "output_type": "error",
439
+ "traceback": [
440
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
441
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
442
+ "Cell \u001b[0;32mIn[49], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Создание эмбеддингов текстовых аннотаций\u001b[39;00m\n\u001b[1;32m 2\u001b[0m annotations \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mtolist()\n\u001b[0;32m----> 3\u001b[0m annotation_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mannotations\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Инициализация поискового индекса\u001b[39;00m\n",
443
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py:371\u001b[0m, in \u001b[0;36mSentenceTransformer.encode\u001b[0;34m(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)\u001b[0m\n\u001b[1;32m 368\u001b[0m features\u001b[38;5;241m.\u001b[39mupdate(extra_features)\n\u001b[1;32m 370\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 371\u001b[0m out_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 372\u001b[0m out_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence_embedding\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m truncate_embeddings(\n\u001b[1;32m 373\u001b[0m out_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence_embedding\u001b[39m\u001b[38;5;124m\"\u001b[39m], \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtruncate_dim\n\u001b[1;32m 374\u001b[0m )\n\u001b[1;32m 376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_value \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_embeddings\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
444
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m):\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m:\n\u001b[0;32m--> 217\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28minput\u001b[39m\n",
445
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
446
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
447
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/sentence_transformers/models/Transformer.py:98\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[0;34m(self, features)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m features:\n\u001b[1;32m 96\u001b[0m trans_features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_type_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m---> 98\u001b[0m output_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mauto_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtrans_features\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m output_tokens \u001b[38;5;241m=\u001b[39m output_states[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 101\u001b[0m features\u001b[38;5;241m.\u001b[39mupdate({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken_embeddings\u001b[39m\u001b[38;5;124m\"\u001b[39m: output_tokens, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m: features[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m]})\n",
448
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
449
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
450
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:822\u001b[0m, in \u001b[0;36mDistilBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 820\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mones(input_shape, device\u001b[38;5;241m=\u001b[39mdevice) \u001b[38;5;66;03m# (bs, seq_length)\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransformer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 827\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 829\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
451
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
452
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
453
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:587\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[0;34m(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 579\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 580\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 581\u001b[0m hidden_state,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 584\u001b[0m output_attentions,\n\u001b[1;32m 585\u001b[0m )\n\u001b[1;32m 586\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 587\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 590\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 592\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 594\u001b[0m hidden_state \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
454
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
455
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
456
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:531\u001b[0m, in \u001b[0;36mTransformerBlock.forward\u001b[0;34m(self, x, attn_mask, head_mask, output_attentions)\u001b[0m\n\u001b[1;32m 528\u001b[0m sa_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msa_layer_norm(sa_output \u001b[38;5;241m+\u001b[39m x) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;66;03m# Feed Forward Network\u001b[39;00m\n\u001b[0;32m--> 531\u001b[0m ffn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mffn\u001b[49m\u001b[43m(\u001b[49m\u001b[43msa_output\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 532\u001b[0m ffn_output: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layer_norm(ffn_output \u001b[38;5;241m+\u001b[39m sa_output) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m 534\u001b[0m output \u001b[38;5;241m=\u001b[39m (ffn_output,)\n",
457
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
458
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
459
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:466\u001b[0m, in \u001b[0;36mFFN.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: torch\u001b[38;5;241m.\u001b[39mTensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[0;32m--> 466\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mapply_chunking_to_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mff_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_size_feed_forward\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mseq_len_dim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n",
460
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/pytorch_utils.py:237\u001b[0m, in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;66;03m# concatenate output at same dimension\u001b[39;00m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcat(output_chunks, dim\u001b[38;5;241m=\u001b[39mchunk_dim)\n\u001b[0;32m--> 237\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_tensors\u001b[49m\u001b[43m)\u001b[49m\n",
461
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py:471\u001b[0m, in \u001b[0;36mFFN.ff_chunk\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 469\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlin1(\u001b[38;5;28minput\u001b[39m)\n\u001b[1;32m 470\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactivation(x)\n\u001b[0;32m--> 471\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlin2\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 472\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(x)\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
462
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
463
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
464
+ "File \u001b[0;32m~/anaconda3/envs/cv/lib/python3.12/site-packages/torch/nn/modules/linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
465
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
466
+ ]
467
+ }
468
+ ],
469
+ "source": [
470
+ "# Создание эмбеддингов текстовых аннотаций\n",
471
+ "annotations = data['description'].tolist()\n",
472
+ "annotation_embeddings = model.encode(annotations)\n",
473
+ "\n",
474
+ "# Инициализация поискового индекса\n"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": 35,
480
+ "metadata": {
481
+ "id": "MOjSw4ahPMmh"
482
+ },
483
+ "outputs": [],
484
+ "source": [
485
+ "index = faiss.IndexFlatL2(embedings.shape[1])\n",
486
+ "index.add(np.array(embedings))"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 40,
492
+ "metadata": {},
493
+ "outputs": [],
494
+ "source": [
495
+ "\n",
496
+ "from transformers import AutoTokenizer, AutoModel\n",
497
+ "import numpy as np\n",
498
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
499
+ "import torch\n",
500
+ "import pandas as pd\n",
501
+ "\n",
502
+ "\n",
503
+ "data = np.load('embeddings.npy')\n",
504
+ "movies = pd.read_csv('data.csv')\n",
505
+ "\n",
506
+ "def get_embeddings():\n",
507
+ " tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
508
+ " model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
509
+ " # model.cuda() \n",
510
+ " return model, tokenizer\n",
511
+ "\n",
512
+ "def embed_bert_cls(text ):\n",
513
+ " model, tokenizer = get_embeddings()\n",
514
+ " t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
515
+ " with torch.no_grad():\n",
516
+ " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
517
+ " embeddings = model_output.last_hidden_state[:, 0, :]\n",
518
+ " embeddings = torch.nn.functional.normalize(embeddings)\n",
519
+ " return embeddings[0].cpu().numpy()"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 43,
525
+ "metadata": {
526
+ "id": "CMruSPejPUBu"
527
+ },
528
+ "outputs": [],
529
+ "source": [
530
+ "def search_books(query, k=5):\n",
531
+ " query_embedding = embed_bert_cls(query)\n",
532
+ "\n",
533
+ " # Поиск ближайших соседей в индексе\n",
534
+ " D, I = index.search(np.array([query_embedding]), k)\n",
535
+ "\n",
536
+ " return D,I\n",
537
+ " # for i, idx in enumerate(I[0]):\n",
538
+ " # book = data.iloc[idx]\n",
539
+ " # print(f'Рекомендуемая книга {i + 1}: {book[\"title\"]} by {book[\"author\"]}')\n",
540
+ " # print(f'Жанр: {book[\"genre\"]}')\n",
541
+ " # print(f'URL страницы книги: {book[\"page_url\"]}')\n",
542
+ " # print(f'Описание: {book[\"annotation\"]}')\n",
543
+ " # print(f'Мера подходящести под запрос: {1/(D[0][i]+1):.2f}\\n')"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "execution_count": 42,
549
+ "metadata": {},
550
+ "outputs": [
551
+ {
552
+ "data": {
553
+ "text/plain": [
554
+ "(312,)"
555
+ ]
556
+ },
557
+ "execution_count": 42,
558
+ "metadata": {},
559
+ "output_type": "execute_result"
560
+ }
561
+ ],
562
+ "source": [
563
+ "embed_bert_cls(\"query\").shape"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": 44,
569
+ "metadata": {
570
+ "colab": {
571
+ "base_uri": "https://localhost:8080/"
572
+ },
573
+ "id": "krz9AxlqPZBl",
574
+ "outputId": "d4428884-f616-4963-b82d-1907c11f2304"
575
+ },
576
+ "outputs": [
577
+ {
578
+ "data": {
579
+ "text/plain": [
580
+ "(array([[0.8288138 , 0.84766877, 0.8478927 ]], dtype=float32),\n",
581
+ " array([[ 2967, 1486, 19329]]))"
582
+ ]
583
+ },
584
+ "execution_count": 44,
585
+ "metadata": {},
586
+ "output_type": "execute_result"
587
+ }
588
+ ],
589
+ "source": [
590
+ "query = \"страшные заклинания\"\n",
591
+ "search_books(query, k=3)"
592
+ ]
593
+ }
594
+ ],
595
+ "metadata": {
596
+ "accelerator": "GPU",
597
+ "colab": {
598
+ "gpuType": "T4",
599
+ "provenance": []
600
+ },
601
+ "kernelspec": {
602
+ "display_name": "Python 3",
603
+ "name": "python3"
604
+ },
605
+ "language_info": {
606
+ "codemirror_mode": {
607
+ "name": "ipython",
608
+ "version": 3
609
+ },
610
+ "file_extension": ".py",
611
+ "mimetype": "text/x-python",
612
+ "name": "python",
613
+ "nbconvert_exporter": "python",
614
+ "pygments_lexer": "ipython3",
615
+ "version": "3.12.2"
616
+ }
617
+ },
618
+ "nbformat": 4,
619
+ "nbformat_minor": 0
620
+ }
data/data.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3880998a33fa7f246482272f6c0e8270c6d759ee594a94030cf9d722373f604
3
- size 34515511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aaeb836325f8966ed7b0ed5e18ea4a29ec24300ea8afa615f201d571843e358
3
+ size 34361210
data/{embeddings.npy → embeddings_bert.npy} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f6ebe9af14012e5d2572f995ef84a2f43f07f0235a09e79312ade95b02179d0
3
- size 36520352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2243510b8892ac2f353478d28fc4d9707f2a3e0aec4fb4c17639f4a861ec1c
3
+ size 35503232
data/tf_idf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7ce22b069723ecfaecb88e16129a29ab56074106d076679703666a8648240dc
3
+ size 5236615