Norgan97 commited on
Commit
b8d0a69
·
1 Parent(s): feb85d5

final 1 task

Browse files
Dataset/embeddingsbooks.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f8f45afeee67807066143b91d360a9612d22e24bd71857020a3e32c271292d2
3
- size 22172986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53781aa6a45e47ac88bdc1dfa593f09d835d51c3d9eb366bc01ab89dea13559c
3
+ size 27330999
Dataset/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:422b6dabe40fb0b12a2b62cc06bffcf61ca5294e9da279dd6c4abc56c1310881
3
+ size 7260909
Dataset/parcedbooks.csv CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -4,19 +4,30 @@ import torch
4
  from PIL import Image
5
  from io import BytesIO
6
  import requests
 
7
 
8
 
9
  from transformers import AutoTokenizer, AutoModel
10
  import numpy as np
 
11
 
12
  @st.cache_resource()
13
  def load_model():
14
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
15
- return model
 
16
 
17
- model = load_model()
18
- tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
19
 
 
 
 
 
 
 
 
 
 
20
 
21
  def embed_bert_cls(text, model, tokenizer):
22
  t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
@@ -26,38 +37,29 @@ def embed_bert_cls(text, model, tokenizer):
26
  embeddings = torch.nn.functional.normalize(embeddings)
27
  return embeddings[0].cpu().numpy()
28
 
29
- df = pd.read_csv('Dataset/parcedbooks.csv')
30
-
31
-
32
 
33
  text = st.text_input('Введите ваше предпочтение для рекомендации')
34
 
35
  button = st.button('Отправить запрос')
36
- num = st.number_input('Укажите количество книг для рекомендации', step=1, value=None)
37
- with open('Dataset/embeddingsbooks.txt', 'r') as file:
38
- embeddings_list = [list(map(float, line.split())) for line in file.readlines()]
39
 
40
  if text and button:
41
  decode_text = embed_bert_cls(text, model, tokenizer) # Получение вектора для введенного текста
42
- cosine_similarities = []
43
- for annotation in embeddings_list:
44
- similarity = np.dot(decode_text, annotation) / (np.linalg.norm(decode_text) * np.linalg.norm(annotation)) # Расчет косинусного сходства
45
- cosine_similarities.append(similarity)
46
 
47
- if num:
48
-
49
- k = num # Задайте количество выводимых результатов
50
- top_similar_indices = np.argsort(cosine_similarities)[-k:][::-1] # Получение индексов наиболее похожих предложений
51
- top_similar_annotations = [df['annotation'].iloc[i] for i in top_similar_indices] # Получение самих предложений
52
-
53
- top_similar_images = [df['image_url'].iloc[i] for i in top_similar_indices]
54
- images = [Image.open(BytesIO(requests.get(url).content)) for url in top_similar_images]
55
- top_similar_authors = [df['author'].iloc[i] for i in top_similar_indices]
56
- top_similar_title = [df['title'].iloc[i] for i in top_similar_indices]
57
- top_cosine_similarities = [cosine_similarities[i] for i in top_similar_indices]
58
-
59
- # Отображение изображений и названий
60
- for similarity, image, author, annotation, title in zip(top_cosine_similarities, images, top_similar_authors, top_similar_annotations,top_similar_title):
61
  col1, col2 = st.columns([3, 4])
62
  with col1:
63
  st.image(image, width=300)
@@ -67,6 +69,7 @@ if text and button:
67
  st.write(f"***Аннотация:*** {annotation}")
68
  similarity = float(similarity)
69
  st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
 
70
 
71
  st.markdown(
72
  "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
@@ -76,3 +79,4 @@ if text and button:
76
 
77
 
78
 
 
 
4
  from PIL import Image
5
  from io import BytesIO
6
  import requests
7
+ import faiss
8
 
9
 
10
  from transformers import AutoTokenizer, AutoModel
11
  import numpy as np
12
+ st.set_page_config(layout="wide")
13
 
14
  @st.cache_resource()
15
  def load_model():
16
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
18
+ return model , tokenizer
19
 
20
+ model, tokenizer = load_model()
 
21
 
22
+ @st.cache_data()
23
+ def load_data():
24
+ df = pd.read_csv('Dataset/parcedbooks.csv')
25
+ with open('Dataset/embeddingsbooks.txt', 'r') as file:
26
+ embeddings_list = [list(map(float, line.split())) for line in file.readlines()]
27
+ index = faiss.read_index('Dataset/faiss.index')
28
+ return df, embeddings_list, index
29
+
30
+ df, embeddings_list, index = load_data()
31
 
32
  def embed_bert_cls(text, model, tokenizer):
33
  t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
 
37
  embeddings = torch.nn.functional.normalize(embeddings)
38
  return embeddings[0].cpu().numpy()
39
 
 
 
 
40
 
41
  text = st.text_input('Введите ваше предпочтение для рекомендации')
42
 
43
  button = st.button('Отправить запрос')
44
+ num = st.number_input('Укажите количество книг для рекомендации', step=1, value=1)
45
+
 
46
 
47
  if text and button:
48
  decode_text = embed_bert_cls(text, model, tokenizer) # Получение вектора для введенного текста
49
+ k = num
50
+ D, I = index.search(decode_text.reshape(1, -1), k)
 
 
51
 
52
+ top_similar_indices = I[0]
53
+ top_similar_annotations = [df['annotation'].iloc[i] for i in top_similar_indices]
54
+ top_similar_images = [df['image_url'].iloc[i] for i in top_similar_indices]
55
+ images = [Image.open(BytesIO(requests.get(url).content)) for url in top_similar_images]
56
+ top_similar_authors = [df['author'].iloc[i] for i in top_similar_indices]
57
+ top_similar_title = [df['title'].iloc[i] for i in top_similar_indices]
58
+ top_similar_url = [df['page_url'].iloc[i] for i in top_similar_indices]
59
+ top_cosine_similarities = [1 - d / 2 for d in D[0]] # Преобразование расстояний в косинусное сходство
60
+
61
+ # Отображение изображений и названий
62
+ for similarity, image, author, annotation, title, url in zip(top_cosine_similarities, images, top_similar_authors, top_similar_annotations, top_similar_title, top_similar_url):
 
 
 
63
  col1, col2 = st.columns([3, 4])
64
  with col1:
65
  st.image(image, width=300)
 
69
  st.write(f"***Аннотация:*** {annotation}")
70
  similarity = float(similarity)
71
  st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
72
+ st.write(f"***Ссылка на книгу : {url}***")
73
 
74
  st.markdown(
75
  "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
 
79
 
80
 
81
 
82
+
parcing_faiss.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -7,6 +7,7 @@ cachetools==5.3.2
7
  certifi==2023.7.22
8
  charset-normalizer==3.3.2
9
  click==8.1.7
 
10
  filelock==3.13.1
11
  fsspec==2023.10.0
12
  gitdb==4.0.11
 
7
  certifi==2023.7.22
8
  charset-normalizer==3.3.2
9
  click==8.1.7
10
+ faiss-cpu==1.7.2
11
  filelock==3.13.1
12
  fsspec==2023.10.0
13
  gitdb==4.0.11