hatim00101 commited on
Commit
c834b91
1 Parent(s): 9dfe3e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -15
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
4
  from sentence_transformers import SentenceTransformer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
 
7
  model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
8
 
@@ -11,19 +11,18 @@ embeddings_ar = np.load("normalized_embeddings_ar.pkl", allow_pickle=True)
11
  df_hotels = pd.read_csv("hotel_dataset_processed.csv")
12
  df_ar = pd.read_csv("df_ar_1.csv")
13
 
14
- def search_in_combined(query_text, model, k=5):
15
- query_embedding = model.encode(query_text, convert_to_tensor=True).cpu().numpy().reshape(1, -1)
16
- similarities_hotels = cosine_similarity(query_embedding, embeddings_hotels).flatten()
17
- similarities_ar = cosine_similarity(query_embedding, embeddings_ar).flatten()
18
- top_indices_hotels = np.argsort(similarities_hotels)[::-1][:k]
19
- top_indices_ar = np.argsort(similarities_ar)[::-1][:k]
20
- top_hotels = df_hotels.iloc[top_indices_hotels].copy()
21
- top_ar = df_ar.iloc[top_indices_ar].copy()
22
- top_hotels["similarity"] = similarities_hotels[top_indices_hotels]
23
- top_ar["similarity"] = similarities_ar[top_indices_ar]
24
- combined_top_results = pd.concat([top_hotels, top_ar], ignore_index=True)
25
- combined_top_results = combined_top_results.sort_values(by="similarity", ascending=False)
26
- return combined_top_results.head(k)
27
 
28
  def format_results(results):
29
  formatted_results = []
@@ -63,7 +62,7 @@ def format_results(results):
63
  return "<br><br>".join(formatted_results)
64
 
65
  def search_interface(query_text):
66
- results = search_in_combined(query_text, model, 7)
67
  return format_results(results)
68
 
69
  iface = gr.Interface(
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ import faiss
5
  from sentence_transformers import SentenceTransformer
 
6
 
7
  model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
8
 
 
11
  df_hotels = pd.read_csv("hotel_dataset_processed.csv")
12
  df_ar = pd.read_csv("df_ar_1.csv")
13
 
14
+ embeddings_combined = np.vstack((embeddings_hotels, embeddings_ar))
15
+ df_combined = pd.concat([df_hotels, df_ar], ignore_index=True)
16
+
17
+ dimension = embeddings_combined.shape[1]
18
+ index = faiss.IndexFlatL2(dimension)
19
+ index.add(embeddings_combined)
20
+
21
+ def search_in_faiss(query_text, model, k=5):
22
+ query_embedding = model.encode(query_text).reshape(1, -1).astype("float32")
23
+ _, indices = index.search(query_embedding, k)
24
+ top_results = df_combined.iloc[indices[0]]
25
+ return top_results
 
26
 
27
  def format_results(results):
28
  formatted_results = []
 
62
  return "<br><br>".join(formatted_results)
63
 
64
  def search_interface(query_text):
65
+ results = search_in_faiss(query_text, model, 7)
66
  return format_results(results)
67
 
68
  iface = gr.Interface(