amoldwalunj commited on
Commit
fd7fdc2
1 Parent(s): 4d85c55

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ import numpy as np
5
+ import faiss
6
+ from sentence_transformers import SentenceTransformer
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor
9
+
10
+ def process_string(s):
11
+ return s.lower().replace('&', 'and')
12
+
13
+ #@st.cache
14
+ @st.cache_data
15
+ def load_model():
16
+ return SentenceTransformer(r"finetiuned_model")
17
+
18
+ def process_embedding(ingredient, model):
19
+ processed_ingredient = process_string(ingredient)
20
+ return model.encode([processed_ingredient]).tolist()
21
+
22
+ def faiss_query(xq, index, top_k=1):
23
+ distances, indices = index.search(np.array(xq).astype('float32'), top_k)
24
+ return distances[0], indices[0]
25
+
26
+ def get_top_matches(ingredients_flat, ingredients, loaded_model, index):
27
+ matches = []
28
+ scores = []
29
+
30
+ # Generate embeddings in parallel
31
+ with ThreadPoolExecutor() as executor:
32
+ embeddings = list(executor.map(lambda ing: process_embedding(ing, loaded_model), ingredients))
33
+
34
+ # Query Faiss in parallel
35
+ results = []
36
+ with ThreadPoolExecutor() as executor:
37
+ results = list(executor.map(lambda xq: faiss_query(xq, index), embeddings))
38
+
39
+ # Extract matches and scores
40
+ for distances, indices in results:
41
+ if indices.size > 0:
42
+ match = ingredients_flat[indices[0]]
43
+ matches.append(match)
44
+ scores.append(round(1 - distances[0] / 2, 2))
45
+
46
+ return matches, scores
47
+
48
+ # Load the Faiss index from disk
49
+ index = faiss.read_index('faiss_index.bin')
50
+
51
+ # Load the metadata from the JSON file
52
+ with open('metadata_faiss.json', 'r') as f:
53
+ metadata = json.load(f)
54
+
55
+ ingredients_flat = [item["Ingredient"] for item in metadata]
56
+ loaded_model = load_model()
57
+
58
+ def main():
59
+ #st.set_page_config(page_title="Ingredients Matching App", page_icon=":smiley:", layout="wide")
60
+ st.title("Ingredients name matching App :smiley:")
61
+
62
+ st.header("Matches using embeddings (semantic search)")
63
+ st.write("Enter the JSON input:")
64
+ json_input = st.text_area("")
65
+
66
+ if st.button("Process"):
67
+ start_time = time.time()
68
+ with st.spinner("Processing..."):
69
+ try:
70
+ input_data = json.loads(json_input)
71
+
72
+ for menu_item in input_data:
73
+ ing_list = menu_item.get("ingredients", [])
74
+ matches, scores = get_top_matches(ingredients_flat, ing_list, loaded_model, index)
75
+ menu_item["Ingradients_matched"] = matches
76
+ menu_item["scores"] = scores
77
+
78
+ #st.write("Processed JSON:")
79
+ #st.write("<pre>" + json.dumps(input_data, indent=4) + "</pre>", unsafe_allow_html=True)
80
+ output_df = pd.DataFrame(input_data)
81
+ st.write("Processed Data:")
82
+ st.write(output_df)
83
+
84
+
85
+ except json.JSONDecodeError:
86
+ st.error("Invalid JSON input. Please check and try again.")
87
+
88
+ end_time = time.time()
89
+ st.write(f"Processing time: {end_time - start_time:.2f} seconds")
90
+
91
+ if __name__ == "__main__":
92
+ main()