ashwinpatti commited on
Commit
6213b0d
·
1 Parent(s): 38434f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -45
app.py CHANGED
@@ -57,50 +57,55 @@ def search(query,top_k=15):
57
  rankedResult = search_cross_encoder(query,candidates)
58
 
59
  return rankedResult
60
-
61
-
62
  st.title("Semantic Indian Movie Search");
63
- st.markdown(
64
- """
65
- - Search for movie names based on the plot.
66
- - The corpus is made up of Hindi, Telugu, Tamil, Kannada, Bengali, Malayalam, Odiya, Marathi, Punjabi & Gujarathi movies released between 1950 and 2023.
67
- - The app understands the context of the query and returns the results from the datastore.""")
68
-
69
- top_k = st.slider("Number of Top Hits Generated",min_value=1,max_value=100,value=15)
70
-
71
- #st.subheader(f"Search Query: {query}")
72
- search_query = st.text_input("Please enter your search query here",value="",key="text_input")
73
-
74
- st.divider()
75
- st.subheader("Results:")
76
- st.caption(f"Search Query: {search_query}")
77
-
78
- ranked_hits = search(search_query,top_k)
79
- st.markdown("\n-------------------------\n")
80
-
81
-
82
- st.divider()
83
- st.header("App details")
84
-
85
- st.markdown(
86
- """
87
- - The app supports Semantic search which seeks to improve search accuracy by understanding the content of the search query in contrast to traditional search engines which only find documents based on lexical matches.
88
- - The corpus consists of movie plots from Hindi, Telugu, Tamil, Kannada, Bengali, Malayalam, Odiya, Marathi, Punjabi & Gujarathi languages.
89
- - The core idea of the retrieval:
90
- - Use Bi-Encoder (Retrieval) and Cross-encoder (Re-ranker) to retrieve the search results.
91
- - The Bi-encoder is responsible for independently embedding the sentences and search queries into a vector space. The result is then passed to the cross-encoder for checking the relevance/similarity between the query and sentences.
92
- - All plot entries in the corpus is embedded into a vector space. At search time, the query is embedded into the same vector space.
93
- - Corpus embeddings and search query embedding are passed into bi-encoder and it would return the closest embeddings from the corpus.
94
- - Cosine similarity is used to find the similar embeddings.
95
- - The result is then passed to cross-encoder to re-rank the results based on the relevance to the search query.
96
- """
97
- )
98
-
99
- st.image(Image.open('semantic_search.png'), caption='Semantic search using Retrieval and Re-Rank')
100
-
101
- st.markdown(
102
- """
103
- Model Source:
104
- - Bi-Encoder - [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
105
- - Cross-Encoder - [cross-encoder/ms-marco-MiniLM-L-2-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-2-v2)""")
 
 
 
 
 
 
106
 
 
57
  rankedResult = search_cross_encoder(query,candidates)
58
 
59
  return rankedResult
60
+
 
61
  st.title("Semantic Indian Movie Search");
62
+ appTab, detailsTab = st.tabs(["App", "Details"])
63
+ with appTab:
64
+
65
+ st.markdown(
66
+ """
67
+ - Search for movie names based on the plot.
68
+ - The corpus is made up of Hindi, Telugu, Tamil, Kannada, Bengali, Malayalam, Odiya, Marathi, Punjabi & Gujarathi movies released between 1950 and 2023.
69
+ - The app understands the context of the query and returns the results from the datastore.""")
70
+
71
+ top_k = st.slider("Number of Top Hits Generated",min_value=1,max_value=100,value=15)
72
+
73
+ #st.subheader(f"Search Query: {query}")
74
+ search_query = st.text_input("Please enter your search query here",value="",key="text_input")
75
+
76
+ st.divider()
77
+ st.subheader("Results:")
78
+ st.caption(f"Search Query: {search_query}")
79
+
80
+ ranked_hits = search(search_query,top_k)
81
+ if(len(ranked_hits)>0):
82
+ st.success("Matches found!!")
83
+ st.markdown("\n-------------------------\n")
84
+
85
+ st.divider()
86
+
87
+ with detailsTab:
88
+ st.header("App details")
89
+
90
+ st.markdown(
91
+ """
92
+ - The app supports Semantic search which seeks to improve search accuracy by understanding the content of the search query in contrast to traditional search engines which only find documents based on lexical matches.
93
+ - The corpus consists of movie plots from Hindi, Telugu, Tamil, Kannada, Bengali, Malayalam, Odiya, Marathi, Punjabi & Gujarathi languages.
94
+ - The core idea of the retrieval:
95
+ - Use Bi-Encoder (Retrieval) and Cross-encoder (Re-ranker) to retrieve the search results.
96
+ - The Bi-encoder is responsible for independently embedding the sentences and search queries into a vector space. The result is then passed to the cross-encoder for checking the relevance/similarity between the query and sentences.
97
+ - All plot entries in the corpus is embedded into a vector space. At search time, the query is embedded into the same vector space.
98
+ - Corpus embeddings and search query embedding are passed into bi-encoder and it would return the closest embeddings from the corpus.
99
+ - Cosine similarity is used to find the similar embeddings.
100
+ - The result is then passed to cross-encoder to re-rank the results based on the relevance to the search query.
101
+ """
102
+ )
103
+
104
+ st.image(Image.open('semantic_search.png'), caption='Semantic search using Retrieval and Re-Rank')
105
+
106
+ st.markdown(
107
+ """
108
+ Model Source:
109
+ - Bi-Encoder - [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
110
+ - Cross-Encoder - [cross-encoder/ms-marco-MiniLM-L-2-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-2-v2)""")
111