mrchtr commited on
Commit
181e8c5
1 Parent(s): 01628bb

Fixing typos

Browse files
app.py CHANGED
@@ -11,7 +11,8 @@ def local_css(file_name):
11
 
12
 
13
  def render_retrieved_content(content, score):
14
- print_score = ''
 
15
  if score is not None:
16
  score = round(score, 3)
17
  print_score = f'<b> Similarity Score: {score}</b>'
@@ -24,40 +25,46 @@ st.markdown('✨ Imagine you have a bunch of text documents and looking for one
24
  '💡 This demo compares different search approaches that can help you to find the right '
25
  'information.', unsafe_allow_html=True)
26
 
27
- option = st.selectbox(
28
- 'Choose a dataset',
29
- ('CDU election program 2021', 'Partisan news 2019 (dutch)'))
30
 
31
 
32
 
33
- search = st.text_input('Enter your search query')
 
 
 
 
 
 
34
  if search:
35
  result = do_search(search)
36
 
37
  st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
38
  st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
39
  'in your collection. Only documents will be found that contain one of the words of '
40
- 'the given search query. You still have to remember on exact terms that are in the'
41
  'searched phrase.')
42
- st.markdown(render_retrieved_content(result[0][0].content, None),
43
  unsafe_allow_html=True)
44
 
45
- st.markdown('### 🧠 Semantic search')
46
- st.markdown('An alternative approach is semantic search. Instead of using words of the '
47
- 'documents to calculate the score, we use a neural network that calculate the '
48
- 'similarity between the query and the documents of the collection. In other words, '
49
- 'the chance is high to find topic related documents without knowing the exact '
50
- 'terms.')
51
- st.markdown(render_retrieved_content(result[1][0].content, result[1][0].score),
 
 
 
52
  unsafe_allow_html=True)
53
 
54
- st.markdown('### 🚀 Domain adapted semantic search')
55
- st.markdown('If our document collection contains a lot of domain specific documents, '
56
  'we can not use standard models. These models were trained on a large amount of '
57
- 'public available data, that covers probably not your domain specific words. To '
58
  'improve the search results, we could fine-tune the network to calculate more '
59
  'accurate similarities between queries and document regarding to your domain.')
60
- st.markdown(render_retrieved_content(result[2][0].content, result[2][0].score),
61
  unsafe_allow_html=True)
62
 
63
 
 
11
 
12
 
13
  def render_retrieved_content(content, score):
14
+ if score is not None and score == 0.0:
15
+ return f'<blockquote> No result </blockquote>'
16
  if score is not None:
17
  score = round(score, 3)
18
  print_score = f'<b> Similarity Score: {score}</b>'
 
25
  '💡 This demo compares different search approaches that can help you to find the right '
26
  'information.', unsafe_allow_html=True)
27
 
 
 
 
28
 
29
 
30
 
31
+ with st.form('search-input'):
32
+ option = st.selectbox(
33
+ 'Choose a dataset',
34
+ ('CDU election program 2021', 'Partisan news 2019 (dutch)'))
35
+ search = st.text_input('Enter your search query')
36
+ button = st.form_submit_button('Search')
37
+
38
  if search:
39
  result = do_search(search)
40
 
41
  st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
42
  st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
43
  'in your collection. Only documents will be found that contain one of the words of '
44
+ 'the given search query. You still have to remember exact terms that are in the '
45
  'searched phrase.')
46
+ st.markdown(render_retrieved_content(result[0].content, result[0].score),
47
  unsafe_allow_html=True)
48
 
49
+ st.markdown('### 🧠 Semantic Search')
50
+ st.markdown('An alternative approach is semantic search. Instead of using words of the'
51
+ 'documents to calculate the score, we use a neural network which calculates '
52
+ 'sentence embeddings. Sentences and documents that are similar will be close to '
53
+ 'each other in the embedding space. We use this behavior to find topic related '
54
+ 'documents without knowing the exact terms. If you want learn more about this '
55
+ 'topic check out one of our recent <a '
56
+ 'href="https://blog.ml6.eu/decoding-sentence-encoders-37e63244ae00?source=collection_detail----1e091bbd5262-----2-----------------------">blogposts</a>.',
57
+ unsafe_allow_html=True)
58
+ st.markdown(render_retrieved_content(result[1].content, result[1].score),
59
  unsafe_allow_html=True)
60
 
61
+ st.markdown('### 🚀 Domain Adapted Semantic Search')
62
+ st.markdown('If our document collection contains a lot of domain-specific documents, '
63
  'we can not use standard models. These models were trained on a large amount of '
64
+ 'publicly available data, which probably not covers your domain-specific words. To '
65
  'improve the search results, we could fine-tune the network to calculate more '
66
  'accurate similarities between queries and document regarding to your domain.')
67
+ st.markdown(render_retrieved_content(result[2].content, result[2].score),
68
  unsafe_allow_html=True)
69
 
70
 
documentstore_german-election-idx_adapted.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:240da0dc8d623928b064900b3c1525e785aefb5cb07a471171d1af2aae0704c8
3
+ size 4874683
retriever.py CHANGED
@@ -26,6 +26,9 @@ class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
26
  document_store = ExportableInMemoryDocumentStore(similarity='cosine')
27
  document_store.load_data('documentstore_german-election-idx.pkl')
28
 
 
 
 
29
  retriever = TfidfRetriever(document_store=document_store)
30
  base_dense_retriever = EmbeddingRetriever(
31
  document_store=document_store,
@@ -34,15 +37,18 @@ base_dense_retriever = EmbeddingRetriever(
34
  )
35
 
36
  fine_tuned_retriever = EmbeddingRetriever(
37
- document_store=document_store,
38
  embedding_model='./adapted-retriever',
39
  model_format='sentence_transformers'
40
  )
41
 
42
  def sparse_retrieval(query):
43
  """Sparse retrieval pipeline"""
 
44
  p_retrieval = DocumentSearchPipeline(retriever)
45
- return p_retrieval.run(query=query)
 
 
46
 
47
  def dense_retrieval(query, retriever='base'):
48
  if retriever == 'base':
@@ -56,13 +62,13 @@ def dense_retrieval(query, retriever='base'):
56
 
57
 
58
  def do_search(query):
59
- sparse_result = sparse_retrieval(query)['documents']
60
- dense_base_result =dense_retrieval(query, retriever='base')['documents']
61
- dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
62
  return sparse_result, dense_base_result, dense_adapted_result
63
 
64
  if __name__ == '__main__':
65
- query = 'Klimawandel stoppen?'
66
  result = do_search(query)
67
  pprint(result)
68
 
 
26
  document_store = ExportableInMemoryDocumentStore(similarity='cosine')
27
  document_store.load_data('documentstore_german-election-idx.pkl')
28
 
29
+ document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
30
+ document_store_adapted.load_data('documentstore_german-election-idx.pkl')
31
+
32
  retriever = TfidfRetriever(document_store=document_store)
33
  base_dense_retriever = EmbeddingRetriever(
34
  document_store=document_store,
 
37
  )
38
 
39
  fine_tuned_retriever = EmbeddingRetriever(
40
+ document_store=document_store_adapted,
41
  embedding_model='./adapted-retriever',
42
  model_format='sentence_transformers'
43
  )
44
 
45
  def sparse_retrieval(query):
46
  """Sparse retrieval pipeline"""
47
+ scores = retriever._calc_scores(query)
48
  p_retrieval = DocumentSearchPipeline(retriever)
49
+ documents = p_retrieval.run(query=query)
50
+ documents['documents'][0].score = list(scores[0].values())[0]
51
+ return documents
52
 
53
  def dense_retrieval(query, retriever='base'):
54
  if retriever == 'base':
 
62
 
63
 
64
  def do_search(query):
65
+ sparse_result = sparse_retrieval(query)['documents'][0]
66
+ dense_base_result =dense_retrieval(query, retriever='base')['documents'][0]
67
+ dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents'][0]
68
  return sparse_result, dense_base_result, dense_adapted_result
69
 
70
  if __name__ == '__main__':
71
+ query = 'Frauen'
72
  result = do_search(query)
73
  pprint(result)
74