Spaces:
Runtime error
Runtime error
Fixing typos
Browse files- app.py +25 -18
- documentstore_german-election-idx_adapted.pkl +3 -0
- retriever.py +12 -6
app.py
CHANGED
@@ -11,7 +11,8 @@ def local_css(file_name):
|
|
11 |
|
12 |
|
13 |
def render_retrieved_content(content, score):
|
14 |
-
|
|
|
15 |
if score is not None:
|
16 |
score = round(score, 3)
|
17 |
print_score = f'<b> Similarity Score: {score}</b>'
|
@@ -24,40 +25,46 @@ st.markdown('✨ Imagine you have a bunch of text documents and looking for one
|
|
24 |
'💡 This demo compares different search approaches that can help you to find the right '
|
25 |
'information.', unsafe_allow_html=True)
|
26 |
|
27 |
-
option = st.selectbox(
|
28 |
-
'Choose a dataset',
|
29 |
-
('CDU election program 2021', 'Partisan news 2019 (dutch)'))
|
30 |
|
31 |
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
if search:
|
35 |
result = do_search(search)
|
36 |
|
37 |
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
38 |
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
39 |
'in your collection. Only documents will be found that contain one of the words of '
|
40 |
-
'the given search query. You still have to remember
|
41 |
'searched phrase.')
|
42 |
-
st.markdown(render_retrieved_content(result[0][0].
|
43 |
unsafe_allow_html=True)
|
44 |
|
45 |
-
st.markdown('### 🧠 Semantic
|
46 |
-
st.markdown('An alternative approach is semantic search. Instead of using words of the
|
47 |
-
'documents to calculate the score, we use a neural network
|
48 |
-
'
|
49 |
-
'the
|
50 |
-
'terms.'
|
51 |
-
|
|
|
|
|
|
|
52 |
unsafe_allow_html=True)
|
53 |
|
54 |
-
st.markdown('### 🚀 Domain
|
55 |
-
st.markdown('If our document collection contains a lot of domain
|
56 |
'we can not use standard models. These models were trained on a large amount of '
|
57 |
-
'
|
58 |
'improve the search results, we could fine-tune the network to calculate more '
|
59 |
'accurate similarities between queries and document regarding to your domain.')
|
60 |
-
st.markdown(render_retrieved_content(result[2]
|
61 |
unsafe_allow_html=True)
|
62 |
|
63 |
|
|
|
11 |
|
12 |
|
13 |
def render_retrieved_content(content, score):
|
14 |
+
if score is not None and score == 0.0:
|
15 |
+
return f'<blockquote> No result </blockquote>'
|
16 |
if score is not None:
|
17 |
score = round(score, 3)
|
18 |
print_score = f'<b> Similarity Score: {score}</b>'
|
|
|
25 |
'💡 This demo compares different search approaches that can help you to find the right '
|
26 |
'information.', unsafe_allow_html=True)
|
27 |
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
|
31 |
+
with st.form('search-input'):
|
32 |
+
option = st.selectbox(
|
33 |
+
'Choose a dataset',
|
34 |
+
('CDU election program 2021', 'Partisan news 2019 (dutch)'))
|
35 |
+
search = st.text_input('Enter your search query')
|
36 |
+
button = st.form_submit_button('Search')
|
37 |
+
|
38 |
if search:
|
39 |
result = do_search(search)
|
40 |
|
41 |
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
42 |
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
43 |
'in your collection. Only documents will be found that contain one of the words of '
|
44 |
+
'the given search query. You still have to remember exact terms that are in the '
|
45 |
'searched phrase.')
|
46 |
+
st.markdown(render_retrieved_content(result[0].content, result[0].score),
|
47 |
unsafe_allow_html=True)
|
48 |
|
49 |
+
st.markdown('### 🧠 Semantic Search')
|
50 |
+
st.markdown('An alternative approach is semantic search. Instead of using words of the'
|
51 |
+
'documents to calculate the score, we use a neural network which calculates '
|
52 |
+
'sentence embeddings. Sentences and documents that are similar will be close to '
|
53 |
+
'each other in the embedding space. We use this behavior to find topic related '
|
54 |
+
'documents without knowing the exact terms. If you want learn more about this '
|
55 |
+
'topic check out one of our recent <a '
|
56 |
+
'href="https://blog.ml6.eu/decoding-sentence-encoders-37e63244ae00?source=collection_detail----1e091bbd5262-----2-----------------------">blogposts</a>.',
|
57 |
+
unsafe_allow_html=True)
|
58 |
+
st.markdown(render_retrieved_content(result[1].content, result[1].score),
|
59 |
unsafe_allow_html=True)
|
60 |
|
61 |
+
st.markdown('### 🚀 Domain Adapted Semantic Search')
|
62 |
+
st.markdown('If our document collection contains a lot of domain-specific documents, '
|
63 |
'we can not use standard models. These models were trained on a large amount of '
|
64 |
+
'publicly available data, which probably not covers your domain-specific words. To '
|
65 |
'improve the search results, we could fine-tune the network to calculate more '
|
66 |
'accurate similarities between queries and document regarding to your domain.')
|
67 |
+
st.markdown(render_retrieved_content(result[2].content, result[2].score),
|
68 |
unsafe_allow_html=True)
|
69 |
|
70 |
|
documentstore_german-election-idx_adapted.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:240da0dc8d623928b064900b3c1525e785aefb5cb07a471171d1af2aae0704c8
|
3 |
+
size 4874683
|
retriever.py
CHANGED
@@ -26,6 +26,9 @@ class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
|
|
26 |
document_store = ExportableInMemoryDocumentStore(similarity='cosine')
|
27 |
document_store.load_data('documentstore_german-election-idx.pkl')
|
28 |
|
|
|
|
|
|
|
29 |
retriever = TfidfRetriever(document_store=document_store)
|
30 |
base_dense_retriever = EmbeddingRetriever(
|
31 |
document_store=document_store,
|
@@ -34,15 +37,18 @@ base_dense_retriever = EmbeddingRetriever(
|
|
34 |
)
|
35 |
|
36 |
fine_tuned_retriever = EmbeddingRetriever(
|
37 |
-
document_store=
|
38 |
embedding_model='./adapted-retriever',
|
39 |
model_format='sentence_transformers'
|
40 |
)
|
41 |
|
42 |
def sparse_retrieval(query):
|
43 |
"""Sparse retrieval pipeline"""
|
|
|
44 |
p_retrieval = DocumentSearchPipeline(retriever)
|
45 |
-
|
|
|
|
|
46 |
|
47 |
def dense_retrieval(query, retriever='base'):
|
48 |
if retriever == 'base':
|
@@ -56,13 +62,13 @@ def dense_retrieval(query, retriever='base'):
|
|
56 |
|
57 |
|
58 |
def do_search(query):
|
59 |
-
sparse_result = sparse_retrieval(query)['documents']
|
60 |
-
dense_base_result =dense_retrieval(query, retriever='base')['documents']
|
61 |
-
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
|
62 |
return sparse_result, dense_base_result, dense_adapted_result
|
63 |
|
64 |
if __name__ == '__main__':
|
65 |
-
query = '
|
66 |
result = do_search(query)
|
67 |
pprint(result)
|
68 |
|
|
|
26 |
document_store = ExportableInMemoryDocumentStore(similarity='cosine')
|
27 |
document_store.load_data('documentstore_german-election-idx.pkl')
|
28 |
|
29 |
+
document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
|
30 |
+
document_store_adapted.load_data('documentstore_german-election-idx.pkl')
|
31 |
+
|
32 |
retriever = TfidfRetriever(document_store=document_store)
|
33 |
base_dense_retriever = EmbeddingRetriever(
|
34 |
document_store=document_store,
|
|
|
37 |
)
|
38 |
|
39 |
fine_tuned_retriever = EmbeddingRetriever(
|
40 |
+
document_store=document_store_adapted,
|
41 |
embedding_model='./adapted-retriever',
|
42 |
model_format='sentence_transformers'
|
43 |
)
|
44 |
|
45 |
def sparse_retrieval(query):
|
46 |
"""Sparse retrieval pipeline"""
|
47 |
+
scores = retriever._calc_scores(query)
|
48 |
p_retrieval = DocumentSearchPipeline(retriever)
|
49 |
+
documents = p_retrieval.run(query=query)
|
50 |
+
documents['documents'][0].score = list(scores[0].values())[0]
|
51 |
+
return documents
|
52 |
|
53 |
def dense_retrieval(query, retriever='base'):
|
54 |
if retriever == 'base':
|
|
|
62 |
|
63 |
|
64 |
def do_search(query):
|
65 |
+
sparse_result = sparse_retrieval(query)['documents'][0]
|
66 |
+
dense_base_result =dense_retrieval(query, retriever='base')['documents'][0]
|
67 |
+
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents'][0]
|
68 |
return sparse_result, dense_base_result, dense_adapted_result
|
69 |
|
70 |
if __name__ == '__main__':
|
71 |
+
query = 'Frauen'
|
72 |
result = do_search(query)
|
73 |
pprint(result)
|
74 |
|