Spaces:
Runtime error
Runtime error
Add dutch partisan news dataset
Browse files- .gitattributes +15 -0
- app.py +5 -4
- dutch-article-idx.pkl +3 -0
- dutch-article-idx_adapted.pkl +3 -0
- dutch-article-retriever/1_Pooling/config.json +7 -0
- dutch-article-retriever/README.md +3 -0
- dutch-article-retriever/config.json +3 -0
- dutch-article-retriever/config_sentence_transformers.json +3 -0
- dutch-article-retriever/modules.json +3 -0
- dutch-article-retriever/pytorch_model.bin +3 -0
- dutch-article-retriever/sentence_bert_config.json +3 -0
- dutch-article-retriever/sentencepiece.bpe.model +3 -0
- dutch-article-retriever/special_tokens_map.json +3 -0
- dutch-article-retriever/tokenizer.json +3 -0
- dutch-article-retriever/tokenizer_config.json +3 -0
- retriever.py +65 -47
.gitattributes
CHANGED
@@ -33,3 +33,18 @@ adapted-retriever/sentence_bert_config.json filter=lfs diff=lfs merge=lfs -text
|
|
33 |
adapted-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
|
34 |
adapted-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
35 |
adapted-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
adapted-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
|
34 |
adapted-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
35 |
adapted-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
36 |
+
documentstore_german-election-idx_adapted.pkl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
dutch-article-idx_adapted.pkl filter=lfs diff=lfs merge=lfs -text
|
38 |
+
dutch-article-retriever filter=lfs diff=lfs merge=lfs -text
|
39 |
+
dutch-article-idx.pkl filter=lfs diff=lfs merge=lfs -text
|
40 |
+
dutch-article-retriever/1_Pooling filter=lfs diff=lfs merge=lfs -text
|
41 |
+
dutch-article-retriever/README.md filter=lfs diff=lfs merge=lfs -text
|
42 |
+
dutch-article-retriever/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
43 |
+
dutch-article-retriever/sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
|
44 |
+
dutch-article-retriever/config.json filter=lfs diff=lfs merge=lfs -text
|
45 |
+
dutch-article-retriever/config_sentence_transformers.json filter=lfs diff=lfs merge=lfs -text
|
46 |
+
dutch-article-retriever/modules.json filter=lfs diff=lfs merge=lfs -text
|
47 |
+
dutch-article-retriever/sentence_bert_config.json filter=lfs diff=lfs merge=lfs -text
|
48 |
+
dutch-article-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
|
49 |
+
dutch-article-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
50 |
+
dutch-article-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -3,7 +3,8 @@
|
|
3 |
Here's our first attempt at using data to create a table:
|
4 |
"""
|
5 |
import streamlit as st
|
6 |
-
from retriever import do_search
|
|
|
7 |
|
8 |
def local_css(file_name):
|
9 |
with open(file_name) as f:
|
@@ -16,7 +17,7 @@ def render_retrieved_content(content, score):
|
|
16 |
if score is not None:
|
17 |
score = round(score, 3)
|
18 |
print_score = f'<b> Similarity Score: {score}</b>'
|
19 |
-
return f'<blockquote>{content} </blockquote> {print_score}'
|
20 |
|
21 |
local_css('style.css')
|
22 |
st.header('🧐 Where my docs at?')
|
@@ -31,12 +32,12 @@ st.markdown('✨ Imagine you have a bunch of text documents and looking for one
|
|
31 |
with st.form('search-input'):
|
32 |
option = st.selectbox(
|
33 |
'Choose a dataset',
|
34 |
-
(
|
35 |
search = st.text_input('Enter your search query')
|
36 |
button = st.form_submit_button('Search')
|
37 |
|
38 |
if search:
|
39 |
-
result = do_search(search)
|
40 |
|
41 |
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
42 |
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
|
|
3 |
Here's our first attempt at using data to create a table:
|
4 |
"""
|
5 |
import streamlit as st
|
6 |
+
from retriever import do_search, dutch_datset_name, german_datset_name
|
7 |
+
|
8 |
|
9 |
def local_css(file_name):
|
10 |
with open(file_name) as f:
|
|
|
17 |
if score is not None:
|
18 |
score = round(score, 3)
|
19 |
print_score = f'<b> Similarity Score: {score}</b>'
|
20 |
+
return f'<blockquote> {content} </blockquote> {print_score}'
|
21 |
|
22 |
local_css('style.css')
|
23 |
st.header('🧐 Where my docs at?')
|
|
|
32 |
with st.form('search-input'):
|
33 |
option = st.selectbox(
|
34 |
'Choose a dataset',
|
35 |
+
(german_datset_name, dutch_datset_name))
|
36 |
search = st.text_input('Enter your search query')
|
37 |
button = st.form_submit_button('Search')
|
38 |
|
39 |
if search:
|
40 |
+
result = do_search(search, option)
|
41 |
|
42 |
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
43 |
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
dutch-article-idx.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6b1edcffb6ca9c5409af117770d97415a119bcb02fc5c3ac338f82dadacdb51
|
3 |
+
size 24987947
|
dutch-article-idx_adapted.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed14bfd16fa49000673d7964bf90f3da854b3a17209554bc4ec6d1664f59858d
|
3 |
+
size 25239050
|
dutch-article-retriever/1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
dutch-article-retriever/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:112c56ba0758ca51e45cda7f0d505af643c740abd0af7f740ec411d30708a96d
|
3 |
+
size 3696
|
dutch-article-retriever/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29703b29b31e2dabfcd73e52ba0856489249af29f2c8fc5209415fccadfac0d3
|
3 |
+
size 821
|
dutch-article-retriever/config_sentence_transformers.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8c64b5cece00d8424b4896ea75b512b6008576088497609dfeb6bd63e6d36b8
|
3 |
+
size 122
|
dutch-article-retriever/modules.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f4b264b80206c830bebbdcae377e137925650a433b689343a63bdc9b3145460
|
3 |
+
size 229
|
dutch-article-retriever/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b906450207e003aaf2f08d775fedfb16b8438206899eb12a93f92059069ad8a
|
3 |
+
size 1112244081
|
dutch-article-retriever/sentence_bert_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec8e29d6dcb61b611b7d3fdd2982c4524e6ad985959fa7194eacfb655a8d0d51
|
3 |
+
size 53
|
dutch-article-retriever/sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
dutch-article-retriever/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:378eb3bf733eb16e65792d7e3fda5b8a4631387ca04d2015199c4d4f22ae554d
|
3 |
+
size 239
|
dutch-article-retriever/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46afe88da5fd71bdbab5cfab5e84c1adce59c246ea5f9341bbecef061891d0a7
|
3 |
+
size 17082913
|
dutch-article-retriever/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c84cba673d65cd6fabcaf0340ae8e57b34306e01862132f4b476936917727dea
|
3 |
+
size 483
|
retriever.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
from haystack.document_stores import InMemoryDocumentStore
|
2 |
-
|
3 |
from haystack.nodes.retriever import TfidfRetriever
|
4 |
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
|
5 |
from haystack.nodes.retriever import EmbeddingRetriever
|
6 |
-
from haystack.nodes import FARMReader
|
7 |
import pickle
|
8 |
from pprint import pprint
|
|
|
|
|
9 |
|
10 |
class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
|
11 |
"""
|
@@ -22,54 +23,71 @@ class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
|
|
22 |
self.indexes = pickle.load(f)
|
23 |
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
else:
|
59 |
-
return
|
60 |
-
|
61 |
-
return p_retrieval.run(query=query)
|
62 |
-
|
63 |
-
|
64 |
-
def do_search(query):
|
65 |
-
sparse_result = sparse_retrieval(query)['documents'][0]
|
66 |
-
dense_base_result =dense_retrieval(query, retriever='base')['documents'][0]
|
67 |
-
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents'][0]
|
68 |
-
return sparse_result, dense_base_result, dense_adapted_result
|
69 |
|
70 |
if __name__ == '__main__':
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
73 |
pprint(result)
|
74 |
|
75 |
|
|
|
1 |
from haystack.document_stores import InMemoryDocumentStore
|
2 |
+
|
3 |
from haystack.nodes.retriever import TfidfRetriever
|
4 |
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
|
5 |
from haystack.nodes.retriever import EmbeddingRetriever
|
|
|
6 |
import pickle
|
7 |
from pprint import pprint
|
8 |
+
dutch_datset_name = 'Partisan news 2019 (dutch)'
|
9 |
+
german_datset_name = 'CDU election program 2021'
|
10 |
|
11 |
class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
|
12 |
"""
|
|
|
23 |
self.indexes = pickle.load(f)
|
24 |
|
25 |
|
26 |
+
class SearchEngine():
|
27 |
+
|
28 |
+
def __init__(self, document_store_name_base, document_store_name_adpated,
|
29 |
+
adapted_retriever_path):
|
30 |
+
self.document_store = ExportableInMemoryDocumentStore(similarity='cosine')
|
31 |
+
self.document_store.load_data(document_store_name_base)
|
32 |
+
|
33 |
+
self.document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
|
34 |
+
self.document_store_adapted.load_data(document_store_name_adpated)
|
35 |
+
|
36 |
+
self.retriever = TfidfRetriever(document_store=self.document_store)
|
37 |
+
|
38 |
+
self.base_dense_retriever = EmbeddingRetriever(
|
39 |
+
document_store=self.document_store,
|
40 |
+
embedding_model='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
|
41 |
+
model_format='sentence_transformers'
|
42 |
+
)
|
43 |
+
|
44 |
+
self.fine_tuned_retriever = EmbeddingRetriever(
|
45 |
+
document_store=self.document_store_adapted,
|
46 |
+
embedding_model=adapted_retriever_path,
|
47 |
+
model_format='sentence_transformers'
|
48 |
+
)
|
49 |
+
|
50 |
+
def sparse_retrieval(self, query):
|
51 |
+
"""Sparse retrieval pipeline"""
|
52 |
+
scores = self.retriever._calc_scores(query)
|
53 |
+
p_retrieval = DocumentSearchPipeline(self.retriever)
|
54 |
+
documents = p_retrieval.run(query=query)
|
55 |
+
documents['documents'][0].score = list(scores[0].values())[0]
|
56 |
+
return documents
|
57 |
+
|
58 |
+
def dense_retrieval(self, query, retriever='base'):
|
59 |
+
if retriever == 'base':
|
60 |
+
p_retrieval = DocumentSearchPipeline(self.base_dense_retriever)
|
61 |
+
return p_retrieval.run(query=query)
|
62 |
+
if retriever == 'adapted':
|
63 |
+
p_retrieval = DocumentSearchPipeline(self.fine_tuned_retriever)
|
64 |
+
return p_retrieval.run(query=query)
|
65 |
+
|
66 |
+
def do_search(self, query):
|
67 |
+
sparse_result = self.sparse_retrieval(query)['documents'][0]
|
68 |
+
dense_base_result = self.dense_retrieval(query, 'base')['documents'][0]
|
69 |
+
dense_adapted_result = self.dense_retrieval(query, 'adapted')['documents'][0]
|
70 |
+
return sparse_result, dense_base_result, dense_adapted_result
|
71 |
+
|
72 |
+
|
73 |
+
dutch_search_engine = SearchEngine('dutch-article-idx.pkl', 'dutch-article-idx_adapted.pkl',
|
74 |
+
'dutch-article-retriever')
|
75 |
+
german_search_engine = SearchEngine('documentstore_german-election-idx.pkl',
|
76 |
+
'documentstore_german-election-idx_adapted.pkl',
|
77 |
+
'adapted-retriever')
|
78 |
+
|
79 |
+
def do_search(query, dataset):
|
80 |
+
if dataset == german_datset_name:
|
81 |
+
return german_search_engine.do_search(query)
|
82 |
else:
|
83 |
+
return dutch_search_engine.do_search(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
if __name__ == '__main__':
|
86 |
+
search_engine = SearchEngine('dutch-article-idx.pkl', 'dutch-article-idx_adapted.pkl',
|
87 |
+
'dutch-article-retriever')
|
88 |
+
query = 'Kindergarten'
|
89 |
+
|
90 |
+
result = search_engine.do_search(query)
|
91 |
pprint(result)
|
92 |
|
93 |
|