leomaurodesenv commited on
Commit
8416f29
·
1 Parent(s): 4ce2e5d

feat(app): Add question answering for basketball, update the requirements

Browse files
Files changed (3) hide show
  1. app.py +35 -11
  2. requirements.txt +2 -0
  3. utils.py +10 -3
app.py CHANGED
@@ -1,22 +1,46 @@
1
  import streamlit as st
2
  from datasets import load_dataset
3
- from haystack import Document
4
  from haystack.components.readers import ExtractiveReader
 
 
 
 
5
 
6
  # Load the dataset
7
- dataset = load_dataset("PedroCJardim/QASports", "basketball", split="validation")
 
 
 
 
 
8
 
9
- # Load the model
 
 
 
 
 
10
  reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
11
  reader.warm_up()
 
 
 
 
 
12
 
13
- # Running using the Reader
14
- docs = [
15
- Document(content="Paris is the capital of France."),
16
- Document(content="Berlin is the capital of Germany.")
17
- ]
18
 
19
- query = "What is the capital of France?"
20
- answer = reader.run(query="What is the capital of France?", documents=docs, top_k=1)
 
21
 
22
- st.json(answer)
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from datasets import load_dataset
3
+ from haystack import Pipeline
4
  from haystack.components.readers import ExtractiveReader
5
+ from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
6
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
7
+
8
+ from utils import get_unique_docs
9
 
10
  # Load the dataset
11
+ unique_docs = set()
12
+ dataset = load_dataset("PedroCJardim/QASports", "basketball")
13
+ docs_validation = get_unique_docs(dataset["validation"], unique_docs)
14
+ docs_train = get_unique_docs(dataset["train"], unique_docs)
15
+ docs_test = get_unique_docs(dataset["test"], unique_docs)
16
+ docs_all = docs_validation + docs_train + docs_test
17
 
18
+ # Create the Question Answering pipeline
19
+ # Create in memory database
20
+ document_store = InMemoryDocumentStore()
21
+ document_store.write_documents(documents=docs_all)
22
+ # Create the retriever and reader
23
+ retriever = InMemoryBM25Retriever(document_store=document_store)
24
  reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
25
  reader.warm_up()
26
+ # Create the pipeline
27
+ pipe = Pipeline()
28
+ pipe.add_component(instance=retriever, name="retriever")
29
+ pipe.add_component(instance=reader, name="reader")
30
+ pipe.connect("retriever.documents", "reader.documents")
31
 
32
+ # Streamlit interface
33
+ st.markdown("""This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources.""")
34
+ st.subheader('QASports: Basketball', divider='rainbow')
 
 
35
 
36
+ top_k = 3
37
+ user_query = None
38
+ user_query = st.text_input("Please, make a question about basketball:")
39
 
40
+ if user_query:
41
+ answer = pipe.run(data={
42
+ "retriever": {"query": user_query, "top_k": 10},
43
+ "reader": {"query": user_query, "top_k": top_k},
44
+ })
45
+ # Display only the top k answers
46
+ st.json(answer["reader"]["answers"][0:top_k])
requirements.txt CHANGED
@@ -7,3 +7,5 @@ datasets==2.18.0
7
  haystack-ai==2.0.1
8
  accelerate==0.29.2
9
  sentence-transformers==2.7.0
 
 
 
7
  haystack-ai==2.0.1
8
  accelerate==0.29.2
9
  sentence-transformers==2.7.0
10
+ # Extra
11
+ mmh3==4.1.0
utils.py CHANGED
@@ -3,7 +3,7 @@ import mmh3
3
  from haystack import Document
4
 
5
 
6
- def get_unique_docs(dataset):
7
  '''Get unique documents from dataset
8
 
9
  Args:
@@ -12,11 +12,18 @@ def get_unique_docs(dataset):
12
  Returns:
13
  docs: list of haystack.Document
14
  '''
15
- unique_docs = set()
16
  docs = list()
17
  for doc in dataset:
18
  if doc["context"] is not None and doc["context_id"] not in unique_docs:
19
  unique_docs.add(doc["context_id"])
20
- document = Document(content=doc["context"], meta={'title': doc["context_title"], 'context_id': doc["context_id"]})
 
 
 
 
 
 
 
 
21
  docs.append(document)
22
  return docs
 
3
  from haystack import Document
4
 
5
 
6
+ def get_unique_docs(dataset, unique_docs:set):
7
  '''Get unique documents from dataset
8
 
9
  Args:
 
12
  Returns:
13
  docs: list of haystack.Document
14
  '''
 
15
  docs = list()
16
  for doc in dataset:
17
  if doc["context"] is not None and doc["context_id"] not in unique_docs:
18
  unique_docs.add(doc["context_id"])
19
+ document = Document(
20
+ content=doc["context"],
21
+ meta={
22
+ 'title': doc["context_title"],
23
+ 'context_id': doc["context_id"],
24
+ 'url': doc["url"],
25
+ 'source': 'QASports', 'category': 'basketball'
26
+ }
27
+ )
28
  docs.append(document)
29
  return docs