kevin-pek commited on
Commit
91855c2
1 Parent(s): 108bb17

sbert gradio interface

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. README.md +8 -0
  3. main.py +13 -12
.gitignore CHANGED
@@ -1 +1,3 @@
1
  venv/
 
 
 
1
  venv/
2
+ __pycache__/
3
+
README.md CHANGED
@@ -1,5 +1,13 @@
1
  # Document Semantic Search
2
 
 
 
 
 
 
 
 
 
3
  ## Setup
4
 
5
  [Link to venv docs](https://docs.python.org/3/library/venv.html)
 
1
  # Document Semantic Search
2
 
3
+ ## Run
4
+
5
+ Run the app in reload mode with this command. This will let the app reload automatically when changes are made to the python script.
6
+
7
+ ```shell
8
+ $ gradio main.py
9
+ ```
10
+
11
  ## Setup
12
 
13
  [Link to venv docs](https://docs.python.org/3/library/venv.html)
main.py CHANGED
@@ -1,6 +1,6 @@
1
- from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever
2
  from haystack.document_stores import InMemoryDocumentStore
3
- from haystack.pipelines import DocumentSearchPipeline
4
  import gradio as gr
5
 
6
  preprocessor = PreProcessor(
@@ -12,9 +12,11 @@ preprocessor = PreProcessor(
12
  split_respect_sentence_boundary=True,
13
  split_overlap=3
14
  )
15
- document_store = InMemoryDocumentStore()
 
16
  retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
17
- pipeline = DocumentSearchPipeline(retriever)
 
18
 
19
  def print_answers(results):
20
  fields = ["answer", "score"] # "context"
@@ -28,27 +30,26 @@ def print_answers(results):
28
  return filtered_answers
29
 
30
  def write_pdf(pdf_file):
31
- converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
32
  document = converter.convert(file_path=pdf_file.name, meta=None)[0]
33
  preprocessed_docs = preprocessor.process(document)
34
  document_store.write_documents(preprocessed_docs)
 
35
 
36
  def predict(question, pdf_file):
37
- print("Start processing pdf")
38
  write_pdf(pdf_file)
39
- print("Processing done.")
40
  result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
41
  answers = print_answers(result)
42
  return answers
43
 
44
- title = "Search"
45
  interface = gr.Interface(
46
  fn=predict,
47
- inputs=[gr.components.Textbox(lines = 3, label="Ask an open question!"),gr.components.File(file_count="single", type="file", label="Upload a pdf")],
 
 
 
48
  outputs="text",
49
- title=title,
50
- flagging_options=["top", "medium", "bad"],
51
- interpretation="default",
52
  theme="default" # “default", “huggingface", “dark-grass", “peach"
53
  )
54
 
 
1
+ from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever, TransformersReader
2
  from haystack.document_stores import InMemoryDocumentStore
3
+ from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
4
  import gradio as gr
5
 
6
  preprocessor = PreProcessor(
 
12
  split_respect_sentence_boundary=True,
13
  split_overlap=3
14
  )
15
+ document_store = InMemoryDocumentStore(embedding_dim=384)
16
+ reader = TransformersReader("sentence-transformers/all-MiniLM-L6-v2")
17
  retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
18
+ pipeline = ExtractiveQAPipeline(reader, retriever)
19
+ converter = PDFToTextConverter(remove_numeric_tables=True)
20
 
21
  def print_answers(results):
22
  fields = ["answer", "score"] # "context"
 
30
  return filtered_answers
31
 
32
  def write_pdf(pdf_file):
 
33
  document = converter.convert(file_path=pdf_file.name, meta=None)[0]
34
  preprocessed_docs = preprocessor.process(document)
35
  document_store.write_documents(preprocessed_docs)
36
+ document_store.update_embeddings(retriever)
37
 
38
  def predict(question, pdf_file):
 
39
  write_pdf(pdf_file)
 
40
  result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
41
  answers = print_answers(result)
42
  return answers
43
 
 
44
  interface = gr.Interface(
45
  fn=predict,
46
+ inputs=[
47
+ gr.components.Textbox(lines = 1, label="Enter your search query here..."),
48
+ gr.components.File(file_count="single", type="file", label="Upload a file here.")
49
+ ],
50
  outputs="text",
51
+ title="Search",
52
+ interpretation=None,
 
53
  theme="default" # “default", “huggingface", “dark-grass", “peach"
54
  )
55