davanstrien HF staff commited on
Commit
f7d37d9
·
1 Parent(s): c035c1f

remove unsued file

Browse files
Files changed (1) hide show
  1. ragatouille_search.py +0 -109
ragatouille_search.py DELETED
@@ -1,109 +0,0 @@
1
- from pathlib import Path
2
- from typing import Any, Dict, List
3
-
4
- import gradio as gr
5
- from huggingface_hub import snapshot_download
6
- from ragatouille import RAGPretrainedModel
7
- from toolz import unique
8
-
9
- # Top-level variables
10
- INDEX_PATH = Path(".ragatouille/colbert/indexes/my_index_with_ids_and_metadata/")
11
- REPO_ID = "davanstrien/search-index"
12
-
13
- INITIAL_QUERY = "hello world"
14
- DEFAULT_K = 10
15
-
16
-
17
- def initialize_index():
18
- INDEX_PATH.mkdir(parents=True, exist_ok=True)
19
- snapshot_download(REPO_ID, repo_type="dataset", local_dir=INDEX_PATH)
20
- rag = RAGPretrainedModel.from_index(INDEX_PATH)
21
- # Warm up index
22
- rag.search(INITIAL_QUERY)
23
- return rag
24
-
25
-
26
- def format_results_as_markdown(results: List[Dict[str, Any]]) -> str:
27
- markdown = ""
28
- for result in results:
29
- content = result["content"]
30
- score = result["score"]
31
- rank = result["rank"]
32
- document_id = result["document_id"]
33
- passage_id = result["passage_id"]
34
- link = f"https://huggingface.co/datasets/{document_id}"
35
-
36
- markdown += f"### Result {rank}\n"
37
- markdown += f"**Score:** {score}\n\n"
38
- markdown += f"**Document ID:** [{document_id}]({link})\n\n"
39
- markdown += f"**Passage ID:** {passage_id}\n\n"
40
-
41
- # Limit initial content display to 1000 characters
42
- preview = f"{content[:1000]}..." if len(content) > 1000 else content
43
- markdown += f"{preview}\n\n"
44
-
45
- # Add expandable section for full content if it's longer than 1000 characters
46
- if len(content) > 1000:
47
- markdown += "<details>\n"
48
- markdown += "<summary>Click to expand full content</summary>\n\n"
49
- markdown += f"{content}\n\n"
50
- markdown += "</details>\n\n"
51
-
52
- markdown += "---\n\n"
53
-
54
- return markdown
55
-
56
-
57
- def search_with_ragatouille(query, k=DEFAULT_K, make_unique=False):
58
- results = RAG.search(query, k=k)
59
- if make_unique:
60
- results = make_results_unique(results)
61
- return format_results_as_markdown(results)
62
-
63
-
64
- def make_results_unique(results: List[Dict[str, Any]]):
65
- unique_results = unique(results, lambda x: x["document_id"])
66
- return list(unique_results)
67
-
68
-
69
- def create_ragatouille_interface():
70
- with gr.Blocks() as ragatouille_demo:
71
- gr.Markdown("### RAGatouille Dataset Search")
72
- gr.Markdown(
73
- """This interface allows you to search inside dataset cards on the Hub using the [answerai-colbert-small-v1](https://huggingface.co/answerdotai/answerai-colbert-small-v1) ColBERT model via [RAGatouille](https://github.com/AnswerDotAI/RAGatouille). Please be aware that this is an early prototype and may not work as expected!
74
-
75
- ## Notes:
76
- **Not all datasets are indexed yet!**
77
- For a dataset to be indexed:
78
- - It must have a dataset card on the Hub. You can find documentation on how to write a good dataset card [here](https://huggingface.co/docs/hub/datasets-cards).
79
- - The dataset must have at least 1 like and 1 download
80
- - The card must be a minimum length (to weed out low quality cards)
81
- **At the moment the index is refreshed when I decide to do it, so it may not be up to date.** If there is sufficient interest I will implement a daily refresh (give this repo a like if you'd like this feature!)
82
- Feel free to open a discussion to give feedback or request features &#129303;
83
- """
84
- )
85
- with gr.Column():
86
- query = gr.Textbox(label="Search query", placeholder="medieval handwriting")
87
- with gr.Row():
88
- k = gr.Slider(1, 100, value=DEFAULT_K, step=1, label="Number of Results")
89
- make_unique = gr.Checkbox(False, label="Show each dataset only once?")
90
- search_button = gr.Button("Search")
91
- search_button.click(
92
- search_with_ragatouille,
93
- inputs=[query, k, make_unique],
94
- outputs=gr.Markdown(label="Results"),
95
- )
96
- return ragatouille_demo
97
-
98
-
99
- # Initialize RAG globally
100
- RAG = initialize_index()
101
-
102
-
103
- def main():
104
- demo = create_ragatouille_interface()
105
- demo.launch()
106
-
107
-
108
- if __name__ == "__main__":
109
- main()