davidberenstein1957 HF staff commited on
Commit
5dc52fb
·
verified ·
1 Parent(s): e6bfd16

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ import duckdb
5
+ from huggingface_hub import get_token
6
+
7
+ model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-v1.5")
8
+
9
+ def similarity_search(
10
+ query: str,
11
+ k: int = 5,
12
+ dataset_name: str = "smol-blueprint-project/hf-blogs-text-embeddings",
13
+ embedding_column: str = "embedding",
14
+ ):
15
+ # Use same model as used for indexing
16
+ query_vector = model.encode(query)
17
+ embedding_dim = model.get_sentence_embedding_dimension()
18
+
19
+ sql = f"""
20
+ SELECT
21
+ title,
22
+ author,
23
+ date,
24
+ local,
25
+ tags,
26
+ URL,
27
+ chunk,
28
+ array_cosine_distance(
29
+ {embedding_column}::float[{embedding_dim}],
30
+ {query_vector.tolist()}::float[{embedding_dim}]
31
+ ) as distance
32
+ FROM 'hf://datasets/{dataset_name}/**/*.parquet'
33
+ ORDER BY distance
34
+ LIMIT {k}
35
+ """
36
+
37
+ return duckdb.sql(sql).to_df()
38
+
39
+ with gr.Blocks() as demo:
40
+ query = gr.Textbox(label="Query")
41
+ k = gr.Slider(1, 10, value=5, label="Number of results")
42
+ btn = gr.Button("Search")
43
+ results = gr.Dataframe(headers=["title", "url", "content", "distance"])
44
+ btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
45
+
46
+
47
+ demo.launch()