davidberenstein1957 HF staff commited on
Commit
e9fd1e4
·
verified ·
1 Parent(s): 90b7917

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -10,36 +10,41 @@ model = SentenceTransformer(modules=[static_embedding])
10
  embedding_dimensions = model.get_sentence_embedding_dimension()
11
  dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
12
  embedding_column = "embeddings"
 
13
  table_name = "fineweb"
14
 
15
  duckdb.sql(query=f"""
16
  INSTALL vss;
17
  LOAD vss;
18
  CREATE TABLE {table_name} AS
19
- SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column}_float
20
  FROM 'hf://datasets/{dataset_name}/**/*.parquet';
21
- CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
22
  """)
23
 
24
  def similarity_search(query: str, k: int = 5):
25
  embedding = model.encode(query).tolist()
26
- return duckdb.sql(
27
  query=f"""
28
- SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{embedding_dimensions}]) as distance
29
  FROM {table_name}
30
  ORDER BY distance
31
  LIMIT {k};
32
  """
33
  ).to_df()
 
 
34
 
35
  with gr.Blocks() as demo:
36
  gr.Markdown("""# RAG - retrieve
 
 
37
 
38
  Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
39
  query = gr.Textbox(label="Query")
40
  k = gr.Slider(1, 50, value=5, label="Number of results")
41
  btn = gr.Button("Search")
42
- results = gr.Dataframe(headers=["url", "chunk", "distance"])
43
  btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
44
 
45
 
 
10
  embedding_dimensions = model.get_sentence_embedding_dimension()
11
  dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
12
  embedding_column = "embeddings"
13
+ embedding_column_float = f"{embedding_column}_float"
14
  table_name = "fineweb"
15
 
16
  duckdb.sql(query=f"""
17
  INSTALL vss;
18
  LOAD vss;
19
  CREATE TABLE {table_name} AS
20
+ SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column_float}
21
  FROM 'hf://datasets/{dataset_name}/**/*.parquet';
22
+ CREATE INDEX my_hnsw_index ON {table_name} USING HNSW ({embedding_column_float}) WITH (metric = 'cosine');
23
  """)
24
 
25
  def similarity_search(query: str, k: int = 5):
26
  embedding = model.encode(query).tolist()
27
+ df = duckdb.sql(
28
  query=f"""
29
+ SELECT *, array_cosine_distance({embedding_column_float}, {embedding}::FLOAT[{embedding_dimensions}]) as distance
30
  FROM {table_name}
31
  ORDER BY distance
32
  LIMIT {k};
33
  """
34
  ).to_df()
35
+ df = df.drop(columns=[embedding_column, embedding_column_float])
36
+ return df
37
 
38
  with gr.Blocks() as demo:
39
  gr.Markdown("""# RAG - retrieve
40
+
41
+ Executes vector search on top of [fineweb-bbc-news-embeddings](https://huggingface.co/datasets/ai-blueprint/fineweb-bbc-news-embeddings) using DuckDB.
42
 
43
  Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
44
  query = gr.Textbox(label="Query")
45
  k = gr.Slider(1, 50, value=5, label="Number of results")
46
  btn = gr.Button("Search")
47
+ results = gr.Dataframe(headers=["url", "chunk", "distance"], wrap=True)
48
  btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
49
 
50