File size: 2,110 Bytes
05e7b4d
5dc52fb
 
05e7b4d
5dc52fb
 
8636daf
05e7b4d
 
90b7917
b82c93a
 
e9fd1e4
0636ee3
8636daf
0636ee3
8636daf
 
0636ee3
e9fd1e4
8636daf
e9fd1e4
0636ee3
8636daf
 
 
e9fd1e4
8636daf
e9fd1e4
0636ee3
8636daf
 
5dc52fb
8636daf
e9fd1e4
 
5dc52fb
 
581cf0b
e9fd1e4
 
e0e0b77
90b7917
5dc52fb
53ff92d
5dc52fb
e9fd1e4
5dc52fb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import duckdb
import gradio as gr
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from huggingface_hub import get_token


static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
model = SentenceTransformer(modules=[static_embedding])
embedding_dimensions = model.get_sentence_embedding_dimension()
dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
embedding_column = "embeddings"
embedding_column_float = f"{embedding_column}_float"
table_name = "fineweb"

duckdb.sql(query=f"""
    INSTALL vss;
    LOAD vss;
    CREATE TABLE {table_name} AS 
    SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column_float} 
    FROM 'hf://datasets/{dataset_name}/**/*.parquet';
    CREATE INDEX my_hnsw_index ON {table_name} USING HNSW ({embedding_column_float}) WITH (metric = 'cosine');
""")

def similarity_search(query: str, k: int = 5):
    embedding = model.encode(query).tolist()
    df = duckdb.sql(
        query=f"""
        SELECT *, array_cosine_distance({embedding_column_float}, {embedding}::FLOAT[{embedding_dimensions}]) as distance 
        FROM {table_name}
        ORDER BY distance 
        LIMIT {k};
    """
    ).to_df()
    df = df.drop(columns=[embedding_column, embedding_column_float])
    return df

with gr.Blocks() as demo:
    gr.Markdown("""# RAG - retrieve

                Executes vector search on top of [fineweb-bbc-news-embeddings](https://huggingface.co./datasets/ai-blueprint/fineweb-bbc-news-embeddings) using DuckDB.
                
                Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
    query = gr.Textbox(label="Query")
    k = gr.Slider(1, 50, value=5, label="Number of results")
    btn = gr.Button("Search")
    results = gr.Dataframe(headers=["url", "chunk", "distance"], wrap=True)
    btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
    

demo.launch()