Spaces:
Running
Running
File size: 2,110 Bytes
05e7b4d 5dc52fb 05e7b4d 5dc52fb 8636daf 05e7b4d 90b7917 b82c93a e9fd1e4 0636ee3 8636daf 0636ee3 8636daf 0636ee3 e9fd1e4 8636daf e9fd1e4 0636ee3 8636daf e9fd1e4 8636daf e9fd1e4 0636ee3 8636daf 5dc52fb 8636daf e9fd1e4 5dc52fb 581cf0b e9fd1e4 e0e0b77 90b7917 5dc52fb 53ff92d 5dc52fb e9fd1e4 5dc52fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import duckdb
import gradio as gr
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from huggingface_hub import get_token
static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
model = SentenceTransformer(modules=[static_embedding])
embedding_dimensions = model.get_sentence_embedding_dimension()
dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
embedding_column = "embeddings"
embedding_column_float = f"{embedding_column}_float"
table_name = "fineweb"
duckdb.sql(query=f"""
INSTALL vss;
LOAD vss;
CREATE TABLE {table_name} AS
SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column_float}
FROM 'hf://datasets/{dataset_name}/**/*.parquet';
CREATE INDEX my_hnsw_index ON {table_name} USING HNSW ({embedding_column_float}) WITH (metric = 'cosine');
""")
def similarity_search(query: str, k: int = 5):
embedding = model.encode(query).tolist()
df = duckdb.sql(
query=f"""
SELECT *, array_cosine_distance({embedding_column_float}, {embedding}::FLOAT[{embedding_dimensions}]) as distance
FROM {table_name}
ORDER BY distance
LIMIT {k};
"""
).to_df()
df = df.drop(columns=[embedding_column, embedding_column_float])
return df
with gr.Blocks() as demo:
gr.Markdown("""# RAG - retrieve
Executes vector search on top of [fineweb-bbc-news-embeddings](https://huggingface.co./datasets/ai-blueprint/fineweb-bbc-news-embeddings) using DuckDB.
Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
query = gr.Textbox(label="Query")
k = gr.Slider(1, 50, value=5, label="Number of results")
btn = gr.Button("Search")
results = gr.Dataframe(headers=["url", "chunk", "distance"], wrap=True)
btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
demo.launch() |