Spaces:
Sleeping
Sleeping
Added backend `app` module and core engine logic:
Browse files- `vectorizer.py` for prompt vectorization
- `scorer.py` for cosine similarity calculations
- `engine.py` to manage the prompt search engine workflow
- `api.py` to expose a RESTful API for prompt search functionality
- included `requirements.txt` to define app module dependencies
- app/__init__.py +0 -0
- app/api.py +21 -0
- app/engine.py +41 -0
- app/requirements.txt +4 -0
- app/scorer.py +20 -0
- app/vectorizer.py +23 -0
app/__init__.py
ADDED
File without changes
|
app/api.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, Query, HTTPException
|
2 |
+
|
3 |
+
app = FastAPI()
|
4 |
+
|
5 |
+
|
6 |
+
@app.get("/search")
|
7 |
+
def search(query: str, n: int = Query(5, ge=1, le=10)):
|
8 |
+
"""
|
9 |
+
Endpoint for querying the search engine.
|
10 |
+
Args:
|
11 |
+
query (str): The search query.
|
12 |
+
n (int): Number of results to return (default: 5).
|
13 |
+
Returns:
|
14 |
+
dict: Query results.
|
15 |
+
"""
|
16 |
+
search_engine = app.state.search_engine
|
17 |
+
if not search_engine:
|
18 |
+
raise HTTPException(status_code=500, detail="Search engine not initialized.")
|
19 |
+
|
20 |
+
results = search_engine.most_similar(query, n)
|
21 |
+
return {"query": query, "results": results}
|
app/engine.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from typing import Sequence, List, Tuple
|
4 |
+
|
5 |
+
from app.vectorizer import Vectorizer
|
6 |
+
from app.scorer import cosine_similarity
|
7 |
+
|
8 |
+
|
9 |
+
class PromptSearchEngine:
|
10 |
+
|
11 |
+
def __init__(self, prompts: Sequence[str]) -> None:
|
12 |
+
"""
|
13 |
+
Initialize search engine by vectorizing prompt corpus.
|
14 |
+
Vectorized prompt corpus should be used to find the top n most
|
15 |
+
similar prompts w.r.t. user’s input prompt.
|
16 |
+
Args:
|
17 |
+
prompts: The sequence of raw prompts from the dataset.
|
18 |
+
"""
|
19 |
+
self.prompts = prompts
|
20 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
21 |
+
self.vectorizer = Vectorizer(model)
|
22 |
+
self.corpus_vectors = self.vectorizer.transform(prompts)
|
23 |
+
|
24 |
+
def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
|
25 |
+
"""
|
26 |
+
Return top n most similar prompts from corpus.
|
27 |
+
Input query prompt should be vectorized with chosen Vectorizer.
|
28 |
+
After that, use the cosine_similarity function to get the top n most similar prompts from the corpus.
|
29 |
+
Args:
|
30 |
+
query: The raw query prompt input from the user.
|
31 |
+
n: The number of similar prompts returned from the corpus.
|
32 |
+
Returns:
|
33 |
+
The list of top n most similar prompts from the corpus along
|
34 |
+
with similarity scores. Note that returned prompts are verbatim.
|
35 |
+
"""
|
36 |
+
query_vector = self.vectorizer.transform([query])
|
37 |
+
similarities = cosine_similarity(query_vector, self.corpus_vectors)
|
38 |
+
top_n_vectors_with_scores = np.argsort(similarities)[-n:][::-1]
|
39 |
+
|
40 |
+
# Convert similarities to Python float and return the top-n prompts
|
41 |
+
return [(float(similarities[i]), self.prompts[i]) for i in top_n_vectors_with_scores]
|
app/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
datasets
|
4 |
+
sentence-transformers
|
app/scorer.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def cosine_similarity(
|
5 |
+
query_vector: np.ndarray,
|
6 |
+
corpus_vectors: np.ndarray
|
7 |
+
) -> np.ndarray:
|
8 |
+
"""
|
9 |
+
Calculate cosine similarity between prompt vectors.
|
10 |
+
Args:
|
11 |
+
query_vector: Vectorized prompt query of shape (1, D).
|
12 |
+
corpus_vectors: Vectorized prompt corpus of shape (N, D).
|
13 |
+
Returns:
|
14 |
+
The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
|
15 |
+
"""
|
16 |
+
query_norm = np.linalg.norm(query_vector, axis=1)[0]
|
17 |
+
corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
|
18 |
+
dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
|
19 |
+
similarities = dot_products / (query_norm * corpus_norms)
|
20 |
+
return similarities
|
app/vectorizer.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from typing import Sequence
|
3 |
+
|
4 |
+
|
5 |
+
class Vectorizer:
|
6 |
+
def __init__(self, model) -> None:
|
7 |
+
"""
|
8 |
+
Initialize the vectorizer with a pre-trained embedding model.
|
9 |
+
Args:
|
10 |
+
model: The pre-trained embedding model to use for transforming prompts.
|
11 |
+
"""
|
12 |
+
self.model = model
|
13 |
+
|
14 |
+
def transform(self, prompts: Sequence[str]) -> np.ndarray:
|
15 |
+
"""
|
16 |
+
Transform texts into numerical vectors using the specified model.
|
17 |
+
Args:
|
18 |
+
prompts: The sequence of raw corpus prompts.
|
19 |
+
Returns:
|
20 |
+
Vectorized prompts as a numpy array.
|
21 |
+
"""
|
22 |
+
# Using 'encode' method for SentenceTransformer model; may need updating for other models (e.g. 'embed')
|
23 |
+
return np.array(self.model.encode(prompts, show_progress_bar=True))
|