Spaces:

Jokica17
/

promptsearchengine

Sleeping

Jokica17 commited on Jan 5

Commit

cd20a25

1 Parent(s): fc456a9

Added backend `app` module and core engine logic:

- `vectorizer.py` for prompt vectorization
- `scorer.py` for cosine similarity calculations
- `engine.py` to manage the prompt search engine workflow
- `api.py` to expose a RESTful API for prompt search functionality
- included `requirements.txt` to define app module dependencies

Files changed (6) hide show

app/__init__.py +0 -0
app/api.py +21 -0
app/engine.py +41 -0
app/requirements.txt +4 -0
app/scorer.py +20 -0
app/vectorizer.py +23 -0

app/__init__.py ADDED Viewed

File without changes

app/api.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from fastapi import FastAPI, Query, HTTPException
+app = FastAPI()
+@app.get("/search")
+def search(query: str, n: int = Query(5, ge=1, le=10)):
+    """
+    Endpoint for querying the search engine.
+    Args:
+        query (str): The search query.
+        n (int): Number of results to return (default: 5).
+    Returns:
+        dict: Query results.
+    """
+    search_engine = app.state.search_engine
+    if not search_engine:
+        raise HTTPException(status_code=500, detail="Search engine not initialized.")
+    results = search_engine.most_similar(query, n)
+    return {"query": query, "results": results}

app/engine.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+from typing import Sequence, List, Tuple
+from app.vectorizer import Vectorizer
+from app.scorer import cosine_similarity
+class PromptSearchEngine:
+    def __init__(self, prompts: Sequence[str]) -> None:
+        """
+        Initialize search engine by vectorizing prompt corpus.
+        Vectorized prompt corpus should be used to find the top n most
+        similar prompts w.r.t. user’s input prompt.
+        Args:
+            prompts: The sequence of raw prompts from the dataset.
+        """
+        self.prompts = prompts
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        self.vectorizer = Vectorizer(model)
+        self.corpus_vectors = self.vectorizer.transform(prompts)
+    def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
+        """
+        Return top n most similar prompts from corpus.
+        Input query prompt should be vectorized with chosen Vectorizer.
+        After that, use the cosine_similarity function to get the top n most similar prompts from the corpus.
+        Args:
+             query: The raw query prompt input from the user.
+             n: The number of similar prompts returned from the corpus.
+        Returns:
+             The list of top n most similar prompts from the corpus along
+             with similarity scores. Note that returned prompts are verbatim.
+        """
+        query_vector = self.vectorizer.transform([query])
+        similarities = cosine_similarity(query_vector, self.corpus_vectors)
+        top_n_vectors_with_scores = np.argsort(similarities)[-n:][::-1]
+        # Convert similarities to Python float and return the top-n prompts
+        return [(float(similarities[i]), self.prompts[i]) for i in top_n_vectors_with_scores]

app/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi
+uvicorn
+datasets
+sentence-transformers

app/scorer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import numpy as np
+def cosine_similarity(
+     query_vector: np.ndarray,
+     corpus_vectors: np.ndarray
+) -> np.ndarray:
+    """
+    Calculate cosine similarity between prompt vectors.
+    Args:
+        query_vector: Vectorized prompt query of shape (1, D).
+        corpus_vectors: Vectorized prompt corpus of shape (N, D).
+    Returns:
+        The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
+    """
+    query_norm = np.linalg.norm(query_vector, axis=1)[0]
+    corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
+    dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
+    similarities = dot_products / (query_norm * corpus_norms)
+    return similarities

app/vectorizer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+from typing import Sequence
+class Vectorizer:
+    def __init__(self, model) -> None:
+        """
+        Initialize the vectorizer with a pre-trained embedding model.
+        Args:
+            model: The pre-trained embedding model to use for transforming prompts.
+        """
+        self.model = model
+    def transform(self, prompts: Sequence[str]) -> np.ndarray:
+        """
+        Transform texts into numerical vectors using the specified model.
+        Args:
+            prompts: The sequence of raw corpus prompts.
+        Returns:
+            Vectorized prompts as a numpy array.
+        """
+        # Using 'encode' method for SentenceTransformer model; may need updating for other models (e.g. 'embed')
+        return np.array(self.model.encode(prompts, show_progress_bar=True))