File size: 1,803 Bytes
cd20a25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Sequence, List, Tuple

from app.vectorizer import Vectorizer
from app.scorer import cosine_similarity


class PromptSearchEngine:

    def __init__(self, prompts: Sequence[str]) -> None:
        """
        Initialize search engine by vectorizing prompt corpus.
        Vectorized prompt corpus should be used to find the top n most
        similar prompts w.r.t. user’s input prompt.
        Args:
            prompts: The sequence of raw prompts from the dataset.
        """
        self.prompts = prompts
        model = SentenceTransformer("all-MiniLM-L6-v2")
        self.vectorizer = Vectorizer(model)
        self.corpus_vectors = self.vectorizer.transform(prompts)

    def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
        """
        Return top n most similar prompts from corpus.
        Input query prompt should be vectorized with chosen Vectorizer.
        After that, use the cosine_similarity function to get the top n most similar prompts from the corpus.
        Args:
             query: The raw query prompt input from the user.
             n: The number of similar prompts returned from the corpus.
        Returns:
             The list of top n most similar prompts from the corpus along
             with similarity scores. Note that returned prompts are verbatim.
        """
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.corpus_vectors)
        top_n_vectors_with_scores = np.argsort(similarities)[-n:][::-1]

        # Convert similarities to Python float and return the top-n prompts
        return [(float(similarities[i]), self.prompts[i]) for i in top_n_vectors_with_scores]