File size: 2,890 Bytes
cd20a25
 
 
00ad06a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd20a25
00ad06a
 
cd20a25
 
 
 
 
 
 
 
00ad06a
 
 
 
 
 
 
 
 
 
cd20a25
00ad06a
 
 
 
 
 
 
 
 
 
 
cd20a25
00ad06a
 
 
 
 
cd20a25
00ad06a
 
 
 
cd20a25
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np


# TODO: Move exception classes into separate file
class DimensionalityMismatchError(ValueError):
    """Raised when the dimensions of query and corpus vectors don't match."""
    pass


class ZeroVectorError(ValueError):
    """Raised when a zero vector is encountered."""
    pass


class EmptyInputError(ValueError):
    """Raised when the input arrays are empty."""
    pass


def cosine_similarity(
        query_vector: np.ndarray,
        corpus_vectors: np.ndarray
) -> np.ndarray:
    """
    Calculate cosine similarity between prompt vectors.
    Args:
        query_vector: Vectorized prompt query of shape (1, D).
        corpus_vectors: Vectorized prompt corpus of shape (N, D).
    Returns:
        The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
    Raises:
        DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match.
        ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector.
        EmptyInputError: If query_vector or corpus_vectors are empty.

    Note:
        - This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model.
        - SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs.
        - However, checks for zero vectors are included to handle potential edge cases and ensure robustness
          for future modifications or alternative embedding models.
    """
    # Validate input shapes and properties
    if query_vector.shape[0] != 1:
        raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.")
    if query_vector.shape[1] != corpus_vectors.shape[1]:
        raise DimensionalityMismatchError(
            f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}."
        )
    if query_vector.size == 0 or corpus_vectors.size == 0:
        raise EmptyInputError("query_vector and corpus_vectors must not be empty.")

    # Compute query norm and check for zero vector
    query_norm = np.linalg.norm(query_vector, axis=1)[0]
    if query_norm == 0:
        raise ZeroVectorError("query_vector must not be a zero vector.")

    # Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero,
    # can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors)
    corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
    if np.any(corpus_norms == 0):
        raise ZeroVectorError("corpus_vectors must not contain zero vectors.")

    # Compute cosine similarity
    dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
    similarities = dot_products / (query_norm * corpus_norms)
    return similarities