Spaces:
Sleeping
Sleeping
File size: 2,890 Bytes
cd20a25 00ad06a cd20a25 00ad06a cd20a25 00ad06a cd20a25 00ad06a cd20a25 00ad06a cd20a25 00ad06a cd20a25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import numpy as np
# TODO: Move exception classes into separate file
class DimensionalityMismatchError(ValueError):
"""Raised when the dimensions of query and corpus vectors don't match."""
pass
class ZeroVectorError(ValueError):
"""Raised when a zero vector is encountered."""
pass
class EmptyInputError(ValueError):
"""Raised when the input arrays are empty."""
pass
def cosine_similarity(
query_vector: np.ndarray,
corpus_vectors: np.ndarray
) -> np.ndarray:
"""
Calculate cosine similarity between prompt vectors.
Args:
query_vector: Vectorized prompt query of shape (1, D).
corpus_vectors: Vectorized prompt corpus of shape (N, D).
Returns:
The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
Raises:
DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match.
ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector.
EmptyInputError: If query_vector or corpus_vectors are empty.
Note:
- This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model.
- SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs.
- However, checks for zero vectors are included to handle potential edge cases and ensure robustness
for future modifications or alternative embedding models.
"""
# Validate input shapes and properties
if query_vector.shape[0] != 1:
raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.")
if query_vector.shape[1] != corpus_vectors.shape[1]:
raise DimensionalityMismatchError(
f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}."
)
if query_vector.size == 0 or corpus_vectors.size == 0:
raise EmptyInputError("query_vector and corpus_vectors must not be empty.")
# Compute query norm and check for zero vector
query_norm = np.linalg.norm(query_vector, axis=1)[0]
if query_norm == 0:
raise ZeroVectorError("query_vector must not be a zero vector.")
# Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero,
# can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors)
corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
if np.any(corpus_norms == 0):
raise ZeroVectorError("corpus_vectors must not contain zero vectors.")
# Compute cosine similarity
dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
similarities = dot_products / (query_norm * corpus_norms)
return similarities
|