Spaces:
Sleeping
Sleeping
import numpy as np | |
# TODO: Move exception classes into separate file | |
class DimensionalityMismatchError(ValueError): | |
"""Raised when the dimensions of query and corpus vectors don't match.""" | |
pass | |
class ZeroVectorError(ValueError): | |
"""Raised when a zero vector is encountered.""" | |
pass | |
class EmptyInputError(ValueError): | |
"""Raised when the input arrays are empty.""" | |
pass | |
def cosine_similarity( | |
query_vector: np.ndarray, | |
corpus_vectors: np.ndarray | |
) -> np.ndarray: | |
""" | |
Calculate cosine similarity between prompt vectors. | |
Args: | |
query_vector: Vectorized prompt query of shape (1, D). | |
corpus_vectors: Vectorized prompt corpus of shape (N, D). | |
Returns: | |
The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same. | |
Raises: | |
DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match. | |
ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector. | |
EmptyInputError: If query_vector or corpus_vectors are empty. | |
Note: | |
- This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model. | |
- SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs. | |
- However, checks for zero vectors are included to handle potential edge cases and ensure robustness | |
for future modifications or alternative embedding models. | |
""" | |
# Validate input shapes and properties | |
if query_vector.shape[0] != 1: | |
raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.") | |
if query_vector.shape[1] != corpus_vectors.shape[1]: | |
raise DimensionalityMismatchError( | |
f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}." | |
) | |
if query_vector.size == 0 or corpus_vectors.size == 0: | |
raise EmptyInputError("query_vector and corpus_vectors must not be empty.") | |
# Compute query norm and check for zero vector | |
query_norm = np.linalg.norm(query_vector, axis=1)[0] | |
if query_norm == 0: | |
raise ZeroVectorError("query_vector must not be a zero vector.") | |
# Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero, | |
# can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors) | |
corpus_norms = np.linalg.norm(corpus_vectors, axis=1) | |
if np.any(corpus_norms == 0): | |
raise ZeroVectorError("corpus_vectors must not contain zero vectors.") | |
# Compute cosine similarity | |
dot_products = np.dot(corpus_vectors, query_vector.T).flatten() | |
similarities = dot_products / (query_norm * corpus_norms) | |
return similarities | |