Spaces:
Running
Running
Added handling edge cases in scorer.py
Browse files- app/scorer.py +48 -2
app/scorer.py
CHANGED
@@ -1,9 +1,25 @@
|
|
1 |
import numpy as np
|
2 |
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def cosine_similarity(
|
5 |
-
|
6 |
-
|
7 |
) -> np.ndarray:
|
8 |
"""
|
9 |
Calculate cosine similarity between prompt vectors.
|
@@ -12,9 +28,39 @@ def cosine_similarity(
|
|
12 |
corpus_vectors: Vectorized prompt corpus of shape (N, D).
|
13 |
Returns:
|
14 |
The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
query_norm = np.linalg.norm(query_vector, axis=1)[0]
|
|
|
|
|
|
|
|
|
|
|
17 |
corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
|
|
|
|
|
|
|
|
|
18 |
dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
|
19 |
similarities = dot_products / (query_norm * corpus_norms)
|
20 |
return similarities
|
|
|
1 |
import numpy as np
|
2 |
|
3 |
|
4 |
+
# TODO: Move exception classes into separate file
|
5 |
+
class DimensionalityMismatchError(ValueError):
|
6 |
+
"""Raised when the dimensions of query and corpus vectors don't match."""
|
7 |
+
pass
|
8 |
+
|
9 |
+
|
10 |
+
class ZeroVectorError(ValueError):
|
11 |
+
"""Raised when a zero vector is encountered."""
|
12 |
+
pass
|
13 |
+
|
14 |
+
|
15 |
+
class EmptyInputError(ValueError):
|
16 |
+
"""Raised when the input arrays are empty."""
|
17 |
+
pass
|
18 |
+
|
19 |
+
|
20 |
def cosine_similarity(
|
21 |
+
query_vector: np.ndarray,
|
22 |
+
corpus_vectors: np.ndarray
|
23 |
) -> np.ndarray:
|
24 |
"""
|
25 |
Calculate cosine similarity between prompt vectors.
|
|
|
28 |
corpus_vectors: Vectorized prompt corpus of shape (N, D).
|
29 |
Returns:
|
30 |
The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
|
31 |
+
Raises:
|
32 |
+
DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match.
|
33 |
+
ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector.
|
34 |
+
EmptyInputError: If query_vector or corpus_vectors are empty.
|
35 |
+
|
36 |
+
Note:
|
37 |
+
- This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model.
|
38 |
+
- SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs.
|
39 |
+
- However, checks for zero vectors are included to handle potential edge cases and ensure robustness
|
40 |
+
for future modifications or alternative embedding models.
|
41 |
"""
|
42 |
+
# Validate input shapes and properties
|
43 |
+
if query_vector.shape[0] != 1:
|
44 |
+
raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.")
|
45 |
+
if query_vector.shape[1] != corpus_vectors.shape[1]:
|
46 |
+
raise DimensionalityMismatchError(
|
47 |
+
f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}."
|
48 |
+
)
|
49 |
+
if query_vector.size == 0 or corpus_vectors.size == 0:
|
50 |
+
raise EmptyInputError("query_vector and corpus_vectors must not be empty.")
|
51 |
+
|
52 |
+
# Compute query norm and check for zero vector
|
53 |
query_norm = np.linalg.norm(query_vector, axis=1)[0]
|
54 |
+
if query_norm == 0:
|
55 |
+
raise ZeroVectorError("query_vector must not be a zero vector.")
|
56 |
+
|
57 |
+
# Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero,
|
58 |
+
# can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors)
|
59 |
corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
|
60 |
+
if np.any(corpus_norms == 0):
|
61 |
+
raise ZeroVectorError("corpus_vectors must not contain zero vectors.")
|
62 |
+
|
63 |
+
# Compute cosine similarity
|
64 |
dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
|
65 |
similarities = dot_products / (query_norm * corpus_norms)
|
66 |
return similarities
|