Jokica17 commited on
Commit
00ad06a
·
1 Parent(s): 67654d1

Added handling edge cases in scorer.py

Browse files
Files changed (1) hide show
  1. app/scorer.py +48 -2
app/scorer.py CHANGED
@@ -1,9 +1,25 @@
1
  import numpy as np
2
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def cosine_similarity(
5
- query_vector: np.ndarray,
6
- corpus_vectors: np.ndarray
7
  ) -> np.ndarray:
8
  """
9
  Calculate cosine similarity between prompt vectors.
@@ -12,9 +28,39 @@ def cosine_similarity(
12
  corpus_vectors: Vectorized prompt corpus of shape (N, D).
13
  Returns:
14
  The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
 
 
 
 
 
 
 
 
 
 
15
  """
 
 
 
 
 
 
 
 
 
 
 
16
  query_norm = np.linalg.norm(query_vector, axis=1)[0]
 
 
 
 
 
17
  corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
 
 
 
 
18
  dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
19
  similarities = dot_products / (query_norm * corpus_norms)
20
  return similarities
 
1
  import numpy as np
2
 
3
 
4
+ # TODO: Move exception classes into separate file
5
+ class DimensionalityMismatchError(ValueError):
6
+ """Raised when the dimensions of query and corpus vectors don't match."""
7
+ pass
8
+
9
+
10
+ class ZeroVectorError(ValueError):
11
+ """Raised when a zero vector is encountered."""
12
+ pass
13
+
14
+
15
+ class EmptyInputError(ValueError):
16
+ """Raised when the input arrays are empty."""
17
+ pass
18
+
19
+
20
  def cosine_similarity(
21
+ query_vector: np.ndarray,
22
+ corpus_vectors: np.ndarray
23
  ) -> np.ndarray:
24
  """
25
  Calculate cosine similarity between prompt vectors.
 
28
  corpus_vectors: Vectorized prompt corpus of shape (N, D).
29
  Returns:
30
  The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
31
+ Raises:
32
+ DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match.
33
+ ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector.
34
+ EmptyInputError: If query_vector or corpus_vectors are empty.
35
+
36
+ Note:
37
+ - This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model.
38
+ - SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs.
39
+ - However, checks for zero vectors are included to handle potential edge cases and ensure robustness
40
+ for future modifications or alternative embedding models.
41
  """
42
+ # Validate input shapes and properties
43
+ if query_vector.shape[0] != 1:
44
+ raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.")
45
+ if query_vector.shape[1] != corpus_vectors.shape[1]:
46
+ raise DimensionalityMismatchError(
47
+ f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}."
48
+ )
49
+ if query_vector.size == 0 or corpus_vectors.size == 0:
50
+ raise EmptyInputError("query_vector and corpus_vectors must not be empty.")
51
+
52
+ # Compute query norm and check for zero vector
53
  query_norm = np.linalg.norm(query_vector, axis=1)[0]
54
+ if query_norm == 0:
55
+ raise ZeroVectorError("query_vector must not be a zero vector.")
56
+
57
+ # Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero,
58
+ # can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors)
59
  corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
60
+ if np.any(corpus_norms == 0):
61
+ raise ZeroVectorError("corpus_vectors must not contain zero vectors.")
62
+
63
+ # Compute cosine similarity
64
  dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
65
  similarities = dot_products / (query_norm * corpus_norms)
66
  return similarities