omidf commited on
Commit
32e4ba0
·
1 Parent(s): 8348839

Update compute_score.py

Browse files
Files changed (1) hide show
  1. compute_score.py +11 -34
compute_score.py CHANGED
@@ -1,5 +1,4 @@
1
  """ Official evaluation script for v1.1 of the SQuAD dataset. """
2
-
3
  import argparse
4
  import json
5
  import re
@@ -26,37 +25,18 @@ def normalize_answer(s):
26
 
27
  return white_space_fix(remove_articles(remove_punc(lower(s))))
28
 
29
- def precision_score(prediction, ground_truth):
30
- prediction_tokens = normalize_answer(prediction).split()
31
- ground_truth_tokens = normalize_answer(ground_truth).split()
32
- common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33
- num_same = sum(common.values())
34
- if num_same == 0:
35
- return 0
36
- precision = 1.0 * num_same / len(prediction_tokens)
37
- return precision
38
 
39
- def recall_score(prediction, ground_truth):
40
  prediction_tokens = normalize_answer(prediction).split()
41
  ground_truth_tokens = normalize_answer(ground_truth).split()
42
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
43
  num_same = sum(common.values())
44
  if num_same == 0:
45
  return 0
 
46
  recall = 1.0 * num_same / len(ground_truth_tokens)
47
- return recall
48
-
49
- # def f1_score(prediction, ground_truth):
50
- # prediction_tokens = normalize_answer(prediction).split()
51
- # ground_truth_tokens = normalize_answer(ground_truth).split()
52
- # common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
53
- # num_same = sum(common.values())
54
- # if num_same == 0:
55
- # return 0
56
- # precision = 1.0 * num_same / len(prediction_tokens)
57
- # recall = 1.0 * num_same / len(ground_truth_tokens)
58
- # f1 = (2 * precision * recall) / (precision + recall)
59
- # return f1
60
 
61
 
62
  def exact_match_score(prediction, ground_truth):
@@ -72,7 +52,7 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
72
 
73
 
74
  def compute_score(dataset, predictions):
75
- precision = recall = f1 = exact_match = total = 0
76
  for article in dataset:
77
  for paragraph in article["paragraphs"]:
78
  for qa in paragraph["qas"]:
@@ -84,18 +64,15 @@ def compute_score(dataset, predictions):
84
  ground_truths = list(map(lambda x: x["text"], qa["answers"]))
85
  prediction = predictions[qa["id"]]
86
  exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
87
- precision_qa = metric_max_over_ground_truths(precision_score, prediction, ground_truths)
88
- recall_qa = metric_max_over_ground_truths(recall_score, prediction, ground_truths)
89
- print(recall_qa, precision_qa)
90
- f1 += (2 * precision_qa * recall_qa) / (precision_qa + recall_qa)
91
- recall += recall_qa
92
- precision += precision_qa
93
  exact_match = 100.0 * exact_match / total
94
  f1 = 100.0 * f1 / total
95
- recall = 100.0 * recall / total
96
- precision = 100.0 * precision / total
97
 
98
- return {"exact_match": exact_match, "f1": f1, "precision": precision , "recall": recall}
99
 
100
 
101
  if __name__ == "__main__":
 
1
  """ Official evaluation script for v1.1 of the SQuAD dataset. """
 
2
  import argparse
3
  import json
4
  import re
 
25
 
26
  return white_space_fix(remove_articles(remove_punc(lower(s))))
27
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def f1_score(prediction, ground_truth):
30
  prediction_tokens = normalize_answer(prediction).split()
31
  ground_truth_tokens = normalize_answer(ground_truth).split()
32
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33
  num_same = sum(common.values())
34
  if num_same == 0:
35
  return 0
36
+ precision = 1.0 * num_same / len(prediction_tokens)
37
  recall = 1.0 * num_same / len(ground_truth_tokens)
38
+ f1 = (2 * precision * recall) / (precision + recall)
39
+ return f1
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  def exact_match_score(prediction, ground_truth):
 
52
 
53
 
54
  def compute_score(dataset, predictions):
55
+ f1 = exact_match = total = 0
56
  for article in dataset:
57
  for paragraph in article["paragraphs"]:
58
  for qa in paragraph["qas"]:
 
64
  ground_truths = list(map(lambda x: x["text"], qa["answers"]))
65
  prediction = predictions[qa["id"]]
66
  exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
67
+ f1_temp = metric_max_over_ground_truths(f1_score, prediction, ground_truths)
68
+ print(f1_temp)
69
+ f1 += f1_temp
70
+
71
+
 
72
  exact_match = 100.0 * exact_match / total
73
  f1 = 100.0 * f1 / total
 
 
74
 
75
+ return {"exact_match": exact_match, "f1": f1}
76
 
77
 
78
  if __name__ == "__main__":