Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
•
404587d
1
Parent(s):
3b66490
Edited README and removed error-rate metric
Browse files- README.md +1 -1
- main_backend.py +5 -1
- src/backend/evaluate_model.py +8 -5
- src/backend/manage_requests.py +2 -1
- src/backend/model_operations.py +19 -19
- src/backend/util.py +10 -14
- src/display/about.py +22 -4
- src/display/utils.py +0 -15
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
1 |
---
|
2 |
+
title: H2EM Leaderboard
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
main_backend.py
CHANGED
@@ -35,11 +35,13 @@ def run_auto_eval():
|
|
35 |
hf_repo_results=envs.RESULTS_REPO,
|
36 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
37 |
)
|
38 |
-
|
39 |
eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
|
40 |
hf_repo=envs.QUEUE_REPO,
|
41 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
|
|
|
42 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
|
|
43 |
|
44 |
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
45 |
|
@@ -57,6 +59,7 @@ def run_auto_eval():
|
|
57 |
hf_repo=envs.QUEUE_REPO,
|
58 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
59 |
)
|
|
|
60 |
|
61 |
run_eval_suite.run_evaluation(
|
62 |
eval_request=eval_request,
|
@@ -66,6 +69,7 @@ def run_auto_eval():
|
|
66 |
device=envs.DEVICE,
|
67 |
no_cache=True,
|
68 |
)
|
|
|
69 |
|
70 |
|
71 |
if __name__ == "__main__":
|
|
|
35 |
hf_repo_results=envs.RESULTS_REPO,
|
36 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
37 |
)
|
38 |
+
logging.info("Checked completed evals")
|
39 |
eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
|
40 |
hf_repo=envs.QUEUE_REPO,
|
41 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
|
42 |
+
logging.info("Got eval requests")
|
43 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
44 |
+
logging.info("Sorted eval requests")
|
45 |
|
46 |
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
47 |
|
|
|
59 |
hf_repo=envs.QUEUE_REPO,
|
60 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
61 |
)
|
62 |
+
logging.info("Set eval request to running, now running eval")
|
63 |
|
64 |
run_eval_suite.run_evaluation(
|
65 |
eval_request=eval_request,
|
|
|
69 |
device=envs.DEVICE,
|
70 |
no_cache=True,
|
71 |
)
|
72 |
+
logging.info("Eval finished, now setting status to finished")
|
73 |
|
74 |
|
75 |
if __name__ == "__main__":
|
src/backend/evaluate_model.py
CHANGED
@@ -75,20 +75,23 @@ class Evaluator:
|
|
75 |
|
76 |
avg_summary_len = self.summary_generator.avg_length
|
77 |
answer_rate = self.summary_generator.answer_rate
|
78 |
-
error_rate = self.summary_generator.error_rate
|
79 |
|
80 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
81 |
generated_summaries_df)
|
82 |
-
|
83 |
hallucination_rate = self.eval_model.hallucination_rate
|
84 |
|
85 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
86 |
-
precision=self.precision,
|
87 |
-
|
88 |
-
|
|
|
|
|
89 |
|
90 |
return results
|
91 |
except FileNotFoundError:
|
|
|
92 |
logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
|
93 |
raise
|
94 |
except Exception as e:
|
|
|
75 |
|
76 |
avg_summary_len = self.summary_generator.avg_length
|
77 |
answer_rate = self.summary_generator.answer_rate
|
78 |
+
# error_rate = self.summary_generator.error_rate
|
79 |
|
80 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
81 |
generated_summaries_df)
|
82 |
+
factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
|
83 |
hallucination_rate = self.eval_model.hallucination_rate
|
84 |
|
85 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
86 |
+
precision=self.precision,
|
87 |
+
factual_consistency_rate=factual_consistency_rate,
|
88 |
+
hallucination_rate=hallucination_rate,
|
89 |
+
answer_rate=answer_rate,
|
90 |
+
avg_summary_len=avg_summary_len)
|
91 |
|
92 |
return results
|
93 |
except FileNotFoundError:
|
94 |
+
# logging.error(f"File not found: {envs.SOURCE_PATH}")
|
95 |
logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
|
96 |
raise
|
97 |
except Exception as e:
|
src/backend/manage_requests.py
CHANGED
@@ -9,9 +9,10 @@ from huggingface_hub import HfApi, snapshot_download
|
|
9 |
@dataclass
|
10 |
class EvalRequest:
|
11 |
model: str
|
12 |
-
private: bool
|
13 |
status: str
|
14 |
json_filepath: str
|
|
|
15 |
weight_type: str = "Original"
|
16 |
model_type: str = "" # pretrained, finetuned, with RL
|
17 |
precision: str = "" # float16, bfloat16
|
|
|
9 |
@dataclass
|
10 |
class EvalRequest:
|
11 |
model: str
|
12 |
+
# private: bool
|
13 |
status: str
|
14 |
json_filepath: str
|
15 |
+
private: bool = False
|
16 |
weight_type: str = "Original"
|
17 |
model_type: str = "" # pretrained, finetuned, with RL
|
18 |
precision: str = "" # float16, bfloat16
|
src/backend/model_operations.py
CHANGED
@@ -111,7 +111,7 @@ class SummaryGenerator:
|
|
111 |
columns=["source", "summary", "dataset"])
|
112 |
self._compute_avg_length()
|
113 |
self._compute_answer_rate()
|
114 |
-
self._compute_error_rate(error_count)
|
115 |
|
116 |
return self.summaries_df
|
117 |
|
@@ -140,13 +140,13 @@ class SummaryGenerator:
|
|
140 |
|
141 |
self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
|
142 |
|
143 |
-
def _compute_error_rate(self, count):
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
|
149 |
-
|
150 |
|
151 |
|
152 |
class EvaluationModel:
|
@@ -168,7 +168,7 @@ class EvaluationModel:
|
|
168 |
"""
|
169 |
self.model = load_evaluation_model(model_path)
|
170 |
self.scores = []
|
171 |
-
self.
|
172 |
self.hallucination_rate = None
|
173 |
|
174 |
def evaluate_hallucination(self, summaries_df):
|
@@ -192,15 +192,15 @@ class EvaluationModel:
|
|
192 |
logging.error(f"Error evaluating hallucination: {e}")
|
193 |
raise
|
194 |
|
195 |
-
def
|
196 |
"""
|
197 |
-
Compute the
|
198 |
-
This method relies on the 'scores'
|
199 |
-
'evaluate_hallucination' method.
|
200 |
|
201 |
Returns:
|
202 |
-
float:
|
203 |
-
|
204 |
|
205 |
Raises:
|
206 |
ValueError: If scores have not been calculated prior to calling this method.
|
@@ -210,15 +210,15 @@ class EvaluationModel:
|
|
210 |
logging.error(error_msg)
|
211 |
raise ValueError(error_msg)
|
212 |
|
213 |
-
# Use threshold of 0.5 to compute
|
214 |
num_above_threshold = sum(score >= threshold for score in self.scores)
|
215 |
num_total = len(self.scores)
|
216 |
|
217 |
if not num_total:
|
218 |
-
raise ValueError("No scores available to compute
|
219 |
|
220 |
-
self.
|
221 |
-
self.hallucination_rate = 100 - self.
|
222 |
|
223 |
-
return self.
|
224 |
|
|
|
111 |
columns=["source", "summary", "dataset"])
|
112 |
self._compute_avg_length()
|
113 |
self._compute_answer_rate()
|
114 |
+
# self._compute_error_rate(error_count)
|
115 |
|
116 |
return self.summaries_df
|
117 |
|
|
|
140 |
|
141 |
self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
|
142 |
|
143 |
+
# def _compute_error_rate(self, count):
|
144 |
+
# """
|
145 |
+
# Compute the error rate of summaries.
|
146 |
+
# """
|
147 |
+
# total_rows = len(self.summaries_df)
|
148 |
|
149 |
+
# self.error_rate = 0 if total_rows == 0 else count / total_rows
|
150 |
|
151 |
|
152 |
class EvaluationModel:
|
|
|
168 |
"""
|
169 |
self.model = load_evaluation_model(model_path)
|
170 |
self.scores = []
|
171 |
+
self.factual_consistency_rate = None
|
172 |
self.hallucination_rate = None
|
173 |
|
174 |
def evaluate_hallucination(self, summaries_df):
|
|
|
192 |
logging.error(f"Error evaluating hallucination: {e}")
|
193 |
raise
|
194 |
|
195 |
+
def compute_factual_consistency_rate(self, threshold=0.5):
|
196 |
"""
|
197 |
+
Compute the factual consistency rate of the evaluated summaries based on
|
198 |
+
the previously calculated scores. This method relies on the 'scores'
|
199 |
+
attribute being populated, typically via the 'evaluate_hallucination' method.
|
200 |
|
201 |
Returns:
|
202 |
+
float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
|
203 |
+
and 'hallucination_rate' attributes of the instance.
|
204 |
|
205 |
Raises:
|
206 |
ValueError: If scores have not been calculated prior to calling this method.
|
|
|
210 |
logging.error(error_msg)
|
211 |
raise ValueError(error_msg)
|
212 |
|
213 |
+
# Use threshold of 0.5 to compute factual_consistency_rate
|
214 |
num_above_threshold = sum(score >= threshold for score in self.scores)
|
215 |
num_total = len(self.scores)
|
216 |
|
217 |
if not num_total:
|
218 |
+
raise ValueError("No scores available to compute factual consistency rate.")
|
219 |
|
220 |
+
self.factual_consistency_rate = (num_above_threshold / num_total) * 100
|
221 |
+
self.hallucination_rate = 100 - self.factual_consistency_rate
|
222 |
|
223 |
+
return self.factual_consistency_rate
|
224 |
|
src/backend/util.py
CHANGED
@@ -17,9 +17,9 @@ def generate_prompt(source_passage: str) -> str:
|
|
17 |
"""
|
18 |
|
19 |
|
20 |
-
def format_results(model_name: str, revision: str, precision: str,
|
21 |
-
|
22 |
-
|
23 |
"""
|
24 |
Formats the evaluation results into a structured dictionary.
|
25 |
|
@@ -27,11 +27,10 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
|
|
27 |
model_name (str): The name of the evaluated model.
|
28 |
revision (str): The revision hash of the model.
|
29 |
precision (str): The precision with which the evaluation was run.
|
30 |
-
|
31 |
-
hallucination_rate (float): The hallucination rate
|
32 |
-
answer_rate (float): The answer rate
|
33 |
-
avg_summary_len (float): The average summary length
|
34 |
-
error_rate (float): The rate at which errors occurred during summary generation.
|
35 |
|
36 |
Returns:
|
37 |
dict: A dictionary containing the structured evaluation results.
|
@@ -43,21 +42,18 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
|
|
43 |
"model_sha": revision # Hash of the model
|
44 |
},
|
45 |
"results": {
|
46 |
-
"accuracy": {
|
47 |
-
"accuracy": accuracy
|
48 |
-
},
|
49 |
"hallucination_rate": {
|
50 |
"hallucination_rate": hallucination_rate
|
51 |
},
|
|
|
|
|
|
|
52 |
"answer_rate": {
|
53 |
"answer_rate": answer_rate
|
54 |
},
|
55 |
"average_summary_length": {
|
56 |
"average_summary_length": avg_summary_len
|
57 |
},
|
58 |
-
"error_rate": {
|
59 |
-
"error_rate": error_rate
|
60 |
-
}
|
61 |
}
|
62 |
}
|
63 |
|
|
|
17 |
"""
|
18 |
|
19 |
|
20 |
+
def format_results(model_name: str, revision: str, precision: str,
|
21 |
+
factual_consistency_rate: float, hallucination_rate: float,
|
22 |
+
answer_rate: float, avg_summary_len: float) -> dict:
|
23 |
"""
|
24 |
Formats the evaluation results into a structured dictionary.
|
25 |
|
|
|
27 |
model_name (str): The name of the evaluated model.
|
28 |
revision (str): The revision hash of the model.
|
29 |
precision (str): The precision with which the evaluation was run.
|
30 |
+
factual_consistency_rate (float): The factual consistency rate.
|
31 |
+
hallucination_rate (float): The hallucination rate.
|
32 |
+
answer_rate (float): The answer rate.
|
33 |
+
avg_summary_len (float): The average summary length.
|
|
|
34 |
|
35 |
Returns:
|
36 |
dict: A dictionary containing the structured evaluation results.
|
|
|
42 |
"model_sha": revision # Hash of the model
|
43 |
},
|
44 |
"results": {
|
|
|
|
|
|
|
45 |
"hallucination_rate": {
|
46 |
"hallucination_rate": hallucination_rate
|
47 |
},
|
48 |
+
"factual_consistency_rate": {
|
49 |
+
"factual_consistency_rate": factual_consistency_rate
|
50 |
+
},
|
51 |
"answer_rate": {
|
52 |
"answer_rate": answer_rate
|
53 |
},
|
54 |
"average_summary_length": {
|
55 |
"average_summary_length": avg_summary_len
|
56 |
},
|
|
|
|
|
|
|
57 |
}
|
58 |
}
|
59 |
|
src/display/about.py
CHANGED
@@ -10,26 +10,44 @@ class Task:
|
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
-
accuracy = Task("accuracy", "accuracy", "Accuracy")
|
14 |
hallucination_rate = Task("hallucination_rate",
|
15 |
"hallucination_rate", "Hallucination Rate")
|
|
|
16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
17 |
average_summary_length = Task("average_summary_length",
|
18 |
"average_summary_length", "Average Summary Length")
|
19 |
-
error_rate = Task("error_rate", "error_rate", "Error Rate")
|
20 |
|
|
|
|
|
21 |
# Your leaderboard name
|
22 |
-
TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
-
This
|
|
|
|
|
27 |
"""
|
28 |
|
29 |
# Which evaluations are you running? how can people reproduce what you have?
|
30 |
LLM_BENCHMARKS_TEXT = """
|
31 |
## How it works
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
## Reproducibility
|
34 |
To reproduce our results, here is the commands you can run:
|
35 |
|
|
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
13 |
hallucination_rate = Task("hallucination_rate",
|
14 |
"hallucination_rate", "Hallucination Rate")
|
15 |
+
accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
|
16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
17 |
average_summary_length = Task("average_summary_length",
|
18 |
"average_summary_length", "Average Summary Length")
|
19 |
+
# error_rate = Task("error_rate", "error_rate", "Error Rate")
|
20 |
|
21 |
+
|
22 |
+
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation (H2EM) Model leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
This leaderboard evaluates how often an LLM introduces hallucinations when summarizing a document.
|
29 |
+
|
30 |
+
|
31 |
"""
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
34 |
LLM_BENCHMARKS_TEXT = """
|
35 |
## How it works
|
36 |
|
37 |
+
Using Vectara's H2EM (Hughes Hallucination Evaluation Model), we evaluate how often an LLM introduces hallucinations when summarizing a document.
|
38 |
+
|
39 |
+
The model card for H2EM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
|
40 |
+
Given a document and a summary generated by an LLM, H2EM outputs a hallucination score between 0 and 1, where 0 means hallucination and 1 indicates no hallucination, or perfect factual consistency with the document.
|
41 |
+
|
42 |
+
Our evaluation dataset is composed of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
|
43 |
+
We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
|
44 |
+
|
45 |
+
## Understand each metric
|
46 |
+
### - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
|
47 |
+
### - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
|
48 |
+
### - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
|
49 |
+
### - Average Summary Length: The average number of words in the generated summaries
|
50 |
+
|
51 |
## Reproducibility
|
52 |
To reproduce our results, here is the commands you can run:
|
53 |
|
src/display/utils.py
CHANGED
@@ -30,21 +30,6 @@ auto_eval_column_dict.append(["model", ColumnContent,
|
|
30 |
ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
-
# # Accuracy
|
34 |
-
# auto_eval_column_dict.append(["accuracy", ColumnContent,
|
35 |
-
# ColumnContent("Accuracy", "number", True)])
|
36 |
-
# # Hallucination Rate
|
37 |
-
# auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
|
38 |
-
# ColumnContent("Hallucination Rate", "number", True)])
|
39 |
-
# # Answer Rate
|
40 |
-
# auto_eval_column_dict.append(["answer_rate", ColumnContent,
|
41 |
-
# ColumnContent("Answer Rate", "number", True)])
|
42 |
-
# # Average Summary Length
|
43 |
-
# auto_eval_column_dict.append(["average_summary_length", ColumnContent,
|
44 |
-
# ColumnContent("Average Summary Length", "number", True)])
|
45 |
-
# # Error Rate
|
46 |
-
# auto_eval_column_dict.append(["error_rate", ColumnContent,
|
47 |
-
# ColumnContent("Error Rate", "number", True)])
|
48 |
|
49 |
# Model information
|
50 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
30 |
ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|