Commit
·
54e08b1
1
Parent(s):
8fe9b43
Update evaluation.py
Browse files- evaluation.py +19 -23
evaluation.py
CHANGED
|
@@ -34,14 +34,14 @@ def parse_float(value):
|
|
| 34 |
Params:
|
| 35 |
value (string): value to be converted to float
|
| 36 |
Returns:
|
| 37 |
-
The float representation of the given string, or
|
| 38 |
not be converted to a float
|
| 39 |
"""
|
| 40 |
try:
|
| 41 |
float_value = float(value)
|
| 42 |
return float_value
|
| 43 |
except ValueError:
|
| 44 |
-
return
|
| 45 |
|
| 46 |
def extract_scores(predictions):
|
| 47 |
"""
|
|
@@ -54,17 +54,13 @@ def extract_scores(predictions):
|
|
| 54 |
"""
|
| 55 |
scores = []
|
| 56 |
# iterate through predictions and try to extract predicted score;
|
| 57 |
-
# if score could not be extracted, set it to
|
| 58 |
for pred in predictions:
|
| 59 |
try:
|
| 60 |
-
score_string = pred.split('
|
| 61 |
score = parse_float(score_string)
|
| 62 |
except IndexError:
|
| 63 |
-
|
| 64 |
-
score_string = pred.split(' ', 1)[0].strip()
|
| 65 |
-
score = parse_float(score_string)
|
| 66 |
-
except IndexError:
|
| 67 |
-
score = -1
|
| 68 |
scores.append(score)
|
| 69 |
|
| 70 |
return scores
|
|
@@ -92,40 +88,40 @@ def extract_feedback(predictions):
|
|
| 92 |
|
| 93 |
return feedback
|
| 94 |
|
| 95 |
-
def
|
| 96 |
"""
|
| 97 |
-
Utility function to compute the mean squared error of the
|
| 98 |
score predictions in relation to the golden label scores
|
| 99 |
|
| 100 |
Params:
|
| 101 |
predictions (list): model score predictions
|
| 102 |
labels (list): golden label scores
|
| 103 |
Returns:
|
| 104 |
-
(float, int):
|
| 105 |
"""
|
| 106 |
# get indexes of valid score predictions
|
| 107 |
-
# (i.e., where the score is
|
| 108 |
-
idx = np.where(np.array(predictions)
|
| 109 |
|
| 110 |
# get size of the golden labels list and of
|
| 111 |
# the valid predictions array
|
| 112 |
labels_size = np.array(labels).size
|
| 113 |
valid_predictions_size = idx[0].size
|
| 114 |
|
| 115 |
-
# only compute
|
| 116 |
# otherwise set mse to 1
|
| 117 |
if valid_predictions_size > 0:
|
| 118 |
-
# calculate
|
| 119 |
valid_predictions = np.array(predictions)[idx]
|
| 120 |
score_labels = np.array(labels)[idx]
|
| 121 |
-
|
| 122 |
|
| 123 |
# cap mse at 1
|
| 124 |
-
if
|
| 125 |
return 1, labels_size - valid_predictions_size
|
| 126 |
|
| 127 |
-
# return computed
|
| 128 |
-
return
|
| 129 |
else:
|
| 130 |
return 1, labels_size - valid_predictions_size
|
| 131 |
|
|
@@ -158,15 +154,15 @@ def compute_metrics(predictions, labels):
|
|
| 158 |
model_type='bert-base-multilingual-cased',
|
| 159 |
rescale_with_baseline=True)
|
| 160 |
|
| 161 |
-
# compute
|
| 162 |
-
|
| 163 |
|
| 164 |
results = {
|
| 165 |
'sacrebleu': sacrebleu_score,
|
| 166 |
'rouge': rouge_score,
|
| 167 |
'meteor': meteor_score,
|
| 168 |
'bert_score': np.array(bert_score['f1']).mean().item(),
|
| 169 |
-
'
|
| 170 |
}
|
| 171 |
|
| 172 |
return results
|
|
|
|
| 34 |
Params:
|
| 35 |
value (string): value to be converted to float
|
| 36 |
Returns:
|
| 37 |
+
The float representation of the given string, or None if the string could
|
| 38 |
not be converted to a float
|
| 39 |
"""
|
| 40 |
try:
|
| 41 |
float_value = float(value)
|
| 42 |
return float_value
|
| 43 |
except ValueError:
|
| 44 |
+
return None
|
| 45 |
|
| 46 |
def extract_scores(predictions):
|
| 47 |
"""
|
|
|
|
| 54 |
"""
|
| 55 |
scores = []
|
| 56 |
# iterate through predictions and try to extract predicted score;
|
| 57 |
+
# if score could not be extracted, set it to None
|
| 58 |
for pred in predictions:
|
| 59 |
try:
|
| 60 |
+
score_string = pred.split(' ', 1)[0].strip()
|
| 61 |
score = parse_float(score_string)
|
| 62 |
except IndexError:
|
| 63 |
+
score = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
scores.append(score)
|
| 65 |
|
| 66 |
return scores
|
|
|
|
| 88 |
|
| 89 |
return feedback
|
| 90 |
|
| 91 |
+
def compute_rmse(predictions, labels):
|
| 92 |
"""
|
| 93 |
+
Utility function to compute the root mean squared error of the
|
| 94 |
score predictions in relation to the golden label scores
|
| 95 |
|
| 96 |
Params:
|
| 97 |
predictions (list): model score predictions
|
| 98 |
labels (list): golden label scores
|
| 99 |
Returns:
|
| 100 |
+
(float, int): rmse of valid samples and number of invalid samples
|
| 101 |
"""
|
| 102 |
# get indexes of valid score predictions
|
| 103 |
+
# (i.e., where the score is not None)
|
| 104 |
+
idx = np.where(np.array(predictions) != None)
|
| 105 |
|
| 106 |
# get size of the golden labels list and of
|
| 107 |
# the valid predictions array
|
| 108 |
labels_size = np.array(labels).size
|
| 109 |
valid_predictions_size = idx[0].size
|
| 110 |
|
| 111 |
+
# only compute rmse if valid score predictions were generated,
|
| 112 |
# otherwise set mse to 1
|
| 113 |
if valid_predictions_size > 0:
|
| 114 |
+
# calculate rmse from labels and predictions
|
| 115 |
valid_predictions = np.array(predictions)[idx]
|
| 116 |
score_labels = np.array(labels)[idx]
|
| 117 |
+
rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
|
| 118 |
|
| 119 |
# cap mse at 1
|
| 120 |
+
if rmse > 1:
|
| 121 |
return 1, labels_size - valid_predictions_size
|
| 122 |
|
| 123 |
+
# return computed rmse and number of invalid samples
|
| 124 |
+
return rmse, labels_size - valid_predictions_size
|
| 125 |
else:
|
| 126 |
return 1, labels_size - valid_predictions_size
|
| 127 |
|
|
|
|
| 154 |
model_type='bert-base-multilingual-cased',
|
| 155 |
rescale_with_baseline=True)
|
| 156 |
|
| 157 |
+
# compute rmse of score predictions
|
| 158 |
+
rmse, _ = compute_rmse(predicted_scores, reference_scores)
|
| 159 |
|
| 160 |
results = {
|
| 161 |
'sacrebleu': sacrebleu_score,
|
| 162 |
'rouge': rouge_score,
|
| 163 |
'meteor': meteor_score,
|
| 164 |
'bert_score': np.array(bert_score['f1']).mean().item(),
|
| 165 |
+
'rmse': rmse
|
| 166 |
}
|
| 167 |
|
| 168 |
return results
|