Short-Answer-Feedback
/

mbart-score-finetuned-saf-micro-job

@@ -34,14 +34,14 @@ def parse_float(value):
     Params:
         value (string): value to be converted to float
     Returns:
-        The float representation of the given string, or -1 if the string could
         not be converted to a float
     """
     try:
         float_value = float(value)
         return float_value
     except ValueError:
-        return -1
 def extract_scores(predictions):
     """
@@ -54,17 +54,13 @@ def extract_scores(predictions):
     """
     scores = []
     # iterate through predictions and try to extract predicted score;
-    # if score could not be extracted, set it to -1
     for pred in predictions:
         try:
-            score_string = pred.split('Feedback:', 1)[0].strip()
             score = parse_float(score_string)
         except IndexError:
-            try:
-                score_string = pred.split(' ', 1)[0].strip()
-                score = parse_float(score_string)
-            except IndexError:
-                score = -1
         scores.append(score)
     return scores
@@ -92,40 +88,40 @@ def extract_feedback(predictions):
     return feedback
-def compute_mse(predictions, labels):
     """
-    Utility function to compute the mean squared error of the
     score predictions in relation to the golden label scores
     Params:
         predictions (list): model score predictions
         labels (list): golden label scores
     Returns:
-        (float, int): mse of valid samples and number of invalid samples
     """
     # get indexes of valid score predictions
-    # (i.e., where the score is greater than zero)
-    idx = np.where(np.array(predictions) > 0)
     # get size of the golden labels list and of
     # the valid predictions array
     labels_size = np.array(labels).size
     valid_predictions_size = idx[0].size
-    # only compute mse if valid score predictions were generated,
     # otherwise set mse to 1
     if valid_predictions_size > 0:
-        # calculate mse from labels and predictions
         valid_predictions = np.array(predictions)[idx]
         score_labels = np.array(labels)[idx]
-        mse = mean_squared_error(score_labels, valid_predictions)
         # cap mse at 1
-        if mse > 1:
             return 1, labels_size - valid_predictions_size
-        # return computed mse and number of invalid samples
-        return mse, labels_size - valid_predictions_size
     else:
         return 1, labels_size - valid_predictions_size
@@ -158,15 +154,15 @@ def compute_metrics(predictions, labels):
         model_type='bert-base-multilingual-cased',
         rescale_with_baseline=True)
-    # compute mse of score predictions
-    mse, _ = compute_mse(predicted_scores, reference_scores)
     results = {
         'sacrebleu': sacrebleu_score,
         'rouge': rouge_score,
         'meteor': meteor_score,
         'bert_score': np.array(bert_score['f1']).mean().item(),
-        'mse': mse
         }
     return results

     Params:
         value (string): value to be converted to float
     Returns:
+        The float representation of the given string, or None if the string could
         not be converted to a float
     """
     try:
         float_value = float(value)
         return float_value
     except ValueError:
+        return None
 def extract_scores(predictions):
     """
     """
     scores = []
     # iterate through predictions and try to extract predicted score;
+    # if score could not be extracted, set it to None
     for pred in predictions:
         try:
+            score_string = pred.split(' ', 1)[0].strip()
             score = parse_float(score_string)
         except IndexError:
+            score = None
         scores.append(score)
     return scores
     return feedback
+def compute_rmse(predictions, labels):
     """
+    Utility function to compute the root mean squared error of the
     score predictions in relation to the golden label scores
     Params:
         predictions (list): model score predictions
         labels (list): golden label scores
     Returns:
+        (float, int): rmse of valid samples and number of invalid samples
     """
     # get indexes of valid score predictions
+    # (i.e., where the score is not None)
+    idx = np.where(np.array(predictions) != None)
     # get size of the golden labels list and of
     # the valid predictions array
     labels_size = np.array(labels).size
     valid_predictions_size = idx[0].size
+    # only compute rmse if valid score predictions were generated,
     # otherwise set mse to 1
     if valid_predictions_size > 0:
+        # calculate rmse from labels and predictions
         valid_predictions = np.array(predictions)[idx]
         score_labels = np.array(labels)[idx]
+        rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
         # cap mse at 1
+        if rmse > 1:
             return 1, labels_size - valid_predictions_size
+        # return computed rmse and number of invalid samples
+        return rmse, labels_size - valid_predictions_size
     else:
         return 1, labels_size - valid_predictions_size
         model_type='bert-base-multilingual-cased',
         rescale_with_baseline=True)
+    # compute rmse of score predictions
+    rmse, _ = compute_rmse(predicted_scores, reference_scores)
     results = {
         'sacrebleu': sacrebleu_score,
         'rouge': rouge_score,
         'meteor': meteor_score,
         'bert_score': np.array(bert_score['f1']).mean().item(),
+        'rmse': rmse
         }
     return results