Lj Miranda
Add ability to incorporate external submissions (#7)
96c67b7 unverified
import logging
import sys
from dataclasses import dataclass, make_dataclass
from enum import Enum
import numpy as np
from src.display.formatting import make_clickable_model, model_hyperlink
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO,
)
def fields(raw_class):
return [
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
]
### The Models (System-Under-Study, SUT) we're evaluating. ###
class ModelType(Enum):
BASE = "🟥 Base"
SFT = "⭕ SFT"
PREFERENCE_ALIGNED = "♦️ Preference-aligned"
UNKNOWN = "❓ Unknown"
class Multilingual(Enum):
MONOLINGUAL = "🟠 Monolingual"
MULTILINGUAL = "🟢 Multilingual"
SEA = "🔵 SEA-Focused"
UNKNOWN = "❓ Unknown"
@dataclass
class ModelSUT:
# fmt: off
param_size: float # Number of parameters
model_type: str # Model type: SFT, Preference-aligned
multilingual: str # Multilingual: Monolingual, SEA-focused, Multilingual
# fmt: on
model_registry = {
# fmt: off
"gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
"gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
"aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"google/gemma-2-9b-it": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-2-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-3-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-3-12b-it": ModelSUT(param_size=12, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"sail/Sailor2-20B-Chat": ModelSUT(param_size=20, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value),
"sail/Sailor2-8B-Chat": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value),
"Qwen/Qwen2.5-72B-Instruct": ModelSUT(param_size=72, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-32B-Instruct": ModelSUT(param_size=32, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-14B-Instruct": ModelSUT(param_size=14, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-7B-Instruct": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-32B": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-14B": ModelSUT(param_size=14, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-8B": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-4B": ModelSUT(param_size=4, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"aisingapore/Llama-SEA-LION-v3.5-70B-R": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/Llama-SEA-LION-v3.5-8B-R": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"CohereLabs/c4ai-command-a-03-2025": ModelSUT(param_size=111, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"CohereLabs/c4ai-command-r7b-12-2024": ModelSUT(param_size=7, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"SeaLLMs/SeaLLMs-v3-1.5B-Chat": ModelSUT(param_size=1.5, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"SeaLLMs/SeaLLMs-v3-7B-Chat": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"mistralai/Ministral-8B-Instruct-2410": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"mistralai/Mixtral-8x7B-Instruct-v0.1": ModelSUT(param_size=47, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-9B-Chat": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-83B-Chat": ModelSUT(param_size=83, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-83B": ModelSUT(param_size=83, model_type=ModelType.BASE.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-3.1-8B-Instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-3.1-70B-Instruct": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": ModelSUT(param_size=400, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-4-Scout-17B-16E-Instruct": ModelSUT(param_size=109, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"mistralai/Mixtral-8x22B-Instruct-v0.1": ModelSUT(param_size=141, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"CohereForAI/aya-expanse-32b": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"neulab/Pangea-7B": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"HuggingFaceTB/SmolLM-1.7B-Instruct": ModelSUT(param_size=1.7, model_type=ModelType.SFT.value, multilingual=Multilingual.MONOLINGUAL.value),
# fmt: on
}
### The Task and Tasks classes store information about each benchmark we're scoring. ###
class TaskCategory(Enum):
CULTURAL_KNOWLEDGE = "🌏 Cultural Knowledge"
CLASSICAL_NLP = "🏛️ Classical NLP"
READING_COMPREHENSION = "📖 Reading Comprehension"
TRANSLATION = "🔢 Generation"
@dataclass
class Task:
benchmark: str # benchmark name in the results file
metric: str # metric to display
col_name: str # column name to display
language: str # language being evaluated
category: str # choice between different task categories
num_samples: int # canonical number of examples
class Tasks(Enum):
# fmt: off
balita_tgl_mcf = Task("balita_tgl_mcf", "acc_", "🏛️ BalitaNLP", "tgl", TaskCategory.CLASSICAL_NLP, 35_177)
belebele_ceb_mcf = Task("belebele_ceb_mcf", "acc_", "📖 Belebele (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 900)
belebele_fil_mcf = Task("belebele_fil_mcf", "acc_", "📖 Belebele (fil)", "fil", TaskCategory.READING_COMPREHENSION, 900)
cebuaner_ceb_mcf = Task("cebuaner_ceb_mcf", "acc_", "🏛️ CebuaNER", "ceb", TaskCategory.CLASSICAL_NLP, 1310)
dengue_filipino_fil = Task("dengue_filipino_fil:_average", "acc_norm", "🏛️ Dengue", "fil", TaskCategory.CLASSICAL_NLP, 4015)
firecs_fil_mcf = Task("firecs_fil_mcf", "acc_", "🏛️ FiReCS", "fil", TaskCategory.CLASSICAL_NLP, 7340)
global_mmlu_all_tgl = Task("global_mmlu_all_tgl_mcf:_average", "acc_", "🌏 Global-MMLU", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 14_042)
include_tgl_mcf = Task("include_tgl_mcf:_average", "acc_", "🌏 INCLUDE", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 500)
kalahi_tgl_mcf = Task("kalahi_tgl_mcf", "acc_", "🌏 KALAHI", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 150)
newsphnli_fil_mcf = Task("newsphnli_fil_mcf", "acc_", "📖 NewsPH NLI", "fil", TaskCategory.READING_COMPREHENSION, 90_000)
ntrex128_fil = Task("ntrex128_fil", "rougeL", "🔢 NTREX-128", "fil", TaskCategory.TRANSLATION, 1997)
readability_ceb_mcf = Task("readability_ceb_mcf", "acc_", "📖 Readability (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 350)
sib200_ceb_mcf = Task("sib200_ceb_mcf", "acc_", "🏛️ SIB-200 (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 99)
sib200_tgl_mcf = Task("sib200_tgl_mcf", "acc_", "🏛️ SIB-200 (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 99)
# stingraybench_corr_tgl_mcf = Task("stingraybench_correctness_tgl_mcf", "acc_", "StingrayBench (Correctness)", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100)
stingraybench_sem_appropriateness_tgl_mcf = Task("stingraybench_semantic_appropriateness_tgl_mcf", "acc_", "🌏StingrayBench", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100)
tatoeba_ceb = Task("tatoeba_ceb", "rougeL", "🔢 Tatoeba (ceb)", "ceb", TaskCategory.TRANSLATION, 377)
tatoeba_tgl = Task("tatoeba_tgl", "rougeL", "🔢 Tatoeba (tgl)", "tgl", TaskCategory.TRANSLATION, 2499)
tico19_tgl = Task("tico19_tgl", "rougeL", "🔢 TICO-19", "tgl", TaskCategory.TRANSLATION, 971)
tlunifiedner_tgl_mcf = Task("tlunifiedner_tgl_mcf", "acc_", "🏛️ TLUnified NER", "tgl", TaskCategory.CLASSICAL_NLP, 1579)
universalner_ceb_mcf = Task("universalner_ceb_mcf", "acc_", "🏛️ Universal NER (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 49)
universalner_tgl_mcf = Task("universalner_tgl_mcf", "acc_", "🏛️ Universal NER (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 56)
# fmt: on
### These classes define how the columns will be represented ###
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
aggregate: bool = False
meta: bool = False
auto_eval_cols = [
# fmt: off
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True, meta=True)],
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, meta=True)],
["precision", ColumnContent, ColumnContent("Precision", "str", False, meta=True)],
["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)],
["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)],
["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)],
["is_submission", ColumnContent, ColumnContent("Submission", "boolean", False, meta=True)],
["submission_date", ColumnContent, ColumnContent("Submission Date", "str", False, meta=True)],
# fmt: on
]
for task in Tasks:
auto_eval_cols.append(
[task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]
)
for task_category in TaskCategory:
auto_eval_cols.append(
[
task_category.name,
ColumnContent,
ColumnContent(task_category.value, "number", True, aggregate=True),
]
)
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_cols, frozen=True)
### These classes define how a single model evaluation result will be represented ###
@dataclass
class ModelDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class Precision(Enum):
float16 = ModelDetails("float16")
bfloat16 = ModelDetails("bfloat16")
Unknown = ModelDetails("?")
def from_str(precision):
if precision in ["torch.float16", "float16"]:
return Precision.float16
if precision in ["torch.bfloat16", "bfloat16"]:
return Precision.bfloat16
return Precision.Unknown
@dataclass
class EvalResult:
"""Represent one full model evaluation."""
eval_name: str
full_model: str
org: str
model: str
results: dict
average: float
aggregate_results: dict
precision: Precision = Precision.Unknown
# Submission metadata
is_submission: bool = False
param_size: float = -1
model_type: str = ModelType.UNKNOWN.value
multilingual: str = Multilingual.UNKNOWN.value
submission_date: str = ""
model_url: str = "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard"
@classmethod
def init_from_dict(self, data: dict, is_submission: bool = False) -> "EvalResult":
"""Populate results from a dictionary"""
# For model details, use user-provided metadata if it's a submission
config_key = "display_metadata" if is_submission else "config"
config = data.get(config_key)
precision = Precision.from_str(config.get("model_dtype"))
org_and_model = (
config.get("hf_id")
if is_submission
else config.get("model_name", config.get("model_args", None))
)
org_and_model = org_and_model.split("/", 1)
if len(org_and_model) == 1:
org = None
model = org_and_model[0]
result_key = f"{model}_{precision.value.name}"
else:
org = org_and_model[0]
model = org_and_model[1]
result_key = f"{org}_{model}_{precision.value.name}"
full_model = "/".join(org_and_model)
results = EvalResult.compute_scores_per_benchmark(data.get("results"))
aggregate_results = EvalResult.compute_aggregate_results(results)
filbench_score = np.mean(list(aggregate_results.values()))
# Format all results
if is_submission:
# Use pre-computed scores and check if they match our computed scores
category_scores = data.get("category_scores")
aggregate_results_precomputed = {
TaskCategory.CULTURAL_KNOWLEDGE.value: category_scores.get(
"CULTURAL_KNOWLEDGE"
),
TaskCategory.CLASSICAL_NLP.value: category_scores.get("CLASSICAL_NLP"),
TaskCategory.READING_COMPREHENSION.value: category_scores.get(
"READING_COMPREHENSION"
),
TaskCategory.TRANSLATION.value: category_scores.get("GENERATION"),
}
is_similar = EvalResult.compare_category_scores(
precomputed=aggregate_results_precomputed,
computed=aggregate_results,
)
if not is_similar:
logging.warning("Precomputed and computed category scores differ.")
logging.info("Will use computed scores for display.")
else:
logging.info("Precomputed and computed category scores are similar.")
aggregate_results = aggregate_results_precomputed
# Do the same comparison for FilBench score
filbench_score_precomputed = data.get("filbench_score")
is_filbench_score_similar = (
abs(filbench_score_precomputed - filbench_score) < 1e-2
)
if not is_filbench_score_similar:
logging.warning(
f"Precomputed filbench_score ({filbench_score_precomputed}) and"
f" official FilBench score ({filbench_score}) differ."
)
average = (
filbench_score_precomputed
if is_filbench_score_similar
else filbench_score
)
display_metadata = data.get("display_metadata")
return EvalResult(
eval_name=result_key,
full_model=full_model,
org=org,
model=model,
precision=precision,
results=results,
aggregate_results=aggregate_results,
average=average,
# Display Metadata
is_submission=True,
submission_date=display_metadata.get("submission_date", ""),
param_size=display_metadata.get("num_params", -1),
model_type=display_metadata.get("model_type", ModelType.UNKNOWN.value),
multilingual=display_metadata.get(
"multilinguality", Multilingual.UNKNOWN.value
),
model_url=display_metadata.get(
"url",
"https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard",
),
)
else:
return self(
eval_name=result_key,
full_model=full_model,
org=org,
model=model,
precision=precision,
results=results,
aggregate_results=aggregate_results,
is_submission=False,
average=filbench_score,
)
@classmethod
def compute_scores_per_benchmark(cls, results: dict) -> dict[str, float]:
scores_per_benchmark = {}
for task in Tasks:
task = task.value
if results.get(task.benchmark):
score = results.get(task.benchmark).get(task.metric)
if "acc_" in task.metric:
score = score * 100.0
if "rougeL" in task.metric:
score = score * 100.0
scores_per_benchmark[task.benchmark] = score
else:
scores_per_benchmark[task.benchmark] = None
return scores_per_benchmark
@classmethod
def compute_aggregate_results(cls, results: dict) -> dict[str, float]:
aggregate_results = {}
for task_category in TaskCategory:
tasks = [
task.value for task in Tasks if task.value.category == task_category
]
total_category = sum([task.num_samples for task in tasks])
weighted_total_category = 0
for task in tasks:
if results[task.benchmark]:
score = results[task.benchmark]
else:
score = 0
weighted_total_category += score * task.num_samples
aggregate_results[task_category.value] = (
weighted_total_category / total_category
)
return aggregate_results
@classmethod
def compare_category_scores(
cls, precomputed: dict, computed: dict, threshold: float = 1e-2
) -> bool:
"""Compares precomputed and computed category scores."""
is_similar = True
for key, precomputed_value in precomputed.items():
computed_value = computed.get(key)
if precomputed_value is not None and computed_value is not None:
if abs(precomputed_value - computed_value) > threshold:
logging.warning(
f"Aggregate result for '{key}' differs"
f" (precomputed={precomputed_value}, computed={computed_value})"
)
is_similar = False
return is_similar
def to_dict(self):
"""Converts the EvalResult to a dict compatible with our dataframe display"""
if not self.is_submission:
model_details = model_registry.get(
self.full_model,
ModelSUT(
param_size=-1,
model_type=ModelType.UNKNOWN.value,
multilingual=Multilingual.UNKNOWN.value,
),
)
else:
model_details = ModelSUT(
param_size=self.param_size,
model_type=self.model_type,
multilingual=self.multilingual,
)
model_name_with_url = (
make_clickable_model(self.full_model)
if not self.is_submission
else f"📥 {model_hyperlink(self.model_url, self.full_model)}"
)
data_dict = {
"eval_name": self.eval_name, # not a column, just a save name
AutoEvalColumn.precision.name: self.precision.value.name,
AutoEvalColumn.model.name: model_name_with_url,
AutoEvalColumn.average.name: self.average,
AutoEvalColumn.param_size.name: model_details.param_size,
AutoEvalColumn.model_type.name: model_details.model_type,
AutoEvalColumn.multilingual.name: model_details.multilingual,
AutoEvalColumn.is_submission.name: self.is_submission,
AutoEvalColumn.submission_date.name: self.submission_date,
}
for task in Tasks:
data_dict[task.value.col_name] = self.results[task.value.benchmark]
for task_category in TaskCategory:
data_dict[task_category.value] = self.aggregate_results[task_category.value]
return data_dict