import logging import sys from dataclasses import dataclass, make_dataclass from enum import Enum import numpy as np from src.display.formatting import make_clickable_model, model_hyperlink logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], level=logging.INFO, ) def fields(raw_class): return [ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" ] ### The Models (System-Under-Study, SUT) we're evaluating. ### class ModelType(Enum): BASE = "πŸŸ₯ Base" SFT = "β­• SFT" PREFERENCE_ALIGNED = "♦️ Preference-aligned" UNKNOWN = "❓ Unknown" class Multilingual(Enum): MONOLINGUAL = "🟠 Monolingual" MULTILINGUAL = "🟒 Multilingual" SEA = "πŸ”΅ SEA-Focused" UNKNOWN = "❓ Unknown" @dataclass class ModelSUT: # fmt: off param_size: float # Number of parameters model_type: str # Model type: SFT, Preference-aligned multilingual: str # Multilingual: Monolingual, SEA-focused, Multilingual # fmt: on model_registry = { # fmt: off "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value), "gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value), "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "google/gemma-2-9b-it": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "google/gemma-2-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "google/gemma-3-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "google/gemma-3-12b-it": ModelSUT(param_size=12, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "sail/Sailor2-20B-Chat": ModelSUT(param_size=20, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value), "sail/Sailor2-8B-Chat": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value), "Qwen/Qwen2.5-72B-Instruct": ModelSUT(param_size=72, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen2.5-32B-Instruct": ModelSUT(param_size=32, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen2.5-14B-Instruct": ModelSUT(param_size=14, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen2.5-7B-Instruct": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen3-32B": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen3-14B": ModelSUT(param_size=14, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen3-8B": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "Qwen/Qwen3-4B": ModelSUT(param_size=4, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "aisingapore/Llama-SEA-LION-v3.5-70B-R": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "aisingapore/Llama-SEA-LION-v3.5-8B-R": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "CohereLabs/c4ai-command-a-03-2025": ModelSUT(param_size=111, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "CohereLabs/c4ai-command-r7b-12-2024": ModelSUT(param_size=7, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "SeaLLMs/SeaLLMs-v3-1.5B-Chat": ModelSUT(param_size=1.5, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "SeaLLMs/SeaLLMs-v3-7B-Chat": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), "mistralai/Ministral-8B-Instruct-2410": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "mistralai/Mixtral-8x7B-Instruct-v0.1": ModelSUT(param_size=47, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Tower-Babel/Babel-9B-Chat": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Tower-Babel/Babel-83B-Chat": ModelSUT(param_size=83, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "Tower-Babel/Babel-83B": ModelSUT(param_size=83, model_type=ModelType.BASE.value, multilingual=Multilingual.MULTILINGUAL.value), "meta-llama/Llama-3.1-8B-Instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "meta-llama/Llama-3.1-70B-Instruct": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": ModelSUT(param_size=400, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "meta-llama/Llama-4-Scout-17B-16E-Instruct": ModelSUT(param_size=109, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "mistralai/Mixtral-8x22B-Instruct-v0.1": ModelSUT(param_size=141, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "CohereForAI/aya-expanse-32b": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), "neulab/Pangea-7B": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), "HuggingFaceTB/SmolLM-1.7B-Instruct": ModelSUT(param_size=1.7, model_type=ModelType.SFT.value, multilingual=Multilingual.MONOLINGUAL.value), # fmt: on } ### The Task and Tasks classes store information about each benchmark we're scoring. ### class TaskCategory(Enum): CULTURAL_KNOWLEDGE = "🌏 Cultural Knowledge" CLASSICAL_NLP = "πŸ›οΈ Classical NLP" READING_COMPREHENSION = "πŸ“– Reading Comprehension" TRANSLATION = "πŸ”’ Generation" @dataclass class Task: benchmark: str # benchmark name in the results file metric: str # metric to display col_name: str # column name to display language: str # language being evaluated category: str # choice between different task categories num_samples: int # canonical number of examples class Tasks(Enum): # fmt: off balita_tgl_mcf = Task("balita_tgl_mcf", "acc_", "πŸ›οΈ BalitaNLP", "tgl", TaskCategory.CLASSICAL_NLP, 35_177) belebele_ceb_mcf = Task("belebele_ceb_mcf", "acc_", "πŸ“– Belebele (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 900) belebele_fil_mcf = Task("belebele_fil_mcf", "acc_", "πŸ“– Belebele (fil)", "fil", TaskCategory.READING_COMPREHENSION, 900) cebuaner_ceb_mcf = Task("cebuaner_ceb_mcf", "acc_", "πŸ›οΈ CebuaNER", "ceb", TaskCategory.CLASSICAL_NLP, 1310) dengue_filipino_fil = Task("dengue_filipino_fil:_average", "acc_norm", "πŸ›οΈ Dengue", "fil", TaskCategory.CLASSICAL_NLP, 4015) firecs_fil_mcf = Task("firecs_fil_mcf", "acc_", "πŸ›οΈ FiReCS", "fil", TaskCategory.CLASSICAL_NLP, 7340) global_mmlu_all_tgl = Task("global_mmlu_all_tgl_mcf:_average", "acc_", "🌏 Global-MMLU", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 14_042) include_tgl_mcf = Task("include_tgl_mcf:_average", "acc_", "🌏 INCLUDE", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 500) kalahi_tgl_mcf = Task("kalahi_tgl_mcf", "acc_", "🌏 KALAHI", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 150) newsphnli_fil_mcf = Task("newsphnli_fil_mcf", "acc_", "πŸ“– NewsPH NLI", "fil", TaskCategory.READING_COMPREHENSION, 90_000) ntrex128_fil = Task("ntrex128_fil", "rougeL", "πŸ”’ NTREX-128", "fil", TaskCategory.TRANSLATION, 1997) readability_ceb_mcf = Task("readability_ceb_mcf", "acc_", "πŸ“– Readability (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 350) sib200_ceb_mcf = Task("sib200_ceb_mcf", "acc_", "πŸ›οΈ SIB-200 (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 99) sib200_tgl_mcf = Task("sib200_tgl_mcf", "acc_", "πŸ›οΈ SIB-200 (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 99) # stingraybench_corr_tgl_mcf = Task("stingraybench_correctness_tgl_mcf", "acc_", "StingrayBench (Correctness)", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100) stingraybench_sem_appropriateness_tgl_mcf = Task("stingraybench_semantic_appropriateness_tgl_mcf", "acc_", "🌏StingrayBench", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100) tatoeba_ceb = Task("tatoeba_ceb", "rougeL", "πŸ”’ Tatoeba (ceb)", "ceb", TaskCategory.TRANSLATION, 377) tatoeba_tgl = Task("tatoeba_tgl", "rougeL", "πŸ”’ Tatoeba (tgl)", "tgl", TaskCategory.TRANSLATION, 2499) tico19_tgl = Task("tico19_tgl", "rougeL", "πŸ”’ TICO-19", "tgl", TaskCategory.TRANSLATION, 971) tlunifiedner_tgl_mcf = Task("tlunifiedner_tgl_mcf", "acc_", "πŸ›οΈ TLUnified NER", "tgl", TaskCategory.CLASSICAL_NLP, 1579) universalner_ceb_mcf = Task("universalner_ceb_mcf", "acc_", "πŸ›οΈ Universal NER (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 49) universalner_tgl_mcf = Task("universalner_tgl_mcf", "acc_", "πŸ›οΈ Universal NER (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 56) # fmt: on ### These classes define how the columns will be represented ### @dataclass class ColumnContent: name: str type: str displayed_by_default: bool hidden: bool = False never_hidden: bool = False aggregate: bool = False meta: bool = False auto_eval_cols = [ # fmt: off ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True, meta=True)], ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, meta=True)], ["precision", ColumnContent, ColumnContent("Precision", "str", False, meta=True)], ["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)], ["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)], ["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)], ["is_submission", ColumnContent, ColumnContent("Submission", "boolean", False, meta=True)], ["submission_date", ColumnContent, ColumnContent("Submission Date", "str", False, meta=True)], # fmt: on ] for task in Tasks: auto_eval_cols.append( [task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)] ) for task_category in TaskCategory: auto_eval_cols.append( [ task_category.name, ColumnContent, ColumnContent(task_category.value, "number", True, aggregate=True), ] ) AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_cols, frozen=True) ### These classes define how a single model evaluation result will be represented ### @dataclass class ModelDetails: name: str display_name: str = "" symbol: str = "" # emoji class Precision(Enum): float16 = ModelDetails("float16") bfloat16 = ModelDetails("bfloat16") Unknown = ModelDetails("?") def from_str(precision): if precision in ["torch.float16", "float16"]: return Precision.float16 if precision in ["torch.bfloat16", "bfloat16"]: return Precision.bfloat16 return Precision.Unknown @dataclass class EvalResult: """Represent one full model evaluation.""" eval_name: str full_model: str org: str model: str results: dict average: float aggregate_results: dict precision: Precision = Precision.Unknown # Submission metadata is_submission: bool = False param_size: float = -1 model_type: str = ModelType.UNKNOWN.value multilingual: str = Multilingual.UNKNOWN.value submission_date: str = "" model_url: str = "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard" @classmethod def init_from_dict(self, data: dict, is_submission: bool = False) -> "EvalResult": """Populate results from a dictionary""" # For model details, use user-provided metadata if it's a submission config_key = "display_metadata" if is_submission else "config" config = data.get(config_key) precision = Precision.from_str(config.get("model_dtype")) org_and_model = ( config.get("hf_id") if is_submission else config.get("model_name", config.get("model_args", None)) ) org_and_model = org_and_model.split("/", 1) if len(org_and_model) == 1: org = None model = org_and_model[0] result_key = f"{model}_{precision.value.name}" else: org = org_and_model[0] model = org_and_model[1] result_key = f"{org}_{model}_{precision.value.name}" full_model = "/".join(org_and_model) results = EvalResult.compute_scores_per_benchmark(data.get("results")) aggregate_results = EvalResult.compute_aggregate_results(results) filbench_score = np.mean(list(aggregate_results.values())) # Format all results if is_submission: # Use pre-computed scores and check if they match our computed scores category_scores = data.get("category_scores") aggregate_results_precomputed = { TaskCategory.CULTURAL_KNOWLEDGE.value: category_scores.get( "CULTURAL_KNOWLEDGE" ), TaskCategory.CLASSICAL_NLP.value: category_scores.get("CLASSICAL_NLP"), TaskCategory.READING_COMPREHENSION.value: category_scores.get( "READING_COMPREHENSION" ), TaskCategory.TRANSLATION.value: category_scores.get("GENERATION"), } is_similar = EvalResult.compare_category_scores( precomputed=aggregate_results_precomputed, computed=aggregate_results, ) if not is_similar: logging.warning("Precomputed and computed category scores differ.") logging.info("Will use computed scores for display.") else: logging.info("Precomputed and computed category scores are similar.") aggregate_results = aggregate_results_precomputed # Do the same comparison for FilBench score filbench_score_precomputed = data.get("filbench_score") is_filbench_score_similar = ( abs(filbench_score_precomputed - filbench_score) < 1e-2 ) if not is_filbench_score_similar: logging.warning( f"Precomputed filbench_score ({filbench_score_precomputed}) and" f" official FilBench score ({filbench_score}) differ." ) average = ( filbench_score_precomputed if is_filbench_score_similar else filbench_score ) display_metadata = data.get("display_metadata") return EvalResult( eval_name=result_key, full_model=full_model, org=org, model=model, precision=precision, results=results, aggregate_results=aggregate_results, average=average, # Display Metadata is_submission=True, submission_date=display_metadata.get("submission_date", ""), param_size=display_metadata.get("num_params", -1), model_type=display_metadata.get("model_type", ModelType.UNKNOWN.value), multilingual=display_metadata.get( "multilinguality", Multilingual.UNKNOWN.value ), model_url=display_metadata.get( "url", "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard", ), ) else: return self( eval_name=result_key, full_model=full_model, org=org, model=model, precision=precision, results=results, aggregate_results=aggregate_results, is_submission=False, average=filbench_score, ) @classmethod def compute_scores_per_benchmark(cls, results: dict) -> dict[str, float]: scores_per_benchmark = {} for task in Tasks: task = task.value if results.get(task.benchmark): score = results.get(task.benchmark).get(task.metric) if "acc_" in task.metric: score = score * 100.0 if "rougeL" in task.metric: score = score * 100.0 scores_per_benchmark[task.benchmark] = score else: scores_per_benchmark[task.benchmark] = None return scores_per_benchmark @classmethod def compute_aggregate_results(cls, results: dict) -> dict[str, float]: aggregate_results = {} for task_category in TaskCategory: tasks = [ task.value for task in Tasks if task.value.category == task_category ] total_category = sum([task.num_samples for task in tasks]) weighted_total_category = 0 for task in tasks: if results[task.benchmark]: score = results[task.benchmark] else: score = 0 weighted_total_category += score * task.num_samples aggregate_results[task_category.value] = ( weighted_total_category / total_category ) return aggregate_results @classmethod def compare_category_scores( cls, precomputed: dict, computed: dict, threshold: float = 1e-2 ) -> bool: """Compares precomputed and computed category scores.""" is_similar = True for key, precomputed_value in precomputed.items(): computed_value = computed.get(key) if precomputed_value is not None and computed_value is not None: if abs(precomputed_value - computed_value) > threshold: logging.warning( f"Aggregate result for '{key}' differs" f" (precomputed={precomputed_value}, computed={computed_value})" ) is_similar = False return is_similar def to_dict(self): """Converts the EvalResult to a dict compatible with our dataframe display""" if not self.is_submission: model_details = model_registry.get( self.full_model, ModelSUT( param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.UNKNOWN.value, ), ) else: model_details = ModelSUT( param_size=self.param_size, model_type=self.model_type, multilingual=self.multilingual, ) model_name_with_url = ( make_clickable_model(self.full_model) if not self.is_submission else f"πŸ“₯ {model_hyperlink(self.model_url, self.full_model)}" ) data_dict = { "eval_name": self.eval_name, # not a column, just a save name AutoEvalColumn.precision.name: self.precision.value.name, AutoEvalColumn.model.name: model_name_with_url, AutoEvalColumn.average.name: self.average, AutoEvalColumn.param_size.name: model_details.param_size, AutoEvalColumn.model_type.name: model_details.model_type, AutoEvalColumn.multilingual.name: model_details.multilingual, AutoEvalColumn.is_submission.name: self.is_submission, AutoEvalColumn.submission_date.name: self.submission_date, } for task in Tasks: data_dict[task.value.col_name] = self.results[task.value.benchmark] for task_category in TaskCategory: data_dict[task_category.value] = self.aggregate_results[task_category.value] return data_dict