Spaces:
Running
Running
| import logging | |
| import sys | |
| from dataclasses import dataclass, make_dataclass | |
| from enum import Enum | |
| import numpy as np | |
| from src.display.formatting import make_clickable_model, model_hyperlink | |
| logging.basicConfig( | |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| handlers=[logging.StreamHandler(sys.stdout)], | |
| level=logging.INFO, | |
| ) | |
| def fields(raw_class): | |
| return [ | |
| v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" | |
| ] | |
| ### The Models (System-Under-Study, SUT) we're evaluating. ### | |
| class ModelType(Enum): | |
| BASE = "🟥 Base" | |
| SFT = "⭕ SFT" | |
| PREFERENCE_ALIGNED = "♦️ Preference-aligned" | |
| UNKNOWN = "❓ Unknown" | |
| class Multilingual(Enum): | |
| MONOLINGUAL = "🟠 Monolingual" | |
| MULTILINGUAL = "🟢 Multilingual" | |
| SEA = "🔵 SEA-Focused" | |
| UNKNOWN = "❓ Unknown" | |
| class ModelSUT: | |
| # fmt: off | |
| param_size: float # Number of parameters | |
| model_type: str # Model type: SFT, Preference-aligned | |
| multilingual: str # Multilingual: Monolingual, SEA-focused, Multilingual | |
| # fmt: on | |
| model_registry = { | |
| # fmt: off | |
| "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "google/gemma-2-9b-it": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "google/gemma-2-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "google/gemma-3-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "google/gemma-3-12b-it": ModelSUT(param_size=12, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "sail/Sailor2-20B-Chat": ModelSUT(param_size=20, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value), | |
| "sail/Sailor2-8B-Chat": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value), | |
| "Qwen/Qwen2.5-72B-Instruct": ModelSUT(param_size=72, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen2.5-32B-Instruct": ModelSUT(param_size=32, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen2.5-14B-Instruct": ModelSUT(param_size=14, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen2.5-7B-Instruct": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen3-32B": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen3-14B": ModelSUT(param_size=14, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen3-8B": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Qwen/Qwen3-4B": ModelSUT(param_size=4, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "aisingapore/Llama-SEA-LION-v3.5-70B-R": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "aisingapore/Llama-SEA-LION-v3.5-8B-R": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "CohereLabs/c4ai-command-a-03-2025": ModelSUT(param_size=111, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "CohereLabs/c4ai-command-r7b-12-2024": ModelSUT(param_size=7, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "SeaLLMs/SeaLLMs-v3-1.5B-Chat": ModelSUT(param_size=1.5, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "SeaLLMs/SeaLLMs-v3-7B-Chat": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value), | |
| "mistralai/Ministral-8B-Instruct-2410": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "mistralai/Mixtral-8x7B-Instruct-v0.1": ModelSUT(param_size=47, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Tower-Babel/Babel-9B-Chat": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Tower-Babel/Babel-83B-Chat": ModelSUT(param_size=83, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "Tower-Babel/Babel-83B": ModelSUT(param_size=83, model_type=ModelType.BASE.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "meta-llama/Llama-3.1-8B-Instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "meta-llama/Llama-3.1-70B-Instruct": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": ModelSUT(param_size=400, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct": ModelSUT(param_size=109, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "mistralai/Mixtral-8x22B-Instruct-v0.1": ModelSUT(param_size=141, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "CohereForAI/aya-expanse-32b": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "neulab/Pangea-7B": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value), | |
| "HuggingFaceTB/SmolLM-1.7B-Instruct": ModelSUT(param_size=1.7, model_type=ModelType.SFT.value, multilingual=Multilingual.MONOLINGUAL.value), | |
| # fmt: on | |
| } | |
| ### The Task and Tasks classes store information about each benchmark we're scoring. ### | |
| class TaskCategory(Enum): | |
| CULTURAL_KNOWLEDGE = "🌏 Cultural Knowledge" | |
| CLASSICAL_NLP = "🏛️ Classical NLP" | |
| READING_COMPREHENSION = "📖 Reading Comprehension" | |
| TRANSLATION = "🔢 Generation" | |
| class Task: | |
| benchmark: str # benchmark name in the results file | |
| metric: str # metric to display | |
| col_name: str # column name to display | |
| language: str # language being evaluated | |
| category: str # choice between different task categories | |
| num_samples: int # canonical number of examples | |
| class Tasks(Enum): | |
| # fmt: off | |
| balita_tgl_mcf = Task("balita_tgl_mcf", "acc_", "🏛️ BalitaNLP", "tgl", TaskCategory.CLASSICAL_NLP, 35_177) | |
| belebele_ceb_mcf = Task("belebele_ceb_mcf", "acc_", "📖 Belebele (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 900) | |
| belebele_fil_mcf = Task("belebele_fil_mcf", "acc_", "📖 Belebele (fil)", "fil", TaskCategory.READING_COMPREHENSION, 900) | |
| cebuaner_ceb_mcf = Task("cebuaner_ceb_mcf", "acc_", "🏛️ CebuaNER", "ceb", TaskCategory.CLASSICAL_NLP, 1310) | |
| dengue_filipino_fil = Task("dengue_filipino_fil:_average", "acc_norm", "🏛️ Dengue", "fil", TaskCategory.CLASSICAL_NLP, 4015) | |
| firecs_fil_mcf = Task("firecs_fil_mcf", "acc_", "🏛️ FiReCS", "fil", TaskCategory.CLASSICAL_NLP, 7340) | |
| global_mmlu_all_tgl = Task("global_mmlu_all_tgl_mcf:_average", "acc_", "🌏 Global-MMLU", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 14_042) | |
| include_tgl_mcf = Task("include_tgl_mcf:_average", "acc_", "🌏 INCLUDE", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 500) | |
| kalahi_tgl_mcf = Task("kalahi_tgl_mcf", "acc_", "🌏 KALAHI", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 150) | |
| newsphnli_fil_mcf = Task("newsphnli_fil_mcf", "acc_", "📖 NewsPH NLI", "fil", TaskCategory.READING_COMPREHENSION, 90_000) | |
| ntrex128_fil = Task("ntrex128_fil", "rougeL", "🔢 NTREX-128", "fil", TaskCategory.TRANSLATION, 1997) | |
| readability_ceb_mcf = Task("readability_ceb_mcf", "acc_", "📖 Readability (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 350) | |
| sib200_ceb_mcf = Task("sib200_ceb_mcf", "acc_", "🏛️ SIB-200 (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 99) | |
| sib200_tgl_mcf = Task("sib200_tgl_mcf", "acc_", "🏛️ SIB-200 (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 99) | |
| # stingraybench_corr_tgl_mcf = Task("stingraybench_correctness_tgl_mcf", "acc_", "StingrayBench (Correctness)", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100) | |
| stingraybench_sem_appropriateness_tgl_mcf = Task("stingraybench_semantic_appropriateness_tgl_mcf", "acc_", "🌏StingrayBench", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100) | |
| tatoeba_ceb = Task("tatoeba_ceb", "rougeL", "🔢 Tatoeba (ceb)", "ceb", TaskCategory.TRANSLATION, 377) | |
| tatoeba_tgl = Task("tatoeba_tgl", "rougeL", "🔢 Tatoeba (tgl)", "tgl", TaskCategory.TRANSLATION, 2499) | |
| tico19_tgl = Task("tico19_tgl", "rougeL", "🔢 TICO-19", "tgl", TaskCategory.TRANSLATION, 971) | |
| tlunifiedner_tgl_mcf = Task("tlunifiedner_tgl_mcf", "acc_", "🏛️ TLUnified NER", "tgl", TaskCategory.CLASSICAL_NLP, 1579) | |
| universalner_ceb_mcf = Task("universalner_ceb_mcf", "acc_", "🏛️ Universal NER (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 49) | |
| universalner_tgl_mcf = Task("universalner_tgl_mcf", "acc_", "🏛️ Universal NER (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 56) | |
| # fmt: on | |
| ### These classes define how the columns will be represented ### | |
| class ColumnContent: | |
| name: str | |
| type: str | |
| displayed_by_default: bool | |
| hidden: bool = False | |
| never_hidden: bool = False | |
| aggregate: bool = False | |
| meta: bool = False | |
| auto_eval_cols = [ | |
| # fmt: off | |
| ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True, meta=True)], | |
| ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, meta=True)], | |
| ["precision", ColumnContent, ColumnContent("Precision", "str", False, meta=True)], | |
| ["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)], | |
| ["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)], | |
| ["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)], | |
| ["is_submission", ColumnContent, ColumnContent("Submission", "boolean", False, meta=True)], | |
| ["submission_date", ColumnContent, ColumnContent("Submission Date", "str", False, meta=True)], | |
| # fmt: on | |
| ] | |
| for task in Tasks: | |
| auto_eval_cols.append( | |
| [task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)] | |
| ) | |
| for task_category in TaskCategory: | |
| auto_eval_cols.append( | |
| [ | |
| task_category.name, | |
| ColumnContent, | |
| ColumnContent(task_category.value, "number", True, aggregate=True), | |
| ] | |
| ) | |
| AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_cols, frozen=True) | |
| ### These classes define how a single model evaluation result will be represented ### | |
| class ModelDetails: | |
| name: str | |
| display_name: str = "" | |
| symbol: str = "" # emoji | |
| class Precision(Enum): | |
| float16 = ModelDetails("float16") | |
| bfloat16 = ModelDetails("bfloat16") | |
| Unknown = ModelDetails("?") | |
| def from_str(precision): | |
| if precision in ["torch.float16", "float16"]: | |
| return Precision.float16 | |
| if precision in ["torch.bfloat16", "bfloat16"]: | |
| return Precision.bfloat16 | |
| return Precision.Unknown | |
| class EvalResult: | |
| """Represent one full model evaluation.""" | |
| eval_name: str | |
| full_model: str | |
| org: str | |
| model: str | |
| results: dict | |
| average: float | |
| aggregate_results: dict | |
| precision: Precision = Precision.Unknown | |
| # Submission metadata | |
| is_submission: bool = False | |
| param_size: float = -1 | |
| model_type: str = ModelType.UNKNOWN.value | |
| multilingual: str = Multilingual.UNKNOWN.value | |
| submission_date: str = "" | |
| model_url: str = "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard" | |
| def init_from_dict(self, data: dict, is_submission: bool = False) -> "EvalResult": | |
| """Populate results from a dictionary""" | |
| # For model details, use user-provided metadata if it's a submission | |
| config_key = "display_metadata" if is_submission else "config" | |
| config = data.get(config_key) | |
| precision = Precision.from_str(config.get("model_dtype")) | |
| org_and_model = ( | |
| config.get("hf_id") | |
| if is_submission | |
| else config.get("model_name", config.get("model_args", None)) | |
| ) | |
| org_and_model = org_and_model.split("/", 1) | |
| if len(org_and_model) == 1: | |
| org = None | |
| model = org_and_model[0] | |
| result_key = f"{model}_{precision.value.name}" | |
| else: | |
| org = org_and_model[0] | |
| model = org_and_model[1] | |
| result_key = f"{org}_{model}_{precision.value.name}" | |
| full_model = "/".join(org_and_model) | |
| results = EvalResult.compute_scores_per_benchmark(data.get("results")) | |
| aggregate_results = EvalResult.compute_aggregate_results(results) | |
| filbench_score = np.mean(list(aggregate_results.values())) | |
| # Format all results | |
| if is_submission: | |
| # Use pre-computed scores and check if they match our computed scores | |
| category_scores = data.get("category_scores") | |
| aggregate_results_precomputed = { | |
| TaskCategory.CULTURAL_KNOWLEDGE.value: category_scores.get( | |
| "CULTURAL_KNOWLEDGE" | |
| ), | |
| TaskCategory.CLASSICAL_NLP.value: category_scores.get("CLASSICAL_NLP"), | |
| TaskCategory.READING_COMPREHENSION.value: category_scores.get( | |
| "READING_COMPREHENSION" | |
| ), | |
| TaskCategory.TRANSLATION.value: category_scores.get("GENERATION"), | |
| } | |
| is_similar = EvalResult.compare_category_scores( | |
| precomputed=aggregate_results_precomputed, | |
| computed=aggregate_results, | |
| ) | |
| if not is_similar: | |
| logging.warning("Precomputed and computed category scores differ.") | |
| logging.info("Will use computed scores for display.") | |
| else: | |
| logging.info("Precomputed and computed category scores are similar.") | |
| aggregate_results = aggregate_results_precomputed | |
| # Do the same comparison for FilBench score | |
| filbench_score_precomputed = data.get("filbench_score") | |
| is_filbench_score_similar = ( | |
| abs(filbench_score_precomputed - filbench_score) < 1e-2 | |
| ) | |
| if not is_filbench_score_similar: | |
| logging.warning( | |
| f"Precomputed filbench_score ({filbench_score_precomputed}) and" | |
| f" official FilBench score ({filbench_score}) differ." | |
| ) | |
| average = ( | |
| filbench_score_precomputed | |
| if is_filbench_score_similar | |
| else filbench_score | |
| ) | |
| display_metadata = data.get("display_metadata") | |
| return EvalResult( | |
| eval_name=result_key, | |
| full_model=full_model, | |
| org=org, | |
| model=model, | |
| precision=precision, | |
| results=results, | |
| aggregate_results=aggregate_results, | |
| average=average, | |
| # Display Metadata | |
| is_submission=True, | |
| submission_date=display_metadata.get("submission_date", ""), | |
| param_size=display_metadata.get("num_params", -1), | |
| model_type=display_metadata.get("model_type", ModelType.UNKNOWN.value), | |
| multilingual=display_metadata.get( | |
| "multilinguality", Multilingual.UNKNOWN.value | |
| ), | |
| model_url=display_metadata.get( | |
| "url", | |
| "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard", | |
| ), | |
| ) | |
| else: | |
| return self( | |
| eval_name=result_key, | |
| full_model=full_model, | |
| org=org, | |
| model=model, | |
| precision=precision, | |
| results=results, | |
| aggregate_results=aggregate_results, | |
| is_submission=False, | |
| average=filbench_score, | |
| ) | |
| def compute_scores_per_benchmark(cls, results: dict) -> dict[str, float]: | |
| scores_per_benchmark = {} | |
| for task in Tasks: | |
| task = task.value | |
| if results.get(task.benchmark): | |
| score = results.get(task.benchmark).get(task.metric) | |
| if "acc_" in task.metric: | |
| score = score * 100.0 | |
| if "rougeL" in task.metric: | |
| score = score * 100.0 | |
| scores_per_benchmark[task.benchmark] = score | |
| else: | |
| scores_per_benchmark[task.benchmark] = None | |
| return scores_per_benchmark | |
| def compute_aggregate_results(cls, results: dict) -> dict[str, float]: | |
| aggregate_results = {} | |
| for task_category in TaskCategory: | |
| tasks = [ | |
| task.value for task in Tasks if task.value.category == task_category | |
| ] | |
| total_category = sum([task.num_samples for task in tasks]) | |
| weighted_total_category = 0 | |
| for task in tasks: | |
| if results[task.benchmark]: | |
| score = results[task.benchmark] | |
| else: | |
| score = 0 | |
| weighted_total_category += score * task.num_samples | |
| aggregate_results[task_category.value] = ( | |
| weighted_total_category / total_category | |
| ) | |
| return aggregate_results | |
| def compare_category_scores( | |
| cls, precomputed: dict, computed: dict, threshold: float = 1e-2 | |
| ) -> bool: | |
| """Compares precomputed and computed category scores.""" | |
| is_similar = True | |
| for key, precomputed_value in precomputed.items(): | |
| computed_value = computed.get(key) | |
| if precomputed_value is not None and computed_value is not None: | |
| if abs(precomputed_value - computed_value) > threshold: | |
| logging.warning( | |
| f"Aggregate result for '{key}' differs" | |
| f" (precomputed={precomputed_value}, computed={computed_value})" | |
| ) | |
| is_similar = False | |
| return is_similar | |
| def to_dict(self): | |
| """Converts the EvalResult to a dict compatible with our dataframe display""" | |
| if not self.is_submission: | |
| model_details = model_registry.get( | |
| self.full_model, | |
| ModelSUT( | |
| param_size=-1, | |
| model_type=ModelType.UNKNOWN.value, | |
| multilingual=Multilingual.UNKNOWN.value, | |
| ), | |
| ) | |
| else: | |
| model_details = ModelSUT( | |
| param_size=self.param_size, | |
| model_type=self.model_type, | |
| multilingual=self.multilingual, | |
| ) | |
| model_name_with_url = ( | |
| make_clickable_model(self.full_model) | |
| if not self.is_submission | |
| else f"📥 {model_hyperlink(self.model_url, self.full_model)}" | |
| ) | |
| data_dict = { | |
| "eval_name": self.eval_name, # not a column, just a save name | |
| AutoEvalColumn.precision.name: self.precision.value.name, | |
| AutoEvalColumn.model.name: model_name_with_url, | |
| AutoEvalColumn.average.name: self.average, | |
| AutoEvalColumn.param_size.name: model_details.param_size, | |
| AutoEvalColumn.model_type.name: model_details.model_type, | |
| AutoEvalColumn.multilingual.name: model_details.multilingual, | |
| AutoEvalColumn.is_submission.name: self.is_submission, | |
| AutoEvalColumn.submission_date.name: self.submission_date, | |
| } | |
| for task in Tasks: | |
| data_dict[task.value.col_name] = self.results[task.value.benchmark] | |
| for task_category in TaskCategory: | |
| data_dict[task_category.value] = self.aggregate_results[task_category.value] | |
| return data_dict | |