Source code for maxent_grpo.core.evaluation

# Copyright 2025 Liv d'Aliberti
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""LightEval task registration and Slurm launch utilities.

This module provides helpers to:

- Define a compact string specification per benchmark task and register common
  tasks in a dictionary consumable by launchers.
- Compute the proper vLLM Slurm submission command and spawn evaluations as
  jobs using ``subprocess.run``.

It also exposes ``SUPPORTED_BENCHMARKS`` and convenience functions to list
registered tasks. vLLM launch on Slurm requires a specific environment
bootstrap (see ``VLLM_SLURM_PREFIX``) to source system profiles and set ``$HOME``.
"""

from __future__ import annotations

import base64
import os
import subprocess
from typing import TYPE_CHECKING, Dict, List, Literal, TypeAlias

from .hub import get_gpu_count_for_vllm, get_param_count_from_repo_id


if TYPE_CHECKING:
    from trl import ModelConfig

    from ..config.grpo import GRPOConfig


# We need a special environment setup to launch vLLM from within Slurm training jobs.
# - Reference code: https://github.com/huggingface/brrr/blob/c55ba3505686d690de24c7ace6487a5c1426c0fd/brrr/lighteval/one_job_runner.py#L105
# - Slack thread: https://huggingface.slack.com/archives/C043JTYE1MJ/p1726566494958269
user_home_directory: str = os.path.expanduser("~")
VLLM_SLURM_PREFIX: List[str] = [
    "env",
    "-i",
    "bash",
    "-c",
    f"for f in /etc/profile.d/*.sh; do source $f; done; export HOME={user_home_directory}; mkdir -p var/artifacts/logs; sbatch ",
]

# Type aliases for task configuration
TaskSpec: TypeAlias = str  # e.g. "lighteval|math_500|0|0"
TaskName: TypeAlias = str  # e.g. "math_500"
TaskSuite: TypeAlias = Literal["lighteval", "extended"]
BenchmarkKey: TypeAlias = Literal["bbh", "mt-bench", "TruthfulQA"]


[docs] def register_lighteval_task( configs: Dict[TaskName, TaskSpec], eval_suite: TaskSuite, task_name: TaskName, task_list: str, num_fewshot: int = 0, ) -> None: """Register a LightEval task configuration in ``configs``. - Core tasks table: https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/tasks_table.jsonl - Custom tasks should live under your project (e.g., ``tasks/`` or ``ops/``). :param configs: Mapping where the serialized task spec is stored; mutated in place with the new ``task_name`` entry. :type configs: dict[str, str] :param eval_suite: Suite prefix, e.g. ``"lighteval"`` or ``"extended"``. This is prepended to every task in ``task_list``. :type eval_suite: str :param task_name: Key under which the composed task specification is stored. :type task_name: str :param task_list: Comma-separated list of task identifiers without suite prefix. Each entry is expanded into ``{eval_suite}|{task}|{num_fewshot}|0``. :type task_list: str :param num_fewshot: Number of few-shot examples per task (defaults to zero). :type num_fewshot: int :returns: ``None``. ``configs`` is updated directly. :rtype: None """ # Format task list in lighteval format task_list = ",".join( f"{eval_suite}|{task}|{num_fewshot}|0" for task in task_list.split(",") ) configs[task_name] = task_list
LIGHTEVAL_TASKS: Dict[TaskName, TaskSpec] = {} register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "math_500", "math_500", 0) register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime24", "aime24", 0) register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime25", "aime25", 0) register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "gpqa", "gpqa:diamond", 0) register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0) register_lighteval_task( LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0 )
[docs] def get_lighteval_tasks() -> List[TaskName]: """Return the list of registered LightEval task names. :returns: Available benchmark keys currently registered in ``LIGHTEVAL_TASKS``. :rtype: list[str] """ return list(LIGHTEVAL_TASKS.keys())
SUPPORTED_BENCHMARKS = get_lighteval_tasks() def _build_slurm_gpu_flag(num_gpus: int) -> List[str]: """Return sbatch GPU flag(s) based on environment policy. The behaviour is controlled by ``SLURM_GPU_FLAG_STYLE``: - ``\"none\"`` (default): do not add a GPU flag; rely on script headers. - ``\"gpus\"``: append ``--gpus={num_gpus}``. - ``\"gres\"``: append ``--gres=gpu:{num_gpus}``. :param num_gpus: Requested number of GPUs for the evaluation job. This is used verbatim in the formatted flag. :type num_gpus: int :returns: List of flag strings suitable for ``sbatch`` (possibly empty). :rtype: list[str] """ style = os.getenv("SLURM_GPU_FLAG_STYLE", "none").lower() if style == "gpus": return [f"--gpus={num_gpus}"] if style == "gres": return [f"--gres=gpu:{num_gpus}"] return []
[docs] def run_lighteval_job( benchmark: TaskName, training_args: "GRPOConfig", model_args: "ModelConfig", ) -> None: """Launch a LightEval job under Slurm with vLLM decoding. The job command is composed from ``VLLM_SLURM_PREFIX`` and a generated task list. For models with >=30B parameters the function enables tensor parallelism; otherwise it defaults to two GPUs to reduce cluster pressure. If ``system_prompt`` is provided it is base64-encoded to avoid quoting issues in the Slurm script. :param benchmark: Registered benchmark key to execute. :type benchmark: str :param training_args: Training configuration providing Hub identifiers and the optional ``system_prompt`` for evaluation. :type training_args: GRPOConfig :param model_args: Model configuration controlling trust flags for remote code and general model loading options. :type model_args: ModelConfig :returns: ``None``. A subprocess is spawned for the Slurm submission. :rtype: None :raises KeyError: If ``benchmark`` is not present in ``LIGHTEVAL_TASKS``. :raises subprocess.CalledProcessError: If ``sbatch`` returns a non-zero exit status. """ task_list = LIGHTEVAL_TASKS[benchmark] model_name = training_args.hub_model_id if not model_name: raise ValueError("hub_model_id must be set to run evaluation jobs") model_revision = training_args.hub_model_revision or "main" # For large models >= 30b params or those running the MATH benchmark, we need to shard them across the GPUs to avoid OOM num_gpus = get_gpu_count_for_vllm(model_name, model_revision) if get_param_count_from_repo_id(model_name) >= 30_000_000_000: tensor_parallel = True else: num_gpus = 2 # Hack while cluster is full tensor_parallel = False cmd = VLLM_SLURM_PREFIX.copy() gpu_flags = _build_slurm_gpu_flag(num_gpus) eval_slurm_script = os.getenv( "MAXENT_EVAL_SLURM_SCRIPT", "ops/slurm/evaluate.slurm" ) cmd_args = [ *gpu_flags, f"--job-name=or1_{benchmark}_{model_name.split('/')[-1]}_{model_revision}", eval_slurm_script, benchmark, f'"{task_list}"', model_name, model_revision, f"{tensor_parallel}", f"{model_args.trust_remote_code}", ] if training_args.system_prompt is not None: # encode to base64 to avoid issues with special characters # we decode in the sbatch script prompt_encoded = base64.b64encode(training_args.system_prompt.encode()).decode() cmd_args.append(prompt_encoded) cmd[-1] += " " + " ".join(cmd_args) subprocess.run(cmd, check=True)
[docs] def run_benchmark_jobs(training_args: "GRPOConfig", model_args: "ModelConfig") -> None: """Launch one or more benchmarks as Slurm jobs. When the CLI requests ``benchmarks=["all"]`` the function expands this into every registered LightEval task. Each benchmark is delegated to ``run_lighteval_job`` with the provided arguments. :param training_args: Training configuration whose ``benchmarks`` field enumerates the tasks to run (or the sentinel ``\"all\"``). :type training_args: GRPOConfig :param model_args: Model configuration forwarded to ``run_lighteval_job``. :type model_args: ModelConfig :returns: ``None``. Jobs are submitted sequentially. :rtype: None :raises ValueError: If an unknown benchmark name is supplied. """ benchmarks = training_args.benchmarks if len(benchmarks) == 1 and benchmarks[0] == "all": benchmarks = get_lighteval_tasks() # Evaluate on all supported benchmarks. Later we may want to include a `chat` option # that just evaluates on `ifeval` and `mt_bench` etc. for benchmark in benchmarks: print(f"Launching benchmark `{benchmark}`") if benchmark in get_lighteval_tasks(): run_lighteval_job(benchmark, training_args, model_args) else: raise ValueError(f"Unknown benchmark {benchmark}")