Index A | B | C | D | E | F | G | H | I | K | L | M | N | O | P | Q | R | S | T | U | V | W | Z A Accelerator (class in maxent_grpo.training.types.runtime) accelerator (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) accumulate_metrics() (in module maxent_grpo.training.metrics) accuracy_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) advantage (maxent_grpo.training.types.rewards.RewardComputation attribute) advantage_count (maxent_grpo.training.types.logging.RewardLoggingView attribute) advantage_entropy (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) advantage_entropy_mean (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) advantage_entropy_std (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) advantage_mean (maxent_grpo.training.types.logging.RewardLoggingView attribute) advantage_samples (maxent_grpo.training.types.rewards.RewardComputation property) advantage_scale_max (maxent_grpo.training.types.logging.RewardLoggingView attribute) advantage_scale_mean (maxent_grpo.training.types.logging.RewardLoggingView attribute) advantage_scale_min (maxent_grpo.training.types.logging.RewardLoggingView attribute) advantage_std (maxent_grpo.training.types.logging.RewardLoggingView attribute) AdvantageStats (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) AggregatedGenerationState (class in maxent_grpo.training.generation) (class in maxent_grpo.training.generation.common) (class in maxent_grpo.training.generation.helpers) allow_empty_weight_fallback (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) allow_stale_reference_logprobs (maxent_grpo.training.types.runtime.ScoringSettings attribute) analytic_steps (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) AnalyticControllerObjective (class in maxent_grpo.training) (class in maxent_grpo.training.controller_objective) answers (maxent_grpo.training.types.GenerationBatch attribute) (maxent_grpo.training.types.rewards.GenerationBatch attribute) append_completion_group() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) append_eval_prompt_suffix() (in module maxent_grpo.training.runtime.prompts) append_prompt_suffix() (in module maxent_grpo.training.runtime.prompts) apply_chat_template() (maxent_grpo.training.runtime.ChatTokenizer method) (maxent_grpo.training.runtime.prompts.ChatTokenizer method) apply_gradients() (maxent_grpo.training.controller_optimizer.ControllerMetaManager method) apply_learning_rate() (in module maxent_grpo.training.optim) apply_meta_controller_update() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) apply_state() (maxent_grpo.training.weighting.ControllerMetaSettings method) (maxent_grpo.training.weighting.types.ControllerMetaSettings method) apply_to_weighting() (maxent_grpo.training.weighting.ControllerStateSnapshot method) (maxent_grpo.training.weighting.types.ControllerStateSnapshot method) as_dict() (maxent_grpo.training.rollout.context.GenerationContext method) (maxent_grpo.training.rollout.GenerationContext method) (maxent_grpo.training.rollout.generator.GenerationContext method) (maxent_grpo.training.rollout.helpers.GenerationContext method) (maxent_grpo.training.types.logging.LogStepArtifacts method) (maxent_grpo.training.types.LogStepArtifacts method) attempt (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) attention_mask (maxent_grpo.training.types.rewards.PromptCacheEntry attribute) avg_completion_tokens (maxent_grpo.training.types.logging.TokenUsageStats attribute) (maxent_grpo.training.types.logging.TrainingScalarStats property) (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) (maxent_grpo.training.types.TokenUsageStats attribute) (maxent_grpo.training.types.TrainingScalarStats property) B backfill_local (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) backfill_missing() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) backoff (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) backoff_multiplier (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) backprop_fn (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) base_optimizer (maxent_grpo.training.types.runtime.OptimizerHandles attribute) baseline (maxent_grpo.cli.hydra_cli.HydraRootConfig attribute) baseline_entry() (in module maxent_grpo.cli.hydra_cli) BaselineCommand (class in maxent_grpo.cli.hydra_cli) batch_size (maxent_grpo.training.types.runtime.EvaluationSettings attribute) batch_stats (maxent_grpo.training.pipeline.PreparedBatch attribute) BatchDiagnostics (class in maxent_grpo.training.types.rewards) batching (maxent_grpo.training.types.runtime.ScoringSettings attribute) BatchingSettings (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.runtime) behavior_logp_sum (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) behavior_logprobs_source (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.ScoringSettings attribute) benchmarks (maxent_grpo.config.grpo.GRPOConfig attribute) best_of (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) beta (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingConfigLike attribute) (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.WeightingConfigLike attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) beta_grad (maxent_grpo.training.controller_objective.ControllerGradients attribute) beta_grad_clip (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) beta_learning_rate (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) beta_tensor() (maxent_grpo.training.weighting.types.TorchControllerState method) binary_code_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) boxed_accuracy_reward_math() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) broadcast_controller_state() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) broadcast_object_list() (in module maxent_grpo.training.rollout.vllm_adapter) build_checkpoint_saver() (in module maxent_grpo.training.state) build_controller_objective() (in module maxent_grpo.training.controller_objective) build_custom_grpo_trainer() (in module maxent_grpo.training.trl_trainer) build_live_seed_paper_eval_command() (in module maxent_grpo.training.seed_paper_eval_callback) build_optimization_handles() (in module maxent_grpo.training.optim) build_score_batch() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_batching) build_sequence_scores() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_logprob) build_training_metrics_dict() (in module maxent_grpo.training.metrics) build_training_state() (in module maxent_grpo.training.state) build_uniform_weight_stats() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) build_weighting_settings() (in module maxent_grpo.training.weighting.logic) C cached_import() (in module maxent_grpo.utils.imports) callbacks (maxent_grpo.config.grpo.GRPOConfig attribute) chat_template (maxent_grpo.config.grpo.GRPOConfig attribute) ChatMessage (class in maxent_grpo.core.model) ChatTokenizer (class in maxent_grpo.training.runtime) (class in maxent_grpo.training.runtime.prompts) check_hub_revision_exists() (in module maxent_grpo.core.hub) check_stop_condition() (in module maxent_grpo.training.state) checkpoint_state_ref (maxent_grpo.training.types.logging.LoggingHandles attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) classify_vllm_startup_log() (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.ops) (in module maxent_grpo.training.runtime.ops.vllm_startup) cli() (in module maxent_grpo.grpo) clip_adv_baseline (maxent_grpo.training.types.runtime.ClipSettings attribute) clip_delta (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.ClipSettings attribute) clip_grad_norm_local() (in module maxent_grpo.training.optim) clip_loss (maxent_grpo.training.types.LossScalarBundle attribute) (maxent_grpo.training.types.rewards.LossScalarBundle attribute) clip_loss_scalar (maxent_grpo.training.types.rewards.LossOutputs property) clip_objective_coef (maxent_grpo.training.types.runtime.ClipSettings attribute) clip_range (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.ClipSettings attribute) clip_range_high (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.ClipSettings attribute) clip_ratio (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clip_ratio_high_max (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clip_ratio_high_mean (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clip_ratio_low_mean (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clip_ratio_low_min (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clip_ratio_region_mean (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) clipped_ratio (maxent_grpo.training.types.rewards.LengthStats attribute) clipping (maxent_grpo.training.types.logging.LoggingConfigView attribute) (maxent_grpo.training.types.LoggingConfigView attribute) (maxent_grpo.training.types.runtime.ScoringSettings attribute) ClipSettings (class in maxent_grpo.training.types.runtime) coalesce_grouped_outputs() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin static method) collect_weight_entropy() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) ColocateVLLMClient (class in maxent_grpo.training.rollout.vllm_colocate) ColocateVLLMEngine (class in maxent_grpo.training.rollout.vllm_colocate) columns (maxent_grpo.config.dataset.DatasetConfig attribute) command (maxent_grpo.cli.hydra_cli.HydraRootConfig attribute) completion_attention_mask (maxent_grpo.training.types.rewards.ScoreBatch attribute) completion_ids (maxent_grpo.training.types.rewards.ScoreBatch attribute) completion_metadata (maxent_grpo.training.types.rewards.RewardComputation attribute) CompletionGenerator (class in maxent_grpo.training.rollout) (class in maxent_grpo.training.rollout.generator) (class in maxent_grpo.training.rollout.helpers) completions (maxent_grpo.training.generation.AggregatedGenerationState attribute) (maxent_grpo.training.generation.common.AggregatedGenerationState attribute) (maxent_grpo.training.generation.helpers.AggregatedGenerationState attribute) (maxent_grpo.training.types.PromptCompletionBatch attribute) (maxent_grpo.training.types.rewards.PromptCompletionBatch attribute) CompletionTensors (class in maxent_grpo.training.scoring) (class in maxent_grpo.training.scoring_batching) compute() (maxent_grpo.training.AnalyticControllerObjective method) (maxent_grpo.training.controller_objective.AnalyticControllerObjective method) (maxent_grpo.training.controller_objective.ControllerObjective method) (maxent_grpo.training.controller_objective.TruncatedBackpropControllerObjective method) (maxent_grpo.training.TruncatedBackpropControllerObjective method) compute_reward_statistics() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.rewards) compute_reward_totals() (in module maxent_grpo.rewards.maxent) compute_weight_stats() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) config (maxent_grpo.config.dataset.DatasetConfig attribute) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) ConfigDict() (in module maxent_grpo.config.recipes) configure_accumulation_steps() (in module maxent_grpo.training.optim) content (maxent_grpo.core.model.ChatMessage attribute) controller (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) controller_meta (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) controller_meta_analytic_steps (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_beta_grad_clip (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_beta_lr (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_lr (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_manager (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) controller_meta_method (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_objective (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_optimizer (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_tau_lr (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_truncation_steps (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_update_interval (maxent_grpo.config.grpo.GRPOConfig attribute) controller_meta_use_hessian (maxent_grpo.config.grpo.GRPOConfig attribute) controller_objective (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) controller_overwrite_from_config (maxent_grpo.config.grpo.GRPOConfig attribute) controller_state (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) controller_state_dict() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) ControllerGradients (class in maxent_grpo.training.controller_objective) ControllerMetaContext (class in maxent_grpo.training.controller_objective) ControllerMetaManager (class in maxent_grpo.training.controller_optimizer) ControllerMetaSettings (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) ControllerObjective (class in maxent_grpo.training.controller_objective) ControllerPaths (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.runtime) ControllerStateSnapshot (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) copy_with() (maxent_grpo.training.generation.errors.ServiceErrorPayload method) CORE_ENGINE_STALL (maxent_grpo.training.runtime.ops.StartupStatus attribute) (maxent_grpo.training.runtime.ops.vllm_startup.StartupStatus attribute) cosine_max_len (maxent_grpo.config.grpo.GRPOScriptArguments attribute) cosine_max_value_correct (maxent_grpo.config.grpo.GRPOScriptArguments attribute) cosine_max_value_wrong (maxent_grpo.config.grpo.GRPOScriptArguments attribute) cosine_min_value_correct (maxent_grpo.config.grpo.GRPOScriptArguments attribute) cosine_min_value_wrong (maxent_grpo.config.grpo.GRPOScriptArguments attribute) ctx (maxent_grpo.training.generation.vllm_distributed.VLLMDistributedMixin attribute) (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin attribute) (maxent_grpo.training.generation.vllm_weight_sync.VLLMWeightSyncMixin attribute) (maxent_grpo.training.rollout.local.LocalGenerationMixin attribute) (maxent_grpo.training.rollout.vllm_adapter.VLLMGenerationMixin attribute) cur_logp_sum (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) current_lr (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) D dataloader_num_workers (maxent_grpo.config.grpo.GRPOConfig attribute) dataloader_persistent_workers (maxent_grpo.config.grpo.GRPOConfig attribute) dataloader_pin_memory (maxent_grpo.config.grpo.GRPOConfig attribute) dataloader_prefetch_factor (maxent_grpo.config.grpo.GRPOConfig attribute) dataset_config (maxent_grpo.config.dataset.ScriptArguments attribute) dataset_mixture (maxent_grpo.config.dataset.ScriptArguments attribute) dataset_name (maxent_grpo.config.dataset.ScriptArguments attribute) dataset_prompt_column (maxent_grpo.config.grpo.GRPOScriptArguments attribute) dataset_solution_column (maxent_grpo.config.grpo.GRPOScriptArguments attribute) DatasetConfig (class in maxent_grpo.config.dataset) DatasetMixtureConfig (class in maxent_grpo.config.dataset) datasets (maxent_grpo.config.dataset.DatasetMixtureConfig attribute) decode() (maxent_grpo.training.patches.vllm.TokenizerLike method) DeepspeedState (class in maxent_grpo.training.optim) denom (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightNormalizationSettings attribute) (maxent_grpo.training.weighting.WeightNormalizationSettings attribute) denom_tok_tensor (maxent_grpo.training.types.rewards.LossOutputs attribute) (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) describe() (maxent_grpo.training.rollout.generator.CompletionGenerator method) (maxent_grpo.training.rollout.local.LocalGenerationMixin method) detect_deepspeed_state() (in module maxent_grpo.training.optim) determine_retry_limit() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) device (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) diagnostics (maxent_grpo.training.types.logging.LogStepArtifacts attribute) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) (maxent_grpo.training.types.LogStepArtifacts attribute) DictConfig (in module maxent_grpo.cli.hydra_cli) disable_distributed_sampler (maxent_grpo.config.grpo.GRPOConfig attribute) disable_grad() (maxent_grpo.training.weighting.types.TorchControllerState method) diversity_metrics (maxent_grpo.training.pipeline.PreparedBatch attribute) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) dr_grpo_denominator_mode (maxent_grpo.config.grpo.GRPOConfig attribute) drop_empty_prompt_groups() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) drop_incomplete_prompt_groups() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) E enable_grad() (maxent_grpo.training.weighting.types.TorchControllerState method) enabled (maxent_grpo.training.types.runtime.EvaluationSettings attribute) (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) endpoint (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) ensure_hf_repo_ready() (in module maxent_grpo.core.hub) ensure_ready() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMClient method) (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMEngine method) ensure_real_dependencies() (in module maxent_grpo.utils.deps_guard) ensure_weighting_logging() (in module maxent_grpo.training.telemetry.trl_logging) entropy (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) entropy_bonus_coef (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) entropy_bonus_reward_std (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) entropy_bonus_scale (maxent_grpo.training.types.rewards.RewardComputation attribute) entropy_max (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) entropy_min (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) entropy_norm (maxent_grpo.rewards.maxent.WeightLoggingView attribute) (maxent_grpo.training.weighting.types.WeightLoggingView attribute) entropy_value() (maxent_grpo.training.controller_objective.ControllerMetaContext method) eos_token_id (maxent_grpo.training.runtime.ChatTokenizer property) (maxent_grpo.training.runtime.prompts.ChatTokenizer property) epoch_progress (maxent_grpo.training.types.logging.LogStepArtifacts attribute) (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.LogStepArtifacts attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) epoch_progress() (in module maxent_grpo.training.optim) epsilon (maxent_grpo.training.weighting.QDistributionSettings attribute) (maxent_grpo.training.weighting.types.QDistributionSettings attribute) ERROR (maxent_grpo.training.runtime.ops.StartupStatus attribute) (maxent_grpo.training.runtime.ops.vllm_startup.StartupStatus attribute) eval_before_train (maxent_grpo.config.grpo.GRPOConfig attribute) eval_dataset_config (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_dataset_name (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_dataset_prompt_column (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_dataset_solution_column (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_dataset_split (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_greedy_only_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) eval_reward (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) eval_reward_funcs (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_reward_weights (maxent_grpo.config.grpo.GRPOScriptArguments attribute) eval_strategy (maxent_grpo.config.grpo.GRPOConfig attribute) evaluation (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) EvaluationSettings (class in maxent_grpo.training.types.runtime) every_n_steps (maxent_grpo.training.types.runtime.EvaluationSettings attribute) exception_message (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) exception_type (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) expand_dedup_results() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin static method) extra (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) F final_model_save_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) finalize_reference_stats() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) flat_weights (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) flatten_prompt_completions() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.helpers) flatten_ref_metadata() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) flush() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMClient method) (maxent_grpo.training.types.logging.MetricWriter method) (maxent_grpo.training.types.MetricWriter method) flush_metric_averages() (in module maxent_grpo.training.metrics) flush_metrics() (maxent_grpo.training.types.logging.LoggingHandles method) format_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) frac_zero_std (maxent_grpo.training.types.logging.RewardLoggingView attribute) frequency_penalty (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) from_dict() (maxent_grpo.training.weighting.ControllerStateSnapshot class method) (maxent_grpo.training.weighting.types.ControllerStateSnapshot class method) from_weighting() (maxent_grpo.training.weighting.ControllerStateSnapshot class method) (maxent_grpo.training.weighting.types.ControllerStateSnapshot class method) fsdp_cls (maxent_grpo.training.generation.vllm.VLLMGenerationHelper property) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper property) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper property) G gather_object_list() (in module maxent_grpo.training.rollout.vllm_adapter) gather_reference_logprobs() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) gen_best_of (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin property) (maxent_grpo.training.runtime.prompts.GenerationPenaltyConfig attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin property) gen_frequency_penalty (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin property) (maxent_grpo.training.runtime.prompts.GenerationPenaltyConfig attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin property) gen_presence_penalty (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin property) (maxent_grpo.training.runtime.prompts.GenerationPenaltyConfig attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin property) gen_stop_sequences (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin property) (maxent_grpo.training.runtime.prompts.GenerationPenaltyConfig attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin property) gen_temperature (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) gen_top_k (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin property) (maxent_grpo.training.runtime.prompts.GenerationPenaltyConfig attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin property) gen_top_p (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) generate() (maxent_grpo.training.generation.vllm.VLLMGenerationHelper method) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper method) (maxent_grpo.training.rollout.generator.CompletionGenerator method) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper method) (maxent_grpo.training.rollout.vllm_adapter.VLLMGenerationMixin method) generate_collective() (maxent_grpo.training.generation.vllm.VLLMGenerationHelper method) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper method) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper method) generation (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) generation_stats (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.types.runtime.GenerationSettings attribute) GenerationBatch (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) GenerationContext (class in maxent_grpo.training.rollout) (class in maxent_grpo.training.rollout.context) (class in maxent_grpo.training.rollout.generator) (class in maxent_grpo.training.rollout.helpers) GenerationFn (class in maxent_grpo.training.types.runtime) GenerationLogprobEntry (in module maxent_grpo.training.patches.vllm) GenerationPenaltyConfig (class in maxent_grpo.training.runtime.prompts) GenerationPenaltyPassthroughMixin (class in maxent_grpo.training.runtime) (class in maxent_grpo.training.runtime.prompts) GenerationSamplingConfig (class in maxent_grpo.training.runtime) (class in maxent_grpo.training.runtime.config) (class in maxent_grpo.training.runtime.setup) GenerationServiceError GenerationSettings (class in maxent_grpo.training.types.runtime) generator (maxent_grpo.training.types.rewards.ValidationContext attribute) get_code_format_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) get_cosine_scaled_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) get_dataset() (in module maxent_grpo.core.data) get_gpu_count_for_vllm() (in module maxent_grpo.core.hub) get_kbit_device_map() (in module maxent_grpo.core.model) get_lighteval_tasks() (in module maxent_grpo.core.evaluation) get_missing_boxed_answer_penalty_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) get_model() (in module maxent_grpo.core.model) get_param_count_from_repo_id() (in module maxent_grpo.core.hub) get_process_log_level() (maxent_grpo.config.grpo.GRPOConfig method) get_quantization_config() (in module maxent_grpo.core.model) get_ref_model (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) get_repetition_penalty_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) get_reward_funcs() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) (in module maxent_grpo.training.baseline) get_tokenizer() (in module maxent_grpo.core.model) get_trl_prepare_deepspeed() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deepspeed) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) global_step (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) (maxent_grpo.training.types.logging.MetricState attribute) grad_accum_steps (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) grad_norm_scalar (maxent_grpo.training.types.logging.LogStepArtifacts attribute) (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.LogStepArtifacts attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) greedy_eval_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) group_advantages() (in module maxent_grpo.rewards.maxent) grouped (maxent_grpo.training.types.AdvantageStats attribute) (maxent_grpo.training.types.QDistribution attribute) (maxent_grpo.training.types.rewards.AdvantageStats attribute) (maxent_grpo.training.types.rewards.QDistribution attribute) grouped_completion_info (maxent_grpo.training.types.GenerationBatch attribute) (maxent_grpo.training.types.rewards.GenerationBatch attribute) grouped_completions (maxent_grpo.training.pipeline.PreparedBatch attribute) (maxent_grpo.training.types.GenerationBatch attribute) (maxent_grpo.training.types.rewards.GenerationBatch attribute) grouped_ref_meta (maxent_grpo.training.types.GenerationBatch attribute) (maxent_grpo.training.types.rewards.GenerationBatch attribute) grpo_beta_controller_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) grpo_loss_type (maxent_grpo.config.grpo.GRPOConfig attribute) GRPOConfig (class in maxent_grpo.config.grpo) GRPOScriptArguments (class in maxent_grpo.config.grpo) guided_json (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) guided_regex (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) H handles (maxent_grpo.training.types.OptimizationSettings attribute) (maxent_grpo.training.types.runtime.OptimizationSettings attribute) has_updates() (maxent_grpo.training.controller_objective.ControllerGradients method) HEALTHY (maxent_grpo.training.runtime.ops.StartupStatus attribute) (maxent_grpo.training.runtime.ops.vllm_startup.StartupStatus attribute) horizon (maxent_grpo.training.weighting.KlControllerSettings attribute) (maxent_grpo.training.weighting.types.KlControllerSettings attribute) hub_model_revision (maxent_grpo.config.grpo.GRPOConfig attribute) hydra_entry() (in module maxent_grpo.cli.hydra_cli) hydra_main() (in module maxent_grpo.cli.hydra_cli) HydraRootConfig (class in maxent_grpo.cli.hydra_cli) I id (maxent_grpo.config.dataset.DatasetConfig attribute) ids (maxent_grpo.training.scoring.CompletionTensors attribute) (maxent_grpo.training.scoring_batching.CompletionTensors attribute) import_vllm_client_cls() (in module maxent_grpo.training.generation.vllm_utils) include_stop_str_in_output (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) init_from_checkpoint (maxent_grpo.config.grpo.GRPOConfig attribute) init_vllm_client_communicator() (in module maxent_grpo.training.generation.vllm_utils) init_wandb_training() (in module maxent_grpo.training.telemetry.wandb) input_ids (maxent_grpo.training.types.rewards.PromptCacheEntry attribute) iter_batch_slices() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_batching) iter_batch_slices_trl() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_batching) K kl_controller (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) kl_ctl_step_size (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) kl_horizon (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) kl_loss (maxent_grpo.training.types.LossScalarBundle attribute) (maxent_grpo.training.types.rewards.LossScalarBundle attribute) kl_loss_scalar (maxent_grpo.training.types.rewards.LossOutputs property) kl_metric() (maxent_grpo.training.controller_objective.ControllerMetaContext method) kl_per_token_by_len_bucket (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) kl_target (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) kl_token_count_by_len_bucket (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) kl_value (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) (maxent_grpo.training.types.rewards.BatchDiagnostics attribute) KlControllerSettings (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) L last_beta_grad (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) last_tau_grad (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) last_vllm_synced_step (maxent_grpo.training.generation.vllm.VLLMGenerationHelper property) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper property) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper property) learning_rate (maxent_grpo.training.types.runtime.OptimizerHandles attribute) (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) len_norm_ref (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightNormalizationSettings attribute) (maxent_grpo.training.weighting.WeightNormalizationSettings attribute) len_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) length (maxent_grpo.training.types.rewards.PromptCacheEntry property) length_normalize_ref (maxent_grpo.training.runtime.config.MaxEntOptions attribute) (maxent_grpo.training.runtime.MaxEntOptions attribute) (maxent_grpo.training.runtime.setup.MaxEntOptions attribute) length_stats (maxent_grpo.training.pipeline.PreparedBatch property) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) LengthStats (class in maxent_grpo.training.types.rewards) load_controller_state() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) load_controller_state_chain() (in module maxent_grpo.training.state) load_dataset_split() (in module maxent_grpo.core.data) load_datasets() (in module maxent_grpo.training.data) load_eval_reward_functions() (in module maxent_grpo.training.rewards) load_grpo_recipe() (in module maxent_grpo.config) (in module maxent_grpo.config.recipes) load_reward_functions() (in module maxent_grpo.training.rewards) load_trainer_state_metadata() (in module maxent_grpo.training.state) LocalGenerationMixin (class in maxent_grpo.training.rollout.local) log() (maxent_grpo.training.types.logging.MetricWriter method) (maxent_grpo.training.types.MetricWriter method) log_completions (maxent_grpo.config.grpo.GRPOConfig attribute) log_generation_service_error() (in module maxent_grpo.training.generation.errors) log_level (maxent_grpo.config.grpo.GRPOConfig attribute) log_like_grpo (maxent_grpo.config.grpo.GRPOConfig attribute) log_local_step() (in module maxent_grpo.training.metrics) log_metrics() (maxent_grpo.training.types.logging.LoggingHandles method) log_ratio_train (maxent_grpo.training.types.rewards.LossOutputs attribute) (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) log_run_header() (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.logging) log_training_metrics() (in module maxent_grpo.training.metrics) log_training_step() (in module maxent_grpo.training.metrics) logging (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) LoggingConfigView (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) LoggingHandles (class in maxent_grpo.training.types.logging) logit_bias (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) logprob_chunk_size (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) logprob_sum (maxent_grpo.training.patches.vllm.VLLMLogprobResult attribute) LogStepArtifacts (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) LoopSettings (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.runtime) loss (maxent_grpo.training.types.rewards.LossOutputs attribute) loss_outputs (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) (maxent_grpo.training.types.logging.LogStepArtifacts attribute) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) (maxent_grpo.training.types.LogStepArtifacts attribute) LossOutputs (class in maxent_grpo.training.types.rewards) LossScalarBundle (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) lr_scale (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) lr_scheduler (maxent_grpo.training.types.runtime.OptimizerHandles attribute) lr_scheduler_type (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) M main() (in module maxent_grpo) (in module maxent_grpo.grpo) (in module maxent_grpo.training.runtime.ops.vllm_startup) make_backprop_fn() (maxent_grpo.training.controller_optimizer.ControllerMetaManager method) mask (maxent_grpo.training.scoring.CompletionTensors attribute) (maxent_grpo.training.scoring_batching.CompletionTensors attribute) max_attempts (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) max_completion_len (maxent_grpo.config.grpo.GRPOScriptArguments attribute) (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) max_grad_norm (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) max_length (maxent_grpo.training.types.rewards.LengthStats attribute) max_prompt_len (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) (maxent_grpo.training.types.rewards.ScoreBatch attribute) max_retries (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) max_terminated (maxent_grpo.training.types.rewards.LengthStats attribute) maxent (maxent_grpo.cli.hydra_cli.HydraRootConfig attribute) maxent_allow_empty_weight_fallback (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_allow_stale_reference_logprobs (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_disable_outside_trust_zone (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_kl_gain (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_kl_max_multiplier (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_kl_min_multiplier (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_kl_threshold (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_lower_on_high_kl (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_alpha_raise_on_low_kl (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_beta_controller_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_clip_adv_baseline (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_clip_objective_coef (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_clip_range (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_grpo module maxent_grpo.cli module maxent_grpo.cli.config_validation module maxent_grpo.cli.hydra_cli module maxent_grpo.config module maxent_grpo.config.dataset module maxent_grpo.config.grpo module maxent_grpo.config.recipes module maxent_grpo.core module maxent_grpo.core.data module maxent_grpo.core.evaluation module maxent_grpo.core.hub module maxent_grpo.core.model module maxent_grpo.grpo module maxent_grpo.rewards module maxent_grpo.rewards.basic module maxent_grpo.rewards.maxent module maxent_grpo.training module maxent_grpo.training.baseline module maxent_grpo.training.cli module maxent_grpo.training.cli.trl module maxent_grpo.training.controller_objective module maxent_grpo.training.controller_optimizer module maxent_grpo.training.data module maxent_grpo.training.eval module maxent_grpo.training.generation module maxent_grpo.training.generation.common module maxent_grpo.training.generation.errors module maxent_grpo.training.generation.helpers module maxent_grpo.training.generation.vllm module maxent_grpo.training.generation.vllm_distributed module maxent_grpo.training.generation.vllm_helper module maxent_grpo.training.generation.vllm_requests module maxent_grpo.training.generation.vllm_state module maxent_grpo.training.generation.vllm_utils module maxent_grpo.training.generation.vllm_weight_sync module maxent_grpo.training.generation.vocab_guard module maxent_grpo.training.metrics module maxent_grpo.training.optim module maxent_grpo.training.patches module maxent_grpo.training.patches.vllm module maxent_grpo.training.pipeline module maxent_grpo.training.rewards module maxent_grpo.training.rollout module maxent_grpo.training.rollout.context module maxent_grpo.training.rollout.distributed module maxent_grpo.training.rollout.generator module maxent_grpo.training.rollout.helpers module maxent_grpo.training.rollout.local module maxent_grpo.training.rollout.vllm_adapter module maxent_grpo.training.rollout.vllm_colocate module maxent_grpo.training.run_helpers module maxent_grpo.training.runtime module maxent_grpo.training.runtime.config module maxent_grpo.training.runtime.deepspeed module maxent_grpo.training.runtime.deps module maxent_grpo.training.runtime.logging module maxent_grpo.training.runtime.ops module maxent_grpo.training.runtime.ops.vllm_startup module maxent_grpo.training.runtime.prompts module maxent_grpo.training.runtime.setup module maxent_grpo.training.scoring module maxent_grpo.training.scoring_batching module maxent_grpo.training.scoring_common module maxent_grpo.training.scoring_logprob module maxent_grpo.training.scoring_reference module maxent_grpo.training.seed_paper_eval_callback module maxent_grpo.training.state module maxent_grpo.training.telemetry module maxent_grpo.training.telemetry.trl_logging module maxent_grpo.training.telemetry.wandb module maxent_grpo.training.trainer_hooks module maxent_grpo.training.trl_trainer module maxent_grpo.training.types module maxent_grpo.training.types.logging module maxent_grpo.training.types.rewards module maxent_grpo.training.types.runtime module maxent_grpo.training.weighting module maxent_grpo.training.weighting.logic module maxent_grpo.training.weighting.types module maxent_grpo.training.zero_utils module maxent_grpo.utils module maxent_grpo.utils.deps_guard module maxent_grpo.utils.imports module maxent_length_normalize_policy (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_length_normalize_ref (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_listwise_skip_zero_variance_groups (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_logprob_chunk_size (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_objective_variant (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_policy_entropy (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_policy_entropy_mode (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_prompt_cache_size (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_q_epsilon (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_q_temperature (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_reference_ema_beta (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_reference_ema_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_reference_ema_update_interval (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_reference_ema_warmup_steps (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_reference_logprobs_source (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_score_slice_prefetch (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_score_tail_tokens (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_share_reference_model (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_target_weight_entropy (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_target_weight_entropy_final (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_target_weight_entropy_horizon (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_target_weight_entropy_start (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_tau (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_tau_lr (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_tau_max (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_tau_min (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_tau_warmup_steps (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_trl_reference_scoring (maxent_grpo.config.grpo.GRPOConfig attribute) maxent_use_clip_objective (maxent_grpo.config.grpo.GRPOConfig attribute) MaxentCommand (class in maxent_grpo.cli.hydra_cli) MaxEntOptions (class in maxent_grpo.training.runtime) (class in maxent_grpo.training.runtime.config) (class in maxent_grpo.training.runtime.setup) maximum_value (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) maybe_checkpoint() (in module maxent_grpo.training.state) maybe_clear_stale_controller_state() (in module maxent_grpo.training.state) maybe_create_deepspeed_plugin() (in module maxent_grpo.training.runtime.deps) maybe_load_accelerator_state() (in module maxent_grpo.training.state) maybe_sync_weights() (maxent_grpo.training.generation.vllm_weight_sync.VLLMWeightSyncMixin method) maybe_update_beta() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) maybe_update_tau() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) mean (maxent_grpo.training.types.logging.RewardComponentStats attribute) (maxent_grpo.training.types.RewardComponentStats attribute) (maxent_grpo.training.types.RewardMoments attribute) (maxent_grpo.training.types.rewards.RewardMoments attribute) mean_length (maxent_grpo.training.types.rewards.LengthStats attribute) mean_terminated (maxent_grpo.training.types.rewards.LengthStats attribute) merge_group_chunk() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin static method) merge_invalid_token_block_logit_bias() (in module maxent_grpo.training.generation.vocab_guard) merge_vllm_results() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) meta (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) metadata (maxent_grpo.training.generation.AggregatedGenerationState attribute) (maxent_grpo.training.generation.common.AggregatedGenerationState attribute) (maxent_grpo.training.generation.helpers.AggregatedGenerationState attribute) (maxent_grpo.training.types.PromptCompletionBatch attribute) (maxent_grpo.training.types.rewards.PromptCompletionBatch attribute) method (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) metric_counts (maxent_grpo.training.types.logging.MetricState attribute) metric_sums (maxent_grpo.training.types.logging.MetricState attribute) metric_writer (maxent_grpo.training.types.logging.LoggingHandles attribute) MetricState (class in maxent_grpo.training.types.logging) MetricWriter (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) min_length (maxent_grpo.training.types.rewards.LengthStats attribute) min_terminated (maxent_grpo.training.types.rewards.LengthStats attribute) minimum_value (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) missing_boxed_answer_penalty (maxent_grpo.config.grpo.GRPOConfig attribute) model (maxent_grpo.cli.hydra_cli.BaselineCommand attribute) (maxent_grpo.cli.hydra_cli.MaxentCommand attribute) (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) model_validator() (in module maxent_grpo.config.recipes) module maxent_grpo maxent_grpo.cli maxent_grpo.cli.config_validation maxent_grpo.cli.hydra_cli maxent_grpo.config maxent_grpo.config.dataset maxent_grpo.config.grpo maxent_grpo.config.recipes maxent_grpo.core maxent_grpo.core.data maxent_grpo.core.evaluation maxent_grpo.core.hub maxent_grpo.core.model maxent_grpo.grpo maxent_grpo.rewards maxent_grpo.rewards.basic maxent_grpo.rewards.maxent maxent_grpo.training maxent_grpo.training.baseline maxent_grpo.training.cli maxent_grpo.training.cli.trl maxent_grpo.training.controller_objective maxent_grpo.training.controller_optimizer maxent_grpo.training.data maxent_grpo.training.eval maxent_grpo.training.generation maxent_grpo.training.generation.common maxent_grpo.training.generation.errors maxent_grpo.training.generation.helpers maxent_grpo.training.generation.vllm maxent_grpo.training.generation.vllm_distributed maxent_grpo.training.generation.vllm_helper maxent_grpo.training.generation.vllm_requests maxent_grpo.training.generation.vllm_state maxent_grpo.training.generation.vllm_utils maxent_grpo.training.generation.vllm_weight_sync maxent_grpo.training.generation.vocab_guard maxent_grpo.training.metrics maxent_grpo.training.optim maxent_grpo.training.patches maxent_grpo.training.patches.vllm maxent_grpo.training.pipeline maxent_grpo.training.rewards maxent_grpo.training.rollout maxent_grpo.training.rollout.context maxent_grpo.training.rollout.distributed maxent_grpo.training.rollout.generator maxent_grpo.training.rollout.helpers maxent_grpo.training.rollout.local maxent_grpo.training.rollout.vllm_adapter maxent_grpo.training.rollout.vllm_colocate maxent_grpo.training.run_helpers maxent_grpo.training.runtime maxent_grpo.training.runtime.config maxent_grpo.training.runtime.deepspeed maxent_grpo.training.runtime.deps maxent_grpo.training.runtime.logging maxent_grpo.training.runtime.ops maxent_grpo.training.runtime.ops.vllm_startup maxent_grpo.training.runtime.prompts maxent_grpo.training.runtime.setup maxent_grpo.training.scoring maxent_grpo.training.scoring_batching maxent_grpo.training.scoring_common maxent_grpo.training.scoring_logprob maxent_grpo.training.scoring_reference maxent_grpo.training.seed_paper_eval_callback maxent_grpo.training.state maxent_grpo.training.telemetry maxent_grpo.training.telemetry.trl_logging maxent_grpo.training.telemetry.wandb maxent_grpo.training.trainer_hooks maxent_grpo.training.trl_trainer maxent_grpo.training.types maxent_grpo.training.types.logging maxent_grpo.training.types.rewards maxent_grpo.training.types.runtime maxent_grpo.training.weighting maxent_grpo.training.weighting.logic maxent_grpo.training.weighting.types maxent_grpo.training.zero_utils maxent_grpo.utils maxent_grpo.utils.deps_guard maxent_grpo.utils.imports moments (maxent_grpo.training.types.rewards.RewardComputation attribute) N name (maxent_grpo.training.AnalyticControllerObjective attribute) (maxent_grpo.training.controller_objective.AnalyticControllerObjective attribute) (maxent_grpo.training.controller_objective.ControllerObjective attribute) (maxent_grpo.training.controller_objective.TruncatedBackpropControllerObjective attribute) (maxent_grpo.training.TruncatedBackpropControllerObjective attribute) normalization (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) nullcontext() (in module maxent_grpo.training.generation.vllm) num_completion_tokens (maxent_grpo.training.pipeline.PreparedBatch property) (maxent_grpo.training.types.logging.TokenUsageStats attribute) (maxent_grpo.training.types.logging.TrainingScalarStats property) (maxent_grpo.training.types.TokenUsageStats attribute) (maxent_grpo.training.types.TrainingScalarStats property) num_completions_to_print (maxent_grpo.config.grpo.GRPOConfig attribute) num_epochs (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) num_generations (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) num_input_tokens (maxent_grpo.training.types.logging.TokenUsageStats attribute) (maxent_grpo.training.types.logging.TrainingScalarStats property) (maxent_grpo.training.types.TokenUsageStats attribute) (maxent_grpo.training.types.TrainingScalarStats property) num_input_tokens_seen (maxent_grpo.training.types.logging.MetricState attribute) O objective (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) old_token_logp (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) OmegaConf (in module maxent_grpo.cli.hydra_cli) on_evaluate() (maxent_grpo.training.seed_paper_eval_callback.SeedPaperEvalCallback method) on_step_end() (maxent_grpo.training.seed_paper_eval_callback.SeedPaperEvalCallback method) on_train_begin() (maxent_grpo.training.seed_paper_eval_callback.SeedPaperEvalCallback method) on_train_end() (maxent_grpo.training.seed_paper_eval_callback.SeedPaperEvalCallback method) optimization (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) OptimizationSchedule (class in maxent_grpo.training.types.runtime) OptimizationSettings (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.runtime) optimizer (maxent_grpo.training.types.runtime.OptimizerHandles attribute) (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) optimizer_step() (in module maxent_grpo.training.optim) OptimizerHandles (class in maxent_grpo.training.types.runtime) optional_import() (in module maxent_grpo.utils.imports) overwrite_existing (maxent_grpo.training.types.ControllerPaths attribute) (maxent_grpo.training.types.runtime.ControllerPaths attribute) overwrite_hub_revision (maxent_grpo.config.grpo.GRPOConfig attribute) P pad_token_id (maxent_grpo.training.types.rewards.ScoreBatch attribute) pairs (maxent_grpo.training.types.rewards.RewardComputation attribute) parameters() (maxent_grpo.training.weighting.types.TorchControllerState method) parse_grpo_args() (in module maxent_grpo) (in module maxent_grpo.cli) (in module maxent_grpo.training.cli) (in module maxent_grpo.training.cli.trl) payload_chars (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) payload_size_bytes (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) penalty (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.GenerationPenaltyPassthroughMixin attribute) (maxent_grpo.training.runtime.prompts.GenerationPenaltyPassthroughMixin attribute) (maxent_grpo.training.types.runtime.GenerationSettings attribute) pending_generation_indices() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) per_reward (maxent_grpo.training.types.logging.RewardLoggingView attribute) per_reward_quantiles (maxent_grpo.training.types.logging.RewardLoggingView attribute) per_reward_values (maxent_grpo.training.types.rewards.RewardComputation attribute) policy_entropy (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.runtime.ScoringSettings attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) policy_entropy_bonus_coef (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.ScoringSettings attribute) policy_entropy_mode (maxent_grpo.training.types.runtime.ScoringSettings attribute) policy_entropy_sum (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) policy_loss (maxent_grpo.training.types.LossScalarBundle attribute) (maxent_grpo.training.types.rewards.LossScalarBundle attribute) policy_loss_scalar (maxent_grpo.training.types.rewards.LossOutputs property) pooled_hidden (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) prepare_generation_batch() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.rewards) prepare_training_batch() (in module maxent_grpo.training.pipeline) prepare_vllm_targets() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) prepared_batch (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) PreparedBatch (class in maxent_grpo.training.pipeline) presence_penalty (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) PreTrainedModel (class in maxent_grpo.training.types.runtime) prompt_cache_get (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) prompt_cache_size (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) prompt_char_limit (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) prompt_count (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) prompt_entries (maxent_grpo.training.types.rewards.ScoreBatch attribute) prompt_length_cache_get (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) prompt_template (maxent_grpo.config.grpo.GRPOConfig attribute) PromptCacheEntry (class in maxent_grpo.training.types.rewards) PromptCompletionBatch (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) prompts (maxent_grpo.training.types.GenerationBatch attribute) (maxent_grpo.training.types.PromptCompletionBatch attribute) (maxent_grpo.training.types.rewards.GenerationBatch attribute) (maxent_grpo.training.types.rewards.PromptCompletionBatch attribute) pure_accuracy_math_correctness() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) pure_accuracy_reward_math() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) push_param_to_vllm() (maxent_grpo.training.generation.vllm_weight_sync.VLLMWeightSyncMixin method) push_to_hub_revision (maxent_grpo.config.grpo.GRPOConfig attribute) push_to_hub_revision() (in module maxent_grpo.core.hub) python_unit_test_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) Q q_distribution (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.types.rewards.RewardComputation attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) q_entropy_max (maxent_grpo.training.types.logging.RewardLoggingView attribute) q_entropy_mean (maxent_grpo.training.types.logging.RewardLoggingView attribute) q_entropy_min (maxent_grpo.training.types.logging.RewardLoggingView attribute) q_entropy_std (maxent_grpo.training.types.logging.RewardLoggingView attribute) q_epsilon (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.runtime.config.MaxEntOptions attribute) (maxent_grpo.training.runtime.MaxEntOptions attribute) (maxent_grpo.training.runtime.setup.MaxEntOptions attribute) (maxent_grpo.training.weighting.types.WeightingSettings property) q_grouped (maxent_grpo.training.types.rewards.RewardComputation property) q_temperature (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.runtime.config.MaxEntOptions attribute) (maxent_grpo.training.runtime.MaxEntOptions attribute) (maxent_grpo.training.runtime.setup.MaxEntOptions attribute) (maxent_grpo.training.weighting.types.WeightingSettings property) QDistribution (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) QDistributionSettings (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) R raw_output (maxent_grpo.training.patches.vllm.VLLMLogprobResult attribute) reasoning_steps_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) recipe (maxent_grpo.cli.hydra_cli.BaselineCommand attribute) (maxent_grpo.cli.hydra_cli.MaxentCommand attribute) record_vllm_failure() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) ref_logp_mean (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) ref_logp_sum (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) ref_logp_sum_raw (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) ref_logprob_meta (maxent_grpo.training.types.rewards.RewardComputation attribute) ref_stats (maxent_grpo.training.pipeline.PreparedBatch property) ref_tok_counts (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) ref_token_logp (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) ref_token_mask (maxent_grpo.training.types.rewards.ReferenceLogprobs attribute) reference_from_model() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) reference_from_model_trl() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) reference_from_vllm_meta() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) reference_logprobs_source (maxent_grpo.training.types.runtime.ScoringSettings attribute) reference_model (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) reference_model_name_or_path (maxent_grpo.config.grpo.GRPOConfig attribute) reference_model_revision (maxent_grpo.config.grpo.GRPOConfig attribute) reference_stats_from_policy_logprobs() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) ReferenceLogprobs (class in maxent_grpo.training.types.rewards) register_lighteval_task() (in module maxent_grpo.core.evaluation) repetition_max_penalty (maxent_grpo.config.grpo.GRPOScriptArguments attribute) repetition_n_grams (maxent_grpo.config.grpo.GRPOScriptArguments attribute) request_batch() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMEngine method) request_id (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) request_id_prefix (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) request_logprobs (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) require_accelerator() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) require_accumulation_context() (in module maxent_grpo.training.optim) require_dataloader() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) require_deepspeed() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deepspeed) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) require_dependency() (in module maxent_grpo.utils.imports) require_torch() (in module maxent_grpo.training.rollout.helpers) (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) require_transformer_base_classes() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.deps) (in module maxent_grpo.training.runtime.setup) reset_prefix_cache() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMClient method) reset_vllm_cache() (maxent_grpo.training.generation.vllm_weight_sync.VLLMWeightSyncMixin method) resolve_allowed_token_ids() (in module maxent_grpo.training.generation.vocab_guard) resolve_blocked_token_ids() (in module maxent_grpo.training.generation.vocab_guard) resolve_dataloader_kwargs() (in module maxent_grpo.training.data) resolve_model_vocab_limit() (in module maxent_grpo.training.generation.vocab_guard) resolve_resume_checkpoint() (in module maxent_grpo.training.state) resolve_run_metadata() (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.logging) resolve_tokenizer_vocab_limit() (in module maxent_grpo.training.generation.vocab_guard) resume_checkpoint (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) resume_from (maxent_grpo.training.types.ControllerPaths attribute) (maxent_grpo.training.types.runtime.ControllerPaths attribute) resume_from_checkpoint (maxent_grpo.config.grpo.GRPOConfig attribute) resume_state (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) retry_incomplete_prompts() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) retry_sleep (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) reward (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) reward_comp (maxent_grpo.training.pipeline.PreparedBatch attribute) reward_funcs (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.basic.RewardConfig attribute) (maxent_grpo.training.types.runtime.RewardSpec attribute) reward_mean (maxent_grpo.training.types.logging.RewardLoggingView attribute) reward_moments() (in module maxent_grpo.rewards.maxent) reward_quantiles (maxent_grpo.training.types.logging.RewardLoggingView attribute) reward_stats (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) reward_std (maxent_grpo.training.types.logging.RewardLoggingView attribute) reward_weights (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.types.runtime.RewardSpec attribute) RewardComponentStats (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) RewardComputation (class in maxent_grpo.training.types.rewards) RewardConfig (class in maxent_grpo.rewards.basic) RewardFunction (class in maxent_grpo.rewards) (class in maxent_grpo.rewards.basic) RewardLoggingView (class in maxent_grpo.training.types.logging) RewardMoments (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) RewardSpec (class in maxent_grpo.training.types.runtime) rich_log_completions (maxent_grpo.config.grpo.GRPOConfig attribute) rich_log_completions_key (maxent_grpo.config.grpo.GRPOConfig attribute) rich_log_completions_synchronize_ranks (maxent_grpo.config.grpo.GRPOConfig attribute) rich_log_completions_to_wandb (maxent_grpo.config.grpo.GRPOConfig attribute) role (maxent_grpo.core.model.ChatMessage attribute) rounds_cfg (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) rows (maxent_grpo.training.types.runtime.EvaluationSettings attribute) run_baseline_training() (in module maxent_grpo.training) (in module maxent_grpo.training.baseline) run_benchmark_jobs() (in module maxent_grpo.core.evaluation) run_lighteval_job() (in module maxent_grpo.core.evaluation) run_validation_step() (in module maxent_grpo.training.eval) run_vllm_rounds() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) runtime (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) RuntimeHandles (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.runtime) S safe_generate() (in module maxent_grpo.training.patches.vllm) (in module maxent_grpo.training.rollout.generator) (in module maxent_grpo.training.rollout.helpers) safe_request() (in module maxent_grpo.training.patches.vllm) samples (maxent_grpo.training.types.AdvantageStats attribute) (maxent_grpo.training.types.QDistribution attribute) (maxent_grpo.training.types.rewards.AdvantageStats attribute) (maxent_grpo.training.types.rewards.QDistribution attribute) save_checkpoint (maxent_grpo.training.types.logging.LoggingHandles attribute) save_controller_state() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) save_steps (maxent_grpo.training.types.logging.LoggingHandles attribute) save_strategy (maxent_grpo.training.types.logging.LoggingHandles attribute) scalars (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) (maxent_grpo.training.types.rewards.LossOutputs attribute) scale_rewards (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) scatter_object() (in module maxent_grpo.training.rollout.vllm_adapter) schedule (maxent_grpo.training.types.logging.LoggingConfigView attribute) (maxent_grpo.training.types.LoggingConfigView attribute) (maxent_grpo.training.types.OptimizationSettings attribute) (maxent_grpo.training.types.runtime.OptimizationSettings attribute) scheduled_learning_rate() (in module maxent_grpo.training.optim) score_model_outputs() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_logprob) score_slice (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) score_tail_tokens (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.rewards.ScoreBatch attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) ScoreBatch (class in maxent_grpo.training.types.rewards) scores (maxent_grpo.training.pipeline.PreparedBatch attribute) scoring (maxent_grpo.training.types.LoopSettings attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.LoopSettings attribute) (maxent_grpo.training.types.runtime.TrainingLoopContext property) ScoringSettings (class in maxent_grpo.training.types.runtime) script (maxent_grpo.cli.hydra_cli.BaselineCommand attribute) (maxent_grpo.cli.hydra_cli.MaxentCommand attribute) ScriptArguments (class in maxent_grpo.config.dataset) seed (maxent_grpo.config.dataset.DatasetMixtureConfig attribute) seed_advantage_scales (maxent_grpo.training.types.rewards.RewardComputation attribute) seed_alpha_effective (maxent_grpo.training.types.logging.RewardLoggingView attribute) (maxent_grpo.training.types.rewards.RewardComputation attribute) seed_eval (maxent_grpo.training.types.runtime.EvaluationSettings attribute) seed_generation_groups() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) seed_grpo_alpha (maxent_grpo.config.grpo.GRPOConfig attribute) seed_grpo_alpha_normalize_by_max_entropy (maxent_grpo.config.grpo.GRPOConfig attribute) seed_grpo_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) seed_grpo_length_normalize_logprobs (maxent_grpo.config.grpo.GRPOConfig attribute) seed_max_possible_entropy (maxent_grpo.training.types.logging.RewardLoggingView attribute) (maxent_grpo.training.types.rewards.RewardComputation attribute) seed_paper_answer_tag_accuracy_reward_math() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) seed_paper_boxed_accuracy_reward_math() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) seed_paper_eval_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_fail_on_error (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_max_test (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_pass_at_8_enabled (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_pass_at_8_samples (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_pass_at_8_temperature (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_pass_at_8_top_p (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_python (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_results_dir (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_tasks (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_template (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_timeout_s (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_vllm_batch_size (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_eval_workspace_dir (maxent_grpo.config.grpo.GRPOConfig attribute) seed_paper_reward_fast (maxent_grpo.config.grpo.GRPOConfig attribute) seed_semantic_entropies (maxent_grpo.training.types.rewards.RewardComputation attribute) SeedPaperEvalCallback (class in maxent_grpo.training.seed_paper_eval_callback) selective_log_softmax() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_logprob) semantic_entropy_max (maxent_grpo.training.types.logging.RewardLoggingView attribute) semantic_entropy_mean (maxent_grpo.training.types.logging.RewardLoggingView attribute) semantic_entropy_min (maxent_grpo.training.types.logging.RewardLoggingView attribute) semantic_entropy_std (maxent_grpo.training.types.logging.RewardLoggingView attribute) SequenceScores (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.rewards) service (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) ServiceErrorPayload (class in maxent_grpo.training.generation.errors) set_fallback_generate() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) set_request_batcher() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) set_request_executor() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) set_safe_generate() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) set_time_provider() (maxent_grpo.training.generation.vllm_requests.VLLMRequestMixin method) settings (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) should_run() (maxent_grpo.training.controller_optimizer.ControllerMetaManager method) should_trigger_v0_fallback() (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.ops) (in module maxent_grpo.training.runtime.ops.vllm_startup) slice_prefetch (maxent_grpo.training.types.BatchingSettings attribute) (maxent_grpo.training.types.runtime.BatchingSettings attribute) slice_size (maxent_grpo.training.types.rewards.ScoreBatch attribute) soft_punish_cache (maxent_grpo.config.grpo.GRPOScriptArguments attribute) span_kl_beta0 (maxent_grpo.config.grpo.GRPOScriptArguments attribute) span_kl_horizon (maxent_grpo.config.grpo.GRPOScriptArguments attribute) span_kl_target (maxent_grpo.config.grpo.GRPOScriptArguments attribute) split (maxent_grpo.config.dataset.DatasetConfig attribute) split_reference_logprobs() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) split_reference_token_counts() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) STARTING (maxent_grpo.training.runtime.ops.StartupStatus attribute) (maxent_grpo.training.runtime.ops.vllm_startup.StartupStatus attribute) StartupStatus (class in maxent_grpo.training.runtime.ops) (class in maxent_grpo.training.runtime.ops.vllm_startup) state_path (maxent_grpo.training.types.ControllerPaths attribute) (maxent_grpo.training.types.runtime.ControllerPaths attribute) STATE_VERSION (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) status_code (maxent_grpo.training.generation.errors.ServiceErrorPayload attribute) std (maxent_grpo.training.types.logging.RewardComponentStats attribute) (maxent_grpo.training.types.RewardComponentStats attribute) (maxent_grpo.training.types.RewardMoments attribute) (maxent_grpo.training.types.rewards.RewardMoments attribute) step_logger() (maxent_grpo.training.types.logging.LoggingHandles method) step_size (maxent_grpo.training.weighting.KlControllerSettings attribute) (maxent_grpo.training.weighting.types.KlControllerSettings attribute) steps_per_epoch (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) stop_sequences (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) summarize_completion_lengths() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_batching) summarize_reward_stats() (in module maxent_grpo.training.metrics) summarize_weight_stats() (in module maxent_grpo.training.metrics) sync_client() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMEngine method) sync_from_scalars() (maxent_grpo.training.weighting.types.TorchControllerState method) sync_fsdp_params() (maxent_grpo.training.generation.vllm_weight_sync.VLLMWeightSyncMixin method) sync_gradients_enabled() (in module maxent_grpo.training.optim) sync_trunc_state() (in module maxent_grpo.training.runtime.prompts) sync_weights (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) system_prompt (maxent_grpo.config.grpo.GRPOConfig attribute) T tag_count_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) target (maxent_grpo.training.weighting.KlControllerSettings attribute) (maxent_grpo.training.weighting.types.KlControllerSettings attribute) target_entropy (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) target_entropy_final (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) target_entropy_horizon (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) target_entropy_start (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) tau (maxent_grpo.rewards.maxent.WeightingConfigLike attribute) (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.runtime.config.MaxEntOptions attribute) (maxent_grpo.training.runtime.MaxEntOptions attribute) (maxent_grpo.training.runtime.setup.MaxEntOptions attribute) (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.WeightingConfigLike attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) tau_entropy_ema (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) tau_grad (maxent_grpo.training.controller_objective.ControllerGradients attribute) tau_learning_rate (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) tau_log (maxent_grpo.training.weighting.ControllerStateSnapshot attribute) (maxent_grpo.training.weighting.types.ControllerStateSnapshot attribute) tau_lr (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) tau_max (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) tau_min (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) tau_schedule (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) tau_target_entropy (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) tau_tensor() (maxent_grpo.training.weighting.types.TorchControllerState method) tau_warmup_steps (maxent_grpo.rewards.maxent.WeightingSettings property) (maxent_grpo.training.weighting.types.WeightingSettings property) TauSchedule (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) temperature (maxent_grpo.training.weighting.QDistributionSettings attribute) (maxent_grpo.training.weighting.types.QDistributionSettings attribute) test_split_size (maxent_grpo.config.dataset.DatasetMixtureConfig attribute) timeout (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) to_dict() (maxent_grpo.training.generation.errors.GenerationServiceError method) (maxent_grpo.training.generation.errors.ServiceErrorPayload method) (maxent_grpo.training.weighting.ControllerStateSnapshot method) (maxent_grpo.training.weighting.types.ControllerStateSnapshot method) to_json() (maxent_grpo.training.generation.errors.GenerationServiceError method) (maxent_grpo.training.generation.errors.ServiceErrorPayload method) to_state() (maxent_grpo.training.weighting.ControllerMetaSettings method) (maxent_grpo.training.weighting.types.ControllerMetaSettings method) to_trl_payload() (maxent_grpo.training.patches.vllm.VLLMLogprobResult method) token_count (maxent_grpo.training.patches.vllm.VLLMLogprobResult attribute) token_counts_from_score_batch() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_batching) token_logp (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) token_logprobs (maxent_grpo.training.patches.vllm.VLLMLogprobResult attribute) token_mask (maxent_grpo.training.types.rewards.SequenceScores attribute) (maxent_grpo.training.types.SequenceScores attribute) tokenizer (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.types.rewards.ValidationContext attribute) (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) TokenizerLike (class in maxent_grpo.training.patches.vllm) tokens (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) TokenUsageStats (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) top_k (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) torch_compile (maxent_grpo.config.grpo.GRPOConfig attribute) TorchControllerState (class in maxent_grpo.training.weighting.types) total_input_tokens (maxent_grpo.training.pipeline.PreparedBatch attribute) total_loss (maxent_grpo.training.types.LossScalarBundle attribute) (maxent_grpo.training.types.rewards.LossScalarBundle attribute) total_loss_scalar (maxent_grpo.training.types.rewards.LossOutputs property) total_sequences (maxent_grpo.training.types.rewards.ScoreBatch attribute) total_training_steps (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) total_utils (maxent_grpo.training.types.rewards.RewardComputation attribute) train_grpo_objective (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.rewards.maxent.WeightingSettings attribute) (maxent_grpo.training.weighting.types.WeightingSettings attribute) train_loader (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) train_reward_mean (maxent_grpo.training.types.rewards.RewardComputation property) train_reward_std (maxent_grpo.training.types.rewards.RewardComputation property) train_sampler (maxent_grpo.training.types.runtime.RuntimeHandles attribute) (maxent_grpo.training.types.RuntimeHandles attribute) training (maxent_grpo.cli.hydra_cli.BaselineCommand attribute) (maxent_grpo.cli.hydra_cli.MaxentCommand attribute) training_args (maxent_grpo.training.types.runtime.TrainingLoopContext attribute) TrainingLoopContext (class in maxent_grpo.training.types.runtime) TrainingMetricsPayload (class in maxent_grpo.training.types.logging) TrainingScalarStats (class in maxent_grpo.training.types) (class in maxent_grpo.training.types.logging) trl_reference_scoring (maxent_grpo.training.types.runtime.ScoringSettings attribute) truncate_after_first_boxed_answer() (in module maxent_grpo.rewards.basic) truncate_completions_at_first_boxed_answer (maxent_grpo.config.grpo.GRPOConfig attribute) truncate_prompt() (in module maxent_grpo.training.run_helpers) (in module maxent_grpo.training.runtime) (in module maxent_grpo.training.runtime.prompts) truncate_to_expected_counts() (in module maxent_grpo.training.generation) (in module maxent_grpo.training.generation.common) (in module maxent_grpo.training.generation.helpers) TruncatedBackpropControllerObjective (class in maxent_grpo.training) (class in maxent_grpo.training.controller_objective) truncation_steps (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) U update_interval (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) update_named_param() (maxent_grpo.training.rollout.vllm_colocate.ColocateVLLMClient method) url (maxent_grpo.training.runtime.config.VLLMClientConfig attribute) use_clip_objective (maxent_grpo.training.types.runtime.ClipSettings attribute) use_deepspeed (maxent_grpo.training.optim.DeepspeedState attribute) use_hessian (maxent_grpo.training.weighting.ControllerMetaSettings attribute) (maxent_grpo.training.weighting.types.ControllerMetaSettings attribute) use_vllm (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) uses_pure_accuracy_math_reward() (in module maxent_grpo.rewards) (in module maxent_grpo.rewards.basic) V validate_training_config() (in module maxent_grpo.cli.config_validation) ValidationContext (class in maxent_grpo.training.types.rewards) vllm (maxent_grpo.training.rollout.context.GenerationContext attribute) (maxent_grpo.training.rollout.generator.GenerationContext attribute) (maxent_grpo.training.rollout.helpers.GenerationContext attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) vllm_backfill_local (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_backfill_with_model (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_backoff (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_backoff_multiplier (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_best_of (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_client (maxent_grpo.training.generation.vllm.VLLMGenerationHelper property) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper property) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper property) vllm_client_tag_fail_fast (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_force_logprobs (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_frequency_penalty (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_guided_json (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_guided_regex (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_include_stop_str_in_output (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_latency_ms (maxent_grpo.training.types.logging.TrainingScalarStats attribute) (maxent_grpo.training.types.TrainingScalarStats attribute) vllm_logit_bias (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_logprob_fail_after (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_logprob_fallback (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_max_completion_rounds (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_max_retries (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_meta_has_logprobs() (in module maxent_grpo.training.scoring) (in module maxent_grpo.training.scoring_reference) vllm_mode (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.GenerationSamplingConfig attribute) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig attribute) vllm_presence_penalty (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_request_id_prefix (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_request_logprobs (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_retry_sleep (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_return_logprobs (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_rounds_cfg (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_stop_sequences (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_sync_interval_steps (maxent_grpo.config.grpo.GRPOConfig attribute) vllm_sync_ready (maxent_grpo.training.generation.vllm.VLLMGenerationHelper property) (maxent_grpo.training.generation.vllm_helper.VLLMGenerationHelper property) (maxent_grpo.training.rollout.helpers.VLLMGenerationHelper property) vllm_sync_weights (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_timeout (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_top_k (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) vllm_url (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.runtime.config.GenerationSamplingConfig property) (maxent_grpo.training.runtime.GenerationSamplingConfig property) (maxent_grpo.training.runtime.setup.GenerationSamplingConfig property) VLLMClientConfig (class in maxent_grpo.training.runtime.config) VLLMDistributedMixin (class in maxent_grpo.training.generation.vllm_distributed) VLLMGenerationHelper (class in maxent_grpo.training.generation.vllm) (class in maxent_grpo.training.generation.vllm_helper) (class in maxent_grpo.training.rollout.helpers) VLLMGenerationMixin (class in maxent_grpo.training.rollout.vllm_adapter) VLLMLogprobResult (class in maxent_grpo.training.patches.vllm) VLLMRequestMixin (class in maxent_grpo.training.generation.vllm_requests) VLLMWeightSyncMixin (class in maxent_grpo.training.generation.vllm_weight_sync) W wandb_entity (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.telemetry.wandb.WandbConfig attribute) wandb_log_unique_prompts (maxent_grpo.config.grpo.GRPOConfig attribute) wandb_project (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.telemetry.wandb.WandbConfig attribute) wandb_run (maxent_grpo.training.types.logging.LoggingHandles attribute) wandb_run_group (maxent_grpo.config.grpo.GRPOConfig attribute) (maxent_grpo.training.telemetry.wandb.WandbConfig attribute) WandbConfig (class in maxent_grpo.training.telemetry.wandb) warmup_steps (maxent_grpo.training.types.runtime.OptimizationSchedule attribute) (maxent_grpo.training.weighting.TauSchedule attribute) (maxent_grpo.training.weighting.types.TauSchedule attribute) weight (maxent_grpo.config.dataset.DatasetConfig attribute) weight_entropy (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) weight_entropy_max (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) weight_entropy_min (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) weight_matrix_from_q() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) weight_stats (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) (maxent_grpo.training.pipeline.PreparedBatch property) (maxent_grpo.training.types.logging.TrainingMetricsPayload attribute) weight_vector_from_q() (in module maxent_grpo.rewards.maxent) (in module maxent_grpo.training.weighting) (in module maxent_grpo.training.weighting.logic) weighted_kl_loss (maxent_grpo.training.types.LossScalarBundle attribute) (maxent_grpo.training.types.rewards.LossScalarBundle attribute) weighted_kl_loss_scalar (maxent_grpo.training.types.rewards.LossOutputs property) weighting (maxent_grpo.training.controller_objective.ControllerMetaContext attribute) (maxent_grpo.training.types.logging.LoggingConfigView attribute) (maxent_grpo.training.types.LoggingConfigView attribute) (maxent_grpo.training.types.runtime.ScoringSettings attribute) WeightingConfigLike (class in maxent_grpo.rewards.maxent) (class in maxent_grpo.training.weighting.types) WeightingSettings (class in maxent_grpo.rewards.maxent) (class in maxent_grpo.training.weighting.types) WeightLoggingView (class in maxent_grpo.rewards.maxent) (class in maxent_grpo.training.weighting.types) WeightNormalizationSettings (class in maxent_grpo.training.weighting) (class in maxent_grpo.training.weighting.types) weights_grouped (maxent_grpo.rewards.maxent.WeightStats attribute) (maxent_grpo.training.weighting.types.WeightStats attribute) WeightStats (class in maxent_grpo.rewards.maxent) (class in maxent_grpo.training.weighting.types) wrap_trl_trainer() (in module maxent_grpo.training.trl_trainer) Z zero3_gather_factory() (in module maxent_grpo.training.generation.vllm_utils) zero_grad() (maxent_grpo.training.weighting.types.TorchControllerState method) zero_stage (maxent_grpo.training.optim.DeepspeedState attribute) zero_truncated_completion_rewards (maxent_grpo.config.grpo.GRPOConfig attribute)