Harmony provides four main training algorithms for custom recipes: SFT (Supervised Fine-Tuning), PPO (Proximal Policy Optimization), GRPO (Group Relative Policy Optimization), and DPO (Direct Preference Optimization).
SFT
SFT is the foundation training method that teaches a model to follow instructions using demonstration data.
Basic SFT Training
from adaptive_harmony.common.sft import SFT
from adaptive_harmony.metric_logger import WandbLogger
from adaptive_harmony.core.dataset import load_from_hf, convert_sample_dict
from adaptive_harmony.runtime import recipe_main, RecipeContext
@recipe_main
async def sft_training(ctx: RecipeContext):
client = ctx.client
assert ctx.model_to_train, "Model must be set for training"
# Load dataset
convert_sample_fn = convert_sample_dict("messages", "role", "content")
dataset = load_from_hf("HuggingFaceH4/ultrachat_200k", "train_sft", convert_sample_fn)
# Spawn model
model_future = client.model(ctx.model_to_train).tp(ctx.world_size).spawn_train("main", 4096)
model = await model_future
# Setup logging
logger = WandbLogger("harmony-sft-training", "sft-training", "adaptive-ml")
# Run SFT training
sft = SFT(
dataset,
model,
logger=logger,
lr=1e-5,
samples_per_batch=512,
max_grad_norm=1.0
)
await sft.run()
SFT Parameters
data_set
: List of StringThread
objects containing training examples
model
: Training model instance
logger
: Logger for tracking training metrics
lr
: Learning rate
samples_per_batch
: Batch size
max_grad_norm
: Gradient clipping norm
PPO
PPO is a reinforcement learning algorithm that uses a reward function to improve model behavior.
Basic PPO Training
from adaptive_harmony.common.ppo import PPO
from adaptive_harmony.metric_logger import WandbLogger
from adaptive_harmony.scoring.combined_scorer import CombinedScorer
from adaptive_harmony.scoring.binary_judge_scorer import BinaryJudgeScorer
from adaptive_harmony.runtime import recipe_main, RecipeContext
@recipe_main
async def ppo_training(ctx: RecipeContext):
client = ctx.client
assert ctx.model_to_train, "Model must be set for training"
# Prepare dataset (prompts only for PPO)
dataset = [
StringThread([("user", "What is machine learning?")]),
StringThread([("user", "Explain quantum computing")]),
# ... more prompts
]
# Spawn models
policy_model = await client.model(ctx.model_to_train).tp(ctx.world_size).spawn_train("policy", 4096)
value_model = await client.model(ctx.model_to_train).into_scoring_model().tp(ctx.world_size).spawn_train("value", 4096)
judge_model = await client.model("gpt-4o").tp(1).spawn_inference("judge")
# Create scoring function
criteria = "Evaluate if the answer is harmful, offensive, or use inappropriate language, including hate speech, violence, or explicit content."
scorer = BinaryJudgeScorer(model=judge_model, criteria=criteria)
# Setup logging
logger = WandbLogger("harmony-ppo-training", "ppo-training", "adaptive-ml")
# Run PPO training
await PPO(
dataset,
policy_model,
value_model,
scoring_fn=scorer.score_without_metadata,
logger=logger,
lr_policy=0.75e-6,
lr_value=1e-6,
num_samples_per_batch=128,
num_samples_per_mini_batch=128,
max_grad_norm=1.0,
clip_range=0.1,
kl_beta=0.1,
mini_epochs_per_batch=1,
max_num_ppo_steps=100
).run()
PPO Parameters
data_set
: List of StringThread
prompts
model
: Policy model for training
value_model
: Value model for advantage estimation
scoring_fn
: Function that returns reward scores
lr_policy
: Policy learning rate
lr_value
: Value learning rate
kl_beta
: KL divergence penalty coefficient
clip_range
: PPO clipping range
GRPO
GRPO is similar to PPO but generates multiple completions per prompt and uses relative ranking for training.
Basic GRPO Training
from adaptive_harmony.common.grpo import GRPO
from adaptive_harmony.metric_logger import WandbLogger
from adaptive_harmony.runtime import recipe_main, RecipeContext
@recipe_main
async def grpo_training(ctx: RecipeContext):
client = ctx.client
assert ctx.model_to_train, "Model must be set for training"
# Prepare dataset
dataset = [
StringThread([("user", "Solve this math problem: 2x + 5 = 13")]),
StringThread([("user", "What is the capital of France?")]),
# ... more prompts
]
# Spawn model
policy_model = await client.model(ctx.model_to_train).tp(ctx.world_size).spawn_train("policy", 4096)
# Create scoring function
def scoring_fn(response: StringThread) -> float:
# Implement your scoring logic
# Return score between 0 and 1
return 0.8 # Example score
# Setup logging
logger = WandbLogger("harmony-grpo-training", "grpo-training", "adaptive-ml")
# Run GRPO training
await GRPO(
dataset,
policy_model,
scoring_fn=scoring_fn,
logger=logger,
lr=7.5e-7,
num_samples_per_batch=128,
num_samples_per_mini_batch=128,
max_grad_norm=1.0,
clip_range=0.1,
kl_beta=0.1,
mini_epochs_per_batch=1,
max_num_grpo_steps=100,
completions_per_sample=8
).run()
GRPO Parameters
data_set
: List of StringThread
prompts
model
: Training model
scoring_fn
: Function that returns reward scores
completions_per_sample
: Number of completions per prompt
lr
: Learning rate
kl_beta
: KL divergence penalty coefficient
DPO
DPO trains models using preference data (preferred vs non-preferred responses) without explicit reward modeling.
Basic DPO Training
from adaptive_harmony.common.dpo import DPO
from adaptive_harmony.metric_logger import WandbLogger
from adaptive_harmony.runtime import recipe_main, RecipeContext
@recipe_main
async def dpo_training(ctx: RecipeContext):
client = ctx.client
assert ctx.model_to_train, "Model must be set for training"
# Prepare preference dataset: (preferred_response, non_preferred_response)
dataset = [
(
StringThread([("user", "What is AI?"), ("assistant", "AI is artificial intelligence that enables machines to perform tasks requiring human intelligence.")]),
StringThread([("user", "What is AI?"), ("assistant", "AI is a computer thing.")])
),
# ... more preference pairs
]
# Spawn model
model = await client.model(ctx.model_to_train).tp(1).spawn_train("main", 4096)
# Setup logging
logger = WandbLogger("harmony-dpo-training", "dpo-training", "adaptive-ml")
# Run DPO training
await DPO(
dataset,
model,
logger=logger,
lr=1e-4,
samples_per_batch=32,
max_grad_norm=1.0,
beta=0.1
).run()
DPO Parameters
data_set
: List of tuples containing (preferred_response, non_preferred_response)
model
: Training model
logger
: Logger for tracking metrics
lr
: Learning rate
samples_per_batch
: Batch size
beta
: DPO beta parameter