| run_name: AMD-OLMo-1B-SFT-1st-phase |
| seed: 6198 |
| dry_run: false |
|
|
| wandb: |
| name: ${run_name} |
| project: AMD-OLMo |
| group: SFT |
|
|
| model: |
| d_model: 2048 |
| n_heads: 16 |
| n_layers: 16 |
| mlp_ratio: 8 |
| weight_tying: true |
| alibi: false |
| rope: true |
| flash_attention: false |
| attention_dropout: 0.0 |
| attention_layer_norm: false |
| multi_query_attention: false |
| include_bias: false |
| block_type: sequential |
| layer_norm_type: default |
| layer_norm_with_affine: false |
| bias_for_layer_norm: false |
| attention_layer_norm_with_affine: false |
| activation_type: swiglu |
| residual_dropout: 0.0 |
| embedding_dropout: 0.0 |
| max_sequence_length: 2048 |
| vocab_size: 50280 |
| embedding_size: 50304 |
| eos_token_id: 50279 |
| pad_token_id: 1 |
| init_device: meta |
| init_fn: mitchell |
|
|
| compile: |
| fullgraph: false |
|
|
| optimizer: |
| name: adamw |
| learning_rate: 2.0e-5 |
| weight_decay: 0 |
| betas: |
| - 0.9 |
| - 0.95 |
| metrics_log_interval: 10 |
|
|
| scheduler: |
| name: linear_with_warmup |
| t_warmup: 200 |
| alpha_f: 0.001 |
|
|
| tokenizer: |
| identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json |
| truncate_direction: right |
|
|
| save_folder: ./outputs/${run_name}/ |
| save_overwrite: true |
| |
| save_interval: 1000 |
| save_num_checkpoints_to_keep: -1 |
| |
| save_interval_unsharded: 10000 |
| save_num_unsharded_checkpoints_to_keep: -1 |
|
|
| load_path: path_to_unsharded_pretrain_checkpoint |
| reset_trainer_state: true |
|
|
| max_duration: 3ep |
| global_train_batch_size: 128 |
| device_train_microbatch_size: 8 |
|
|
| precision: amp_bf16 |
|
|
| fsdp: |
| wrapping_strategy: null |
| precision: mixed |
|
|
| max_grad_norm: 1.0 |
| max_grad_norm_ratio: null |
|
|
| speed_monitor: |
| window_size: 20 |
|
|
| eval_interval: ${save_interval} |
| eval_subset_num_batches: -1 |
| device_eval_batch_size: ${device_train_microbatch_size} |
| evaluators: |
| - label: piqa |
| type: downstream |
|
|
| - label: hellaswag |
| type: downstream |
|
|
| - label: winogrande |
| type: downstream |
|
|
| - label: openbook_qa |
| type: downstream |
|
|
| |
| |
|
|
| - label: sciq |
| type: downstream |
|
|
| - label: arc_easy |
| type: downstream |
|
|
| |
| |
|
|
| - label: copa |
| type: downstream |
|
|
| - label: rte |
| type: downstream |
|
|
| - label: commitment_bank |
| type: downstream |
|
|
| - label: mrpc |
| type: downstream |
|
|
| - label: sst2 |
| type: downstream |
|
|
| data: |
| pad_direction: right |
| num_workers: 0 |
| drop_last: true |
| pin_memory: true |
| prefetch_factor: 1 |
| persistent_workers: true |
| timeout: 0 |
| generate_attention_mask: true |
| paths: |
| - ./datasets/tulu/input_ids.npy |
| label_mask_paths: |
| - ./datasets/tulu/label_mask.npy |