amd
/

AMD-OLMo

Text Generation

Model card Files Files and versions

AMD-OLMo / AMD-OLMo-1B-SFT-1st-phase.yaml

Prakamya Mishra

Upload RREADME and Scripts

27651a9 over 1 year ago

history blame contribute delete

2.73 kB

	run_name: AMD-OLMo-1B-SFT-1st-phase
	seed: 6198
	dry_run: false

	wandb:
	name: ${run_name}
	project: AMD-OLMo
	group: SFT

	model:
	d_model: 2048
	n_heads: 16
	n_layers: 16
	mlp_ratio: 8
	weight_tying: true
	alibi: false
	rope: true
	flash_attention: false
	attention_dropout: 0.0
	attention_layer_norm: false
	multi_query_attention: false
	include_bias: false
	block_type: sequential
	layer_norm_type: default
	layer_norm_with_affine: false
	bias_for_layer_norm: false
	attention_layer_norm_with_affine: false
	activation_type: swiglu
	residual_dropout: 0.0
	embedding_dropout: 0.0
	max_sequence_length: 2048
	vocab_size: 50280
	embedding_size: 50304
	eos_token_id: 50279
	pad_token_id: 1
	init_device: meta
	init_fn: mitchell

	compile:
	fullgraph: false

	optimizer:
	name: adamw
	learning_rate: 2.0e-5
	weight_decay: 0
	betas:
	- 0.9
	- 0.95
	metrics_log_interval: 10

	scheduler:
	name: linear_with_warmup
	t_warmup: 200
	alpha_f: 0.001

	tokenizer:
	identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
	truncate_direction: right

	save_folder: ./outputs/${run_name}/
	save_overwrite: true
	# Sharded checkpoints (best for restarts)
	save_interval: 1000
	save_num_checkpoints_to_keep: -1
	# Unsharded checkpoints (for final storage)
	save_interval_unsharded: 10000
	save_num_unsharded_checkpoints_to_keep: -1

	load_path: path_to_unsharded_pretrain_checkpoint
	reset_trainer_state: true

	max_duration: 3ep # train 3 epochs
	global_train_batch_size: 128
	device_train_microbatch_size: 8

	precision: amp_bf16

	fsdp:
	wrapping_strategy: null
	precision: mixed

	max_grad_norm: 1.0
	max_grad_norm_ratio: null

	speed_monitor:
	window_size: 20

	eval_interval: ${save_interval}
	eval_subset_num_batches: -1
	device_eval_batch_size: ${device_train_microbatch_size}
	evaluators:
	- label: piqa
	type: downstream

	- label: hellaswag
	type: downstream

	- label: winogrande
	type: downstream

	- label: openbook_qa
	type: downstream

	# - label: boolq # requires implemention of the pmi_dc matrix
	# type: downstream

	- label: sciq
	type: downstream

	- label: arc_easy
	type: downstream

	# - label: arc_challenge # requires implemention of the pmi_dc matrix
	# type: downstream

	- label: copa
	type: downstream

	- label: rte
	type: downstream

	- label: commitment_bank
	type: downstream

	- label: mrpc
	type: downstream

	- label: sst2
	type: downstream

	data:
	pad_direction: right
	num_workers: 0
	drop_last: true
	pin_memory: true
	prefetch_factor: 1
	persistent_workers: true
	timeout: 0
	generate_attention_mask: true
	paths:
	- ./datasets/tulu/input_ids.npy
	label_mask_paths:
	- ./datasets/tulu/label_mask.npy