Upload routing tuning test outputs 2026-04-07
Browse files- routing_tuning_test_07_04/N_2.5/.hydra/config.yaml +53 -0
- routing_tuning_test_07_04/N_2.5/.hydra/hydra.yaml +162 -0
- routing_tuning_test_07_04/N_2.5/.hydra/overrides.yaml +2 -0
- routing_tuning_test_07_04/N_2.5/model_best.pt +3 -0
- routing_tuning_test_07_04/N_2.5/model_final.pt +3 -0
- routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_0.pt +3 -0
- routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_781.pt +3 -0
- routing_tuning_test_07_04/N_2.5/train.log +386 -0
- routing_tuning_test_07_04/N_4.0/.hydra/config.yaml +53 -0
- routing_tuning_test_07_04/N_4.0/.hydra/hydra.yaml +162 -0
- routing_tuning_test_07_04/N_4.0/.hydra/overrides.yaml +2 -0
- routing_tuning_test_07_04/N_4.0/model_best.pt +3 -0
- routing_tuning_test_07_04/N_4.0/model_final.pt +3 -0
- routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_0.pt +3 -0
- routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_781.pt +3 -0
- routing_tuning_test_07_04/N_4.0/train.log +386 -0
- routing_tuning_test_07_04/N_6.0/.hydra/config.yaml +53 -0
- routing_tuning_test_07_04/N_6.0/.hydra/hydra.yaml +162 -0
- routing_tuning_test_07_04/N_6.0/.hydra/overrides.yaml +2 -0
- routing_tuning_test_07_04/N_6.0/model_best.pt +3 -0
- routing_tuning_test_07_04/N_6.0/model_final.pt +3 -0
- routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_0.pt +3 -0
- routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_781.pt +3 -0
- routing_tuning_test_07_04/N_6.0/train.log +386 -0
- routing_tuning_test_07_04/N_8.0/.hydra/config.yaml +53 -0
- routing_tuning_test_07_04/N_8.0/.hydra/hydra.yaml +162 -0
- routing_tuning_test_07_04/N_8.0/.hydra/overrides.yaml +2 -0
- routing_tuning_test_07_04/N_8.0/model_best.pt +3 -0
- routing_tuning_test_07_04/N_8.0/model_final.pt +3 -0
- routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_0.pt +3 -0
- routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_781.pt +3 -0
- routing_tuning_test_07_04/N_8.0/train.log +386 -0
routing_tuning_test_07_04/N_2.5/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
|
| 3 |
+
checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 4 |
+
training:
|
| 5 |
+
epochs: 1
|
| 6 |
+
max_steps: null
|
| 7 |
+
batch_size: 8
|
| 8 |
+
eval_batch_size: 24
|
| 9 |
+
gradient_accumulation_steps: 4
|
| 10 |
+
lr: 0.0001
|
| 11 |
+
weight_decay: 0.1
|
| 12 |
+
betas:
|
| 13 |
+
- 0.9
|
| 14 |
+
- 0.95
|
| 15 |
+
eps: 1.0e-08
|
| 16 |
+
lr_scheduler: wsd
|
| 17 |
+
warmup_ratio: 0.1
|
| 18 |
+
decay_ratio: 0.2
|
| 19 |
+
warmup_steps: 100
|
| 20 |
+
min_lr_ratio: 0.1
|
| 21 |
+
lr_multiplier:
|
| 22 |
+
- 2.0
|
| 23 |
+
- 1.5
|
| 24 |
+
- 1.0
|
| 25 |
+
load_balancing_weight: 0.05
|
| 26 |
+
load_balancing_N: 2.5
|
| 27 |
+
max_grad_norm: 1.0
|
| 28 |
+
use_amp: true
|
| 29 |
+
resume: false
|
| 30 |
+
resume_checkpoint: null
|
| 31 |
+
warmup_model: true
|
| 32 |
+
data:
|
| 33 |
+
path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
|
| 34 |
+
max_context_len: 4096
|
| 35 |
+
max_target_len: 256
|
| 36 |
+
num_workers: 0
|
| 37 |
+
pin_memory: true
|
| 38 |
+
max_train_samples: 50000
|
| 39 |
+
max_val_samples: null
|
| 40 |
+
logging:
|
| 41 |
+
log_interval: 10
|
| 42 |
+
save_interval: 1000
|
| 43 |
+
eval_interval: 250
|
| 44 |
+
save_every_epoch: false
|
| 45 |
+
model_only_checkpoints: true
|
| 46 |
+
tracking:
|
| 47 |
+
enabled: true
|
| 48 |
+
project: routing-evolution
|
| 49 |
+
run_name: routing_N2.5
|
| 50 |
+
paths:
|
| 51 |
+
output_dir: outputs/N_${training.load_balancing_N}
|
| 52 |
+
seed: 42
|
| 53 |
+
device: cuda
|
routing_tuning_test_07_04/N_2.5/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${paths.output_dir}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- training.load_balancing_N=2.5
|
| 116 |
+
- tracking.run_name=routing_N2.5
|
| 117 |
+
job:
|
| 118 |
+
name: train
|
| 119 |
+
chdir: false
|
| 120 |
+
override_dirname: tracking.run_name=routing_N2.5,training.load_balancing_N=2.5
|
| 121 |
+
id: ???
|
| 122 |
+
num: ???
|
| 123 |
+
config_name: config
|
| 124 |
+
env_set: {}
|
| 125 |
+
env_copy: []
|
| 126 |
+
config:
|
| 127 |
+
override_dirname:
|
| 128 |
+
kv_sep: '='
|
| 129 |
+
item_sep: ','
|
| 130 |
+
exclude_keys: []
|
| 131 |
+
runtime:
|
| 132 |
+
version: 1.3.2
|
| 133 |
+
version_base: '1.3'
|
| 134 |
+
cwd: /workspace/byte-llms-code/routing_evolution_exp
|
| 135 |
+
config_sources:
|
| 136 |
+
- path: hydra.conf
|
| 137 |
+
schema: pkg
|
| 138 |
+
provider: hydra
|
| 139 |
+
- path: /workspace/byte-llms-code/routing_evolution_exp/configs
|
| 140 |
+
schema: file
|
| 141 |
+
provider: main
|
| 142 |
+
- path: ''
|
| 143 |
+
schema: structured
|
| 144 |
+
provider: schema
|
| 145 |
+
output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_2.5
|
| 146 |
+
choices:
|
| 147 |
+
paths: default
|
| 148 |
+
tracking: default
|
| 149 |
+
logging: default
|
| 150 |
+
data: default
|
| 151 |
+
training: default
|
| 152 |
+
model: hnet_xl_code
|
| 153 |
+
hydra/env: default
|
| 154 |
+
hydra/callbacks: null
|
| 155 |
+
hydra/job_logging: default
|
| 156 |
+
hydra/hydra_logging: default
|
| 157 |
+
hydra/hydra_help: default
|
| 158 |
+
hydra/help: default
|
| 159 |
+
hydra/sweeper: basic
|
| 160 |
+
hydra/launcher: basic
|
| 161 |
+
hydra/output: default
|
| 162 |
+
verbose: false
|
routing_tuning_test_07_04/N_2.5/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- training.load_balancing_N=2.5
|
| 2 |
+
- tracking.run_name=routing_N2.5
|
routing_tuning_test_07_04/N_2.5/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56ffd4eec1c3270b45b6fe7338584cac9f6a57aee6737fc373ffd7b3e5731461
|
| 3 |
+
size 3315165139
|
routing_tuning_test_07_04/N_2.5/model_final.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba7c46532c2086aa0a0a61a36a91b28c01addcff63728d9a90892e702af44611
|
| 3 |
+
size 3315165484
|
routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
|
| 3 |
+
size 13633736
|
routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_781.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c02d24288839c9545a03982a22732394ed5d2a755a9f14de1eb8df2038f79d8f
|
| 3 |
+
size 13633752
|
routing_tuning_test_07_04/N_2.5/train.log
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-04-07 17:39:50] CUDA_VISIBLE_DEVICES: 0,1
|
| 2 |
+
[2026-04-07 17:39:50] Number of processes: 2
|
| 3 |
+
[2026-04-07 17:39:50] Mixed precision: bf16
|
| 4 |
+
[2026-04-07 17:39:50] ============================================================
|
| 5 |
+
[2026-04-07 17:39:50] Routing Evolution Experiment | N=2.5
|
| 6 |
+
[2026-04-07 17:39:50] ============================================================
|
| 7 |
+
[2026-04-07 17:39:50] Config:
|
| 8 |
+
model:
|
| 9 |
+
config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
|
| 10 |
+
checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 11 |
+
training:
|
| 12 |
+
epochs: 1
|
| 13 |
+
max_steps: null
|
| 14 |
+
batch_size: 8
|
| 15 |
+
eval_batch_size: 24
|
| 16 |
+
gradient_accumulation_steps: 4
|
| 17 |
+
lr: 0.0001
|
| 18 |
+
weight_decay: 0.1
|
| 19 |
+
betas:
|
| 20 |
+
- 0.9
|
| 21 |
+
- 0.95
|
| 22 |
+
eps: 1.0e-08
|
| 23 |
+
lr_scheduler: wsd
|
| 24 |
+
warmup_ratio: 0.1
|
| 25 |
+
decay_ratio: 0.2
|
| 26 |
+
warmup_steps: 100
|
| 27 |
+
min_lr_ratio: 0.1
|
| 28 |
+
lr_multiplier:
|
| 29 |
+
- 2.0
|
| 30 |
+
- 1.5
|
| 31 |
+
- 1.0
|
| 32 |
+
load_balancing_weight: 0.05
|
| 33 |
+
load_balancing_N: 2.5
|
| 34 |
+
max_grad_norm: 1.0
|
| 35 |
+
use_amp: true
|
| 36 |
+
resume: false
|
| 37 |
+
resume_checkpoint: null
|
| 38 |
+
warmup_model: true
|
| 39 |
+
data:
|
| 40 |
+
path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
|
| 41 |
+
max_context_len: 4096
|
| 42 |
+
max_target_len: 256
|
| 43 |
+
num_workers: 0
|
| 44 |
+
pin_memory: true
|
| 45 |
+
max_train_samples: 50000
|
| 46 |
+
max_val_samples: null
|
| 47 |
+
logging:
|
| 48 |
+
log_interval: 10
|
| 49 |
+
save_interval: 1000
|
| 50 |
+
eval_interval: 250
|
| 51 |
+
save_every_epoch: false
|
| 52 |
+
model_only_checkpoints: true
|
| 53 |
+
tracking:
|
| 54 |
+
enabled: true
|
| 55 |
+
project: routing-evolution
|
| 56 |
+
run_name: routing_N2.5
|
| 57 |
+
paths:
|
| 58 |
+
output_dir: outputs/N_2.5
|
| 59 |
+
seed: 42
|
| 60 |
+
device: cuda
|
| 61 |
+
|
| 62 |
+
[2026-04-07 17:39:51] Loading model...
|
| 63 |
+
[2026-04-07 17:39:57] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 64 |
+
[2026-04-07 17:39:57] Applied LR multipliers: [2.0, 1.5, 1.0]
|
| 65 |
+
[2026-04-07 17:39:57] Warming up model...
|
| 66 |
+
[2026-04-07 17:40:43] Total params: 1,654,090,112
|
| 67 |
+
[2026-04-07 17:40:43] Trainable params: 1,654,090,112
|
| 68 |
+
[2026-04-07 17:40:43] Creating dataloaders...
|
| 69 |
+
[2026-04-07 17:40:43] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
|
| 70 |
+
[2026-04-07 17:40:43] Max steps: 781, Steps per epoch: 3125
|
| 71 |
+
[2026-04-07 17:40:45] Starting training...
|
| 72 |
+
[2026-04-07 17:40:45]
|
| 73 |
+
============================================================
|
| 74 |
+
[2026-04-07 17:40:45] EPOCH 1/1 (step 0)
|
| 75 |
+
[2026-04-07 17:40:45] ============================================================
|
| 76 |
+
[2026-04-07 17:41:18] Epoch 1 | Step 10 | Loss: 0.6962 | LM: 0.6496 | LB: 1.0154 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
|
| 77 |
+
[2026-04-07 17:41:28] Epoch 1 | Step 20 | Loss: 0.5961 | LM: 0.5509 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.459 | LR: 5.62e-05
|
| 78 |
+
[2026-04-07 17:41:37] Epoch 1 | Step 30 | Loss: 0.5376 | LM: 0.4788 | LB: 1.0147 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.362 | HR1: 0.480/SR1: 0.458 | LR: 7.92e-05
|
| 79 |
+
[2026-04-07 17:41:47] Epoch 1 | Step 40 | Loss: 0.5016 | LM: 0.4479 | LB: 1.0154 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.478/SR1: 0.457 | LR: 1.00e-04
|
| 80 |
+
[2026-04-07 17:41:56] Epoch 1 | Step 50 | Loss: 0.4791 | LM: 0.4096 | LB: 1.0145 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.475/SR1: 0.455 | LR: 1.00e-04
|
| 81 |
+
[2026-04-07 17:42:06] Epoch 1 | Step 60 | Loss: 0.4529 | LM: 0.3812 | LB: 1.0148 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.477/SR1: 0.457 | LR: 1.00e-04
|
| 82 |
+
[2026-04-07 17:42:15] Epoch 1 | Step 70 | Loss: 0.4386 | LM: 0.3774 | LB: 1.0148 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.358 | HR1: 0.477/SR1: 0.456 | LR: 1.00e-04
|
| 83 |
+
[2026-04-07 17:42:24] Epoch 1 | Step 80 | Loss: 0.4239 | LM: 0.3666 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.360/SR0: 0.359 | HR1: 0.476/SR1: 0.456 | LR: 1.00e-04
|
| 84 |
+
[2026-04-07 17:42:33] Epoch 1 | Step 90 | Loss: 0.4154 | LM: 0.3536 | LB: 1.0142 | CL0: 2.8 | CL1: 2.1 | HR0: 0.361/SR0: 0.359 | HR1: 0.475/SR1: 0.454 | LR: 1.00e-04
|
| 85 |
+
[2026-04-07 17:42:42] Epoch 1 | Step 100 | Loss: 0.4104 | LM: 0.3510 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.475/SR1: 0.454 | LR: 1.00e-04
|
| 86 |
+
[2026-04-07 17:42:51] Epoch 1 | Step 110 | Loss: 0.4061 | LM: 0.3457 | LB: 1.0144 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.474/SR1: 0.454 | LR: 1.00e-04
|
| 87 |
+
[2026-04-07 17:43:00] Epoch 1 | Step 120 | Loss: 0.4037 | LM: 0.3472 | LB: 1.0143 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.474/SR1: 0.453 | LR: 1.00e-04
|
| 88 |
+
[2026-04-07 17:43:10] Epoch 1 | Step 130 | Loss: 0.3998 | LM: 0.3442 | LB: 1.0143 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.474/SR1: 0.453 | LR: 1.00e-04
|
| 89 |
+
[2026-04-07 17:43:19] Epoch 1 | Step 140 | Loss: 0.3955 | LM: 0.3397 | LB: 1.0140 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
|
| 90 |
+
[2026-04-07 17:43:28] Epoch 1 | Step 150 | Loss: 0.3932 | LM: 0.3373 | LB: 1.0138 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.472/SR1: 0.451 | LR: 1.00e-04
|
| 91 |
+
[2026-04-07 17:43:37] Epoch 1 | Step 160 | Loss: 0.3889 | LM: 0.3319 | LB: 1.0137 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.471/SR1: 0.450 | LR: 1.00e-04
|
| 92 |
+
[2026-04-07 17:43:46] Epoch 1 | Step 170 | Loss: 0.3841 | LM: 0.3272 | LB: 1.0134 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.470/SR1: 0.450 | LR: 1.00e-04
|
| 93 |
+
[2026-04-07 17:43:55] Epoch 1 | Step 180 | Loss: 0.3806 | LM: 0.3231 | LB: 1.0133 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.470/SR1: 0.449 | LR: 1.00e-04
|
| 94 |
+
[2026-04-07 17:44:05] Epoch 1 | Step 190 | Loss: 0.3789 | LM: 0.3227 | LB: 1.0130 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.469/SR1: 0.448 | LR: 1.00e-04
|
| 95 |
+
[2026-04-07 17:44:14] Epoch 1 | Step 200 | Loss: 0.3763 | LM: 0.3198 | LB: 1.0130 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.469/SR1: 0.448 | LR: 1.00e-04
|
| 96 |
+
[2026-04-07 17:44:23] Epoch 1 | Step 210 | Loss: 0.3761 | LM: 0.3201 | LB: 1.0129 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.468/SR1: 0.447 | LR: 1.00e-04
|
| 97 |
+
[2026-04-07 17:44:33] Epoch 1 | Step 220 | Loss: 0.3752 | LM: 0.3180 | LB: 1.0128 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.447 | LR: 1.00e-04
|
| 98 |
+
[2026-04-07 17:44:42] Epoch 1 | Step 230 | Loss: 0.3739 | LM: 0.3172 | LB: 1.0126 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.446 | LR: 1.00e-04
|
| 99 |
+
[2026-04-07 17:44:51] Epoch 1 | Step 240 | Loss: 0.3718 | LM: 0.3183 | LB: 1.0126 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.467/SR1: 0.446 | LR: 1.00e-04
|
| 100 |
+
[2026-04-07 17:45:00] Epoch 1 | Step 250 | Loss: 0.3695 | LM: 0.3165 | LB: 1.0124 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.467/SR1: 0.445 | LR: 1.00e-04
|
| 101 |
+
[2026-04-07 17:45:01] Validation | Batch 10/732 | Loss: 0.3270 | LM: 0.2764
|
| 102 |
+
[2026-04-07 17:45:02] Validation | Batch 20/732 | Loss: 0.3483 | LM: 0.2979
|
| 103 |
+
[2026-04-07 17:45:04] Validation | Batch 30/732 | Loss: 0.3395 | LM: 0.2890
|
| 104 |
+
[2026-04-07 17:45:05] Validation | Batch 40/732 | Loss: 0.3440 | LM: 0.2936
|
| 105 |
+
[2026-04-07 17:45:06] Validation | Batch 50/732 | Loss: 0.3441 | LM: 0.2937
|
| 106 |
+
[2026-04-07 17:45:07] Validation | Batch 60/732 | Loss: 0.3466 | LM: 0.2962
|
| 107 |
+
[2026-04-07 17:45:08] Validation | Batch 70/732 | Loss: 0.3503 | LM: 0.2999
|
| 108 |
+
[2026-04-07 17:45:10] Validation | Batch 80/732 | Loss: 0.3487 | LM: 0.2983
|
| 109 |
+
[2026-04-07 17:45:11] Validation | Batch 90/732 | Loss: 0.3482 | LM: 0.2978
|
| 110 |
+
[2026-04-07 17:45:12] Validation | Batch 100/732 | Loss: 0.3493 | LM: 0.2989
|
| 111 |
+
[2026-04-07 17:45:13] Validation | Batch 110/732 | Loss: 0.3461 | LM: 0.2957
|
| 112 |
+
[2026-04-07 17:45:14] Validation | Batch 120/732 | Loss: 0.3494 | LM: 0.2990
|
| 113 |
+
[2026-04-07 17:45:16] Validation | Batch 130/732 | Loss: 0.3508 | LM: 0.3004
|
| 114 |
+
[2026-04-07 17:45:17] Validation | Batch 140/732 | Loss: 0.3502 | LM: 0.2998
|
| 115 |
+
[2026-04-07 17:45:18] Validation | Batch 150/732 | Loss: 0.3495 | LM: 0.2991
|
| 116 |
+
[2026-04-07 17:45:19] Validation | Batch 160/732 | Loss: 0.3485 | LM: 0.2981
|
| 117 |
+
[2026-04-07 17:45:20] Validation | Batch 170/732 | Loss: 0.3490 | LM: 0.2986
|
| 118 |
+
[2026-04-07 17:45:21] Validation | Batch 180/732 | Loss: 0.3501 | LM: 0.2997
|
| 119 |
+
[2026-04-07 17:45:22] Validation | Batch 190/732 | Loss: 0.3495 | LM: 0.2991
|
| 120 |
+
[2026-04-07 17:45:23] Validation | Batch 200/732 | Loss: 0.3496 | LM: 0.2992
|
| 121 |
+
[2026-04-07 17:45:24] Validation | Batch 210/732 | Loss: 0.3488 | LM: 0.2984
|
| 122 |
+
[2026-04-07 17:45:26] Validation | Batch 220/732 | Loss: 0.3482 | LM: 0.2978
|
| 123 |
+
[2026-04-07 17:45:27] Validation | Batch 230/732 | Loss: 0.3486 | LM: 0.2982
|
| 124 |
+
[2026-04-07 17:45:28] Validation | Batch 240/732 | Loss: 0.3483 | LM: 0.2979
|
| 125 |
+
[2026-04-07 17:45:30] Validation | Batch 250/732 | Loss: 0.3484 | LM: 0.2980
|
| 126 |
+
[2026-04-07 17:45:31] Validation | Batch 260/732 | Loss: 0.3474 | LM: 0.2970
|
| 127 |
+
[2026-04-07 17:45:32] Validation | Batch 270/732 | Loss: 0.3471 | LM: 0.2967
|
| 128 |
+
[2026-04-07 17:45:33] Validation | Batch 280/732 | Loss: 0.3460 | LM: 0.2956
|
| 129 |
+
[2026-04-07 17:45:34] Validation | Batch 290/732 | Loss: 0.3458 | LM: 0.2954
|
| 130 |
+
[2026-04-07 17:45:35] Validation | Batch 300/732 | Loss: 0.3457 | LM: 0.2953
|
| 131 |
+
[2026-04-07 17:45:36] Validation | Batch 310/732 | Loss: 0.3456 | LM: 0.2952
|
| 132 |
+
[2026-04-07 17:45:38] Validation | Batch 320/732 | Loss: 0.3447 | LM: 0.2943
|
| 133 |
+
[2026-04-07 17:45:39] Validation | Batch 330/732 | Loss: 0.3435 | LM: 0.2931
|
| 134 |
+
[2026-04-07 17:45:40] Validation | Batch 340/732 | Loss: 0.3429 | LM: 0.2925
|
| 135 |
+
[2026-04-07 17:45:41] Validation | Batch 350/732 | Loss: 0.3432 | LM: 0.2928
|
| 136 |
+
[2026-04-07 17:45:42] Validation | Batch 360/732 | Loss: 0.3441 | LM: 0.2937
|
| 137 |
+
[2026-04-07 17:45:43] Validation | Batch 370/732 | Loss: 0.3431 | LM: 0.2927
|
| 138 |
+
[2026-04-07 17:45:44] Validation | Batch 380/732 | Loss: 0.3424 | LM: 0.2920
|
| 139 |
+
[2026-04-07 17:45:45] Validation | Batch 390/732 | Loss: 0.3420 | LM: 0.2916
|
| 140 |
+
[2026-04-07 17:45:46] Validation | Batch 400/732 | Loss: 0.3418 | LM: 0.2914
|
| 141 |
+
[2026-04-07 17:45:47] Validation | Batch 410/732 | Loss: 0.3411 | LM: 0.2907
|
| 142 |
+
[2026-04-07 17:45:48] Validation | Batch 420/732 | Loss: 0.3413 | LM: 0.2909
|
| 143 |
+
[2026-04-07 17:45:50] Validation | Batch 430/732 | Loss: 0.3412 | LM: 0.2908
|
| 144 |
+
[2026-04-07 17:45:51] Validation | Batch 440/732 | Loss: 0.3407 | LM: 0.2903
|
| 145 |
+
[2026-04-07 17:45:52] Validation | Batch 450/732 | Loss: 0.3405 | LM: 0.2901
|
| 146 |
+
[2026-04-07 17:45:53] Validation | Batch 460/732 | Loss: 0.3409 | LM: 0.2904
|
| 147 |
+
[2026-04-07 17:45:54] Validation | Batch 470/732 | Loss: 0.3406 | LM: 0.2902
|
| 148 |
+
[2026-04-07 17:45:55] Validation | Batch 480/732 | Loss: 0.3408 | LM: 0.2904
|
| 149 |
+
[2026-04-07 17:45:57] Validation | Batch 490/732 | Loss: 0.3419 | LM: 0.2915
|
| 150 |
+
[2026-04-07 17:45:58] Validation | Batch 500/732 | Loss: 0.3429 | LM: 0.2925
|
| 151 |
+
[2026-04-07 17:45:59] Validation | Batch 510/732 | Loss: 0.3426 | LM: 0.2921
|
| 152 |
+
[2026-04-07 17:46:00] Validation | Batch 520/732 | Loss: 0.3423 | LM: 0.2919
|
| 153 |
+
[2026-04-07 17:46:01] Validation | Batch 530/732 | Loss: 0.3417 | LM: 0.2913
|
| 154 |
+
[2026-04-07 17:46:02] Validation | Batch 540/732 | Loss: 0.3419 | LM: 0.2915
|
| 155 |
+
[2026-04-07 17:46:03] Validation | Batch 550/732 | Loss: 0.3418 | LM: 0.2914
|
| 156 |
+
[2026-04-07 17:46:04] Validation | Batch 560/732 | Loss: 0.3414 | LM: 0.2909
|
| 157 |
+
[2026-04-07 17:46:06] Validation | Batch 570/732 | Loss: 0.3415 | LM: 0.2911
|
| 158 |
+
[2026-04-07 17:46:07] Validation | Batch 580/732 | Loss: 0.3412 | LM: 0.2908
|
| 159 |
+
[2026-04-07 17:46:08] Validation | Batch 590/732 | Loss: 0.3412 | LM: 0.2908
|
| 160 |
+
[2026-04-07 17:46:10] Validation | Batch 600/732 | Loss: 0.3411 | LM: 0.2907
|
| 161 |
+
[2026-04-07 17:46:11] Validation | Batch 610/732 | Loss: 0.3417 | LM: 0.2913
|
| 162 |
+
[2026-04-07 17:46:12] Validation | Batch 620/732 | Loss: 0.3421 | LM: 0.2916
|
| 163 |
+
[2026-04-07 17:46:13] Validation | Batch 630/732 | Loss: 0.3419 | LM: 0.2915
|
| 164 |
+
[2026-04-07 17:46:14] Validation | Batch 640/732 | Loss: 0.3416 | LM: 0.2912
|
| 165 |
+
[2026-04-07 17:46:16] Validation | Batch 650/732 | Loss: 0.3414 | LM: 0.2910
|
| 166 |
+
[2026-04-07 17:46:17] Validation | Batch 660/732 | Loss: 0.3419 | LM: 0.2915
|
| 167 |
+
[2026-04-07 17:46:18] Validation | Batch 670/732 | Loss: 0.3425 | LM: 0.2921
|
| 168 |
+
[2026-04-07 17:46:19] Validation | Batch 680/732 | Loss: 0.3424 | LM: 0.2920
|
| 169 |
+
[2026-04-07 17:46:20] Validation | Batch 690/732 | Loss: 0.3426 | LM: 0.2922
|
| 170 |
+
[2026-04-07 17:46:21] Validation | Batch 700/732 | Loss: 0.3431 | LM: 0.2927
|
| 171 |
+
[2026-04-07 17:46:22] Validation | Batch 710/732 | Loss: 0.3434 | LM: 0.2930
|
| 172 |
+
[2026-04-07 17:46:23] Validation | Batch 720/732 | Loss: 0.3444 | LM: 0.2940
|
| 173 |
+
[2026-04-07 17:46:25] Validation | Batch 730/732 | Loss: 0.3441 | LM: 0.2937
|
| 174 |
+
[2026-04-07 17:46:25] Validation | Batch 732/732 | Loss: 0.3439 | LM: 0.2935
|
| 175 |
+
[2026-04-07 17:46:25] Validation | Loss: 0.3439 | LM: 0.2935 | PPL: 1.34 | Time: 84.70s
|
| 176 |
+
[2026-04-07 17:46:27] New best model saved! Val loss: 0.3439
|
| 177 |
+
[2026-04-07 17:46:37] Epoch 1 | Step 260 | Loss: 0.3676 | LM: 0.3155 | LB: 1.0124 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.467/SR1: 0.445 | LR: 1.00e-04
|
| 178 |
+
[2026-04-07 17:46:46] Epoch 1 | Step 270 | Loss: 0.3676 | LM: 0.3137 | LB: 1.0123 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.466/SR1: 0.445 | LR: 1.00e-04
|
| 179 |
+
[2026-04-07 17:46:55] Epoch 1 | Step 280 | Loss: 0.3674 | LM: 0.3138 | LB: 1.0122 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.356 | HR1: 0.466/SR1: 0.444 | LR: 1.00e-04
|
| 180 |
+
[2026-04-07 17:47:04] Epoch 1 | Step 290 | Loss: 0.3657 | LM: 0.3118 | LB: 1.0121 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.465/SR1: 0.444 | LR: 1.00e-04
|
| 181 |
+
[2026-04-07 17:47:13] Epoch 1 | Step 300 | Loss: 0.3650 | LM: 0.3113 | LB: 1.0119 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.465/SR1: 0.443 | LR: 1.00e-04
|
| 182 |
+
[2026-04-07 17:47:23] Epoch 1 | Step 310 | Loss: 0.3641 | LM: 0.3103 | LB: 1.0117 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.464/SR1: 0.443 | LR: 1.00e-04
|
| 183 |
+
[2026-04-07 17:47:32] Epoch 1 | Step 320 | Loss: 0.3635 | LM: 0.3098 | LB: 1.0116 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.464/SR1: 0.443 | LR: 9.80e-05
|
| 184 |
+
[2026-04-07 17:47:41] Epoch 1 | Step 330 | Loss: 0.3622 | LM: 0.3088 | LB: 1.0115 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 8.93e-05
|
| 185 |
+
[2026-04-07 17:47:50] Epoch 1 | Step 340 | Loss: 0.3613 | LM: 0.3080 | LB: 1.0114 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 7.51e-05
|
| 186 |
+
[2026-04-07 17:47:59] Epoch 1 | Step 350 | Loss: 0.3609 | LM: 0.3085 | LB: 1.0113 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 5.77e-05
|
| 187 |
+
[2026-04-07 17:48:09] Epoch 1 | Step 360 | Loss: 0.3602 | LM: 0.3100 | LB: 1.0113 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 3.99e-05
|
| 188 |
+
[2026-04-07 17:48:18] Epoch 1 | Step 370 | Loss: 0.3597 | LM: 0.3096 | LB: 1.0112 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 2.45e-05
|
| 189 |
+
[2026-04-07 17:48:27] Epoch 1 | Step 380 | Loss: 0.3587 | LM: 0.3077 | LB: 1.0112 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 1.40e-05
|
| 190 |
+
[2026-04-07 17:48:36] Epoch 1 | Step 390 | Loss: 0.3589 | LM: 0.3073 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 191 |
+
[2026-04-07 17:48:45] Epoch 1 | Step 400 | Loss: 0.3588 | LM: 0.3082 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 192 |
+
[2026-04-07 17:48:54] Epoch 1 | Step 410 | Loss: 0.3587 | LM: 0.3081 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 193 |
+
[2026-04-07 17:49:03] Epoch 1 | Step 420 | Loss: 0.3588 | LM: 0.3074 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 194 |
+
[2026-04-07 17:49:12] Epoch 1 | Step 430 | Loss: 0.3582 | LM: 0.3058 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 195 |
+
[2026-04-07 17:49:21] Epoch 1 | Step 440 | Loss: 0.3574 | LM: 0.3049 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
|
| 196 |
+
[2026-04-07 17:49:31] Epoch 1 | Step 450 | Loss: 0.3568 | LM: 0.3036 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-05
|
| 197 |
+
[2026-04-07 17:49:40] Epoch 1 | Step 460 | Loss: 0.3569 | LM: 0.3027 | LB: 1.0109 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
|
| 198 |
+
[2026-04-07 17:49:49] Epoch 1 | Step 470 | Loss: 0.3567 | LM: 0.3035 | LB: 1.0108 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
|
| 199 |
+
[2026-04-07 17:49:58] Epoch 1 | Step 480 | Loss: 0.3563 | LM: 0.3028 | LB: 1.0108 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
|
| 200 |
+
[2026-04-07 17:50:07] Epoch 1 | Step 490 | Loss: 0.3556 | LM: 0.3017 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
|
| 201 |
+
[2026-04-07 17:50:16] Epoch 1 | Step 500 | Loss: 0.3558 | LM: 0.3020 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
|
| 202 |
+
[2026-04-07 17:50:17] Validation | Batch 10/732 | Loss: 0.3237 | LM: 0.2732
|
| 203 |
+
[2026-04-07 17:50:19] Validation | Batch 20/732 | Loss: 0.3455 | LM: 0.2951
|
| 204 |
+
[2026-04-07 17:50:20] Validation | Batch 30/732 | Loss: 0.3370 | LM: 0.2866
|
| 205 |
+
[2026-04-07 17:50:21] Validation | Batch 40/732 | Loss: 0.3413 | LM: 0.2909
|
| 206 |
+
[2026-04-07 17:50:22] Validation | Batch 50/732 | Loss: 0.3412 | LM: 0.2908
|
| 207 |
+
[2026-04-07 17:50:23] Validation | Batch 60/732 | Loss: 0.3432 | LM: 0.2928
|
| 208 |
+
[2026-04-07 17:50:25] Validation | Batch 70/732 | Loss: 0.3468 | LM: 0.2964
|
| 209 |
+
[2026-04-07 17:50:26] Validation | Batch 80/732 | Loss: 0.3452 | LM: 0.2948
|
| 210 |
+
[2026-04-07 17:50:27] Validation | Batch 90/732 | Loss: 0.3449 | LM: 0.2945
|
| 211 |
+
[2026-04-07 17:50:28] Validation | Batch 100/732 | Loss: 0.3460 | LM: 0.2956
|
| 212 |
+
[2026-04-07 17:50:30] Validation | Batch 110/732 | Loss: 0.3428 | LM: 0.2924
|
| 213 |
+
[2026-04-07 17:50:31] Validation | Batch 120/732 | Loss: 0.3460 | LM: 0.2956
|
| 214 |
+
[2026-04-07 17:50:32] Validation | Batch 130/732 | Loss: 0.3473 | LM: 0.2969
|
| 215 |
+
[2026-04-07 17:50:33] Validation | Batch 140/732 | Loss: 0.3468 | LM: 0.2964
|
| 216 |
+
[2026-04-07 17:50:34] Validation | Batch 150/732 | Loss: 0.3461 | LM: 0.2957
|
| 217 |
+
[2026-04-07 17:50:35] Validation | Batch 160/732 | Loss: 0.3452 | LM: 0.2949
|
| 218 |
+
[2026-04-07 17:50:36] Validation | Batch 170/732 | Loss: 0.3458 | LM: 0.2954
|
| 219 |
+
[2026-04-07 17:50:38] Validation | Batch 180/732 | Loss: 0.3470 | LM: 0.2966
|
| 220 |
+
[2026-04-07 17:50:39] Validation | Batch 190/732 | Loss: 0.3464 | LM: 0.2960
|
| 221 |
+
[2026-04-07 17:50:40] Validation | Batch 200/732 | Loss: 0.3464 | LM: 0.2961
|
| 222 |
+
[2026-04-07 17:50:41] Validation | Batch 210/732 | Loss: 0.3457 | LM: 0.2953
|
| 223 |
+
[2026-04-07 17:50:42] Validation | Batch 220/732 | Loss: 0.3452 | LM: 0.2948
|
| 224 |
+
[2026-04-07 17:50:43] Validation | Batch 230/732 | Loss: 0.3456 | LM: 0.2952
|
| 225 |
+
[2026-04-07 17:50:45] Validation | Batch 240/732 | Loss: 0.3453 | LM: 0.2949
|
| 226 |
+
[2026-04-07 17:50:46] Validation | Batch 250/732 | Loss: 0.3452 | LM: 0.2949
|
| 227 |
+
[2026-04-07 17:50:47] Validation | Batch 260/732 | Loss: 0.3442 | LM: 0.2938
|
| 228 |
+
[2026-04-07 17:50:48] Validation | Batch 270/732 | Loss: 0.3440 | LM: 0.2937
|
| 229 |
+
[2026-04-07 17:50:49] Validation | Batch 280/732 | Loss: 0.3430 | LM: 0.2926
|
| 230 |
+
[2026-04-07 17:50:50] Validation | Batch 290/732 | Loss: 0.3427 | LM: 0.2923
|
| 231 |
+
[2026-04-07 17:50:51] Validation | Batch 300/732 | Loss: 0.3426 | LM: 0.2923
|
| 232 |
+
[2026-04-07 17:50:53] Validation | Batch 310/732 | Loss: 0.3425 | LM: 0.2921
|
| 233 |
+
[2026-04-07 17:50:54] Validation | Batch 320/732 | Loss: 0.3416 | LM: 0.2912
|
| 234 |
+
[2026-04-07 17:50:55] Validation | Batch 330/732 | Loss: 0.3405 | LM: 0.2901
|
| 235 |
+
[2026-04-07 17:50:56] Validation | Batch 340/732 | Loss: 0.3399 | LM: 0.2895
|
| 236 |
+
[2026-04-07 17:50:57] Validation | Batch 350/732 | Loss: 0.3403 | LM: 0.2899
|
| 237 |
+
[2026-04-07 17:50:58] Validation | Batch 360/732 | Loss: 0.3411 | LM: 0.2907
|
| 238 |
+
[2026-04-07 17:50:59] Validation | Batch 370/732 | Loss: 0.3401 | LM: 0.2898
|
| 239 |
+
[2026-04-07 17:51:00] Validation | Batch 380/732 | Loss: 0.3395 | LM: 0.2891
|
| 240 |
+
[2026-04-07 17:51:01] Validation | Batch 390/732 | Loss: 0.3391 | LM: 0.2887
|
| 241 |
+
[2026-04-07 17:51:02] Validation | Batch 400/732 | Loss: 0.3389 | LM: 0.2885
|
| 242 |
+
[2026-04-07 17:51:03] Validation | Batch 410/732 | Loss: 0.3382 | LM: 0.2878
|
| 243 |
+
[2026-04-07 17:51:05] Validation | Batch 420/732 | Loss: 0.3384 | LM: 0.2880
|
| 244 |
+
[2026-04-07 17:51:06] Validation | Batch 430/732 | Loss: 0.3383 | LM: 0.2879
|
| 245 |
+
[2026-04-07 17:51:07] Validation | Batch 440/732 | Loss: 0.3378 | LM: 0.2875
|
| 246 |
+
[2026-04-07 17:51:08] Validation | Batch 450/732 | Loss: 0.3376 | LM: 0.2873
|
| 247 |
+
[2026-04-07 17:51:10] Validation | Batch 460/732 | Loss: 0.3380 | LM: 0.2876
|
| 248 |
+
[2026-04-07 17:51:11] Validation | Batch 470/732 | Loss: 0.3378 | LM: 0.2874
|
| 249 |
+
[2026-04-07 17:51:12] Validation | Batch 480/732 | Loss: 0.3379 | LM: 0.2875
|
| 250 |
+
[2026-04-07 17:51:13] Validation | Batch 490/732 | Loss: 0.3389 | LM: 0.2886
|
| 251 |
+
[2026-04-07 17:51:14] Validation | Batch 500/732 | Loss: 0.3400 | LM: 0.2896
|
| 252 |
+
[2026-04-07 17:51:15] Validation | Batch 510/732 | Loss: 0.3397 | LM: 0.2893
|
| 253 |
+
[2026-04-07 17:51:16] Validation | Batch 520/732 | Loss: 0.3395 | LM: 0.2891
|
| 254 |
+
[2026-04-07 17:51:17] Validation | Batch 530/732 | Loss: 0.3389 | LM: 0.2885
|
| 255 |
+
[2026-04-07 17:51:19] Validation | Batch 540/732 | Loss: 0.3390 | LM: 0.2887
|
| 256 |
+
[2026-04-07 17:51:20] Validation | Batch 550/732 | Loss: 0.3390 | LM: 0.2886
|
| 257 |
+
[2026-04-07 17:51:21] Validation | Batch 560/732 | Loss: 0.3385 | LM: 0.2881
|
| 258 |
+
[2026-04-07 17:51:22] Validation | Batch 570/732 | Loss: 0.3386 | LM: 0.2882
|
| 259 |
+
[2026-04-07 17:51:23] Validation | Batch 580/732 | Loss: 0.3383 | LM: 0.2879
|
| 260 |
+
[2026-04-07 17:51:25] Validation | Batch 590/732 | Loss: 0.3383 | LM: 0.2879
|
| 261 |
+
[2026-04-07 17:51:26] Validation | Batch 600/732 | Loss: 0.3382 | LM: 0.2878
|
| 262 |
+
[2026-04-07 17:51:27] Validation | Batch 610/732 | Loss: 0.3388 | LM: 0.2884
|
| 263 |
+
[2026-04-07 17:51:28] Validation | Batch 620/732 | Loss: 0.3391 | LM: 0.2887
|
| 264 |
+
[2026-04-07 17:51:29] Validation | Batch 630/732 | Loss: 0.3389 | LM: 0.2885
|
| 265 |
+
[2026-04-07 17:51:31] Validation | Batch 640/732 | Loss: 0.3386 | LM: 0.2883
|
| 266 |
+
[2026-04-07 17:51:32] Validation | Batch 650/732 | Loss: 0.3385 | LM: 0.2881
|
| 267 |
+
[2026-04-07 17:51:33] Validation | Batch 660/732 | Loss: 0.3390 | LM: 0.2886
|
| 268 |
+
[2026-04-07 17:51:34] Validation | Batch 670/732 | Loss: 0.3396 | LM: 0.2892
|
| 269 |
+
[2026-04-07 17:51:35] Validation | Batch 680/732 | Loss: 0.3395 | LM: 0.2891
|
| 270 |
+
[2026-04-07 17:51:36] Validation | Batch 690/732 | Loss: 0.3397 | LM: 0.2893
|
| 271 |
+
[2026-04-07 17:51:37] Validation | Batch 700/732 | Loss: 0.3402 | LM: 0.2898
|
| 272 |
+
[2026-04-07 17:51:39] Validation | Batch 710/732 | Loss: 0.3405 | LM: 0.2901
|
| 273 |
+
[2026-04-07 17:51:40] Validation | Batch 720/732 | Loss: 0.3415 | LM: 0.2911
|
| 274 |
+
[2026-04-07 17:51:41] Validation | Batch 730/732 | Loss: 0.3412 | LM: 0.2908
|
| 275 |
+
[2026-04-07 17:51:41] Validation | Batch 732/732 | Loss: 0.3410 | LM: 0.2906
|
| 276 |
+
[2026-04-07 17:51:41] Validation | Loss: 0.3410 | LM: 0.2906 | PPL: 1.34 | Time: 84.74s
|
| 277 |
+
[2026-04-07 17:51:44] New best model saved! Val loss: 0.3410
|
| 278 |
+
[2026-04-07 17:51:54] Epoch 1 | Step 510 | Loss: 0.3557 | LM: 0.3003 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
|
| 279 |
+
[2026-04-07 17:52:03] Epoch 1 | Step 520 | Loss: 0.3554 | LM: 0.2993 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 280 |
+
[2026-04-07 17:52:13] Epoch 1 | Step 530 | Loss: 0.3550 | LM: 0.2984 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 281 |
+
[2026-04-07 17:52:22] Epoch 1 | Step 540 | Loss: 0.3547 | LM: 0.2982 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 282 |
+
[2026-04-07 17:52:31] Epoch 1 | Step 550 | Loss: 0.3544 | LM: 0.2987 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 283 |
+
[2026-04-07 17:52:40] Epoch 1 | Step 560 | Loss: 0.3546 | LM: 0.2995 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 284 |
+
[2026-04-07 17:52:49] Epoch 1 | Step 570 | Loss: 0.3547 | LM: 0.3000 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
|
| 285 |
+
[2026-04-07 17:52:58] Epoch 1 | Step 580 | Loss: 0.3543 | LM: 0.2998 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.438 | LR: 1.00e-05
|
| 286 |
+
[2026-04-07 17:53:07] Epoch 1 | Step 590 | Loss: 0.3548 | LM: 0.3011 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 287 |
+
[2026-04-07 17:53:17] Epoch 1 | Step 600 | Loss: 0.3544 | LM: 0.3003 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 288 |
+
[2026-04-07 17:53:26] Epoch 1 | Step 610 | Loss: 0.3539 | LM: 0.3000 | LB: 1.0104 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 289 |
+
[2026-04-07 17:53:35] Epoch 1 | Step 620 | Loss: 0.3537 | LM: 0.3001 | LB: 1.0104 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 290 |
+
[2026-04-07 17:53:44] Epoch 1 | Step 630 | Loss: 0.3532 | LM: 0.2999 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 291 |
+
[2026-04-07 17:53:54] Epoch 1 | Step 640 | Loss: 0.3529 | LM: 0.2999 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
|
| 292 |
+
[2026-04-07 17:54:03] Epoch 1 | Step 650 | Loss: 0.3531 | LM: 0.2994 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 293 |
+
[2026-04-07 17:54:12] Epoch 1 | Step 660 | Loss: 0.3526 | LM: 0.2991 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 294 |
+
[2026-04-07 17:54:21] Epoch 1 | Step 670 | Loss: 0.3533 | LM: 0.2996 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 295 |
+
[2026-04-07 17:54:31] Epoch 1 | Step 680 | Loss: 0.3530 | LM: 0.2999 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 296 |
+
[2026-04-07 17:54:40] Epoch 1 | Step 690 | Loss: 0.3531 | LM: 0.3004 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 297 |
+
[2026-04-07 17:54:49] Epoch 1 | Step 700 | Loss: 0.3532 | LM: 0.3005 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 298 |
+
[2026-04-07 17:54:58] Epoch 1 | Step 710 | Loss: 0.3530 | LM: 0.3000 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 299 |
+
[2026-04-07 17:55:07] Epoch 1 | Step 720 | Loss: 0.3527 | LM: 0.2999 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 300 |
+
[2026-04-07 17:55:16] Epoch 1 | Step 730 | Loss: 0.3523 | LM: 0.3005 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 301 |
+
[2026-04-07 17:55:26] Epoch 1 | Step 740 | Loss: 0.3521 | LM: 0.3007 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 302 |
+
[2026-04-07 17:55:35] Epoch 1 | Step 750 | Loss: 0.3519 | LM: 0.3003 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 303 |
+
[2026-04-07 17:55:36] Validation | Batch 10/732 | Loss: 0.3236 | LM: 0.2731
|
| 304 |
+
[2026-04-07 17:55:37] Validation | Batch 20/732 | Loss: 0.3450 | LM: 0.2946
|
| 305 |
+
[2026-04-07 17:55:38] Validation | Batch 30/732 | Loss: 0.3364 | LM: 0.2860
|
| 306 |
+
[2026-04-07 17:55:40] Validation | Batch 40/732 | Loss: 0.3408 | LM: 0.2904
|
| 307 |
+
[2026-04-07 17:55:41] Validation | Batch 50/732 | Loss: 0.3406 | LM: 0.2902
|
| 308 |
+
[2026-04-07 17:55:42] Validation | Batch 60/732 | Loss: 0.3426 | LM: 0.2923
|
| 309 |
+
[2026-04-07 17:55:43] Validation | Batch 70/732 | Loss: 0.3462 | LM: 0.2958
|
| 310 |
+
[2026-04-07 17:55:44] Validation | Batch 80/732 | Loss: 0.3447 | LM: 0.2943
|
| 311 |
+
[2026-04-07 17:55:46] Validation | Batch 90/732 | Loss: 0.3444 | LM: 0.2940
|
| 312 |
+
[2026-04-07 17:55:47] Validation | Batch 100/732 | Loss: 0.3455 | LM: 0.2951
|
| 313 |
+
[2026-04-07 17:55:48] Validation | Batch 110/732 | Loss: 0.3423 | LM: 0.2919
|
| 314 |
+
[2026-04-07 17:55:49] Validation | Batch 120/732 | Loss: 0.3455 | LM: 0.2951
|
| 315 |
+
[2026-04-07 17:55:50] Validation | Batch 130/732 | Loss: 0.3468 | LM: 0.2964
|
| 316 |
+
[2026-04-07 17:55:51] Validation | Batch 140/732 | Loss: 0.3463 | LM: 0.2959
|
| 317 |
+
[2026-04-07 17:55:52] Validation | Batch 150/732 | Loss: 0.3456 | LM: 0.2952
|
| 318 |
+
[2026-04-07 17:55:53] Validation | Batch 160/732 | Loss: 0.3447 | LM: 0.2944
|
| 319 |
+
[2026-04-07 17:55:54] Validation | Batch 170/732 | Loss: 0.3453 | LM: 0.2949
|
| 320 |
+
[2026-04-07 17:55:56] Validation | Batch 180/732 | Loss: 0.3465 | LM: 0.2961
|
| 321 |
+
[2026-04-07 17:55:57] Validation | Batch 190/732 | Loss: 0.3459 | LM: 0.2955
|
| 322 |
+
[2026-04-07 17:55:58] Validation | Batch 200/732 | Loss: 0.3459 | LM: 0.2956
|
| 323 |
+
[2026-04-07 17:55:59] Validation | Batch 210/732 | Loss: 0.3452 | LM: 0.2948
|
| 324 |
+
[2026-04-07 17:56:00] Validation | Batch 220/732 | Loss: 0.3447 | LM: 0.2943
|
| 325 |
+
[2026-04-07 17:56:02] Validation | Batch 230/732 | Loss: 0.3451 | LM: 0.2947
|
| 326 |
+
[2026-04-07 17:56:03] Validation | Batch 240/732 | Loss: 0.3448 | LM: 0.2944
|
| 327 |
+
[2026-04-07 17:56:04] Validation | Batch 250/732 | Loss: 0.3448 | LM: 0.2944
|
| 328 |
+
[2026-04-07 17:56:05] Validation | Batch 260/732 | Loss: 0.3438 | LM: 0.2934
|
| 329 |
+
[2026-04-07 17:56:06] Validation | Batch 270/732 | Loss: 0.3436 | LM: 0.2932
|
| 330 |
+
[2026-04-07 17:56:07] Validation | Batch 280/732 | Loss: 0.3425 | LM: 0.2921
|
| 331 |
+
[2026-04-07 17:56:09] Validation | Batch 290/732 | Loss: 0.3422 | LM: 0.2918
|
| 332 |
+
[2026-04-07 17:56:10] Validation | Batch 300/732 | Loss: 0.3422 | LM: 0.2918
|
| 333 |
+
[2026-04-07 17:56:11] Validation | Batch 310/732 | Loss: 0.3420 | LM: 0.2916
|
| 334 |
+
[2026-04-07 17:56:12] Validation | Batch 320/732 | Loss: 0.3411 | LM: 0.2908
|
| 335 |
+
[2026-04-07 17:56:13] Validation | Batch 330/732 | Loss: 0.3400 | LM: 0.2896
|
| 336 |
+
[2026-04-07 17:56:14] Validation | Batch 340/732 | Loss: 0.3394 | LM: 0.2891
|
| 337 |
+
[2026-04-07 17:56:15] Validation | Batch 350/732 | Loss: 0.3398 | LM: 0.2894
|
| 338 |
+
[2026-04-07 17:56:17] Validation | Batch 360/732 | Loss: 0.3406 | LM: 0.2902
|
| 339 |
+
[2026-04-07 17:56:17] Validation | Batch 370/732 | Loss: 0.3397 | LM: 0.2893
|
| 340 |
+
[2026-04-07 17:56:19] Validation | Batch 380/732 | Loss: 0.3390 | LM: 0.2886
|
| 341 |
+
[2026-04-07 17:56:19] Validation | Batch 390/732 | Loss: 0.3386 | LM: 0.2882
|
| 342 |
+
[2026-04-07 17:56:20] Validation | Batch 400/732 | Loss: 0.3384 | LM: 0.2881
|
| 343 |
+
[2026-04-07 17:56:22] Validation | Batch 410/732 | Loss: 0.3377 | LM: 0.2873
|
| 344 |
+
[2026-04-07 17:56:23] Validation | Batch 420/732 | Loss: 0.3379 | LM: 0.2876
|
| 345 |
+
[2026-04-07 17:56:24] Validation | Batch 430/732 | Loss: 0.3379 | LM: 0.2875
|
| 346 |
+
[2026-04-07 17:56:25] Validation | Batch 440/732 | Loss: 0.3374 | LM: 0.2870
|
| 347 |
+
[2026-04-07 17:56:26] Validation | Batch 450/732 | Loss: 0.3372 | LM: 0.2868
|
| 348 |
+
[2026-04-07 17:56:28] Validation | Batch 460/732 | Loss: 0.3376 | LM: 0.2872
|
| 349 |
+
[2026-04-07 17:56:29] Validation | Batch 470/732 | Loss: 0.3373 | LM: 0.2869
|
| 350 |
+
[2026-04-07 17:56:30] Validation | Batch 480/732 | Loss: 0.3375 | LM: 0.2871
|
| 351 |
+
[2026-04-07 17:56:31] Validation | Batch 490/732 | Loss: 0.3385 | LM: 0.2881
|
| 352 |
+
[2026-04-07 17:56:32] Validation | Batch 500/732 | Loss: 0.3396 | LM: 0.2892
|
| 353 |
+
[2026-04-07 17:56:33] Validation | Batch 510/732 | Loss: 0.3392 | LM: 0.2888
|
| 354 |
+
[2026-04-07 17:56:35] Validation | Batch 520/732 | Loss: 0.3390 | LM: 0.2886
|
| 355 |
+
[2026-04-07 17:56:36] Validation | Batch 530/732 | Loss: 0.3384 | LM: 0.2880
|
| 356 |
+
[2026-04-07 17:56:37] Validation | Batch 540/732 | Loss: 0.3386 | LM: 0.2882
|
| 357 |
+
[2026-04-07 17:56:38] Validation | Batch 550/732 | Loss: 0.3385 | LM: 0.2881
|
| 358 |
+
[2026-04-07 17:56:39] Validation | Batch 560/732 | Loss: 0.3380 | LM: 0.2877
|
| 359 |
+
[2026-04-07 17:56:40] Validation | Batch 570/732 | Loss: 0.3381 | LM: 0.2877
|
| 360 |
+
[2026-04-07 17:56:41] Validation | Batch 580/732 | Loss: 0.3378 | LM: 0.2874
|
| 361 |
+
[2026-04-07 17:56:43] Validation | Batch 590/732 | Loss: 0.3379 | LM: 0.2875
|
| 362 |
+
[2026-04-07 17:56:44] Validation | Batch 600/732 | Loss: 0.3378 | LM: 0.2874
|
| 363 |
+
[2026-04-07 17:56:45] Validation | Batch 610/732 | Loss: 0.3383 | LM: 0.2879
|
| 364 |
+
[2026-04-07 17:56:46] Validation | Batch 620/732 | Loss: 0.3387 | LM: 0.2883
|
| 365 |
+
[2026-04-07 17:56:48] Validation | Batch 630/732 | Loss: 0.3385 | LM: 0.2881
|
| 366 |
+
[2026-04-07 17:56:49] Validation | Batch 640/732 | Loss: 0.3382 | LM: 0.2878
|
| 367 |
+
[2026-04-07 17:56:50] Validation | Batch 650/732 | Loss: 0.3380 | LM: 0.2876
|
| 368 |
+
[2026-04-07 17:56:51] Validation | Batch 660/732 | Loss: 0.3385 | LM: 0.2881
|
| 369 |
+
[2026-04-07 17:56:52] Validation | Batch 670/732 | Loss: 0.3391 | LM: 0.2887
|
| 370 |
+
[2026-04-07 17:56:53] Validation | Batch 680/732 | Loss: 0.3391 | LM: 0.2887
|
| 371 |
+
[2026-04-07 17:56:54] Validation | Batch 690/732 | Loss: 0.3393 | LM: 0.2889
|
| 372 |
+
[2026-04-07 17:56:56] Validation | Batch 700/732 | Loss: 0.3397 | LM: 0.2893
|
| 373 |
+
[2026-04-07 17:56:57] Validation | Batch 710/732 | Loss: 0.3401 | LM: 0.2897
|
| 374 |
+
[2026-04-07 17:56:58] Validation | Batch 720/732 | Loss: 0.3411 | LM: 0.2907
|
| 375 |
+
[2026-04-07 17:56:59] Validation | Batch 730/732 | Loss: 0.3407 | LM: 0.2904
|
| 376 |
+
[2026-04-07 17:56:59] Validation | Batch 732/732 | Loss: 0.3406 | LM: 0.2902
|
| 377 |
+
[2026-04-07 17:56:59] Validation | Loss: 0.3406 | LM: 0.2902 | PPL: 1.34 | Time: 84.69s
|
| 378 |
+
[2026-04-07 17:57:02] New best model saved! Val loss: 0.3406
|
| 379 |
+
[2026-04-07 17:57:12] Epoch 1 | Step 760 | Loss: 0.3517 | LM: 0.2994 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 380 |
+
[2026-04-07 17:57:22] Epoch 1 | Step 770 | Loss: 0.3514 | LM: 0.2994 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 381 |
+
[2026-04-07 17:57:31] Epoch 1 | Step 780 | Loss: 0.3517 | LM: 0.3008 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
|
| 382 |
+
[2026-04-07 17:57:32] Reached max_steps=781, stopping training.
|
| 383 |
+
[2026-04-07 17:57:32] Epoch 1 completed in 1006.96s | Loss: 0.3516 | CL0: 2.8 | CL1: 2.2
|
| 384 |
+
[2026-04-07 17:57:32]
|
| 385 |
+
Training completed!
|
| 386 |
+
[2026-04-07 17:57:34] Final model: outputs/N_2.5/model_final.pt
|
routing_tuning_test_07_04/N_4.0/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
|
| 3 |
+
checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 4 |
+
training:
|
| 5 |
+
epochs: 1
|
| 6 |
+
max_steps: null
|
| 7 |
+
batch_size: 8
|
| 8 |
+
eval_batch_size: 24
|
| 9 |
+
gradient_accumulation_steps: 4
|
| 10 |
+
lr: 0.0001
|
| 11 |
+
weight_decay: 0.1
|
| 12 |
+
betas:
|
| 13 |
+
- 0.9
|
| 14 |
+
- 0.95
|
| 15 |
+
eps: 1.0e-08
|
| 16 |
+
lr_scheduler: wsd
|
| 17 |
+
warmup_ratio: 0.1
|
| 18 |
+
decay_ratio: 0.2
|
| 19 |
+
warmup_steps: 100
|
| 20 |
+
min_lr_ratio: 0.1
|
| 21 |
+
lr_multiplier:
|
| 22 |
+
- 2.0
|
| 23 |
+
- 1.5
|
| 24 |
+
- 1.0
|
| 25 |
+
load_balancing_weight: 0.05
|
| 26 |
+
load_balancing_N: 4.0
|
| 27 |
+
max_grad_norm: 1.0
|
| 28 |
+
use_amp: true
|
| 29 |
+
resume: false
|
| 30 |
+
resume_checkpoint: null
|
| 31 |
+
warmup_model: true
|
| 32 |
+
data:
|
| 33 |
+
path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
|
| 34 |
+
max_context_len: 4096
|
| 35 |
+
max_target_len: 256
|
| 36 |
+
num_workers: 0
|
| 37 |
+
pin_memory: true
|
| 38 |
+
max_train_samples: 50000
|
| 39 |
+
max_val_samples: null
|
| 40 |
+
logging:
|
| 41 |
+
log_interval: 10
|
| 42 |
+
save_interval: 1000
|
| 43 |
+
eval_interval: 250
|
| 44 |
+
save_every_epoch: false
|
| 45 |
+
model_only_checkpoints: true
|
| 46 |
+
tracking:
|
| 47 |
+
enabled: true
|
| 48 |
+
project: routing-evolution
|
| 49 |
+
run_name: routing_N4.0
|
| 50 |
+
paths:
|
| 51 |
+
output_dir: outputs/N_${training.load_balancing_N}
|
| 52 |
+
seed: 42
|
| 53 |
+
device: cuda
|
routing_tuning_test_07_04/N_4.0/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${paths.output_dir}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- training.load_balancing_N=4.0
|
| 116 |
+
- tracking.run_name=routing_N4.0
|
| 117 |
+
job:
|
| 118 |
+
name: train
|
| 119 |
+
chdir: false
|
| 120 |
+
override_dirname: tracking.run_name=routing_N4.0,training.load_balancing_N=4.0
|
| 121 |
+
id: ???
|
| 122 |
+
num: ???
|
| 123 |
+
config_name: config
|
| 124 |
+
env_set: {}
|
| 125 |
+
env_copy: []
|
| 126 |
+
config:
|
| 127 |
+
override_dirname:
|
| 128 |
+
kv_sep: '='
|
| 129 |
+
item_sep: ','
|
| 130 |
+
exclude_keys: []
|
| 131 |
+
runtime:
|
| 132 |
+
version: 1.3.2
|
| 133 |
+
version_base: '1.3'
|
| 134 |
+
cwd: /workspace/byte-llms-code/routing_evolution_exp
|
| 135 |
+
config_sources:
|
| 136 |
+
- path: hydra.conf
|
| 137 |
+
schema: pkg
|
| 138 |
+
provider: hydra
|
| 139 |
+
- path: /workspace/byte-llms-code/routing_evolution_exp/configs
|
| 140 |
+
schema: file
|
| 141 |
+
provider: main
|
| 142 |
+
- path: ''
|
| 143 |
+
schema: structured
|
| 144 |
+
provider: schema
|
| 145 |
+
output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_4.0
|
| 146 |
+
choices:
|
| 147 |
+
paths: default
|
| 148 |
+
tracking: default
|
| 149 |
+
logging: default
|
| 150 |
+
data: default
|
| 151 |
+
training: default
|
| 152 |
+
model: hnet_xl_code
|
| 153 |
+
hydra/env: default
|
| 154 |
+
hydra/callbacks: null
|
| 155 |
+
hydra/job_logging: default
|
| 156 |
+
hydra/hydra_logging: default
|
| 157 |
+
hydra/hydra_help: default
|
| 158 |
+
hydra/help: default
|
| 159 |
+
hydra/sweeper: basic
|
| 160 |
+
hydra/launcher: basic
|
| 161 |
+
hydra/output: default
|
| 162 |
+
verbose: false
|
routing_tuning_test_07_04/N_4.0/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- training.load_balancing_N=4.0
|
| 2 |
+
- tracking.run_name=routing_N4.0
|
routing_tuning_test_07_04/N_4.0/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d4e628f340f7ab262f369d2e6c937b10d82f77aabfd27ee70c40d8fd11b6e6a
|
| 3 |
+
size 3315165139
|
routing_tuning_test_07_04/N_4.0/model_final.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30da6a4faa77f36ff078f9eaa61546c90682ec9cc8c595fe235028d1c0794b38
|
| 3 |
+
size 3315165484
|
routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
|
| 3 |
+
size 13633736
|
routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_781.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06258594cce8b74a246104ac374f19d8258d68fcd8c776838309bf3024519e51
|
| 3 |
+
size 13633752
|
routing_tuning_test_07_04/N_4.0/train.log
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-04-07 17:57:52] CUDA_VISIBLE_DEVICES: 0,1
|
| 2 |
+
[2026-04-07 17:57:52] Number of processes: 2
|
| 3 |
+
[2026-04-07 17:57:52] Mixed precision: bf16
|
| 4 |
+
[2026-04-07 17:57:52] ============================================================
|
| 5 |
+
[2026-04-07 17:57:52] Routing Evolution Experiment | N=4.0
|
| 6 |
+
[2026-04-07 17:57:52] ============================================================
|
| 7 |
+
[2026-04-07 17:57:52] Config:
|
| 8 |
+
model:
|
| 9 |
+
config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
|
| 10 |
+
checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 11 |
+
training:
|
| 12 |
+
epochs: 1
|
| 13 |
+
max_steps: null
|
| 14 |
+
batch_size: 8
|
| 15 |
+
eval_batch_size: 24
|
| 16 |
+
gradient_accumulation_steps: 4
|
| 17 |
+
lr: 0.0001
|
| 18 |
+
weight_decay: 0.1
|
| 19 |
+
betas:
|
| 20 |
+
- 0.9
|
| 21 |
+
- 0.95
|
| 22 |
+
eps: 1.0e-08
|
| 23 |
+
lr_scheduler: wsd
|
| 24 |
+
warmup_ratio: 0.1
|
| 25 |
+
decay_ratio: 0.2
|
| 26 |
+
warmup_steps: 100
|
| 27 |
+
min_lr_ratio: 0.1
|
| 28 |
+
lr_multiplier:
|
| 29 |
+
- 2.0
|
| 30 |
+
- 1.5
|
| 31 |
+
- 1.0
|
| 32 |
+
load_balancing_weight: 0.05
|
| 33 |
+
load_balancing_N: 4.0
|
| 34 |
+
max_grad_norm: 1.0
|
| 35 |
+
use_amp: true
|
| 36 |
+
resume: false
|
| 37 |
+
resume_checkpoint: null
|
| 38 |
+
warmup_model: true
|
| 39 |
+
data:
|
| 40 |
+
path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
|
| 41 |
+
max_context_len: 4096
|
| 42 |
+
max_target_len: 256
|
| 43 |
+
num_workers: 0
|
| 44 |
+
pin_memory: true
|
| 45 |
+
max_train_samples: 50000
|
| 46 |
+
max_val_samples: null
|
| 47 |
+
logging:
|
| 48 |
+
log_interval: 10
|
| 49 |
+
save_interval: 1000
|
| 50 |
+
eval_interval: 250
|
| 51 |
+
save_every_epoch: false
|
| 52 |
+
model_only_checkpoints: true
|
| 53 |
+
tracking:
|
| 54 |
+
enabled: true
|
| 55 |
+
project: routing-evolution
|
| 56 |
+
run_name: routing_N4.0
|
| 57 |
+
paths:
|
| 58 |
+
output_dir: outputs/N_4.0
|
| 59 |
+
seed: 42
|
| 60 |
+
device: cuda
|
| 61 |
+
|
| 62 |
+
[2026-04-07 17:57:53] Loading model...
|
| 63 |
+
[2026-04-07 17:57:59] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 64 |
+
[2026-04-07 17:57:59] Applied LR multipliers: [2.0, 1.5, 1.0]
|
| 65 |
+
[2026-04-07 17:57:59] Warming up model...
|
| 66 |
+
[2026-04-07 17:58:45] Total params: 1,654,090,112
|
| 67 |
+
[2026-04-07 17:58:45] Trainable params: 1,654,090,112
|
| 68 |
+
[2026-04-07 17:58:45] Creating dataloaders...
|
| 69 |
+
[2026-04-07 17:58:45] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
|
| 70 |
+
[2026-04-07 17:58:45] Max steps: 781, Steps per epoch: 3125
|
| 71 |
+
[2026-04-07 17:58:47] Starting training...
|
| 72 |
+
[2026-04-07 17:58:47]
|
| 73 |
+
============================================================
|
| 74 |
+
[2026-04-07 17:58:47] EPOCH 1/1 (step 0)
|
| 75 |
+
[2026-04-07 17:58:47] ============================================================
|
| 76 |
+
[2026-04-07 17:59:19] Epoch 1 | Step 10 | Loss: 0.7035 | LM: 0.6494 | LB: 1.1662 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
|
| 77 |
+
[2026-04-07 17:59:28] Epoch 1 | Step 20 | Loss: 0.6032 | LM: 0.5509 | LB: 1.1644 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.459 | LR: 5.62e-05
|
| 78 |
+
[2026-04-07 17:59:38] Epoch 1 | Step 30 | Loss: 0.5447 | LM: 0.4787 | LB: 1.1629 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.362 | HR1: 0.479/SR1: 0.457 | LR: 7.92e-05
|
| 79 |
+
[2026-04-07 17:59:47] Epoch 1 | Step 40 | Loss: 0.5087 | LM: 0.4478 | LB: 1.1577 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.476/SR1: 0.456 | LR: 1.00e-04
|
| 80 |
+
[2026-04-07 17:59:56] Epoch 1 | Step 50 | Loss: 0.4861 | LM: 0.4097 | LB: 1.1533 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
|
| 81 |
+
[2026-04-07 18:00:05] Epoch 1 | Step 60 | Loss: 0.4599 | LM: 0.3813 | LB: 1.1530 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
|
| 82 |
+
[2026-04-07 18:00:14] Epoch 1 | Step 70 | Loss: 0.4455 | LM: 0.3774 | LB: 1.1525 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.358 | HR1: 0.471/SR1: 0.451 | LR: 1.00e-04
|
| 83 |
+
[2026-04-07 18:00:23] Epoch 1 | Step 80 | Loss: 0.4307 | LM: 0.3666 | LB: 1.1508 | CL0: 2.8 | CL1: 2.1 | HR0: 0.360/SR0: 0.358 | HR1: 0.469/SR1: 0.449 | LR: 1.00e-04
|
| 84 |
+
[2026-04-07 18:00:32] Epoch 1 | Step 90 | Loss: 0.4221 | LM: 0.3536 | LB: 1.1481 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.359 | HR1: 0.466/SR1: 0.446 | LR: 1.00e-04
|
| 85 |
+
[2026-04-07 18:00:41] Epoch 1 | Step 100 | Loss: 0.4171 | LM: 0.3510 | LB: 1.1457 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.466/SR1: 0.444 | LR: 1.00e-04
|
| 86 |
+
[2026-04-07 18:00:50] Epoch 1 | Step 110 | Loss: 0.4127 | LM: 0.3458 | LB: 1.1432 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 1.00e-04
|
| 87 |
+
[2026-04-07 18:00:59] Epoch 1 | Step 120 | Loss: 0.4102 | LM: 0.3473 | LB: 1.1407 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-04
|
| 88 |
+
[2026-04-07 18:01:08] Epoch 1 | Step 130 | Loss: 0.4062 | LM: 0.3443 | LB: 1.1390 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-04
|
| 89 |
+
[2026-04-07 18:01:17] Epoch 1 | Step 140 | Loss: 0.4019 | LM: 0.3399 | LB: 1.1363 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-04
|
| 90 |
+
[2026-04-07 18:01:26] Epoch 1 | Step 150 | Loss: 0.3995 | LM: 0.3375 | LB: 1.1345 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.456/SR1: 0.435 | LR: 1.00e-04
|
| 91 |
+
[2026-04-07 18:01:35] Epoch 1 | Step 160 | Loss: 0.3952 | LM: 0.3321 | LB: 1.1326 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.455/SR1: 0.433 | LR: 1.00e-04
|
| 92 |
+
[2026-04-07 18:01:44] Epoch 1 | Step 170 | Loss: 0.3904 | LM: 0.3274 | LB: 1.1309 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.453/SR1: 0.431 | LR: 1.00e-04
|
| 93 |
+
[2026-04-07 18:01:53] Epoch 1 | Step 180 | Loss: 0.3868 | LM: 0.3233 | LB: 1.1297 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.452/SR1: 0.430 | LR: 1.00e-04
|
| 94 |
+
[2026-04-07 18:02:03] Epoch 1 | Step 190 | Loss: 0.3850 | LM: 0.3229 | LB: 1.1283 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.355 | HR1: 0.450/SR1: 0.428 | LR: 1.00e-04
|
| 95 |
+
[2026-04-07 18:02:13] Epoch 1 | Step 200 | Loss: 0.3824 | LM: 0.3201 | LB: 1.1267 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.449/SR1: 0.426 | LR: 1.00e-04
|
| 96 |
+
[2026-04-07 18:02:23] Epoch 1 | Step 210 | Loss: 0.3822 | LM: 0.3205 | LB: 1.1251 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.447/SR1: 0.425 | LR: 1.00e-04
|
| 97 |
+
[2026-04-07 18:02:33] Epoch 1 | Step 220 | Loss: 0.3813 | LM: 0.3185 | LB: 1.1242 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.447/SR1: 0.424 | LR: 1.00e-04
|
| 98 |
+
[2026-04-07 18:02:42] Epoch 1 | Step 230 | Loss: 0.3799 | LM: 0.3177 | LB: 1.1229 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.355 | HR1: 0.445/SR1: 0.422 | LR: 1.00e-04
|
| 99 |
+
[2026-04-07 18:02:51] Epoch 1 | Step 240 | Loss: 0.3778 | LM: 0.3188 | LB: 1.1218 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.444/SR1: 0.421 | LR: 1.00e-04
|
| 100 |
+
[2026-04-07 18:03:00] Epoch 1 | Step 250 | Loss: 0.3754 | LM: 0.3170 | LB: 1.1205 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.420 | LR: 1.00e-04
|
| 101 |
+
[2026-04-07 18:03:01] Validation | Batch 10/732 | Loss: 0.3314 | LM: 0.2769
|
| 102 |
+
[2026-04-07 18:03:03] Validation | Batch 20/732 | Loss: 0.3530 | LM: 0.2984
|
| 103 |
+
[2026-04-07 18:03:04] Validation | Batch 30/732 | Loss: 0.3439 | LM: 0.2894
|
| 104 |
+
[2026-04-07 18:03:05] Validation | Batch 40/732 | Loss: 0.3490 | LM: 0.2945
|
| 105 |
+
[2026-04-07 18:03:06] Validation | Batch 50/732 | Loss: 0.3493 | LM: 0.2948
|
| 106 |
+
[2026-04-07 18:03:07] Validation | Batch 60/732 | Loss: 0.3518 | LM: 0.2973
|
| 107 |
+
[2026-04-07 18:03:08] Validation | Batch 70/732 | Loss: 0.3553 | LM: 0.3008
|
| 108 |
+
[2026-04-07 18:03:10] Validation | Batch 80/732 | Loss: 0.3535 | LM: 0.2991
|
| 109 |
+
[2026-04-07 18:03:11] Validation | Batch 90/732 | Loss: 0.3530 | LM: 0.2986
|
| 110 |
+
[2026-04-07 18:03:12] Validation | Batch 100/732 | Loss: 0.3540 | LM: 0.2995
|
| 111 |
+
[2026-04-07 18:03:13] Validation | Batch 110/732 | Loss: 0.3508 | LM: 0.2963
|
| 112 |
+
[2026-04-07 18:03:14] Validation | Batch 120/732 | Loss: 0.3540 | LM: 0.2996
|
| 113 |
+
[2026-04-07 18:03:15] Validation | Batch 130/732 | Loss: 0.3554 | LM: 0.3010
|
| 114 |
+
[2026-04-07 18:03:17] Validation | Batch 140/732 | Loss: 0.3548 | LM: 0.3004
|
| 115 |
+
[2026-04-07 18:03:18] Validation | Batch 150/732 | Loss: 0.3541 | LM: 0.2997
|
| 116 |
+
[2026-04-07 18:03:19] Validation | Batch 160/732 | Loss: 0.3532 | LM: 0.2988
|
| 117 |
+
[2026-04-07 18:03:20] Validation | Batch 170/732 | Loss: 0.3536 | LM: 0.2992
|
| 118 |
+
[2026-04-07 18:03:21] Validation | Batch 180/732 | Loss: 0.3549 | LM: 0.3004
|
| 119 |
+
[2026-04-07 18:03:22] Validation | Batch 190/732 | Loss: 0.3542 | LM: 0.2998
|
| 120 |
+
[2026-04-07 18:03:23] Validation | Batch 200/732 | Loss: 0.3543 | LM: 0.2998
|
| 121 |
+
[2026-04-07 18:03:24] Validation | Batch 210/732 | Loss: 0.3535 | LM: 0.2991
|
| 122 |
+
[2026-04-07 18:03:25] Validation | Batch 220/732 | Loss: 0.3530 | LM: 0.2986
|
| 123 |
+
[2026-04-07 18:03:27] Validation | Batch 230/732 | Loss: 0.3534 | LM: 0.2990
|
| 124 |
+
[2026-04-07 18:03:28] Validation | Batch 240/732 | Loss: 0.3531 | LM: 0.2987
|
| 125 |
+
[2026-04-07 18:03:29] Validation | Batch 250/732 | Loss: 0.3532 | LM: 0.2987
|
| 126 |
+
[2026-04-07 18:03:30] Validation | Batch 260/732 | Loss: 0.3522 | LM: 0.2977
|
| 127 |
+
[2026-04-07 18:03:31] Validation | Batch 270/732 | Loss: 0.3519 | LM: 0.2975
|
| 128 |
+
[2026-04-07 18:03:32] Validation | Batch 280/732 | Loss: 0.3508 | LM: 0.2963
|
| 129 |
+
[2026-04-07 18:03:33] Validation | Batch 290/732 | Loss: 0.3506 | LM: 0.2962
|
| 130 |
+
[2026-04-07 18:03:34] Validation | Batch 300/732 | Loss: 0.3505 | LM: 0.2960
|
| 131 |
+
[2026-04-07 18:03:36] Validation | Batch 310/732 | Loss: 0.3504 | LM: 0.2960
|
| 132 |
+
[2026-04-07 18:03:37] Validation | Batch 320/732 | Loss: 0.3495 | LM: 0.2950
|
| 133 |
+
[2026-04-07 18:03:38] Validation | Batch 330/732 | Loss: 0.3484 | LM: 0.2939
|
| 134 |
+
[2026-04-07 18:03:39] Validation | Batch 340/732 | Loss: 0.3478 | LM: 0.2933
|
| 135 |
+
[2026-04-07 18:03:40] Validation | Batch 350/732 | Loss: 0.3481 | LM: 0.2936
|
| 136 |
+
[2026-04-07 18:03:41] Validation | Batch 360/732 | Loss: 0.3489 | LM: 0.2944
|
| 137 |
+
[2026-04-07 18:03:42] Validation | Batch 370/732 | Loss: 0.3479 | LM: 0.2935
|
| 138 |
+
[2026-04-07 18:03:43] Validation | Batch 380/732 | Loss: 0.3473 | LM: 0.2928
|
| 139 |
+
[2026-04-07 18:03:44] Validation | Batch 390/732 | Loss: 0.3469 | LM: 0.2924
|
| 140 |
+
[2026-04-07 18:03:45] Validation | Batch 400/732 | Loss: 0.3468 | LM: 0.2923
|
| 141 |
+
[2026-04-07 18:03:46] Validation | Batch 410/732 | Loss: 0.3460 | LM: 0.2915
|
| 142 |
+
[2026-04-07 18:03:47] Validation | Batch 420/732 | Loss: 0.3462 | LM: 0.2917
|
| 143 |
+
[2026-04-07 18:03:48] Validation | Batch 430/732 | Loss: 0.3461 | LM: 0.2916
|
| 144 |
+
[2026-04-07 18:03:50] Validation | Batch 440/732 | Loss: 0.3456 | LM: 0.2911
|
| 145 |
+
[2026-04-07 18:03:51] Validation | Batch 450/732 | Loss: 0.3454 | LM: 0.2909
|
| 146 |
+
[2026-04-07 18:03:52] Validation | Batch 460/732 | Loss: 0.3458 | LM: 0.2913
|
| 147 |
+
[2026-04-07 18:03:53] Validation | Batch 470/732 | Loss: 0.3456 | LM: 0.2911
|
| 148 |
+
[2026-04-07 18:03:54] Validation | Batch 480/732 | Loss: 0.3457 | LM: 0.2913
|
| 149 |
+
[2026-04-07 18:03:55] Validation | Batch 490/732 | Loss: 0.3468 | LM: 0.2923
|
| 150 |
+
[2026-04-07 18:03:56] Validation | Batch 500/732 | Loss: 0.3479 | LM: 0.2934
|
| 151 |
+
[2026-04-07 18:03:57] Validation | Batch 510/732 | Loss: 0.3475 | LM: 0.2931
|
| 152 |
+
[2026-04-07 18:03:58] Validation | Batch 520/732 | Loss: 0.3473 | LM: 0.2928
|
| 153 |
+
[2026-04-07 18:04:00] Validation | Batch 530/732 | Loss: 0.3467 | LM: 0.2922
|
| 154 |
+
[2026-04-07 18:04:01] Validation | Batch 540/732 | Loss: 0.3468 | LM: 0.2924
|
| 155 |
+
[2026-04-07 18:04:02] Validation | Batch 550/732 | Loss: 0.3468 | LM: 0.2923
|
| 156 |
+
[2026-04-07 18:04:03] Validation | Batch 560/732 | Loss: 0.3463 | LM: 0.2919
|
| 157 |
+
[2026-04-07 18:04:04] Validation | Batch 570/732 | Loss: 0.3464 | LM: 0.2919
|
| 158 |
+
[2026-04-07 18:04:05] Validation | Batch 580/732 | Loss: 0.3461 | LM: 0.2917
|
| 159 |
+
[2026-04-07 18:04:07] Validation | Batch 590/732 | Loss: 0.3461 | LM: 0.2916
|
| 160 |
+
[2026-04-07 18:04:08] Validation | Batch 600/732 | Loss: 0.3461 | LM: 0.2916
|
| 161 |
+
[2026-04-07 18:04:09] Validation | Batch 610/732 | Loss: 0.3466 | LM: 0.2922
|
| 162 |
+
[2026-04-07 18:04:10] Validation | Batch 620/732 | Loss: 0.3470 | LM: 0.2925
|
| 163 |
+
[2026-04-07 18:04:11] Validation | Batch 630/732 | Loss: 0.3468 | LM: 0.2923
|
| 164 |
+
[2026-04-07 18:04:12] Validation | Batch 640/732 | Loss: 0.3466 | LM: 0.2921
|
| 165 |
+
[2026-04-07 18:04:14] Validation | Batch 650/732 | Loss: 0.3464 | LM: 0.2919
|
| 166 |
+
[2026-04-07 18:04:15] Validation | Batch 660/732 | Loss: 0.3469 | LM: 0.2924
|
| 167 |
+
[2026-04-07 18:04:16] Validation | Batch 670/732 | Loss: 0.3475 | LM: 0.2930
|
| 168 |
+
[2026-04-07 18:04:17] Validation | Batch 680/732 | Loss: 0.3474 | LM: 0.2930
|
| 169 |
+
[2026-04-07 18:04:18] Validation | Batch 690/732 | Loss: 0.3476 | LM: 0.2932
|
| 170 |
+
[2026-04-07 18:04:19] Validation | Batch 700/732 | Loss: 0.3481 | LM: 0.2936
|
| 171 |
+
[2026-04-07 18:04:20] Validation | Batch 710/732 | Loss: 0.3485 | LM: 0.2940
|
| 172 |
+
[2026-04-07 18:04:21] Validation | Batch 720/732 | Loss: 0.3495 | LM: 0.2950
|
| 173 |
+
[2026-04-07 18:04:22] Validation | Batch 730/732 | Loss: 0.3492 | LM: 0.2947
|
| 174 |
+
[2026-04-07 18:04:23] Validation | Batch 732/732 | Loss: 0.3490 | LM: 0.2945
|
| 175 |
+
[2026-04-07 18:04:23] Validation | Loss: 0.3490 | LM: 0.2945 | PPL: 1.34 | Time: 82.14s
|
| 176 |
+
[2026-04-07 18:04:25] New best model saved! Val loss: 0.3490
|
| 177 |
+
[2026-04-07 18:04:35] Epoch 1 | Step 260 | Loss: 0.3735 | LM: 0.3160 | LB: 1.1194 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.442/SR1: 0.419 | LR: 1.00e-04
|
| 178 |
+
[2026-04-07 18:04:44] Epoch 1 | Step 270 | Loss: 0.3735 | LM: 0.3143 | LB: 1.1184 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.441/SR1: 0.418 | LR: 1.00e-04
|
| 179 |
+
[2026-04-07 18:04:53] Epoch 1 | Step 280 | Loss: 0.3733 | LM: 0.3145 | LB: 1.1175 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.440/SR1: 0.417 | LR: 1.00e-04
|
| 180 |
+
[2026-04-07 18:05:02] Epoch 1 | Step 290 | Loss: 0.3716 | LM: 0.3125 | LB: 1.1164 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.439/SR1: 0.415 | LR: 1.00e-04
|
| 181 |
+
[2026-04-07 18:05:11] Epoch 1 | Step 300 | Loss: 0.3708 | LM: 0.3120 | LB: 1.1156 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.438/SR1: 0.414 | LR: 1.00e-04
|
| 182 |
+
[2026-04-07 18:05:21] Epoch 1 | Step 310 | Loss: 0.3699 | LM: 0.3110 | LB: 1.1149 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.355 | HR1: 0.437/SR1: 0.413 | LR: 1.00e-04
|
| 183 |
+
[2026-04-07 18:05:30] Epoch 1 | Step 320 | Loss: 0.3692 | LM: 0.3105 | LB: 1.1140 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.436/SR1: 0.412 | LR: 9.80e-05
|
| 184 |
+
[2026-04-07 18:05:39] Epoch 1 | Step 330 | Loss: 0.3680 | LM: 0.3096 | LB: 1.1130 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.435/SR1: 0.411 | LR: 8.93e-05
|
| 185 |
+
[2026-04-07 18:05:48] Epoch 1 | Step 340 | Loss: 0.3671 | LM: 0.3088 | LB: 1.1121 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.434/SR1: 0.410 | LR: 7.51e-05
|
| 186 |
+
[2026-04-07 18:05:57] Epoch 1 | Step 350 | Loss: 0.3666 | LM: 0.3093 | LB: 1.1113 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.433/SR1: 0.409 | LR: 5.77e-05
|
| 187 |
+
[2026-04-07 18:06:06] Epoch 1 | Step 360 | Loss: 0.3659 | LM: 0.3108 | LB: 1.1107 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.433/SR1: 0.408 | LR: 3.99e-05
|
| 188 |
+
[2026-04-07 18:06:15] Epoch 1 | Step 370 | Loss: 0.3655 | LM: 0.3104 | LB: 1.1102 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.432/SR1: 0.408 | LR: 2.45e-05
|
| 189 |
+
[2026-04-07 18:06:24] Epoch 1 | Step 380 | Loss: 0.3644 | LM: 0.3085 | LB: 1.1095 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.432/SR1: 0.407 | LR: 1.40e-05
|
| 190 |
+
[2026-04-07 18:06:33] Epoch 1 | Step 390 | Loss: 0.3646 | LM: 0.3082 | LB: 1.1088 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.431/SR1: 0.406 | LR: 1.00e-05
|
| 191 |
+
[2026-04-07 18:06:42] Epoch 1 | Step 400 | Loss: 0.3644 | LM: 0.3090 | LB: 1.1083 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.430/SR1: 0.406 | LR: 1.00e-05
|
| 192 |
+
[2026-04-07 18:06:51] Epoch 1 | Step 410 | Loss: 0.3643 | LM: 0.3088 | LB: 1.1077 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.430/SR1: 0.405 | LR: 1.00e-05
|
| 193 |
+
[2026-04-07 18:07:00] Epoch 1 | Step 420 | Loss: 0.3644 | LM: 0.3082 | LB: 1.1072 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.405 | LR: 1.00e-05
|
| 194 |
+
[2026-04-07 18:07:09] Epoch 1 | Step 430 | Loss: 0.3638 | LM: 0.3066 | LB: 1.1067 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-05
|
| 195 |
+
[2026-04-07 18:07:18] Epoch 1 | Step 440 | Loss: 0.3630 | LM: 0.3057 | LB: 1.1064 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-05
|
| 196 |
+
[2026-04-07 18:07:28] Epoch 1 | Step 450 | Loss: 0.3624 | LM: 0.3044 | LB: 1.1058 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.428/SR1: 0.403 | LR: 1.00e-05
|
| 197 |
+
[2026-04-07 18:07:37] Epoch 1 | Step 460 | Loss: 0.3625 | LM: 0.3035 | LB: 1.1053 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.428/SR1: 0.402 | LR: 1.00e-05
|
| 198 |
+
[2026-04-07 18:07:45] Epoch 1 | Step 470 | Loss: 0.3623 | LM: 0.3043 | LB: 1.1050 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.427/SR1: 0.402 | LR: 1.00e-05
|
| 199 |
+
[2026-04-07 18:07:54] Epoch 1 | Step 480 | Loss: 0.3619 | LM: 0.3036 | LB: 1.1045 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.427/SR1: 0.401 | LR: 1.00e-05
|
| 200 |
+
[2026-04-07 18:08:04] Epoch 1 | Step 490 | Loss: 0.3612 | LM: 0.3024 | LB: 1.1041 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.401 | LR: 1.00e-05
|
| 201 |
+
[2026-04-07 18:08:13] Epoch 1 | Step 500 | Loss: 0.3614 | LM: 0.3029 | LB: 1.1037 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.400 | LR: 1.00e-05
|
| 202 |
+
[2026-04-07 18:08:13] Validation | Batch 10/732 | Loss: 0.3283 | LM: 0.2742
|
| 203 |
+
[2026-04-07 18:08:15] Validation | Batch 20/732 | Loss: 0.3500 | LM: 0.2958
|
| 204 |
+
[2026-04-07 18:08:16] Validation | Batch 30/732 | Loss: 0.3412 | LM: 0.2872
|
| 205 |
+
[2026-04-07 18:08:17] Validation | Batch 40/732 | Loss: 0.3462 | LM: 0.2921
|
| 206 |
+
[2026-04-07 18:08:18] Validation | Batch 50/732 | Loss: 0.3460 | LM: 0.2920
|
| 207 |
+
[2026-04-07 18:08:19] Validation | Batch 60/732 | Loss: 0.3480 | LM: 0.2940
|
| 208 |
+
[2026-04-07 18:08:20] Validation | Batch 70/732 | Loss: 0.3514 | LM: 0.2973
|
| 209 |
+
[2026-04-07 18:08:22] Validation | Batch 80/732 | Loss: 0.3497 | LM: 0.2957
|
| 210 |
+
[2026-04-07 18:08:23] Validation | Batch 90/732 | Loss: 0.3495 | LM: 0.2955
|
| 211 |
+
[2026-04-07 18:08:24] Validation | Batch 100/732 | Loss: 0.3505 | LM: 0.2965
|
| 212 |
+
[2026-04-07 18:08:25] Validation | Batch 110/732 | Loss: 0.3472 | LM: 0.2932
|
| 213 |
+
[2026-04-07 18:08:26] Validation | Batch 120/732 | Loss: 0.3504 | LM: 0.2965
|
| 214 |
+
[2026-04-07 18:08:27] Validation | Batch 130/732 | Loss: 0.3517 | LM: 0.2977
|
| 215 |
+
[2026-04-07 18:08:29] Validation | Batch 140/732 | Loss: 0.3511 | LM: 0.2971
|
| 216 |
+
[2026-04-07 18:08:30] Validation | Batch 150/732 | Loss: 0.3505 | LM: 0.2965
|
| 217 |
+
[2026-04-07 18:08:31] Validation | Batch 160/732 | Loss: 0.3497 | LM: 0.2957
|
| 218 |
+
[2026-04-07 18:08:32] Validation | Batch 170/732 | Loss: 0.3503 | LM: 0.2963
|
| 219 |
+
[2026-04-07 18:08:33] Validation | Batch 180/732 | Loss: 0.3516 | LM: 0.2976
|
| 220 |
+
[2026-04-07 18:08:34] Validation | Batch 190/732 | Loss: 0.3509 | LM: 0.2970
|
| 221 |
+
[2026-04-07 18:08:35] Validation | Batch 200/732 | Loss: 0.3510 | LM: 0.2970
|
| 222 |
+
[2026-04-07 18:08:36] Validation | Batch 210/732 | Loss: 0.3503 | LM: 0.2964
|
| 223 |
+
[2026-04-07 18:08:37] Validation | Batch 220/732 | Loss: 0.3499 | LM: 0.2959
|
| 224 |
+
[2026-04-07 18:08:38] Validation | Batch 230/732 | Loss: 0.3503 | LM: 0.2963
|
| 225 |
+
[2026-04-07 18:08:40] Validation | Batch 240/732 | Loss: 0.3501 | LM: 0.2961
|
| 226 |
+
[2026-04-07 18:08:41] Validation | Batch 250/732 | Loss: 0.3500 | LM: 0.2960
|
| 227 |
+
[2026-04-07 18:08:42] Validation | Batch 260/732 | Loss: 0.3490 | LM: 0.2950
|
| 228 |
+
[2026-04-07 18:08:43] Validation | Batch 270/732 | Loss: 0.3488 | LM: 0.2948
|
| 229 |
+
[2026-04-07 18:08:44] Validation | Batch 280/732 | Loss: 0.3478 | LM: 0.2938
|
| 230 |
+
[2026-04-07 18:08:45] Validation | Batch 290/732 | Loss: 0.3476 | LM: 0.2935
|
| 231 |
+
[2026-04-07 18:08:46] Validation | Batch 300/732 | Loss: 0.3475 | LM: 0.2935
|
| 232 |
+
[2026-04-07 18:08:47] Validation | Batch 310/732 | Loss: 0.3474 | LM: 0.2933
|
| 233 |
+
[2026-04-07 18:08:49] Validation | Batch 320/732 | Loss: 0.3465 | LM: 0.2924
|
| 234 |
+
[2026-04-07 18:08:50] Validation | Batch 330/732 | Loss: 0.3454 | LM: 0.2913
|
| 235 |
+
[2026-04-07 18:08:51] Validation | Batch 340/732 | Loss: 0.3448 | LM: 0.2907
|
| 236 |
+
[2026-04-07 18:08:52] Validation | Batch 350/732 | Loss: 0.3451 | LM: 0.2911
|
| 237 |
+
[2026-04-07 18:08:53] Validation | Batch 360/732 | Loss: 0.3459 | LM: 0.2919
|
| 238 |
+
[2026-04-07 18:08:54] Validation | Batch 370/732 | Loss: 0.3450 | LM: 0.2910
|
| 239 |
+
[2026-04-07 18:08:55] Validation | Batch 380/732 | Loss: 0.3443 | LM: 0.2903
|
| 240 |
+
[2026-04-07 18:08:56] Validation | Batch 390/732 | Loss: 0.3440 | LM: 0.2899
|
| 241 |
+
[2026-04-07 18:08:57] Validation | Batch 400/732 | Loss: 0.3438 | LM: 0.2897
|
| 242 |
+
[2026-04-07 18:08:58] Validation | Batch 410/732 | Loss: 0.3431 | LM: 0.2890
|
| 243 |
+
[2026-04-07 18:08:59] Validation | Batch 420/732 | Loss: 0.3433 | LM: 0.2892
|
| 244 |
+
[2026-04-07 18:09:00] Validation | Batch 430/732 | Loss: 0.3432 | LM: 0.2892
|
| 245 |
+
[2026-04-07 18:09:01] Validation | Batch 440/732 | Loss: 0.3427 | LM: 0.2887
|
| 246 |
+
[2026-04-07 18:09:02] Validation | Batch 450/732 | Loss: 0.3426 | LM: 0.2885
|
| 247 |
+
[2026-04-07 18:09:04] Validation | Batch 460/732 | Loss: 0.3429 | LM: 0.2889
|
| 248 |
+
[2026-04-07 18:09:05] Validation | Batch 470/732 | Loss: 0.3427 | LM: 0.2886
|
| 249 |
+
[2026-04-07 18:09:06] Validation | Batch 480/732 | Loss: 0.3428 | LM: 0.2888
|
| 250 |
+
[2026-04-07 18:09:07] Validation | Batch 490/732 | Loss: 0.3439 | LM: 0.2898
|
| 251 |
+
[2026-04-07 18:09:08] Validation | Batch 500/732 | Loss: 0.3449 | LM: 0.2909
|
| 252 |
+
[2026-04-07 18:09:09] Validation | Batch 510/732 | Loss: 0.3446 | LM: 0.2905
|
| 253 |
+
[2026-04-07 18:09:10] Validation | Batch 520/732 | Loss: 0.3444 | LM: 0.2904
|
| 254 |
+
[2026-04-07 18:09:11] Validation | Batch 530/732 | Loss: 0.3438 | LM: 0.2898
|
| 255 |
+
[2026-04-07 18:09:12] Validation | Batch 540/732 | Loss: 0.3440 | LM: 0.2899
|
| 256 |
+
[2026-04-07 18:09:13] Validation | Batch 550/732 | Loss: 0.3439 | LM: 0.2899
|
| 257 |
+
[2026-04-07 18:09:14] Validation | Batch 560/732 | Loss: 0.3434 | LM: 0.2894
|
| 258 |
+
[2026-04-07 18:09:16] Validation | Batch 570/732 | Loss: 0.3435 | LM: 0.2894
|
| 259 |
+
[2026-04-07 18:09:17] Validation | Batch 580/732 | Loss: 0.3432 | LM: 0.2891
|
| 260 |
+
[2026-04-07 18:09:18] Validation | Batch 590/732 | Loss: 0.3432 | LM: 0.2891
|
| 261 |
+
[2026-04-07 18:09:19] Validation | Batch 600/732 | Loss: 0.3431 | LM: 0.2891
|
| 262 |
+
[2026-04-07 18:09:21] Validation | Batch 610/732 | Loss: 0.3437 | LM: 0.2896
|
| 263 |
+
[2026-04-07 18:09:22] Validation | Batch 620/732 | Loss: 0.3440 | LM: 0.2900
|
| 264 |
+
[2026-04-07 18:09:23] Validation | Batch 630/732 | Loss: 0.3438 | LM: 0.2898
|
| 265 |
+
[2026-04-07 18:09:24] Validation | Batch 640/732 | Loss: 0.3435 | LM: 0.2895
|
| 266 |
+
[2026-04-07 18:09:25] Validation | Batch 650/732 | Loss: 0.3434 | LM: 0.2894
|
| 267 |
+
[2026-04-07 18:09:26] Validation | Batch 660/732 | Loss: 0.3439 | LM: 0.2899
|
| 268 |
+
[2026-04-07 18:09:27] Validation | Batch 670/732 | Loss: 0.3445 | LM: 0.2905
|
| 269 |
+
[2026-04-07 18:09:28] Validation | Batch 680/732 | Loss: 0.3445 | LM: 0.2904
|
| 270 |
+
[2026-04-07 18:09:29] Validation | Batch 690/732 | Loss: 0.3447 | LM: 0.2906
|
| 271 |
+
[2026-04-07 18:09:31] Validation | Batch 700/732 | Loss: 0.3452 | LM: 0.2911
|
| 272 |
+
[2026-04-07 18:09:32] Validation | Batch 710/732 | Loss: 0.3455 | LM: 0.2915
|
| 273 |
+
[2026-04-07 18:09:33] Validation | Batch 720/732 | Loss: 0.3465 | LM: 0.2925
|
| 274 |
+
[2026-04-07 18:09:34] Validation | Batch 730/732 | Loss: 0.3462 | LM: 0.2921
|
| 275 |
+
[2026-04-07 18:09:34] Validation | Batch 732/732 | Loss: 0.3460 | LM: 0.2920
|
| 276 |
+
[2026-04-07 18:09:34] Validation | Loss: 0.3460 | LM: 0.2920 | PPL: 1.34 | Time: 81.84s
|
| 277 |
+
[2026-04-07 18:09:37] New best model saved! Val loss: 0.3460
|
| 278 |
+
[2026-04-07 18:09:46] Epoch 1 | Step 510 | Loss: 0.3613 | LM: 0.3012 | LB: 1.1036 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.400 | LR: 1.00e-05
|
| 279 |
+
[2026-04-07 18:09:56] Epoch 1 | Step 520 | Loss: 0.3609 | LM: 0.3002 | LB: 1.1030 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
|
| 280 |
+
[2026-04-07 18:10:04] Epoch 1 | Step 530 | Loss: 0.3606 | LM: 0.2993 | LB: 1.1027 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
|
| 281 |
+
[2026-04-07 18:10:13] Epoch 1 | Step 540 | Loss: 0.3603 | LM: 0.2991 | LB: 1.1022 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
|
| 282 |
+
[2026-04-07 18:10:22] Epoch 1 | Step 550 | Loss: 0.3600 | LM: 0.2996 | LB: 1.1017 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
|
| 283 |
+
[2026-04-07 18:10:31] Epoch 1 | Step 560 | Loss: 0.3602 | LM: 0.3004 | LB: 1.1015 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
|
| 284 |
+
[2026-04-07 18:10:40] Epoch 1 | Step 570 | Loss: 0.3603 | LM: 0.3008 | LB: 1.1013 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
|
| 285 |
+
[2026-04-07 18:10:49] Epoch 1 | Step 580 | Loss: 0.3598 | LM: 0.3006 | LB: 1.1011 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.397 | LR: 1.00e-05
|
| 286 |
+
[2026-04-07 18:10:58] Epoch 1 | Step 590 | Loss: 0.3603 | LM: 0.3019 | LB: 1.1007 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.397 | LR: 1.00e-05
|
| 287 |
+
[2026-04-07 18:11:07] Epoch 1 | Step 600 | Loss: 0.3599 | LM: 0.3012 | LB: 1.1005 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.397 | LR: 1.00e-05
|
| 288 |
+
[2026-04-07 18:11:17] Epoch 1 | Step 610 | Loss: 0.3595 | LM: 0.3009 | LB: 1.1003 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-05
|
| 289 |
+
[2026-04-07 18:11:26] Epoch 1 | Step 620 | Loss: 0.3592 | LM: 0.3011 | LB: 1.1000 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-05
|
| 290 |
+
[2026-04-07 18:11:35] Epoch 1 | Step 630 | Loss: 0.3588 | LM: 0.3008 | LB: 1.0997 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.396 | LR: 1.00e-05
|
| 291 |
+
[2026-04-07 18:11:44] Epoch 1 | Step 640 | Loss: 0.3584 | LM: 0.3009 | LB: 1.0995 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.395 | LR: 1.00e-05
|
| 292 |
+
[2026-04-07 18:11:53] Epoch 1 | Step 650 | Loss: 0.3586 | LM: 0.3004 | LB: 1.0992 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.395 | LR: 1.00e-05
|
| 293 |
+
[2026-04-07 18:12:02] Epoch 1 | Step 660 | Loss: 0.3582 | LM: 0.3001 | LB: 1.0991 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.395 | LR: 1.00e-05
|
| 294 |
+
[2026-04-07 18:12:11] Epoch 1 | Step 670 | Loss: 0.3588 | LM: 0.3005 | LB: 1.0989 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
|
| 295 |
+
[2026-04-07 18:12:20] Epoch 1 | Step 680 | Loss: 0.3585 | LM: 0.3008 | LB: 1.0988 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
|
| 296 |
+
[2026-04-07 18:12:29] Epoch 1 | Step 690 | Loss: 0.3586 | LM: 0.3014 | LB: 1.0988 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
|
| 297 |
+
[2026-04-07 18:12:38] Epoch 1 | Step 700 | Loss: 0.3587 | LM: 0.3014 | LB: 1.0986 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
|
| 298 |
+
[2026-04-07 18:12:47] Epoch 1 | Step 710 | Loss: 0.3585 | LM: 0.3010 | LB: 1.0983 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
|
| 299 |
+
[2026-04-07 18:12:56] Epoch 1 | Step 720 | Loss: 0.3581 | LM: 0.3008 | LB: 1.0982 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
|
| 300 |
+
[2026-04-07 18:13:05] Epoch 1 | Step 730 | Loss: 0.3578 | LM: 0.3015 | LB: 1.0981 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
|
| 301 |
+
[2026-04-07 18:13:14] Epoch 1 | Step 740 | Loss: 0.3576 | LM: 0.3017 | LB: 1.0980 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
|
| 302 |
+
[2026-04-07 18:13:23] Epoch 1 | Step 750 | Loss: 0.3574 | LM: 0.3013 | LB: 1.0978 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
|
| 303 |
+
[2026-04-07 18:13:24] Validation | Batch 10/732 | Loss: 0.3281 | LM: 0.2740
|
| 304 |
+
[2026-04-07 18:13:25] Validation | Batch 20/732 | Loss: 0.3495 | LM: 0.2954
|
| 305 |
+
[2026-04-07 18:13:26] Validation | Batch 30/732 | Loss: 0.3407 | LM: 0.2866
|
| 306 |
+
[2026-04-07 18:13:27] Validation | Batch 40/732 | Loss: 0.3457 | LM: 0.2917
|
| 307 |
+
[2026-04-07 18:13:29] Validation | Batch 50/732 | Loss: 0.3455 | LM: 0.2915
|
| 308 |
+
[2026-04-07 18:13:30] Validation | Batch 60/732 | Loss: 0.3475 | LM: 0.2935
|
| 309 |
+
[2026-04-07 18:13:31] Validation | Batch 70/732 | Loss: 0.3510 | LM: 0.2969
|
| 310 |
+
[2026-04-07 18:13:32] Validation | Batch 80/732 | Loss: 0.3492 | LM: 0.2953
|
| 311 |
+
[2026-04-07 18:13:33] Validation | Batch 90/732 | Loss: 0.3491 | LM: 0.2951
|
| 312 |
+
[2026-04-07 18:13:34] Validation | Batch 100/732 | Loss: 0.3501 | LM: 0.2961
|
| 313 |
+
[2026-04-07 18:13:35] Validation | Batch 110/732 | Loss: 0.3468 | LM: 0.2928
|
| 314 |
+
[2026-04-07 18:13:37] Validation | Batch 120/732 | Loss: 0.3500 | LM: 0.2961
|
| 315 |
+
[2026-04-07 18:13:38] Validation | Batch 130/732 | Loss: 0.3513 | LM: 0.2973
|
| 316 |
+
[2026-04-07 18:13:39] Validation | Batch 140/732 | Loss: 0.3507 | LM: 0.2967
|
| 317 |
+
[2026-04-07 18:13:40] Validation | Batch 150/732 | Loss: 0.3500 | LM: 0.2961
|
| 318 |
+
[2026-04-07 18:13:41] Validation | Batch 160/732 | Loss: 0.3493 | LM: 0.2953
|
| 319 |
+
[2026-04-07 18:13:42] Validation | Batch 170/732 | Loss: 0.3498 | LM: 0.2959
|
| 320 |
+
[2026-04-07 18:13:43] Validation | Batch 180/732 | Loss: 0.3512 | LM: 0.2972
|
| 321 |
+
[2026-04-07 18:13:44] Validation | Batch 190/732 | Loss: 0.3505 | LM: 0.2965
|
| 322 |
+
[2026-04-07 18:13:45] Validation | Batch 200/732 | Loss: 0.3506 | LM: 0.2966
|
| 323 |
+
[2026-04-07 18:13:46] Validation | Batch 210/732 | Loss: 0.3499 | LM: 0.2959
|
| 324 |
+
[2026-04-07 18:13:47] Validation | Batch 220/732 | Loss: 0.3494 | LM: 0.2955
|
| 325 |
+
[2026-04-07 18:13:49] Validation | Batch 230/732 | Loss: 0.3499 | LM: 0.2959
|
| 326 |
+
[2026-04-07 18:13:50] Validation | Batch 240/732 | Loss: 0.3496 | LM: 0.2956
|
| 327 |
+
[2026-04-07 18:13:51] Validation | Batch 250/732 | Loss: 0.3496 | LM: 0.2956
|
| 328 |
+
[2026-04-07 18:13:52] Validation | Batch 260/732 | Loss: 0.3486 | LM: 0.2946
|
| 329 |
+
[2026-04-07 18:13:53] Validation | Batch 270/732 | Loss: 0.3484 | LM: 0.2944
|
| 330 |
+
[2026-04-07 18:13:54] Validation | Batch 280/732 | Loss: 0.3473 | LM: 0.2933
|
| 331 |
+
[2026-04-07 18:13:55] Validation | Batch 290/732 | Loss: 0.3471 | LM: 0.2931
|
| 332 |
+
[2026-04-07 18:13:56] Validation | Batch 300/732 | Loss: 0.3471 | LM: 0.2930
|
| 333 |
+
[2026-04-07 18:13:58] Validation | Batch 310/732 | Loss: 0.3469 | LM: 0.2929
|
| 334 |
+
[2026-04-07 18:13:59] Validation | Batch 320/732 | Loss: 0.3460 | LM: 0.2920
|
| 335 |
+
[2026-04-07 18:14:00] Validation | Batch 330/732 | Loss: 0.3449 | LM: 0.2909
|
| 336 |
+
[2026-04-07 18:14:01] Validation | Batch 340/732 | Loss: 0.3443 | LM: 0.2903
|
| 337 |
+
[2026-04-07 18:14:02] Validation | Batch 350/732 | Loss: 0.3446 | LM: 0.2906
|
| 338 |
+
[2026-04-07 18:14:03] Validation | Batch 360/732 | Loss: 0.3455 | LM: 0.2914
|
| 339 |
+
[2026-04-07 18:14:04] Validation | Batch 370/732 | Loss: 0.3445 | LM: 0.2905
|
| 340 |
+
[2026-04-07 18:14:05] Validation | Batch 380/732 | Loss: 0.3438 | LM: 0.2898
|
| 341 |
+
[2026-04-07 18:14:06] Validation | Batch 390/732 | Loss: 0.3435 | LM: 0.2895
|
| 342 |
+
[2026-04-07 18:14:07] Validation | Batch 400/732 | Loss: 0.3433 | LM: 0.2893
|
| 343 |
+
[2026-04-07 18:14:08] Validation | Batch 410/732 | Loss: 0.3426 | LM: 0.2886
|
| 344 |
+
[2026-04-07 18:14:09] Validation | Batch 420/732 | Loss: 0.3428 | LM: 0.2888
|
| 345 |
+
[2026-04-07 18:14:10] Validation | Batch 430/732 | Loss: 0.3427 | LM: 0.2887
|
| 346 |
+
[2026-04-07 18:14:12] Validation | Batch 440/732 | Loss: 0.3422 | LM: 0.2882
|
| 347 |
+
[2026-04-07 18:14:13] Validation | Batch 450/732 | Loss: 0.3421 | LM: 0.2881
|
| 348 |
+
[2026-04-07 18:14:14] Validation | Batch 460/732 | Loss: 0.3425 | LM: 0.2884
|
| 349 |
+
[2026-04-07 18:14:15] Validation | Batch 470/732 | Loss: 0.3422 | LM: 0.2882
|
| 350 |
+
[2026-04-07 18:14:16] Validation | Batch 480/732 | Loss: 0.3423 | LM: 0.2883
|
| 351 |
+
[2026-04-07 18:14:17] Validation | Batch 490/732 | Loss: 0.3434 | LM: 0.2894
|
| 352 |
+
[2026-04-07 18:14:18] Validation | Batch 500/732 | Loss: 0.3445 | LM: 0.2904
|
| 353 |
+
[2026-04-07 18:14:19] Validation | Batch 510/732 | Loss: 0.3441 | LM: 0.2901
|
| 354 |
+
[2026-04-07 18:14:20] Validation | Batch 520/732 | Loss: 0.3439 | LM: 0.2899
|
| 355 |
+
[2026-04-07 18:14:21] Validation | Batch 530/732 | Loss: 0.3433 | LM: 0.2893
|
| 356 |
+
[2026-04-07 18:14:23] Validation | Batch 540/732 | Loss: 0.3435 | LM: 0.2895
|
| 357 |
+
[2026-04-07 18:14:24] Validation | Batch 550/732 | Loss: 0.3435 | LM: 0.2894
|
| 358 |
+
[2026-04-07 18:14:25] Validation | Batch 560/732 | Loss: 0.3429 | LM: 0.2889
|
| 359 |
+
[2026-04-07 18:14:26] Validation | Batch 570/732 | Loss: 0.3430 | LM: 0.2890
|
| 360 |
+
[2026-04-07 18:14:27] Validation | Batch 580/732 | Loss: 0.3427 | LM: 0.2887
|
| 361 |
+
[2026-04-07 18:14:28] Validation | Batch 590/732 | Loss: 0.3427 | LM: 0.2887
|
| 362 |
+
[2026-04-07 18:14:30] Validation | Batch 600/732 | Loss: 0.3426 | LM: 0.2886
|
| 363 |
+
[2026-04-07 18:14:31] Validation | Batch 610/732 | Loss: 0.3432 | LM: 0.2892
|
| 364 |
+
[2026-04-07 18:14:32] Validation | Batch 620/732 | Loss: 0.3435 | LM: 0.2895
|
| 365 |
+
[2026-04-07 18:14:33] Validation | Batch 630/732 | Loss: 0.3433 | LM: 0.2893
|
| 366 |
+
[2026-04-07 18:14:34] Validation | Batch 640/732 | Loss: 0.3431 | LM: 0.2891
|
| 367 |
+
[2026-04-07 18:14:35] Validation | Batch 650/732 | Loss: 0.3429 | LM: 0.2889
|
| 368 |
+
[2026-04-07 18:14:36] Validation | Batch 660/732 | Loss: 0.3434 | LM: 0.2894
|
| 369 |
+
[2026-04-07 18:14:37] Validation | Batch 670/732 | Loss: 0.3440 | LM: 0.2900
|
| 370 |
+
[2026-04-07 18:14:38] Validation | Batch 680/732 | Loss: 0.3440 | LM: 0.2900
|
| 371 |
+
[2026-04-07 18:14:39] Validation | Batch 690/732 | Loss: 0.3442 | LM: 0.2902
|
| 372 |
+
[2026-04-07 18:14:41] Validation | Batch 700/732 | Loss: 0.3447 | LM: 0.2907
|
| 373 |
+
[2026-04-07 18:14:42] Validation | Batch 710/732 | Loss: 0.3451 | LM: 0.2910
|
| 374 |
+
[2026-04-07 18:14:43] Validation | Batch 720/732 | Loss: 0.3460 | LM: 0.2920
|
| 375 |
+
[2026-04-07 18:14:44] Validation | Batch 730/732 | Loss: 0.3457 | LM: 0.2917
|
| 376 |
+
[2026-04-07 18:14:44] Validation | Batch 732/732 | Loss: 0.3455 | LM: 0.2915
|
| 377 |
+
[2026-04-07 18:14:44] Validation | Loss: 0.3455 | LM: 0.2915 | PPL: 1.34 | Time: 81.42s
|
| 378 |
+
[2026-04-07 18:14:51] New best model saved! Val loss: 0.3455
|
| 379 |
+
[2026-04-07 18:15:00] Epoch 1 | Step 760 | Loss: 0.3571 | LM: 0.3005 | LB: 1.0976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
|
| 380 |
+
[2026-04-07 18:15:09] Epoch 1 | Step 770 | Loss: 0.3569 | LM: 0.3004 | LB: 1.0976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
|
| 381 |
+
[2026-04-07 18:15:18] Epoch 1 | Step 780 | Loss: 0.3572 | LM: 0.3019 | LB: 1.0973 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
|
| 382 |
+
[2026-04-07 18:15:19] Reached max_steps=781, stopping training.
|
| 383 |
+
[2026-04-07 18:15:19] Epoch 1 completed in 992.60s | Loss: 0.3570 | CL0: 2.8 | CL1: 2.4
|
| 384 |
+
[2026-04-07 18:15:19]
|
| 385 |
+
Training completed!
|
| 386 |
+
[2026-04-07 18:15:22] Final model: outputs/N_4.0/model_final.pt
|
routing_tuning_test_07_04/N_6.0/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
|
| 3 |
+
checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 4 |
+
training:
|
| 5 |
+
epochs: 1
|
| 6 |
+
max_steps: null
|
| 7 |
+
batch_size: 8
|
| 8 |
+
eval_batch_size: 24
|
| 9 |
+
gradient_accumulation_steps: 4
|
| 10 |
+
lr: 0.0001
|
| 11 |
+
weight_decay: 0.1
|
| 12 |
+
betas:
|
| 13 |
+
- 0.9
|
| 14 |
+
- 0.95
|
| 15 |
+
eps: 1.0e-08
|
| 16 |
+
lr_scheduler: wsd
|
| 17 |
+
warmup_ratio: 0.1
|
| 18 |
+
decay_ratio: 0.2
|
| 19 |
+
warmup_steps: 100
|
| 20 |
+
min_lr_ratio: 0.1
|
| 21 |
+
lr_multiplier:
|
| 22 |
+
- 2.0
|
| 23 |
+
- 1.5
|
| 24 |
+
- 1.0
|
| 25 |
+
load_balancing_weight: 0.05
|
| 26 |
+
load_balancing_N: 6.0
|
| 27 |
+
max_grad_norm: 1.0
|
| 28 |
+
use_amp: true
|
| 29 |
+
resume: false
|
| 30 |
+
resume_checkpoint: null
|
| 31 |
+
warmup_model: true
|
| 32 |
+
data:
|
| 33 |
+
path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
|
| 34 |
+
max_context_len: 4096
|
| 35 |
+
max_target_len: 256
|
| 36 |
+
num_workers: 0
|
| 37 |
+
pin_memory: true
|
| 38 |
+
max_train_samples: 50000
|
| 39 |
+
max_val_samples: null
|
| 40 |
+
logging:
|
| 41 |
+
log_interval: 10
|
| 42 |
+
save_interval: 1000
|
| 43 |
+
eval_interval: 250
|
| 44 |
+
save_every_epoch: false
|
| 45 |
+
model_only_checkpoints: true
|
| 46 |
+
tracking:
|
| 47 |
+
enabled: true
|
| 48 |
+
project: routing-evolution
|
| 49 |
+
run_name: routing_N6.0
|
| 50 |
+
paths:
|
| 51 |
+
output_dir: outputs/N_${training.load_balancing_N}
|
| 52 |
+
seed: 42
|
| 53 |
+
device: cuda
|
routing_tuning_test_07_04/N_6.0/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${paths.output_dir}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- training.load_balancing_N=6.0
|
| 116 |
+
- tracking.run_name=routing_N6.0
|
| 117 |
+
job:
|
| 118 |
+
name: train
|
| 119 |
+
chdir: false
|
| 120 |
+
override_dirname: tracking.run_name=routing_N6.0,training.load_balancing_N=6.0
|
| 121 |
+
id: ???
|
| 122 |
+
num: ???
|
| 123 |
+
config_name: config
|
| 124 |
+
env_set: {}
|
| 125 |
+
env_copy: []
|
| 126 |
+
config:
|
| 127 |
+
override_dirname:
|
| 128 |
+
kv_sep: '='
|
| 129 |
+
item_sep: ','
|
| 130 |
+
exclude_keys: []
|
| 131 |
+
runtime:
|
| 132 |
+
version: 1.3.2
|
| 133 |
+
version_base: '1.3'
|
| 134 |
+
cwd: /workspace/byte-llms-code/routing_evolution_exp
|
| 135 |
+
config_sources:
|
| 136 |
+
- path: hydra.conf
|
| 137 |
+
schema: pkg
|
| 138 |
+
provider: hydra
|
| 139 |
+
- path: /workspace/byte-llms-code/routing_evolution_exp/configs
|
| 140 |
+
schema: file
|
| 141 |
+
provider: main
|
| 142 |
+
- path: ''
|
| 143 |
+
schema: structured
|
| 144 |
+
provider: schema
|
| 145 |
+
output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_6.0
|
| 146 |
+
choices:
|
| 147 |
+
paths: default
|
| 148 |
+
tracking: default
|
| 149 |
+
logging: default
|
| 150 |
+
data: default
|
| 151 |
+
training: default
|
| 152 |
+
model: hnet_xl_code
|
| 153 |
+
hydra/env: default
|
| 154 |
+
hydra/callbacks: null
|
| 155 |
+
hydra/job_logging: default
|
| 156 |
+
hydra/hydra_logging: default
|
| 157 |
+
hydra/hydra_help: default
|
| 158 |
+
hydra/help: default
|
| 159 |
+
hydra/sweeper: basic
|
| 160 |
+
hydra/launcher: basic
|
| 161 |
+
hydra/output: default
|
| 162 |
+
verbose: false
|
routing_tuning_test_07_04/N_6.0/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- training.load_balancing_N=6.0
|
| 2 |
+
- tracking.run_name=routing_N6.0
|
routing_tuning_test_07_04/N_6.0/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:064b89f7fd706b1f705261db40d657000945a625a7099ed3a27ff2992de07de4
|
| 3 |
+
size 3315165139
|
routing_tuning_test_07_04/N_6.0/model_final.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4e1dfd3b4f217116471843f4e33c225ad9497993eaa6c199237d26f00a77eeb
|
| 3 |
+
size 3315165484
|
routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
|
| 3 |
+
size 13633736
|
routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_781.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fef4e624ccaafb1e4e287dfcc9afdf773ba6a0833c95fafe489ad0cbe32ee7ce
|
| 3 |
+
size 13633752
|
routing_tuning_test_07_04/N_6.0/train.log
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-04-07 18:15:40] CUDA_VISIBLE_DEVICES: 0,1
|
| 2 |
+
[2026-04-07 18:15:40] Number of processes: 2
|
| 3 |
+
[2026-04-07 18:15:40] Mixed precision: bf16
|
| 4 |
+
[2026-04-07 18:15:40] ============================================================
|
| 5 |
+
[2026-04-07 18:15:40] Routing Evolution Experiment | N=6.0
|
| 6 |
+
[2026-04-07 18:15:40] ============================================================
|
| 7 |
+
[2026-04-07 18:15:40] Config:
|
| 8 |
+
model:
|
| 9 |
+
config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
|
| 10 |
+
checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 11 |
+
training:
|
| 12 |
+
epochs: 1
|
| 13 |
+
max_steps: null
|
| 14 |
+
batch_size: 8
|
| 15 |
+
eval_batch_size: 24
|
| 16 |
+
gradient_accumulation_steps: 4
|
| 17 |
+
lr: 0.0001
|
| 18 |
+
weight_decay: 0.1
|
| 19 |
+
betas:
|
| 20 |
+
- 0.9
|
| 21 |
+
- 0.95
|
| 22 |
+
eps: 1.0e-08
|
| 23 |
+
lr_scheduler: wsd
|
| 24 |
+
warmup_ratio: 0.1
|
| 25 |
+
decay_ratio: 0.2
|
| 26 |
+
warmup_steps: 100
|
| 27 |
+
min_lr_ratio: 0.1
|
| 28 |
+
lr_multiplier:
|
| 29 |
+
- 2.0
|
| 30 |
+
- 1.5
|
| 31 |
+
- 1.0
|
| 32 |
+
load_balancing_weight: 0.05
|
| 33 |
+
load_balancing_N: 6.0
|
| 34 |
+
max_grad_norm: 1.0
|
| 35 |
+
use_amp: true
|
| 36 |
+
resume: false
|
| 37 |
+
resume_checkpoint: null
|
| 38 |
+
warmup_model: true
|
| 39 |
+
data:
|
| 40 |
+
path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
|
| 41 |
+
max_context_len: 4096
|
| 42 |
+
max_target_len: 256
|
| 43 |
+
num_workers: 0
|
| 44 |
+
pin_memory: true
|
| 45 |
+
max_train_samples: 50000
|
| 46 |
+
max_val_samples: null
|
| 47 |
+
logging:
|
| 48 |
+
log_interval: 10
|
| 49 |
+
save_interval: 1000
|
| 50 |
+
eval_interval: 250
|
| 51 |
+
save_every_epoch: false
|
| 52 |
+
model_only_checkpoints: true
|
| 53 |
+
tracking:
|
| 54 |
+
enabled: true
|
| 55 |
+
project: routing-evolution
|
| 56 |
+
run_name: routing_N6.0
|
| 57 |
+
paths:
|
| 58 |
+
output_dir: outputs/N_6.0
|
| 59 |
+
seed: 42
|
| 60 |
+
device: cuda
|
| 61 |
+
|
| 62 |
+
[2026-04-07 18:15:40] Loading model...
|
| 63 |
+
[2026-04-07 18:15:46] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 64 |
+
[2026-04-07 18:15:46] Applied LR multipliers: [2.0, 1.5, 1.0]
|
| 65 |
+
[2026-04-07 18:15:46] Warming up model...
|
| 66 |
+
[2026-04-07 18:16:31] Total params: 1,654,090,112
|
| 67 |
+
[2026-04-07 18:16:31] Trainable params: 1,654,090,112
|
| 68 |
+
[2026-04-07 18:16:31] Creating dataloaders...
|
| 69 |
+
[2026-04-07 18:16:31] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
|
| 70 |
+
[2026-04-07 18:16:31] Max steps: 781, Steps per epoch: 3125
|
| 71 |
+
[2026-04-07 18:16:33] Starting training...
|
| 72 |
+
[2026-04-07 18:16:33]
|
| 73 |
+
============================================================
|
| 74 |
+
[2026-04-07 18:16:33] EPOCH 1/1 (step 0)
|
| 75 |
+
[2026-04-07 18:16:33] ============================================================
|
| 76 |
+
[2026-04-07 18:17:07] Epoch 1 | Step 10 | Loss: 0.7189 | LM: 0.6496 | LB: 1.4740 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
|
| 77 |
+
[2026-04-07 18:17:16] Epoch 1 | Step 20 | Loss: 0.6183 | LM: 0.5507 | LB: 1.4702 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.458 | LR: 5.62e-05
|
| 78 |
+
[2026-04-07 18:17:26] Epoch 1 | Step 30 | Loss: 0.5597 | LM: 0.4786 | LB: 1.4658 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.478/SR1: 0.456 | LR: 7.92e-05
|
| 79 |
+
[2026-04-07 18:17:35] Epoch 1 | Step 40 | Loss: 0.5234 | LM: 0.4475 | LB: 1.4524 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.474/SR1: 0.454 | LR: 1.00e-04
|
| 80 |
+
[2026-04-07 18:17:44] Epoch 1 | Step 50 | Loss: 0.5007 | LM: 0.4094 | LB: 1.4414 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.469/SR1: 0.449 | LR: 1.00e-04
|
| 81 |
+
[2026-04-07 18:17:53] Epoch 1 | Step 60 | Loss: 0.4742 | LM: 0.3811 | LB: 1.4383 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.448 | LR: 1.00e-04
|
| 82 |
+
[2026-04-07 18:18:02] Epoch 1 | Step 70 | Loss: 0.4596 | LM: 0.3773 | LB: 1.4349 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.358 | HR1: 0.465/SR1: 0.445 | LR: 1.00e-04
|
| 83 |
+
[2026-04-07 18:18:11] Epoch 1 | Step 80 | Loss: 0.4446 | LM: 0.3666 | LB: 1.4292 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.462/SR1: 0.441 | LR: 1.00e-04
|
| 84 |
+
[2026-04-07 18:18:20] Epoch 1 | Step 90 | Loss: 0.4359 | LM: 0.3536 | LB: 1.4221 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.458/SR1: 0.437 | LR: 1.00e-04
|
| 85 |
+
[2026-04-07 18:18:29] Epoch 1 | Step 100 | Loss: 0.4306 | LM: 0.3510 | LB: 1.4150 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.356 | HR1: 0.456/SR1: 0.435 | LR: 1.00e-04
|
| 86 |
+
[2026-04-07 18:18:39] Epoch 1 | Step 110 | Loss: 0.4261 | LM: 0.3459 | LB: 1.4085 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.356 | HR1: 0.453/SR1: 0.431 | LR: 1.00e-04
|
| 87 |
+
[2026-04-07 18:18:48] Epoch 1 | Step 120 | Loss: 0.4234 | LM: 0.3474 | LB: 1.4018 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.355 | HR1: 0.451/SR1: 0.428 | LR: 1.00e-04
|
| 88 |
+
[2026-04-07 18:18:57] Epoch 1 | Step 130 | Loss: 0.4192 | LM: 0.3443 | LB: 1.3968 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.449/SR1: 0.426 | LR: 1.00e-04
|
| 89 |
+
[2026-04-07 18:19:06] Epoch 1 | Step 140 | Loss: 0.4148 | LM: 0.3400 | LB: 1.3902 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.445/SR1: 0.423 | LR: 1.00e-04
|
| 90 |
+
[2026-04-07 18:19:15] Epoch 1 | Step 150 | Loss: 0.4123 | LM: 0.3377 | LB: 1.3854 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.420 | LR: 1.00e-04
|
| 91 |
+
[2026-04-07 18:19:24] Epoch 1 | Step 160 | Loss: 0.4078 | LM: 0.3323 | LB: 1.3805 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.441/SR1: 0.418 | LR: 1.00e-04
|
| 92 |
+
[2026-04-07 18:19:33] Epoch 1 | Step 170 | Loss: 0.4029 | LM: 0.3276 | LB: 1.3761 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.439/SR1: 0.415 | LR: 1.00e-04
|
| 93 |
+
[2026-04-07 18:19:42] Epoch 1 | Step 180 | Loss: 0.3992 | LM: 0.3236 | LB: 1.3724 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.353 | HR1: 0.437/SR1: 0.413 | LR: 1.00e-04
|
| 94 |
+
[2026-04-07 18:19:51] Epoch 1 | Step 190 | Loss: 0.3974 | LM: 0.3234 | LB: 1.3686 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.434/SR1: 0.410 | LR: 1.00e-04
|
| 95 |
+
[2026-04-07 18:20:00] Epoch 1 | Step 200 | Loss: 0.3948 | LM: 0.3207 | LB: 1.3641 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.432/SR1: 0.408 | LR: 1.00e-04
|
| 96 |
+
[2026-04-07 18:20:09] Epoch 1 | Step 210 | Loss: 0.3945 | LM: 0.3213 | LB: 1.3598 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.430/SR1: 0.406 | LR: 1.00e-04
|
| 97 |
+
[2026-04-07 18:20:18] Epoch 1 | Step 220 | Loss: 0.3935 | LM: 0.3194 | LB: 1.3568 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-04
|
| 98 |
+
[2026-04-07 18:20:27] Epoch 1 | Step 230 | Loss: 0.3920 | LM: 0.3186 | LB: 1.3533 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.353 | HR1: 0.427/SR1: 0.402 | LR: 1.00e-04
|
| 99 |
+
[2026-04-07 18:20:36] Epoch 1 | Step 240 | Loss: 0.3898 | LM: 0.3196 | LB: 1.3500 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.425/SR1: 0.400 | LR: 1.00e-04
|
| 100 |
+
[2026-04-07 18:20:45] Epoch 1 | Step 250 | Loss: 0.3874 | LM: 0.3177 | LB: 1.3465 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-04
|
| 101 |
+
[2026-04-07 18:20:46] Validation | Batch 10/732 | Loss: 0.3396 | LM: 0.2766
|
| 102 |
+
[2026-04-07 18:20:47] Validation | Batch 20/732 | Loss: 0.3626 | LM: 0.2992
|
| 103 |
+
[2026-04-07 18:20:48] Validation | Batch 30/732 | Loss: 0.3535 | LM: 0.2904
|
| 104 |
+
[2026-04-07 18:20:49] Validation | Batch 40/732 | Loss: 0.3589 | LM: 0.2958
|
| 105 |
+
[2026-04-07 18:20:51] Validation | Batch 50/732 | Loss: 0.3593 | LM: 0.2962
|
| 106 |
+
[2026-04-07 18:20:52] Validation | Batch 60/732 | Loss: 0.3618 | LM: 0.2987
|
| 107 |
+
[2026-04-07 18:20:53] Validation | Batch 70/732 | Loss: 0.3653 | LM: 0.3022
|
| 108 |
+
[2026-04-07 18:20:54] Validation | Batch 80/732 | Loss: 0.3634 | LM: 0.3004
|
| 109 |
+
[2026-04-07 18:20:55] Validation | Batch 90/732 | Loss: 0.3630 | LM: 0.3000
|
| 110 |
+
[2026-04-07 18:20:56] Validation | Batch 100/732 | Loss: 0.3639 | LM: 0.3009
|
| 111 |
+
[2026-04-07 18:20:57] Validation | Batch 110/732 | Loss: 0.3607 | LM: 0.2976
|
| 112 |
+
[2026-04-07 18:20:58] Validation | Batch 120/732 | Loss: 0.3640 | LM: 0.3010
|
| 113 |
+
[2026-04-07 18:20:59] Validation | Batch 130/732 | Loss: 0.3653 | LM: 0.3024
|
| 114 |
+
[2026-04-07 18:21:00] Validation | Batch 140/732 | Loss: 0.3647 | LM: 0.3017
|
| 115 |
+
[2026-04-07 18:21:02] Validation | Batch 150/732 | Loss: 0.3641 | LM: 0.3010
|
| 116 |
+
[2026-04-07 18:21:03] Validation | Batch 160/732 | Loss: 0.3631 | LM: 0.3001
|
| 117 |
+
[2026-04-07 18:21:03] Validation | Batch 170/732 | Loss: 0.3635 | LM: 0.3005
|
| 118 |
+
[2026-04-07 18:21:05] Validation | Batch 180/732 | Loss: 0.3649 | LM: 0.3019
|
| 119 |
+
[2026-04-07 18:21:06] Validation | Batch 190/732 | Loss: 0.3642 | LM: 0.3012
|
| 120 |
+
[2026-04-07 18:21:07] Validation | Batch 200/732 | Loss: 0.3643 | LM: 0.3013
|
| 121 |
+
[2026-04-07 18:21:08] Validation | Batch 210/732 | Loss: 0.3635 | LM: 0.3005
|
| 122 |
+
[2026-04-07 18:21:09] Validation | Batch 220/732 | Loss: 0.3630 | LM: 0.2999
|
| 123 |
+
[2026-04-07 18:21:10] Validation | Batch 230/732 | Loss: 0.3634 | LM: 0.3004
|
| 124 |
+
[2026-04-07 18:21:11] Validation | Batch 240/732 | Loss: 0.3631 | LM: 0.3001
|
| 125 |
+
[2026-04-07 18:21:13] Validation | Batch 250/732 | Loss: 0.3632 | LM: 0.3001
|
| 126 |
+
[2026-04-07 18:21:14] Validation | Batch 260/732 | Loss: 0.3622 | LM: 0.2991
|
| 127 |
+
[2026-04-07 18:21:15] Validation | Batch 270/732 | Loss: 0.3619 | LM: 0.2988
|
| 128 |
+
[2026-04-07 18:21:16] Validation | Batch 280/732 | Loss: 0.3608 | LM: 0.2977
|
| 129 |
+
[2026-04-07 18:21:17] Validation | Batch 290/732 | Loss: 0.3607 | LM: 0.2976
|
| 130 |
+
[2026-04-07 18:21:18] Validation | Batch 300/732 | Loss: 0.3606 | LM: 0.2975
|
| 131 |
+
[2026-04-07 18:21:19] Validation | Batch 310/732 | Loss: 0.3605 | LM: 0.2974
|
| 132 |
+
[2026-04-07 18:21:20] Validation | Batch 320/732 | Loss: 0.3595 | LM: 0.2964
|
| 133 |
+
[2026-04-07 18:21:21] Validation | Batch 330/732 | Loss: 0.3584 | LM: 0.2953
|
| 134 |
+
[2026-04-07 18:21:22] Validation | Batch 340/732 | Loss: 0.3578 | LM: 0.2947
|
| 135 |
+
[2026-04-07 18:21:23] Validation | Batch 350/732 | Loss: 0.3581 | LM: 0.2950
|
| 136 |
+
[2026-04-07 18:21:24] Validation | Batch 360/732 | Loss: 0.3589 | LM: 0.2958
|
| 137 |
+
[2026-04-07 18:21:25] Validation | Batch 370/732 | Loss: 0.3580 | LM: 0.2948
|
| 138 |
+
[2026-04-07 18:21:26] Validation | Batch 380/732 | Loss: 0.3573 | LM: 0.2942
|
| 139 |
+
[2026-04-07 18:21:27] Validation | Batch 390/732 | Loss: 0.3569 | LM: 0.2938
|
| 140 |
+
[2026-04-07 18:21:28] Validation | Batch 400/732 | Loss: 0.3567 | LM: 0.2936
|
| 141 |
+
[2026-04-07 18:21:29] Validation | Batch 410/732 | Loss: 0.3560 | LM: 0.2929
|
| 142 |
+
[2026-04-07 18:21:30] Validation | Batch 420/732 | Loss: 0.3562 | LM: 0.2931
|
| 143 |
+
[2026-04-07 18:21:31] Validation | Batch 430/732 | Loss: 0.3561 | LM: 0.2930
|
| 144 |
+
[2026-04-07 18:21:32] Validation | Batch 440/732 | Loss: 0.3556 | LM: 0.2924
|
| 145 |
+
[2026-04-07 18:21:33] Validation | Batch 450/732 | Loss: 0.3554 | LM: 0.2923
|
| 146 |
+
[2026-04-07 18:21:35] Validation | Batch 460/732 | Loss: 0.3557 | LM: 0.2926
|
| 147 |
+
[2026-04-07 18:21:36] Validation | Batch 470/732 | Loss: 0.3555 | LM: 0.2924
|
| 148 |
+
[2026-04-07 18:21:37] Validation | Batch 480/732 | Loss: 0.3557 | LM: 0.2926
|
| 149 |
+
[2026-04-07 18:21:38] Validation | Batch 490/732 | Loss: 0.3568 | LM: 0.2937
|
| 150 |
+
[2026-04-07 18:21:39] Validation | Batch 500/732 | Loss: 0.3579 | LM: 0.2947
|
| 151 |
+
[2026-04-07 18:21:40] Validation | Batch 510/732 | Loss: 0.3575 | LM: 0.2944
|
| 152 |
+
[2026-04-07 18:21:41] Validation | Batch 520/732 | Loss: 0.3573 | LM: 0.2942
|
| 153 |
+
[2026-04-07 18:21:42] Validation | Batch 530/732 | Loss: 0.3567 | LM: 0.2936
|
| 154 |
+
[2026-04-07 18:21:43] Validation | Batch 540/732 | Loss: 0.3569 | LM: 0.2937
|
| 155 |
+
[2026-04-07 18:21:44] Validation | Batch 550/732 | Loss: 0.3568 | LM: 0.2937
|
| 156 |
+
[2026-04-07 18:21:45] Validation | Batch 560/732 | Loss: 0.3564 | LM: 0.2933
|
| 157 |
+
[2026-04-07 18:21:46] Validation | Batch 570/732 | Loss: 0.3565 | LM: 0.2934
|
| 158 |
+
[2026-04-07 18:21:48] Validation | Batch 580/732 | Loss: 0.3562 | LM: 0.2931
|
| 159 |
+
[2026-04-07 18:21:49] Validation | Batch 590/732 | Loss: 0.3562 | LM: 0.2930
|
| 160 |
+
[2026-04-07 18:21:50] Validation | Batch 600/732 | Loss: 0.3561 | LM: 0.2930
|
| 161 |
+
[2026-04-07 18:21:51] Validation | Batch 610/732 | Loss: 0.3567 | LM: 0.2936
|
| 162 |
+
[2026-04-07 18:21:52] Validation | Batch 620/732 | Loss: 0.3571 | LM: 0.2940
|
| 163 |
+
[2026-04-07 18:21:53] Validation | Batch 630/732 | Loss: 0.3569 | LM: 0.2937
|
| 164 |
+
[2026-04-07 18:21:55] Validation | Batch 640/732 | Loss: 0.3566 | LM: 0.2935
|
| 165 |
+
[2026-04-07 18:21:56] Validation | Batch 650/732 | Loss: 0.3565 | LM: 0.2933
|
| 166 |
+
[2026-04-07 18:21:57] Validation | Batch 660/732 | Loss: 0.3569 | LM: 0.2938
|
| 167 |
+
[2026-04-07 18:21:58] Validation | Batch 670/732 | Loss: 0.3576 | LM: 0.2944
|
| 168 |
+
[2026-04-07 18:21:59] Validation | Batch 680/732 | Loss: 0.3575 | LM: 0.2944
|
| 169 |
+
[2026-04-07 18:22:00] Validation | Batch 690/732 | Loss: 0.3577 | LM: 0.2946
|
| 170 |
+
[2026-04-07 18:22:01] Validation | Batch 700/732 | Loss: 0.3582 | LM: 0.2951
|
| 171 |
+
[2026-04-07 18:22:02] Validation | Batch 710/732 | Loss: 0.3586 | LM: 0.2955
|
| 172 |
+
[2026-04-07 18:22:03] Validation | Batch 720/732 | Loss: 0.3596 | LM: 0.2964
|
| 173 |
+
[2026-04-07 18:22:04] Validation | Batch 730/732 | Loss: 0.3592 | LM: 0.2961
|
| 174 |
+
[2026-04-07 18:22:04] Validation | Batch 732/732 | Loss: 0.3591 | LM: 0.2959
|
| 175 |
+
[2026-04-07 18:22:04] Validation | Loss: 0.3591 | LM: 0.2959 | PPL: 1.35 | Time: 79.46s
|
| 176 |
+
[2026-04-07 18:22:07] New best model saved! Val loss: 0.3591
|
| 177 |
+
[2026-04-07 18:22:16] Epoch 1 | Step 260 | Loss: 0.3854 | LM: 0.3169 | LB: 1.3433 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.422/SR1: 0.396 | LR: 1.00e-04
|
| 178 |
+
[2026-04-07 18:22:25] Epoch 1 | Step 270 | Loss: 0.3854 | LM: 0.3153 | LB: 1.3405 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.420/SR1: 0.394 | LR: 1.00e-04
|
| 179 |
+
[2026-04-07 18:22:34] Epoch 1 | Step 280 | Loss: 0.3852 | LM: 0.3155 | LB: 1.3376 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.352 | HR1: 0.419/SR1: 0.393 | LR: 1.00e-04
|
| 180 |
+
[2026-04-07 18:22:43] Epoch 1 | Step 290 | Loss: 0.3834 | LM: 0.3135 | LB: 1.3348 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.417/SR1: 0.391 | LR: 1.00e-04
|
| 181 |
+
[2026-04-07 18:22:52] Epoch 1 | Step 300 | Loss: 0.3827 | LM: 0.3130 | LB: 1.3326 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.416/SR1: 0.389 | LR: 1.00e-04
|
| 182 |
+
[2026-04-07 18:23:01] Epoch 1 | Step 310 | Loss: 0.3817 | LM: 0.3121 | LB: 1.3305 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.415/SR1: 0.387 | LR: 1.00e-04
|
| 183 |
+
[2026-04-07 18:23:10] Epoch 1 | Step 320 | Loss: 0.3810 | LM: 0.3115 | LB: 1.3280 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.413/SR1: 0.386 | LR: 9.80e-05
|
| 184 |
+
[2026-04-07 18:23:19] Epoch 1 | Step 330 | Loss: 0.3796 | LM: 0.3106 | LB: 1.3255 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.412/SR1: 0.384 | LR: 8.93e-05
|
| 185 |
+
[2026-04-07 18:23:28] Epoch 1 | Step 340 | Loss: 0.3788 | LM: 0.3099 | LB: 1.3231 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.410/SR1: 0.383 | LR: 7.51e-05
|
| 186 |
+
[2026-04-07 18:23:37] Epoch 1 | Step 350 | Loss: 0.3782 | LM: 0.3105 | LB: 1.3210 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.409/SR1: 0.381 | LR: 5.77e-05
|
| 187 |
+
[2026-04-07 18:23:46] Epoch 1 | Step 360 | Loss: 0.3774 | LM: 0.3120 | LB: 1.3191 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.408/SR1: 0.380 | LR: 3.99e-05
|
| 188 |
+
[2026-04-07 18:23:55] Epoch 1 | Step 370 | Loss: 0.3769 | LM: 0.3116 | LB: 1.3176 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.407/SR1: 0.379 | LR: 2.45e-05
|
| 189 |
+
[2026-04-07 18:24:04] Epoch 1 | Step 380 | Loss: 0.3758 | LM: 0.3097 | LB: 1.3158 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.407/SR1: 0.378 | LR: 1.40e-05
|
| 190 |
+
[2026-04-07 18:24:13] Epoch 1 | Step 390 | Loss: 0.3759 | LM: 0.3095 | LB: 1.3138 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.405/SR1: 0.377 | LR: 1.00e-05
|
| 191 |
+
[2026-04-07 18:24:22] Epoch 1 | Step 400 | Loss: 0.3757 | LM: 0.3102 | LB: 1.3121 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.405/SR1: 0.376 | LR: 1.00e-05
|
| 192 |
+
[2026-04-07 18:24:31] Epoch 1 | Step 410 | Loss: 0.3756 | LM: 0.3101 | LB: 1.3104 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.404/SR1: 0.375 | LR: 1.00e-05
|
| 193 |
+
[2026-04-07 18:24:40] Epoch 1 | Step 420 | Loss: 0.3756 | LM: 0.3095 | LB: 1.3089 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.403/SR1: 0.374 | LR: 1.00e-05
|
| 194 |
+
[2026-04-07 18:24:49] Epoch 1 | Step 430 | Loss: 0.3750 | LM: 0.3078 | LB: 1.3075 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.402/SR1: 0.373 | LR: 1.00e-05
|
| 195 |
+
[2026-04-07 18:24:58] Epoch 1 | Step 440 | Loss: 0.3742 | LM: 0.3069 | LB: 1.3064 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.402/SR1: 0.373 | LR: 1.00e-05
|
| 196 |
+
[2026-04-07 18:25:07] Epoch 1 | Step 450 | Loss: 0.3735 | LM: 0.3056 | LB: 1.3051 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.401/SR1: 0.372 | LR: 1.00e-05
|
| 197 |
+
[2026-04-07 18:25:16] Epoch 1 | Step 460 | Loss: 0.3736 | LM: 0.3047 | LB: 1.3038 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.401/SR1: 0.371 | LR: 1.00e-05
|
| 198 |
+
[2026-04-07 18:25:25] Epoch 1 | Step 470 | Loss: 0.3734 | LM: 0.3055 | LB: 1.3029 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.400/SR1: 0.370 | LR: 1.00e-05
|
| 199 |
+
[2026-04-07 18:25:34] Epoch 1 | Step 480 | Loss: 0.3730 | LM: 0.3048 | LB: 1.3016 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.399/SR1: 0.369 | LR: 1.00e-05
|
| 200 |
+
[2026-04-07 18:25:43] Epoch 1 | Step 490 | Loss: 0.3722 | LM: 0.3037 | LB: 1.3006 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.399/SR1: 0.368 | LR: 1.00e-05
|
| 201 |
+
[2026-04-07 18:25:52] Epoch 1 | Step 500 | Loss: 0.3724 | LM: 0.3041 | LB: 1.2995 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.398/SR1: 0.368 | LR: 1.00e-05
|
| 202 |
+
[2026-04-07 18:25:53] Validation | Batch 10/732 | Loss: 0.3368 | LM: 0.2749
|
| 203 |
+
[2026-04-07 18:25:54] Validation | Batch 20/732 | Loss: 0.3593 | LM: 0.2970
|
| 204 |
+
[2026-04-07 18:25:56] Validation | Batch 30/732 | Loss: 0.3508 | LM: 0.2887
|
| 205 |
+
[2026-04-07 18:25:57] Validation | Batch 40/732 | Loss: 0.3562 | LM: 0.2941
|
| 206 |
+
[2026-04-07 18:25:58] Validation | Batch 50/732 | Loss: 0.3562 | LM: 0.2941
|
| 207 |
+
[2026-04-07 18:25:59] Validation | Batch 60/732 | Loss: 0.3582 | LM: 0.2961
|
| 208 |
+
[2026-04-07 18:26:00] Validation | Batch 70/732 | Loss: 0.3616 | LM: 0.2995
|
| 209 |
+
[2026-04-07 18:26:01] Validation | Batch 80/732 | Loss: 0.3598 | LM: 0.2979
|
| 210 |
+
[2026-04-07 18:26:02] Validation | Batch 90/732 | Loss: 0.3596 | LM: 0.2977
|
| 211 |
+
[2026-04-07 18:26:03] Validation | Batch 100/732 | Loss: 0.3606 | LM: 0.2986
|
| 212 |
+
[2026-04-07 18:26:05] Validation | Batch 110/732 | Loss: 0.3573 | LM: 0.2953
|
| 213 |
+
[2026-04-07 18:26:06] Validation | Batch 120/732 | Loss: 0.3604 | LM: 0.2984
|
| 214 |
+
[2026-04-07 18:26:07] Validation | Batch 130/732 | Loss: 0.3616 | LM: 0.2996
|
| 215 |
+
[2026-04-07 18:26:08] Validation | Batch 140/732 | Loss: 0.3610 | LM: 0.2990
|
| 216 |
+
[2026-04-07 18:26:09] Validation | Batch 150/732 | Loss: 0.3604 | LM: 0.2984
|
| 217 |
+
[2026-04-07 18:26:10] Validation | Batch 160/732 | Loss: 0.3595 | LM: 0.2975
|
| 218 |
+
[2026-04-07 18:26:11] Validation | Batch 170/732 | Loss: 0.3600 | LM: 0.2980
|
| 219 |
+
[2026-04-07 18:26:12] Validation | Batch 180/732 | Loss: 0.3614 | LM: 0.2995
|
| 220 |
+
[2026-04-07 18:26:13] Validation | Batch 190/732 | Loss: 0.3607 | LM: 0.2987
|
| 221 |
+
[2026-04-07 18:26:14] Validation | Batch 200/732 | Loss: 0.3608 | LM: 0.2989
|
| 222 |
+
[2026-04-07 18:26:15] Validation | Batch 210/732 | Loss: 0.3601 | LM: 0.2981
|
| 223 |
+
[2026-04-07 18:26:16] Validation | Batch 220/732 | Loss: 0.3596 | LM: 0.2976
|
| 224 |
+
[2026-04-07 18:26:17] Validation | Batch 230/732 | Loss: 0.3600 | LM: 0.2980
|
| 225 |
+
[2026-04-07 18:26:19] Validation | Batch 240/732 | Loss: 0.3597 | LM: 0.2978
|
| 226 |
+
[2026-04-07 18:26:20] Validation | Batch 250/732 | Loss: 0.3597 | LM: 0.2977
|
| 227 |
+
[2026-04-07 18:26:21] Validation | Batch 260/732 | Loss: 0.3587 | LM: 0.2967
|
| 228 |
+
[2026-04-07 18:26:22] Validation | Batch 270/732 | Loss: 0.3585 | LM: 0.2965
|
| 229 |
+
[2026-04-07 18:26:23] Validation | Batch 280/732 | Loss: 0.3576 | LM: 0.2955
|
| 230 |
+
[2026-04-07 18:26:24] Validation | Batch 290/732 | Loss: 0.3573 | LM: 0.2953
|
| 231 |
+
[2026-04-07 18:26:25] Validation | Batch 300/732 | Loss: 0.3573 | LM: 0.2953
|
| 232 |
+
[2026-04-07 18:26:26] Validation | Batch 310/732 | Loss: 0.3572 | LM: 0.2951
|
| 233 |
+
[2026-04-07 18:26:27] Validation | Batch 320/732 | Loss: 0.3562 | LM: 0.2942
|
| 234 |
+
[2026-04-07 18:26:28] Validation | Batch 330/732 | Loss: 0.3551 | LM: 0.2931
|
| 235 |
+
[2026-04-07 18:26:29] Validation | Batch 340/732 | Loss: 0.3545 | LM: 0.2925
|
| 236 |
+
[2026-04-07 18:26:30] Validation | Batch 350/732 | Loss: 0.3549 | LM: 0.2929
|
| 237 |
+
[2026-04-07 18:26:31] Validation | Batch 360/732 | Loss: 0.3557 | LM: 0.2937
|
| 238 |
+
[2026-04-07 18:26:32] Validation | Batch 370/732 | Loss: 0.3548 | LM: 0.2927
|
| 239 |
+
[2026-04-07 18:26:33] Validation | Batch 380/732 | Loss: 0.3541 | LM: 0.2921
|
| 240 |
+
[2026-04-07 18:26:34] Validation | Batch 390/732 | Loss: 0.3538 | LM: 0.2917
|
| 241 |
+
[2026-04-07 18:26:35] Validation | Batch 400/732 | Loss: 0.3536 | LM: 0.2916
|
| 242 |
+
[2026-04-07 18:26:36] Validation | Batch 410/732 | Loss: 0.3529 | LM: 0.2908
|
| 243 |
+
[2026-04-07 18:26:37] Validation | Batch 420/732 | Loss: 0.3531 | LM: 0.2911
|
| 244 |
+
[2026-04-07 18:26:38] Validation | Batch 430/732 | Loss: 0.3531 | LM: 0.2910
|
| 245 |
+
[2026-04-07 18:26:40] Validation | Batch 440/732 | Loss: 0.3525 | LM: 0.2905
|
| 246 |
+
[2026-04-07 18:26:41] Validation | Batch 450/732 | Loss: 0.3524 | LM: 0.2903
|
| 247 |
+
[2026-04-07 18:26:42] Validation | Batch 460/732 | Loss: 0.3527 | LM: 0.2907
|
| 248 |
+
[2026-04-07 18:26:43] Validation | Batch 470/732 | Loss: 0.3525 | LM: 0.2904
|
| 249 |
+
[2026-04-07 18:26:44] Validation | Batch 480/732 | Loss: 0.3526 | LM: 0.2906
|
| 250 |
+
[2026-04-07 18:26:45] Validation | Batch 490/732 | Loss: 0.3537 | LM: 0.2916
|
| 251 |
+
[2026-04-07 18:26:46] Validation | Batch 500/732 | Loss: 0.3547 | LM: 0.2927
|
| 252 |
+
[2026-04-07 18:26:47] Validation | Batch 510/732 | Loss: 0.3544 | LM: 0.2924
|
| 253 |
+
[2026-04-07 18:26:48] Validation | Batch 520/732 | Loss: 0.3542 | LM: 0.2921
|
| 254 |
+
[2026-04-07 18:26:49] Validation | Batch 530/732 | Loss: 0.3537 | LM: 0.2916
|
| 255 |
+
[2026-04-07 18:26:50] Validation | Batch 540/732 | Loss: 0.3538 | LM: 0.2918
|
| 256 |
+
[2026-04-07 18:26:51] Validation | Batch 550/732 | Loss: 0.3538 | LM: 0.2917
|
| 257 |
+
[2026-04-07 18:26:52] Validation | Batch 560/732 | Loss: 0.3533 | LM: 0.2912
|
| 258 |
+
[2026-04-07 18:26:53] Validation | Batch 570/732 | Loss: 0.3533 | LM: 0.2913
|
| 259 |
+
[2026-04-07 18:26:55] Validation | Batch 580/732 | Loss: 0.3530 | LM: 0.2910
|
| 260 |
+
[2026-04-07 18:26:56] Validation | Batch 590/732 | Loss: 0.3530 | LM: 0.2910
|
| 261 |
+
[2026-04-07 18:26:57] Validation | Batch 600/732 | Loss: 0.3530 | LM: 0.2909
|
| 262 |
+
[2026-04-07 18:26:58] Validation | Batch 610/732 | Loss: 0.3535 | LM: 0.2914
|
| 263 |
+
[2026-04-07 18:26:59] Validation | Batch 620/732 | Loss: 0.3539 | LM: 0.2918
|
| 264 |
+
[2026-04-07 18:27:00] Validation | Batch 630/732 | Loss: 0.3537 | LM: 0.2916
|
| 265 |
+
[2026-04-07 18:27:02] Validation | Batch 640/732 | Loss: 0.3534 | LM: 0.2913
|
| 266 |
+
[2026-04-07 18:27:03] Validation | Batch 650/732 | Loss: 0.3532 | LM: 0.2912
|
| 267 |
+
[2026-04-07 18:27:04] Validation | Batch 660/732 | Loss: 0.3537 | LM: 0.2917
|
| 268 |
+
[2026-04-07 18:27:05] Validation | Batch 670/732 | Loss: 0.3544 | LM: 0.2923
|
| 269 |
+
[2026-04-07 18:27:06] Validation | Batch 680/732 | Loss: 0.3544 | LM: 0.2923
|
| 270 |
+
[2026-04-07 18:27:07] Validation | Batch 690/732 | Loss: 0.3545 | LM: 0.2924
|
| 271 |
+
[2026-04-07 18:27:08] Validation | Batch 700/732 | Loss: 0.3550 | LM: 0.2929
|
| 272 |
+
[2026-04-07 18:27:09] Validation | Batch 710/732 | Loss: 0.3554 | LM: 0.2933
|
| 273 |
+
[2026-04-07 18:27:10] Validation | Batch 720/732 | Loss: 0.3563 | LM: 0.2943
|
| 274 |
+
[2026-04-07 18:27:11] Validation | Batch 730/732 | Loss: 0.3560 | LM: 0.2939
|
| 275 |
+
[2026-04-07 18:27:11] Validation | Batch 732/732 | Loss: 0.3558 | LM: 0.2938
|
| 276 |
+
[2026-04-07 18:27:11] Validation | Loss: 0.3558 | LM: 0.2938 | PPL: 1.34 | Time: 78.98s
|
| 277 |
+
[2026-04-07 18:27:14] New best model saved! Val loss: 0.3558
|
| 278 |
+
[2026-04-07 18:27:23] Epoch 1 | Step 510 | Loss: 0.3723 | LM: 0.3024 | LB: 1.2988 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.398/SR1: 0.367 | LR: 1.00e-05
|
| 279 |
+
[2026-04-07 18:27:33] Epoch 1 | Step 520 | Loss: 0.3719 | LM: 0.3015 | LB: 1.2975 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 1.00e-05
|
| 280 |
+
[2026-04-07 18:27:42] Epoch 1 | Step 530 | Loss: 0.3716 | LM: 0.3006 | LB: 1.2966 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 1.00e-05
|
| 281 |
+
[2026-04-07 18:27:51] Epoch 1 | Step 540 | Loss: 0.3713 | LM: 0.3004 | LB: 1.2953 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.396/SR1: 0.365 | LR: 1.00e-05
|
| 282 |
+
[2026-04-07 18:28:00] Epoch 1 | Step 550 | Loss: 0.3709 | LM: 0.3009 | LB: 1.2940 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.396/SR1: 0.365 | LR: 1.00e-05
|
| 283 |
+
[2026-04-07 18:28:09] Epoch 1 | Step 560 | Loss: 0.3711 | LM: 0.3016 | LB: 1.2935 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 1.00e-05
|
| 284 |
+
[2026-04-07 18:28:18] Epoch 1 | Step 570 | Loss: 0.3713 | LM: 0.3021 | LB: 1.2928 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 1.00e-05
|
| 285 |
+
[2026-04-07 18:28:27] Epoch 1 | Step 580 | Loss: 0.3708 | LM: 0.3019 | LB: 1.2920 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.363 | LR: 1.00e-05
|
| 286 |
+
[2026-04-07 18:28:36] Epoch 1 | Step 590 | Loss: 0.3713 | LM: 0.3033 | LB: 1.2910 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.363 | LR: 1.00e-05
|
| 287 |
+
[2026-04-07 18:28:45] Epoch 1 | Step 600 | Loss: 0.3708 | LM: 0.3026 | LB: 1.2905 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.362 | LR: 1.00e-05
|
| 288 |
+
[2026-04-07 18:28:54] Epoch 1 | Step 610 | Loss: 0.3704 | LM: 0.3023 | LB: 1.2900 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.362 | LR: 1.00e-05
|
| 289 |
+
[2026-04-07 18:29:03] Epoch 1 | Step 620 | Loss: 0.3701 | LM: 0.3024 | LB: 1.2893 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.361 | LR: 1.00e-05
|
| 290 |
+
[2026-04-07 18:29:12] Epoch 1 | Step 630 | Loss: 0.3697 | LM: 0.3022 | LB: 1.2884 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.361 | LR: 1.00e-05
|
| 291 |
+
[2026-04-07 18:29:21] Epoch 1 | Step 640 | Loss: 0.3693 | LM: 0.3022 | LB: 1.2879 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 1.00e-05
|
| 292 |
+
[2026-04-07 18:29:30] Epoch 1 | Step 650 | Loss: 0.3695 | LM: 0.3018 | LB: 1.2872 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 1.00e-05
|
| 293 |
+
[2026-04-07 18:29:39] Epoch 1 | Step 660 | Loss: 0.3690 | LM: 0.3015 | LB: 1.2867 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
|
| 294 |
+
[2026-04-07 18:29:48] Epoch 1 | Step 670 | Loss: 0.3697 | LM: 0.3019 | LB: 1.2864 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
|
| 295 |
+
[2026-04-07 18:29:57] Epoch 1 | Step 680 | Loss: 0.3693 | LM: 0.3022 | LB: 1.2860 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
|
| 296 |
+
[2026-04-07 18:30:06] Epoch 1 | Step 690 | Loss: 0.3694 | LM: 0.3027 | LB: 1.2858 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.391/SR1: 0.358 | LR: 1.00e-05
|
| 297 |
+
[2026-04-07 18:30:15] Epoch 1 | Step 700 | Loss: 0.3696 | LM: 0.3028 | LB: 1.2853 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.358 | LR: 1.00e-05
|
| 298 |
+
[2026-04-07 18:30:24] Epoch 1 | Step 710 | Loss: 0.3693 | LM: 0.3024 | LB: 1.2846 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.358 | LR: 1.00e-05
|
| 299 |
+
[2026-04-07 18:30:33] Epoch 1 | Step 720 | Loss: 0.3689 | LM: 0.3023 | LB: 1.2842 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.357 | LR: 1.00e-05
|
| 300 |
+
[2026-04-07 18:30:42] Epoch 1 | Step 730 | Loss: 0.3686 | LM: 0.3030 | LB: 1.2839 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.357 | LR: 1.00e-05
|
| 301 |
+
[2026-04-07 18:30:51] Epoch 1 | Step 740 | Loss: 0.3684 | LM: 0.3032 | LB: 1.2836 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.389/SR1: 0.357 | LR: 1.00e-05
|
| 302 |
+
[2026-04-07 18:31:00] Epoch 1 | Step 750 | Loss: 0.3682 | LM: 0.3028 | LB: 1.2829 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.389/SR1: 0.357 | LR: 1.00e-05
|
| 303 |
+
[2026-04-07 18:31:01] Validation | Batch 10/732 | Loss: 0.3367 | LM: 0.2749
|
| 304 |
+
[2026-04-07 18:31:02] Validation | Batch 20/732 | Loss: 0.3589 | LM: 0.2966
|
| 305 |
+
[2026-04-07 18:31:04] Validation | Batch 30/732 | Loss: 0.3502 | LM: 0.2882
|
| 306 |
+
[2026-04-07 18:31:05] Validation | Batch 40/732 | Loss: 0.3557 | LM: 0.2936
|
| 307 |
+
[2026-04-07 18:31:06] Validation | Batch 50/732 | Loss: 0.3557 | LM: 0.2936
|
| 308 |
+
[2026-04-07 18:31:07] Validation | Batch 60/732 | Loss: 0.3577 | LM: 0.2957
|
| 309 |
+
[2026-04-07 18:31:08] Validation | Batch 70/732 | Loss: 0.3612 | LM: 0.2991
|
| 310 |
+
[2026-04-07 18:31:09] Validation | Batch 80/732 | Loss: 0.3594 | LM: 0.2975
|
| 311 |
+
[2026-04-07 18:31:10] Validation | Batch 90/732 | Loss: 0.3592 | LM: 0.2973
|
| 312 |
+
[2026-04-07 18:31:12] Validation | Batch 100/732 | Loss: 0.3602 | LM: 0.2983
|
| 313 |
+
[2026-04-07 18:31:13] Validation | Batch 110/732 | Loss: 0.3569 | LM: 0.2950
|
| 314 |
+
[2026-04-07 18:31:14] Validation | Batch 120/732 | Loss: 0.3600 | LM: 0.2981
|
| 315 |
+
[2026-04-07 18:31:15] Validation | Batch 130/732 | Loss: 0.3612 | LM: 0.2993
|
| 316 |
+
[2026-04-07 18:31:16] Validation | Batch 140/732 | Loss: 0.3606 | LM: 0.2987
|
| 317 |
+
[2026-04-07 18:31:17] Validation | Batch 150/732 | Loss: 0.3600 | LM: 0.2980
|
| 318 |
+
[2026-04-07 18:31:18] Validation | Batch 160/732 | Loss: 0.3591 | LM: 0.2972
|
| 319 |
+
[2026-04-07 18:31:19] Validation | Batch 170/732 | Loss: 0.3596 | LM: 0.2977
|
| 320 |
+
[2026-04-07 18:31:20] Validation | Batch 180/732 | Loss: 0.3610 | LM: 0.2991
|
| 321 |
+
[2026-04-07 18:31:21] Validation | Batch 190/732 | Loss: 0.3602 | LM: 0.2983
|
| 322 |
+
[2026-04-07 18:31:22] Validation | Batch 200/732 | Loss: 0.3604 | LM: 0.2985
|
| 323 |
+
[2026-04-07 18:31:23] Validation | Batch 210/732 | Loss: 0.3597 | LM: 0.2977
|
| 324 |
+
[2026-04-07 18:31:24] Validation | Batch 220/732 | Loss: 0.3591 | LM: 0.2972
|
| 325 |
+
[2026-04-07 18:31:25] Validation | Batch 230/732 | Loss: 0.3596 | LM: 0.2976
|
| 326 |
+
[2026-04-07 18:31:27] Validation | Batch 240/732 | Loss: 0.3593 | LM: 0.2973
|
| 327 |
+
[2026-04-07 18:31:28] Validation | Batch 250/732 | Loss: 0.3593 | LM: 0.2973
|
| 328 |
+
[2026-04-07 18:31:29] Validation | Batch 260/732 | Loss: 0.3583 | LM: 0.2963
|
| 329 |
+
[2026-04-07 18:31:30] Validation | Batch 270/732 | Loss: 0.3581 | LM: 0.2961
|
| 330 |
+
[2026-04-07 18:31:31] Validation | Batch 280/732 | Loss: 0.3571 | LM: 0.2951
|
| 331 |
+
[2026-04-07 18:31:32] Validation | Batch 290/732 | Loss: 0.3569 | LM: 0.2949
|
| 332 |
+
[2026-04-07 18:31:33] Validation | Batch 300/732 | Loss: 0.3568 | LM: 0.2948
|
| 333 |
+
[2026-04-07 18:31:34] Validation | Batch 310/732 | Loss: 0.3567 | LM: 0.2946
|
| 334 |
+
[2026-04-07 18:31:35] Validation | Batch 320/732 | Loss: 0.3558 | LM: 0.2937
|
| 335 |
+
[2026-04-07 18:31:36] Validation | Batch 330/732 | Loss: 0.3547 | LM: 0.2926
|
| 336 |
+
[2026-04-07 18:31:37] Validation | Batch 340/732 | Loss: 0.3541 | LM: 0.2920
|
| 337 |
+
[2026-04-07 18:31:38] Validation | Batch 350/732 | Loss: 0.3544 | LM: 0.2924
|
| 338 |
+
[2026-04-07 18:31:39] Validation | Batch 360/732 | Loss: 0.3552 | LM: 0.2932
|
| 339 |
+
[2026-04-07 18:31:40] Validation | Batch 370/732 | Loss: 0.3543 | LM: 0.2923
|
| 340 |
+
[2026-04-07 18:31:41] Validation | Batch 380/732 | Loss: 0.3536 | LM: 0.2916
|
| 341 |
+
[2026-04-07 18:31:42] Validation | Batch 390/732 | Loss: 0.3533 | LM: 0.2913
|
| 342 |
+
[2026-04-07 18:31:43] Validation | Batch 400/732 | Loss: 0.3531 | LM: 0.2911
|
| 343 |
+
[2026-04-07 18:31:44] Validation | Batch 410/732 | Loss: 0.3524 | LM: 0.2904
|
| 344 |
+
[2026-04-07 18:31:45] Validation | Batch 420/732 | Loss: 0.3526 | LM: 0.2906
|
| 345 |
+
[2026-04-07 18:31:46] Validation | Batch 430/732 | Loss: 0.3526 | LM: 0.2905
|
| 346 |
+
[2026-04-07 18:31:47] Validation | Batch 440/732 | Loss: 0.3521 | LM: 0.2900
|
| 347 |
+
[2026-04-07 18:31:48] Validation | Batch 450/732 | Loss: 0.3519 | LM: 0.2898
|
| 348 |
+
[2026-04-07 18:31:50] Validation | Batch 460/732 | Loss: 0.3523 | LM: 0.2902
|
| 349 |
+
[2026-04-07 18:31:51] Validation | Batch 470/732 | Loss: 0.3520 | LM: 0.2900
|
| 350 |
+
[2026-04-07 18:31:52] Validation | Batch 480/732 | Loss: 0.3521 | LM: 0.2901
|
| 351 |
+
[2026-04-07 18:31:53] Validation | Batch 490/732 | Loss: 0.3532 | LM: 0.2912
|
| 352 |
+
[2026-04-07 18:31:54] Validation | Batch 500/732 | Loss: 0.3543 | LM: 0.2922
|
| 353 |
+
[2026-04-07 18:31:55] Validation | Batch 510/732 | Loss: 0.3539 | LM: 0.2919
|
| 354 |
+
[2026-04-07 18:31:56] Validation | Batch 520/732 | Loss: 0.3537 | LM: 0.2917
|
| 355 |
+
[2026-04-07 18:31:57] Validation | Batch 530/732 | Loss: 0.3532 | LM: 0.2911
|
| 356 |
+
[2026-04-07 18:31:58] Validation | Batch 540/732 | Loss: 0.3534 | LM: 0.2913
|
| 357 |
+
[2026-04-07 18:31:59] Validation | Batch 550/732 | Loss: 0.3533 | LM: 0.2912
|
| 358 |
+
[2026-04-07 18:32:00] Validation | Batch 560/732 | Loss: 0.3528 | LM: 0.2908
|
| 359 |
+
[2026-04-07 18:32:02] Validation | Batch 570/732 | Loss: 0.3529 | LM: 0.2908
|
| 360 |
+
[2026-04-07 18:32:03] Validation | Batch 580/732 | Loss: 0.3526 | LM: 0.2905
|
| 361 |
+
[2026-04-07 18:32:04] Validation | Batch 590/732 | Loss: 0.3526 | LM: 0.2905
|
| 362 |
+
[2026-04-07 18:32:05] Validation | Batch 600/732 | Loss: 0.3525 | LM: 0.2904
|
| 363 |
+
[2026-04-07 18:32:06] Validation | Batch 610/732 | Loss: 0.3530 | LM: 0.2910
|
| 364 |
+
[2026-04-07 18:32:07] Validation | Batch 620/732 | Loss: 0.3534 | LM: 0.2914
|
| 365 |
+
[2026-04-07 18:32:08] Validation | Batch 630/732 | Loss: 0.3532 | LM: 0.2911
|
| 366 |
+
[2026-04-07 18:32:10] Validation | Batch 640/732 | Loss: 0.3529 | LM: 0.2909
|
| 367 |
+
[2026-04-07 18:32:11] Validation | Batch 650/732 | Loss: 0.3528 | LM: 0.2907
|
| 368 |
+
[2026-04-07 18:32:12] Validation | Batch 660/732 | Loss: 0.3533 | LM: 0.2912
|
| 369 |
+
[2026-04-07 18:32:13] Validation | Batch 670/732 | Loss: 0.3539 | LM: 0.2919
|
| 370 |
+
[2026-04-07 18:32:14] Validation | Batch 680/732 | Loss: 0.3539 | LM: 0.2919
|
| 371 |
+
[2026-04-07 18:32:15] Validation | Batch 690/732 | Loss: 0.3540 | LM: 0.2920
|
| 372 |
+
[2026-04-07 18:32:16] Validation | Batch 700/732 | Loss: 0.3545 | LM: 0.2925
|
| 373 |
+
[2026-04-07 18:32:17] Validation | Batch 710/732 | Loss: 0.3549 | LM: 0.2929
|
| 374 |
+
[2026-04-07 18:32:18] Validation | Batch 720/732 | Loss: 0.3559 | LM: 0.2938
|
| 375 |
+
[2026-04-07 18:32:19] Validation | Batch 730/732 | Loss: 0.3555 | LM: 0.2935
|
| 376 |
+
[2026-04-07 18:32:19] Validation | Batch 732/732 | Loss: 0.3553 | LM: 0.2933
|
| 377 |
+
[2026-04-07 18:32:19] Validation | Loss: 0.3553 | LM: 0.2933 | PPL: 1.34 | Time: 78.83s
|
| 378 |
+
[2026-04-07 18:32:26] New best model saved! Val loss: 0.3553
|
| 379 |
+
[2026-04-07 18:32:35] Epoch 1 | Step 760 | Loss: 0.3679 | LM: 0.3019 | LB: 1.2825 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
|
| 380 |
+
[2026-04-07 18:32:44] Epoch 1 | Step 770 | Loss: 0.3676 | LM: 0.3019 | LB: 1.2821 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
|
| 381 |
+
[2026-04-07 18:32:53] Epoch 1 | Step 780 | Loss: 0.3679 | LM: 0.3034 | LB: 1.2814 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
|
| 382 |
+
[2026-04-07 18:32:54] Reached max_steps=781, stopping training.
|
| 383 |
+
[2026-04-07 18:32:54] Epoch 1 completed in 981.07s | Loss: 0.3678 | CL0: 2.8 | CL1: 2.6
|
| 384 |
+
[2026-04-07 18:32:54]
|
| 385 |
+
Training completed!
|
| 386 |
+
[2026-04-07 18:32:57] Final model: outputs/N_6.0/model_final.pt
|
routing_tuning_test_07_04/N_8.0/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
|
| 3 |
+
checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 4 |
+
training:
|
| 5 |
+
epochs: 1
|
| 6 |
+
max_steps: null
|
| 7 |
+
batch_size: 8
|
| 8 |
+
eval_batch_size: 24
|
| 9 |
+
gradient_accumulation_steps: 4
|
| 10 |
+
lr: 0.0001
|
| 11 |
+
weight_decay: 0.1
|
| 12 |
+
betas:
|
| 13 |
+
- 0.9
|
| 14 |
+
- 0.95
|
| 15 |
+
eps: 1.0e-08
|
| 16 |
+
lr_scheduler: wsd
|
| 17 |
+
warmup_ratio: 0.1
|
| 18 |
+
decay_ratio: 0.2
|
| 19 |
+
warmup_steps: 100
|
| 20 |
+
min_lr_ratio: 0.1
|
| 21 |
+
lr_multiplier:
|
| 22 |
+
- 2.0
|
| 23 |
+
- 1.5
|
| 24 |
+
- 1.0
|
| 25 |
+
load_balancing_weight: 0.05
|
| 26 |
+
load_balancing_N: 8.0
|
| 27 |
+
max_grad_norm: 1.0
|
| 28 |
+
use_amp: true
|
| 29 |
+
resume: false
|
| 30 |
+
resume_checkpoint: null
|
| 31 |
+
warmup_model: true
|
| 32 |
+
data:
|
| 33 |
+
path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
|
| 34 |
+
max_context_len: 4096
|
| 35 |
+
max_target_len: 256
|
| 36 |
+
num_workers: 0
|
| 37 |
+
pin_memory: true
|
| 38 |
+
max_train_samples: 50000
|
| 39 |
+
max_val_samples: null
|
| 40 |
+
logging:
|
| 41 |
+
log_interval: 10
|
| 42 |
+
save_interval: 1000
|
| 43 |
+
eval_interval: 250
|
| 44 |
+
save_every_epoch: false
|
| 45 |
+
model_only_checkpoints: true
|
| 46 |
+
tracking:
|
| 47 |
+
enabled: true
|
| 48 |
+
project: routing-evolution
|
| 49 |
+
run_name: routing_N8.0
|
| 50 |
+
paths:
|
| 51 |
+
output_dir: outputs/N_${training.load_balancing_N}
|
| 52 |
+
seed: 42
|
| 53 |
+
device: cuda
|
routing_tuning_test_07_04/N_8.0/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${paths.output_dir}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- training.load_balancing_N=8.0
|
| 116 |
+
- tracking.run_name=routing_N8.0
|
| 117 |
+
job:
|
| 118 |
+
name: train
|
| 119 |
+
chdir: false
|
| 120 |
+
override_dirname: tracking.run_name=routing_N8.0,training.load_balancing_N=8.0
|
| 121 |
+
id: ???
|
| 122 |
+
num: ???
|
| 123 |
+
config_name: config
|
| 124 |
+
env_set: {}
|
| 125 |
+
env_copy: []
|
| 126 |
+
config:
|
| 127 |
+
override_dirname:
|
| 128 |
+
kv_sep: '='
|
| 129 |
+
item_sep: ','
|
| 130 |
+
exclude_keys: []
|
| 131 |
+
runtime:
|
| 132 |
+
version: 1.3.2
|
| 133 |
+
version_base: '1.3'
|
| 134 |
+
cwd: /workspace/byte-llms-code/routing_evolution_exp
|
| 135 |
+
config_sources:
|
| 136 |
+
- path: hydra.conf
|
| 137 |
+
schema: pkg
|
| 138 |
+
provider: hydra
|
| 139 |
+
- path: /workspace/byte-llms-code/routing_evolution_exp/configs
|
| 140 |
+
schema: file
|
| 141 |
+
provider: main
|
| 142 |
+
- path: ''
|
| 143 |
+
schema: structured
|
| 144 |
+
provider: schema
|
| 145 |
+
output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_8.0
|
| 146 |
+
choices:
|
| 147 |
+
paths: default
|
| 148 |
+
tracking: default
|
| 149 |
+
logging: default
|
| 150 |
+
data: default
|
| 151 |
+
training: default
|
| 152 |
+
model: hnet_xl_code
|
| 153 |
+
hydra/env: default
|
| 154 |
+
hydra/callbacks: null
|
| 155 |
+
hydra/job_logging: default
|
| 156 |
+
hydra/hydra_logging: default
|
| 157 |
+
hydra/hydra_help: default
|
| 158 |
+
hydra/help: default
|
| 159 |
+
hydra/sweeper: basic
|
| 160 |
+
hydra/launcher: basic
|
| 161 |
+
hydra/output: default
|
| 162 |
+
verbose: false
|
routing_tuning_test_07_04/N_8.0/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- training.load_balancing_N=8.0
|
| 2 |
+
- tracking.run_name=routing_N8.0
|
routing_tuning_test_07_04/N_8.0/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a904a1eb0767062c496d1c785af6f85234754836e77f7b8432defda88c98e79
|
| 3 |
+
size 3315165139
|
routing_tuning_test_07_04/N_8.0/model_final.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b37dfdfd5ba634b8cff33353e06b70fc2fcb0bb4997fd44473c4178515bcbf8
|
| 3 |
+
size 3315165484
|
routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
|
| 3 |
+
size 13633736
|
routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_781.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2793d8348710284e5ac059f1dee64775b65061d5315030d4f0b21f9840fe72b
|
| 3 |
+
size 13633752
|
routing_tuning_test_07_04/N_8.0/train.log
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-04-07 18:33:15] CUDA_VISIBLE_DEVICES: 0,1
|
| 2 |
+
[2026-04-07 18:33:15] Number of processes: 2
|
| 3 |
+
[2026-04-07 18:33:15] Mixed precision: bf16
|
| 4 |
+
[2026-04-07 18:33:15] ============================================================
|
| 5 |
+
[2026-04-07 18:33:15] Routing Evolution Experiment | N=8.0
|
| 6 |
+
[2026-04-07 18:33:15] ============================================================
|
| 7 |
+
[2026-04-07 18:33:15] Config:
|
| 8 |
+
model:
|
| 9 |
+
config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
|
| 10 |
+
checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 11 |
+
training:
|
| 12 |
+
epochs: 1
|
| 13 |
+
max_steps: null
|
| 14 |
+
batch_size: 8
|
| 15 |
+
eval_batch_size: 24
|
| 16 |
+
gradient_accumulation_steps: 4
|
| 17 |
+
lr: 0.0001
|
| 18 |
+
weight_decay: 0.1
|
| 19 |
+
betas:
|
| 20 |
+
- 0.9
|
| 21 |
+
- 0.95
|
| 22 |
+
eps: 1.0e-08
|
| 23 |
+
lr_scheduler: wsd
|
| 24 |
+
warmup_ratio: 0.1
|
| 25 |
+
decay_ratio: 0.2
|
| 26 |
+
warmup_steps: 100
|
| 27 |
+
min_lr_ratio: 0.1
|
| 28 |
+
lr_multiplier:
|
| 29 |
+
- 2.0
|
| 30 |
+
- 1.5
|
| 31 |
+
- 1.0
|
| 32 |
+
load_balancing_weight: 0.05
|
| 33 |
+
load_balancing_N: 8.0
|
| 34 |
+
max_grad_norm: 1.0
|
| 35 |
+
use_amp: true
|
| 36 |
+
resume: false
|
| 37 |
+
resume_checkpoint: null
|
| 38 |
+
warmup_model: true
|
| 39 |
+
data:
|
| 40 |
+
path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
|
| 41 |
+
max_context_len: 4096
|
| 42 |
+
max_target_len: 256
|
| 43 |
+
num_workers: 0
|
| 44 |
+
pin_memory: true
|
| 45 |
+
max_train_samples: 50000
|
| 46 |
+
max_val_samples: null
|
| 47 |
+
logging:
|
| 48 |
+
log_interval: 10
|
| 49 |
+
save_interval: 1000
|
| 50 |
+
eval_interval: 250
|
| 51 |
+
save_every_epoch: false
|
| 52 |
+
model_only_checkpoints: true
|
| 53 |
+
tracking:
|
| 54 |
+
enabled: true
|
| 55 |
+
project: routing-evolution
|
| 56 |
+
run_name: routing_N8.0
|
| 57 |
+
paths:
|
| 58 |
+
output_dir: outputs/N_8.0
|
| 59 |
+
seed: 42
|
| 60 |
+
device: cuda
|
| 61 |
+
|
| 62 |
+
[2026-04-07 18:33:15] Loading model...
|
| 63 |
+
[2026-04-07 18:33:21] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 64 |
+
[2026-04-07 18:33:21] Applied LR multipliers: [2.0, 1.5, 1.0]
|
| 65 |
+
[2026-04-07 18:33:21] Warming up model...
|
| 66 |
+
[2026-04-07 18:34:07] Total params: 1,654,090,112
|
| 67 |
+
[2026-04-07 18:34:07] Trainable params: 1,654,090,112
|
| 68 |
+
[2026-04-07 18:34:07] Creating dataloaders...
|
| 69 |
+
[2026-04-07 18:34:07] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
|
| 70 |
+
[2026-04-07 18:34:07] Max steps: 781, Steps per epoch: 3125
|
| 71 |
+
[2026-04-07 18:34:09] Starting training...
|
| 72 |
+
[2026-04-07 18:34:09]
|
| 73 |
+
============================================================
|
| 74 |
+
[2026-04-07 18:34:09] EPOCH 1/1 (step 0)
|
| 75 |
+
[2026-04-07 18:34:09] ============================================================
|
| 76 |
+
[2026-04-07 18:34:43] Epoch 1 | Step 10 | Loss: 0.7353 | LM: 0.6493 | LB: 1.8079 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
|
| 77 |
+
[2026-04-07 18:34:52] Epoch 1 | Step 20 | Loss: 0.6347 | LM: 0.5508 | LB: 1.8018 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.458 | LR: 5.62e-05
|
| 78 |
+
[2026-04-07 18:35:02] Epoch 1 | Step 30 | Loss: 0.5759 | LM: 0.4785 | LB: 1.7936 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.477/SR1: 0.456 | LR: 7.92e-05
|
| 79 |
+
[2026-04-07 18:35:12] Epoch 1 | Step 40 | Loss: 0.5394 | LM: 0.4475 | LB: 1.7712 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.357 | HR1: 0.473/SR1: 0.452 | LR: 1.00e-04
|
| 80 |
+
[2026-04-07 18:35:21] Epoch 1 | Step 50 | Loss: 0.5164 | LM: 0.4096 | LB: 1.7527 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.466/SR1: 0.446 | LR: 1.00e-04
|
| 81 |
+
[2026-04-07 18:35:30] Epoch 1 | Step 60 | Loss: 0.4896 | LM: 0.3812 | LB: 1.7451 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.464/SR1: 0.444 | LR: 1.00e-04
|
| 82 |
+
[2026-04-07 18:35:39] Epoch 1 | Step 70 | Loss: 0.4748 | LM: 0.3774 | LB: 1.7374 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.357 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-04
|
| 83 |
+
[2026-04-07 18:35:48] Epoch 1 | Step 80 | Loss: 0.4595 | LM: 0.3666 | LB: 1.7265 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.358 | HR1: 0.457/SR1: 0.436 | LR: 1.00e-04
|
| 84 |
+
[2026-04-07 18:35:57] Epoch 1 | Step 90 | Loss: 0.4507 | LM: 0.3538 | LB: 1.7140 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.452/SR1: 0.431 | LR: 1.00e-04
|
| 85 |
+
[2026-04-07 18:36:06] Epoch 1 | Step 100 | Loss: 0.4452 | LM: 0.3512 | LB: 1.7013 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.450/SR1: 0.427 | LR: 1.00e-04
|
| 86 |
+
[2026-04-07 18:36:15] Epoch 1 | Step 110 | Loss: 0.4404 | LM: 0.3461 | LB: 1.6898 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.355 | HR1: 0.446/SR1: 0.423 | LR: 1.00e-04
|
| 87 |
+
[2026-04-07 18:36:24] Epoch 1 | Step 120 | Loss: 0.4376 | LM: 0.3477 | LB: 1.6780 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.419 | LR: 1.00e-04
|
| 88 |
+
[2026-04-07 18:36:34] Epoch 1 | Step 130 | Loss: 0.4332 | LM: 0.3445 | LB: 1.6688 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.440/SR1: 0.416 | LR: 1.00e-04
|
| 89 |
+
[2026-04-07 18:36:43] Epoch 1 | Step 140 | Loss: 0.4286 | LM: 0.3403 | LB: 1.6577 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.436/SR1: 0.412 | LR: 1.00e-04
|
| 90 |
+
[2026-04-07 18:36:52] Epoch 1 | Step 150 | Loss: 0.4259 | LM: 0.3379 | LB: 1.6492 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.433/SR1: 0.409 | LR: 1.00e-04
|
| 91 |
+
[2026-04-07 18:37:02] Epoch 1 | Step 160 | Loss: 0.4213 | LM: 0.3326 | LB: 1.6406 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.352 | HR1: 0.431/SR1: 0.406 | LR: 1.00e-04
|
| 92 |
+
[2026-04-07 18:37:11] Epoch 1 | Step 170 | Loss: 0.4162 | LM: 0.3280 | LB: 1.6327 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.428/SR1: 0.403 | LR: 1.00e-04
|
| 93 |
+
[2026-04-07 18:37:20] Epoch 1 | Step 180 | Loss: 0.4124 | LM: 0.3240 | LB: 1.6258 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.425/SR1: 0.400 | LR: 1.00e-04
|
| 94 |
+
[2026-04-07 18:37:29] Epoch 1 | Step 190 | Loss: 0.4105 | LM: 0.3239 | LB: 1.6189 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-04
|
| 95 |
+
[2026-04-07 18:37:38] Epoch 1 | Step 200 | Loss: 0.4078 | LM: 0.3214 | LB: 1.6109 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-04
|
| 96 |
+
[2026-04-07 18:37:47] Epoch 1 | Step 210 | Loss: 0.4074 | LM: 0.3221 | LB: 1.6032 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.351 | HR1: 0.417/SR1: 0.391 | LR: 1.00e-04
|
| 97 |
+
[2026-04-07 18:37:56] Epoch 1 | Step 220 | Loss: 0.4063 | LM: 0.3201 | LB: 1.5976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.351 | HR1: 0.415/SR1: 0.388 | LR: 1.00e-04
|
| 98 |
+
[2026-04-07 18:38:05] Epoch 1 | Step 230 | Loss: 0.4048 | LM: 0.3194 | LB: 1.5915 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.351 | HR1: 0.413/SR1: 0.385 | LR: 1.00e-04
|
| 99 |
+
[2026-04-07 18:38:14] Epoch 1 | Step 240 | Loss: 0.4025 | LM: 0.3205 | LB: 1.5857 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.411/SR1: 0.383 | LR: 1.00e-04
|
| 100 |
+
[2026-04-07 18:38:23] Epoch 1 | Step 250 | Loss: 0.4000 | LM: 0.3185 | LB: 1.5797 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.409/SR1: 0.380 | LR: 1.00e-04
|
| 101 |
+
[2026-04-07 18:38:24] Validation | Batch 10/732 | Loss: 0.3503 | LM: 0.2788
|
| 102 |
+
[2026-04-07 18:38:25] Validation | Batch 20/732 | Loss: 0.3730 | LM: 0.3007
|
| 103 |
+
[2026-04-07 18:38:27] Validation | Batch 30/732 | Loss: 0.3641 | LM: 0.2922
|
| 104 |
+
[2026-04-07 18:38:28] Validation | Batch 40/732 | Loss: 0.3696 | LM: 0.2978
|
| 105 |
+
[2026-04-07 18:38:29] Validation | Batch 50/732 | Loss: 0.3702 | LM: 0.2982
|
| 106 |
+
[2026-04-07 18:38:30] Validation | Batch 60/732 | Loss: 0.3728 | LM: 0.3010
|
| 107 |
+
[2026-04-07 18:38:31] Validation | Batch 70/732 | Loss: 0.3764 | LM: 0.3044
|
| 108 |
+
[2026-04-07 18:38:32] Validation | Batch 80/732 | Loss: 0.3746 | LM: 0.3029
|
| 109 |
+
[2026-04-07 18:38:33] Validation | Batch 90/732 | Loss: 0.3741 | LM: 0.3024
|
| 110 |
+
[2026-04-07 18:38:35] Validation | Batch 100/732 | Loss: 0.3751 | LM: 0.3033
|
| 111 |
+
[2026-04-07 18:38:36] Validation | Batch 110/732 | Loss: 0.3718 | LM: 0.3000
|
| 112 |
+
[2026-04-07 18:38:37] Validation | Batch 120/732 | Loss: 0.3750 | LM: 0.3032
|
| 113 |
+
[2026-04-07 18:38:38] Validation | Batch 130/732 | Loss: 0.3762 | LM: 0.3045
|
| 114 |
+
[2026-04-07 18:38:39] Validation | Batch 140/732 | Loss: 0.3757 | LM: 0.3039
|
| 115 |
+
[2026-04-07 18:38:40] Validation | Batch 150/732 | Loss: 0.3751 | LM: 0.3033
|
| 116 |
+
[2026-04-07 18:38:41] Validation | Batch 160/732 | Loss: 0.3740 | LM: 0.3023
|
| 117 |
+
[2026-04-07 18:38:42] Validation | Batch 170/732 | Loss: 0.3744 | LM: 0.3026
|
| 118 |
+
[2026-04-07 18:38:43] Validation | Batch 180/732 | Loss: 0.3757 | LM: 0.3040
|
| 119 |
+
[2026-04-07 18:38:44] Validation | Batch 190/732 | Loss: 0.3748 | LM: 0.3031
|
| 120 |
+
[2026-04-07 18:38:45] Validation | Batch 200/732 | Loss: 0.3749 | LM: 0.3032
|
| 121 |
+
[2026-04-07 18:38:46] Validation | Batch 210/732 | Loss: 0.3741 | LM: 0.3024
|
| 122 |
+
[2026-04-07 18:38:47] Validation | Batch 220/732 | Loss: 0.3736 | LM: 0.3018
|
| 123 |
+
[2026-04-07 18:38:48] Validation | Batch 230/732 | Loss: 0.3740 | LM: 0.3022
|
| 124 |
+
[2026-04-07 18:38:49] Validation | Batch 240/732 | Loss: 0.3736 | LM: 0.3019
|
| 125 |
+
[2026-04-07 18:38:51] Validation | Batch 250/732 | Loss: 0.3737 | LM: 0.3019
|
| 126 |
+
[2026-04-07 18:38:52] Validation | Batch 260/732 | Loss: 0.3727 | LM: 0.3009
|
| 127 |
+
[2026-04-07 18:38:53] Validation | Batch 270/732 | Loss: 0.3725 | LM: 0.3006
|
| 128 |
+
[2026-04-07 18:38:54] Validation | Batch 280/732 | Loss: 0.3714 | LM: 0.2996
|
| 129 |
+
[2026-04-07 18:38:55] Validation | Batch 290/732 | Loss: 0.3713 | LM: 0.2994
|
| 130 |
+
[2026-04-07 18:38:56] Validation | Batch 300/732 | Loss: 0.3712 | LM: 0.2993
|
| 131 |
+
[2026-04-07 18:38:57] Validation | Batch 310/732 | Loss: 0.3711 | LM: 0.2992
|
| 132 |
+
[2026-04-07 18:38:58] Validation | Batch 320/732 | Loss: 0.3701 | LM: 0.2982
|
| 133 |
+
[2026-04-07 18:38:59] Validation | Batch 330/732 | Loss: 0.3690 | LM: 0.2971
|
| 134 |
+
[2026-04-07 18:39:00] Validation | Batch 340/732 | Loss: 0.3684 | LM: 0.2965
|
| 135 |
+
[2026-04-07 18:39:01] Validation | Batch 350/732 | Loss: 0.3687 | LM: 0.2968
|
| 136 |
+
[2026-04-07 18:39:02] Validation | Batch 360/732 | Loss: 0.3696 | LM: 0.2977
|
| 137 |
+
[2026-04-07 18:39:03] Validation | Batch 370/732 | Loss: 0.3686 | LM: 0.2967
|
| 138 |
+
[2026-04-07 18:39:04] Validation | Batch 380/732 | Loss: 0.3679 | LM: 0.2960
|
| 139 |
+
[2026-04-07 18:39:05] Validation | Batch 390/732 | Loss: 0.3675 | LM: 0.2956
|
| 140 |
+
[2026-04-07 18:39:06] Validation | Batch 400/732 | Loss: 0.3674 | LM: 0.2955
|
| 141 |
+
[2026-04-07 18:39:07] Validation | Batch 410/732 | Loss: 0.3666 | LM: 0.2947
|
| 142 |
+
[2026-04-07 18:39:08] Validation | Batch 420/732 | Loss: 0.3668 | LM: 0.2949
|
| 143 |
+
[2026-04-07 18:39:09] Validation | Batch 430/732 | Loss: 0.3667 | LM: 0.2948
|
| 144 |
+
[2026-04-07 18:39:10] Validation | Batch 440/732 | Loss: 0.3662 | LM: 0.2943
|
| 145 |
+
[2026-04-07 18:39:11] Validation | Batch 450/732 | Loss: 0.3660 | LM: 0.2941
|
| 146 |
+
[2026-04-07 18:39:13] Validation | Batch 460/732 | Loss: 0.3664 | LM: 0.2945
|
| 147 |
+
[2026-04-07 18:39:14] Validation | Batch 470/732 | Loss: 0.3662 | LM: 0.2943
|
| 148 |
+
[2026-04-07 18:39:15] Validation | Batch 480/732 | Loss: 0.3663 | LM: 0.2944
|
| 149 |
+
[2026-04-07 18:39:16] Validation | Batch 490/732 | Loss: 0.3674 | LM: 0.2955
|
| 150 |
+
[2026-04-07 18:39:17] Validation | Batch 500/732 | Loss: 0.3685 | LM: 0.2966
|
| 151 |
+
[2026-04-07 18:39:18] Validation | Batch 510/732 | Loss: 0.3681 | LM: 0.2962
|
| 152 |
+
[2026-04-07 18:39:19] Validation | Batch 520/732 | Loss: 0.3679 | LM: 0.2960
|
| 153 |
+
[2026-04-07 18:39:20] Validation | Batch 530/732 | Loss: 0.3673 | LM: 0.2954
|
| 154 |
+
[2026-04-07 18:39:21] Validation | Batch 540/732 | Loss: 0.3675 | LM: 0.2956
|
| 155 |
+
[2026-04-07 18:39:22] Validation | Batch 550/732 | Loss: 0.3674 | LM: 0.2955
|
| 156 |
+
[2026-04-07 18:39:23] Validation | Batch 560/732 | Loss: 0.3670 | LM: 0.2951
|
| 157 |
+
[2026-04-07 18:39:24] Validation | Batch 570/732 | Loss: 0.3670 | LM: 0.2951
|
| 158 |
+
[2026-04-07 18:39:25] Validation | Batch 580/732 | Loss: 0.3668 | LM: 0.2949
|
| 159 |
+
[2026-04-07 18:39:27] Validation | Batch 590/732 | Loss: 0.3667 | LM: 0.2948
|
| 160 |
+
[2026-04-07 18:39:28] Validation | Batch 600/732 | Loss: 0.3667 | LM: 0.2948
|
| 161 |
+
[2026-04-07 18:39:29] Validation | Batch 610/732 | Loss: 0.3673 | LM: 0.2954
|
| 162 |
+
[2026-04-07 18:39:30] Validation | Batch 620/732 | Loss: 0.3677 | LM: 0.2958
|
| 163 |
+
[2026-04-07 18:39:31] Validation | Batch 630/732 | Loss: 0.3675 | LM: 0.2956
|
| 164 |
+
[2026-04-07 18:39:32] Validation | Batch 640/732 | Loss: 0.3673 | LM: 0.2954
|
| 165 |
+
[2026-04-07 18:39:33] Validation | Batch 650/732 | Loss: 0.3671 | LM: 0.2952
|
| 166 |
+
[2026-04-07 18:39:34] Validation | Batch 660/732 | Loss: 0.3675 | LM: 0.2957
|
| 167 |
+
[2026-04-07 18:39:35] Validation | Batch 670/732 | Loss: 0.3682 | LM: 0.2963
|
| 168 |
+
[2026-04-07 18:39:36] Validation | Batch 680/732 | Loss: 0.3681 | LM: 0.2963
|
| 169 |
+
[2026-04-07 18:39:37] Validation | Batch 690/732 | Loss: 0.3683 | LM: 0.2964
|
| 170 |
+
[2026-04-07 18:39:39] Validation | Batch 700/732 | Loss: 0.3688 | LM: 0.2969
|
| 171 |
+
[2026-04-07 18:39:40] Validation | Batch 710/732 | Loss: 0.3692 | LM: 0.2973
|
| 172 |
+
[2026-04-07 18:39:41] Validation | Batch 720/732 | Loss: 0.3701 | LM: 0.2983
|
| 173 |
+
[2026-04-07 18:39:42] Validation | Batch 730/732 | Loss: 0.3698 | LM: 0.2979
|
| 174 |
+
[2026-04-07 18:39:42] Validation | Batch 732/732 | Loss: 0.3696 | LM: 0.2977
|
| 175 |
+
[2026-04-07 18:39:42] Validation | Loss: 0.3696 | LM: 0.2977 | PPL: 1.35 | Time: 78.62s
|
| 176 |
+
[2026-04-07 18:39:45] New best model saved! Val loss: 0.3696
|
| 177 |
+
[2026-04-07 18:39:54] Epoch 1 | Step 260 | Loss: 0.3979 | LM: 0.3177 | LB: 1.5742 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.407/SR1: 0.378 | LR: 1.00e-04
|
| 178 |
+
[2026-04-07 18:40:03] Epoch 1 | Step 270 | Loss: 0.3979 | LM: 0.3162 | LB: 1.5694 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.405/SR1: 0.376 | LR: 1.00e-04
|
| 179 |
+
[2026-04-07 18:40:12] Epoch 1 | Step 280 | Loss: 0.3976 | LM: 0.3163 | LB: 1.5644 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.404/SR1: 0.374 | LR: 1.00e-04
|
| 180 |
+
[2026-04-07 18:40:21] Epoch 1 | Step 290 | Loss: 0.3958 | LM: 0.3144 | LB: 1.5597 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.402/SR1: 0.372 | LR: 1.00e-04
|
| 181 |
+
[2026-04-07 18:40:30] Epoch 1 | Step 300 | Loss: 0.3950 | LM: 0.3140 | LB: 1.5559 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.400/SR1: 0.370 | LR: 1.00e-04
|
| 182 |
+
[2026-04-07 18:40:40] Epoch 1 | Step 310 | Loss: 0.3940 | LM: 0.3131 | LB: 1.5524 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.398/SR1: 0.368 | LR: 1.00e-04
|
| 183 |
+
[2026-04-07 18:40:49] Epoch 1 | Step 320 | Loss: 0.3932 | LM: 0.3125 | LB: 1.5481 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 9.80e-05
|
| 184 |
+
[2026-04-07 18:40:58] Epoch 1 | Step 330 | Loss: 0.3918 | LM: 0.3116 | LB: 1.5438 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 8.93e-05
|
| 185 |
+
[2026-04-07 18:41:07] Epoch 1 | Step 340 | Loss: 0.3909 | LM: 0.3109 | LB: 1.5397 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.362 | LR: 7.51e-05
|
| 186 |
+
[2026-04-07 18:41:16] Epoch 1 | Step 350 | Loss: 0.3903 | LM: 0.3115 | LB: 1.5360 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 5.77e-05
|
| 187 |
+
[2026-04-07 18:41:25] Epoch 1 | Step 360 | Loss: 0.3895 | LM: 0.3131 | LB: 1.5325 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.391/SR1: 0.359 | LR: 3.99e-05
|
| 188 |
+
[2026-04-07 18:41:34] Epoch 1 | Step 370 | Loss: 0.3890 | LM: 0.3127 | LB: 1.5297 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.390/SR1: 0.357 | LR: 2.45e-05
|
| 189 |
+
[2026-04-07 18:41:43] Epoch 1 | Step 380 | Loss: 0.3879 | LM: 0.3108 | LB: 1.5263 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.389/SR1: 0.356 | LR: 1.40e-05
|
| 190 |
+
[2026-04-07 18:41:52] Epoch 1 | Step 390 | Loss: 0.3880 | LM: 0.3106 | LB: 1.5230 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.387/SR1: 0.354 | LR: 1.00e-05
|
| 191 |
+
[2026-04-07 18:42:01] Epoch 1 | Step 400 | Loss: 0.3877 | LM: 0.3114 | LB: 1.5197 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.386/SR1: 0.353 | LR: 1.00e-05
|
| 192 |
+
[2026-04-07 18:42:10] Epoch 1 | Step 410 | Loss: 0.3875 | LM: 0.3113 | LB: 1.5166 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.349 | HR1: 0.385/SR1: 0.352 | LR: 1.00e-05
|
| 193 |
+
[2026-04-07 18:42:20] Epoch 1 | Step 420 | Loss: 0.3875 | LM: 0.3107 | LB: 1.5138 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.384/SR1: 0.351 | LR: 1.00e-05
|
| 194 |
+
[2026-04-07 18:42:29] Epoch 1 | Step 430 | Loss: 0.3868 | LM: 0.3091 | LB: 1.5112 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.383/SR1: 0.350 | LR: 1.00e-05
|
| 195 |
+
[2026-04-07 18:42:37] Epoch 1 | Step 440 | Loss: 0.3861 | LM: 0.3082 | LB: 1.5090 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.383/SR1: 0.349 | LR: 1.00e-05
|
| 196 |
+
[2026-04-07 18:42:47] Epoch 1 | Step 450 | Loss: 0.3853 | LM: 0.3070 | LB: 1.5068 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.382/SR1: 0.347 | LR: 1.00e-05
|
| 197 |
+
[2026-04-07 18:42:56] Epoch 1 | Step 460 | Loss: 0.3853 | LM: 0.3061 | LB: 1.5044 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.381/SR1: 0.346 | LR: 1.00e-05
|
| 198 |
+
[2026-04-07 18:43:05] Epoch 1 | Step 470 | Loss: 0.3851 | LM: 0.3069 | LB: 1.5026 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.380/SR1: 0.345 | LR: 1.00e-05
|
| 199 |
+
[2026-04-07 18:43:14] Epoch 1 | Step 480 | Loss: 0.3847 | LM: 0.3062 | LB: 1.5003 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.379/SR1: 0.344 | LR: 1.00e-05
|
| 200 |
+
[2026-04-07 18:43:23] Epoch 1 | Step 490 | Loss: 0.3839 | LM: 0.3051 | LB: 1.4984 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.378/SR1: 0.343 | LR: 1.00e-05
|
| 201 |
+
[2026-04-07 18:43:32] Epoch 1 | Step 500 | Loss: 0.3841 | LM: 0.3056 | LB: 1.4964 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.378/SR1: 0.342 | LR: 1.00e-05
|
| 202 |
+
[2026-04-07 18:43:33] Validation | Batch 10/732 | Loss: 0.3485 | LM: 0.2792
|
| 203 |
+
[2026-04-07 18:43:34] Validation | Batch 20/732 | Loss: 0.3705 | LM: 0.3004
|
| 204 |
+
[2026-04-07 18:43:35] Validation | Batch 30/732 | Loss: 0.3618 | LM: 0.2921
|
| 205 |
+
[2026-04-07 18:43:36] Validation | Batch 40/732 | Loss: 0.3675 | LM: 0.2979
|
| 206 |
+
[2026-04-07 18:43:37] Validation | Batch 50/732 | Loss: 0.3673 | LM: 0.2977
|
| 207 |
+
[2026-04-07 18:43:38] Validation | Batch 60/732 | Loss: 0.3694 | LM: 0.2998
|
| 208 |
+
[2026-04-07 18:43:39] Validation | Batch 70/732 | Loss: 0.3729 | LM: 0.3032
|
| 209 |
+
[2026-04-07 18:43:41] Validation | Batch 80/732 | Loss: 0.3713 | LM: 0.3018
|
| 210 |
+
[2026-04-07 18:43:42] Validation | Batch 90/732 | Loss: 0.3710 | LM: 0.3015
|
| 211 |
+
[2026-04-07 18:43:43] Validation | Batch 100/732 | Loss: 0.3718 | LM: 0.3023
|
| 212 |
+
[2026-04-07 18:43:44] Validation | Batch 110/732 | Loss: 0.3685 | LM: 0.2989
|
| 213 |
+
[2026-04-07 18:43:45] Validation | Batch 120/732 | Loss: 0.3715 | LM: 0.3020
|
| 214 |
+
[2026-04-07 18:43:46] Validation | Batch 130/732 | Loss: 0.3728 | LM: 0.3033
|
| 215 |
+
[2026-04-07 18:43:47] Validation | Batch 140/732 | Loss: 0.3722 | LM: 0.3027
|
| 216 |
+
[2026-04-07 18:43:48] Validation | Batch 150/732 | Loss: 0.3717 | LM: 0.3021
|
| 217 |
+
[2026-04-07 18:43:49] Validation | Batch 160/732 | Loss: 0.3707 | LM: 0.3011
|
| 218 |
+
[2026-04-07 18:43:50] Validation | Batch 170/732 | Loss: 0.3711 | LM: 0.3016
|
| 219 |
+
[2026-04-07 18:43:51] Validation | Batch 180/732 | Loss: 0.3726 | LM: 0.3030
|
| 220 |
+
[2026-04-07 18:43:52] Validation | Batch 190/732 | Loss: 0.3717 | LM: 0.3021
|
| 221 |
+
[2026-04-07 18:43:53] Validation | Batch 200/732 | Loss: 0.3718 | LM: 0.3022
|
| 222 |
+
[2026-04-07 18:43:54] Validation | Batch 210/732 | Loss: 0.3711 | LM: 0.3016
|
| 223 |
+
[2026-04-07 18:43:55] Validation | Batch 220/732 | Loss: 0.3706 | LM: 0.3010
|
| 224 |
+
[2026-04-07 18:43:56] Validation | Batch 230/732 | Loss: 0.3710 | LM: 0.3015
|
| 225 |
+
[2026-04-07 18:43:58] Validation | Batch 240/732 | Loss: 0.3707 | LM: 0.3012
|
| 226 |
+
[2026-04-07 18:43:59] Validation | Batch 250/732 | Loss: 0.3707 | LM: 0.3011
|
| 227 |
+
[2026-04-07 18:44:00] Validation | Batch 260/732 | Loss: 0.3697 | LM: 0.3001
|
| 228 |
+
[2026-04-07 18:44:01] Validation | Batch 270/732 | Loss: 0.3695 | LM: 0.2999
|
| 229 |
+
[2026-04-07 18:44:02] Validation | Batch 280/732 | Loss: 0.3685 | LM: 0.2989
|
| 230 |
+
[2026-04-07 18:44:03] Validation | Batch 290/732 | Loss: 0.3683 | LM: 0.2987
|
| 231 |
+
[2026-04-07 18:44:04] Validation | Batch 300/732 | Loss: 0.3683 | LM: 0.2986
|
| 232 |
+
[2026-04-07 18:44:05] Validation | Batch 310/732 | Loss: 0.3681 | LM: 0.2984
|
| 233 |
+
[2026-04-07 18:44:06] Validation | Batch 320/732 | Loss: 0.3672 | LM: 0.2975
|
| 234 |
+
[2026-04-07 18:44:07] Validation | Batch 330/732 | Loss: 0.3660 | LM: 0.2964
|
| 235 |
+
[2026-04-07 18:44:08] Validation | Batch 340/732 | Loss: 0.3655 | LM: 0.2958
|
| 236 |
+
[2026-04-07 18:44:09] Validation | Batch 350/732 | Loss: 0.3658 | LM: 0.2961
|
| 237 |
+
[2026-04-07 18:44:10] Validation | Batch 360/732 | Loss: 0.3667 | LM: 0.2970
|
| 238 |
+
[2026-04-07 18:44:11] Validation | Batch 370/732 | Loss: 0.3657 | LM: 0.2960
|
| 239 |
+
[2026-04-07 18:44:12] Validation | Batch 380/732 | Loss: 0.3651 | LM: 0.2954
|
| 240 |
+
[2026-04-07 18:44:13] Validation | Batch 390/732 | Loss: 0.3647 | LM: 0.2950
|
| 241 |
+
[2026-04-07 18:44:14] Validation | Batch 400/732 | Loss: 0.3645 | LM: 0.2949
|
| 242 |
+
[2026-04-07 18:44:15] Validation | Batch 410/732 | Loss: 0.3638 | LM: 0.2941
|
| 243 |
+
[2026-04-07 18:44:16] Validation | Batch 420/732 | Loss: 0.3640 | LM: 0.2944
|
| 244 |
+
[2026-04-07 18:44:17] Validation | Batch 430/732 | Loss: 0.3639 | LM: 0.2943
|
| 245 |
+
[2026-04-07 18:44:18] Validation | Batch 440/732 | Loss: 0.3635 | LM: 0.2938
|
| 246 |
+
[2026-04-07 18:44:19] Validation | Batch 450/732 | Loss: 0.3633 | LM: 0.2936
|
| 247 |
+
[2026-04-07 18:44:21] Validation | Batch 460/732 | Loss: 0.3637 | LM: 0.2940
|
| 248 |
+
[2026-04-07 18:44:22] Validation | Batch 470/732 | Loss: 0.3634 | LM: 0.2938
|
| 249 |
+
[2026-04-07 18:44:23] Validation | Batch 480/732 | Loss: 0.3635 | LM: 0.2938
|
| 250 |
+
[2026-04-07 18:44:24] Validation | Batch 490/732 | Loss: 0.3646 | LM: 0.2949
|
| 251 |
+
[2026-04-07 18:44:25] Validation | Batch 500/732 | Loss: 0.3656 | LM: 0.2960
|
| 252 |
+
[2026-04-07 18:44:26] Validation | Batch 510/732 | Loss: 0.3653 | LM: 0.2956
|
| 253 |
+
[2026-04-07 18:44:27] Validation | Batch 520/732 | Loss: 0.3651 | LM: 0.2955
|
| 254 |
+
[2026-04-07 18:44:28] Validation | Batch 530/732 | Loss: 0.3646 | LM: 0.2949
|
| 255 |
+
[2026-04-07 18:44:29] Validation | Batch 540/732 | Loss: 0.3647 | LM: 0.2951
|
| 256 |
+
[2026-04-07 18:44:30] Validation | Batch 550/732 | Loss: 0.3646 | LM: 0.2950
|
| 257 |
+
[2026-04-07 18:44:31] Validation | Batch 560/732 | Loss: 0.3642 | LM: 0.2945
|
| 258 |
+
[2026-04-07 18:44:32] Validation | Batch 570/732 | Loss: 0.3642 | LM: 0.2945
|
| 259 |
+
[2026-04-07 18:44:33] Validation | Batch 580/732 | Loss: 0.3639 | LM: 0.2943
|
| 260 |
+
[2026-04-07 18:44:35] Validation | Batch 590/732 | Loss: 0.3639 | LM: 0.2943
|
| 261 |
+
[2026-04-07 18:44:36] Validation | Batch 600/732 | Loss: 0.3638 | LM: 0.2942
|
| 262 |
+
[2026-04-07 18:44:37] Validation | Batch 610/732 | Loss: 0.3644 | LM: 0.2947
|
| 263 |
+
[2026-04-07 18:44:38] Validation | Batch 620/732 | Loss: 0.3648 | LM: 0.2951
|
| 264 |
+
[2026-04-07 18:44:39] Validation | Batch 630/732 | Loss: 0.3646 | LM: 0.2949
|
| 265 |
+
[2026-04-07 18:44:40] Validation | Batch 640/732 | Loss: 0.3643 | LM: 0.2946
|
| 266 |
+
[2026-04-07 18:44:41] Validation | Batch 650/732 | Loss: 0.3641 | LM: 0.2945
|
| 267 |
+
[2026-04-07 18:44:42] Validation | Batch 660/732 | Loss: 0.3646 | LM: 0.2950
|
| 268 |
+
[2026-04-07 18:44:43] Validation | Batch 670/732 | Loss: 0.3652 | LM: 0.2956
|
| 269 |
+
[2026-04-07 18:44:44] Validation | Batch 680/732 | Loss: 0.3652 | LM: 0.2956
|
| 270 |
+
[2026-04-07 18:44:45] Validation | Batch 690/732 | Loss: 0.3654 | LM: 0.2957
|
| 271 |
+
[2026-04-07 18:44:46] Validation | Batch 700/732 | Loss: 0.3658 | LM: 0.2962
|
| 272 |
+
[2026-04-07 18:44:47] Validation | Batch 710/732 | Loss: 0.3662 | LM: 0.2966
|
| 273 |
+
[2026-04-07 18:44:48] Validation | Batch 720/732 | Loss: 0.3672 | LM: 0.2975
|
| 274 |
+
[2026-04-07 18:44:49] Validation | Batch 730/732 | Loss: 0.3668 | LM: 0.2972
|
| 275 |
+
[2026-04-07 18:44:50] Validation | Batch 732/732 | Loss: 0.3667 | LM: 0.2970
|
| 276 |
+
[2026-04-07 18:44:50] Validation | Loss: 0.3667 | LM: 0.2970 | PPL: 1.35 | Time: 77.66s
|
| 277 |
+
[2026-04-07 18:44:56] New best model saved! Val loss: 0.3667
|
| 278 |
+
[2026-04-07 18:45:05] Epoch 1 | Step 510 | Loss: 0.3840 | LM: 0.3039 | LB: 1.4950 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.377/SR1: 0.342 | LR: 1.00e-05
|
| 279 |
+
[2026-04-07 18:45:14] Epoch 1 | Step 520 | Loss: 0.3836 | LM: 0.3030 | LB: 1.4927 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.376/SR1: 0.341 | LR: 1.00e-05
|
| 280 |
+
[2026-04-07 18:45:23] Epoch 1 | Step 530 | Loss: 0.3833 | LM: 0.3022 | LB: 1.4910 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.376/SR1: 0.340 | LR: 1.00e-05
|
| 281 |
+
[2026-04-07 18:45:32] Epoch 1 | Step 540 | Loss: 0.3829 | LM: 0.3020 | LB: 1.4888 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.375/SR1: 0.339 | LR: 1.00e-05
|
| 282 |
+
[2026-04-07 18:45:41] Epoch 1 | Step 550 | Loss: 0.3825 | LM: 0.3024 | LB: 1.4866 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.338 | LR: 1.00e-05
|
| 283 |
+
[2026-04-07 18:45:50] Epoch 1 | Step 560 | Loss: 0.3827 | LM: 0.3032 | LB: 1.4855 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.338 | LR: 1.00e-05
|
| 284 |
+
[2026-04-07 18:46:00] Epoch 1 | Step 570 | Loss: 0.3829 | LM: 0.3037 | LB: 1.4842 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.337 | LR: 1.00e-05
|
| 285 |
+
[2026-04-07 18:46:09] Epoch 1 | Step 580 | Loss: 0.3824 | LM: 0.3037 | LB: 1.4828 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.373/SR1: 0.336 | LR: 1.00e-05
|
| 286 |
+
[2026-04-07 18:46:18] Epoch 1 | Step 590 | Loss: 0.3829 | LM: 0.3051 | LB: 1.4811 | CL0: 2.9 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.372/SR1: 0.336 | LR: 1.00e-05
|
| 287 |
+
[2026-04-07 18:46:27] Epoch 1 | Step 600 | Loss: 0.3825 | LM: 0.3044 | LB: 1.4800 | CL0: 2.9 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.372/SR1: 0.335 | LR: 1.00e-05
|
| 288 |
+
[2026-04-07 18:46:36] Epoch 1 | Step 610 | Loss: 0.3820 | LM: 0.3041 | LB: 1.4791 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.371/SR1: 0.335 | LR: 1.00e-05
|
| 289 |
+
[2026-04-07 18:46:45] Epoch 1 | Step 620 | Loss: 0.3817 | LM: 0.3042 | LB: 1.4778 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.371/SR1: 0.334 | LR: 1.00e-05
|
| 290 |
+
[2026-04-07 18:46:54] Epoch 1 | Step 630 | Loss: 0.3813 | LM: 0.3040 | LB: 1.4763 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.370/SR1: 0.333 | LR: 1.00e-05
|
| 291 |
+
[2026-04-07 18:47:03] Epoch 1 | Step 640 | Loss: 0.3809 | LM: 0.3041 | LB: 1.4753 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.370/SR1: 0.333 | LR: 1.00e-05
|
| 292 |
+
[2026-04-07 18:47:12] Epoch 1 | Step 650 | Loss: 0.3811 | LM: 0.3037 | LB: 1.4741 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.332 | LR: 1.00e-05
|
| 293 |
+
[2026-04-07 18:47:21] Epoch 1 | Step 660 | Loss: 0.3806 | LM: 0.3034 | LB: 1.4732 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.332 | LR: 1.00e-05
|
| 294 |
+
[2026-04-07 18:47:31] Epoch 1 | Step 670 | Loss: 0.3813 | LM: 0.3039 | LB: 1.4725 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.331 | LR: 1.00e-05
|
| 295 |
+
[2026-04-07 18:47:40] Epoch 1 | Step 680 | Loss: 0.3810 | LM: 0.3042 | LB: 1.4717 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.331 | LR: 1.00e-05
|
| 296 |
+
[2026-04-07 18:47:49] Epoch 1 | Step 690 | Loss: 0.3810 | LM: 0.3047 | LB: 1.4712 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.330 | LR: 1.00e-05
|
| 297 |
+
[2026-04-07 18:47:58] Epoch 1 | Step 700 | Loss: 0.3811 | LM: 0.3048 | LB: 1.4703 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.330 | LR: 1.00e-05
|
| 298 |
+
[2026-04-07 18:48:07] Epoch 1 | Step 710 | Loss: 0.3808 | LM: 0.3044 | LB: 1.4692 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.367/SR1: 0.329 | LR: 1.00e-05
|
| 299 |
+
[2026-04-07 18:48:16] Epoch 1 | Step 720 | Loss: 0.3805 | LM: 0.3043 | LB: 1.4684 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.367/SR1: 0.329 | LR: 1.00e-05
|
| 300 |
+
[2026-04-07 18:48:25] Epoch 1 | Step 730 | Loss: 0.3801 | LM: 0.3050 | LB: 1.4677 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.329 | LR: 1.00e-05
|
| 301 |
+
[2026-04-07 18:48:34] Epoch 1 | Step 740 | Loss: 0.3799 | LM: 0.3052 | LB: 1.4670 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
|
| 302 |
+
[2026-04-07 18:48:43] Epoch 1 | Step 750 | Loss: 0.3796 | LM: 0.3048 | LB: 1.4658 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
|
| 303 |
+
[2026-04-07 18:48:44] Validation | Batch 10/732 | Loss: 0.3485 | LM: 0.2793
|
| 304 |
+
[2026-04-07 18:48:45] Validation | Batch 20/732 | Loss: 0.3702 | LM: 0.3003
|
| 305 |
+
[2026-04-07 18:48:46] Validation | Batch 30/732 | Loss: 0.3614 | LM: 0.2918
|
| 306 |
+
[2026-04-07 18:48:47] Validation | Batch 40/732 | Loss: 0.3671 | LM: 0.2975
|
| 307 |
+
[2026-04-07 18:48:48] Validation | Batch 50/732 | Loss: 0.3669 | LM: 0.2973
|
| 308 |
+
[2026-04-07 18:48:49] Validation | Batch 60/732 | Loss: 0.3689 | LM: 0.2994
|
| 309 |
+
[2026-04-07 18:48:51] Validation | Batch 70/732 | Loss: 0.3724 | LM: 0.3028
|
| 310 |
+
[2026-04-07 18:48:52] Validation | Batch 80/732 | Loss: 0.3708 | LM: 0.3014
|
| 311 |
+
[2026-04-07 18:48:53] Validation | Batch 90/732 | Loss: 0.3705 | LM: 0.3011
|
| 312 |
+
[2026-04-07 18:48:54] Validation | Batch 100/732 | Loss: 0.3714 | LM: 0.3019
|
| 313 |
+
[2026-04-07 18:48:55] Validation | Batch 110/732 | Loss: 0.3681 | LM: 0.2986
|
| 314 |
+
[2026-04-07 18:48:56] Validation | Batch 120/732 | Loss: 0.3711 | LM: 0.3016
|
| 315 |
+
[2026-04-07 18:48:57] Validation | Batch 130/732 | Loss: 0.3723 | LM: 0.3029
|
| 316 |
+
[2026-04-07 18:48:58] Validation | Batch 140/732 | Loss: 0.3718 | LM: 0.3024
|
| 317 |
+
[2026-04-07 18:48:59] Validation | Batch 150/732 | Loss: 0.3713 | LM: 0.3017
|
| 318 |
+
[2026-04-07 18:49:00] Validation | Batch 160/732 | Loss: 0.3702 | LM: 0.3007
|
| 319 |
+
[2026-04-07 18:49:01] Validation | Batch 170/732 | Loss: 0.3706 | LM: 0.3012
|
| 320 |
+
[2026-04-07 18:49:02] Validation | Batch 180/732 | Loss: 0.3721 | LM: 0.3026
|
| 321 |
+
[2026-04-07 18:49:03] Validation | Batch 190/732 | Loss: 0.3712 | LM: 0.3017
|
| 322 |
+
[2026-04-07 18:49:04] Validation | Batch 200/732 | Loss: 0.3713 | LM: 0.3018
|
| 323 |
+
[2026-04-07 18:49:05] Validation | Batch 210/732 | Loss: 0.3706 | LM: 0.3011
|
| 324 |
+
[2026-04-07 18:49:06] Validation | Batch 220/732 | Loss: 0.3701 | LM: 0.3006
|
| 325 |
+
[2026-04-07 18:49:08] Validation | Batch 230/732 | Loss: 0.3705 | LM: 0.3010
|
| 326 |
+
[2026-04-07 18:49:09] Validation | Batch 240/732 | Loss: 0.3702 | LM: 0.3007
|
| 327 |
+
[2026-04-07 18:49:10] Validation | Batch 250/732 | Loss: 0.3702 | LM: 0.3007
|
| 328 |
+
[2026-04-07 18:49:11] Validation | Batch 260/732 | Loss: 0.3692 | LM: 0.2997
|
| 329 |
+
[2026-04-07 18:49:12] Validation | Batch 270/732 | Loss: 0.3690 | LM: 0.2995
|
| 330 |
+
[2026-04-07 18:49:13] Validation | Batch 280/732 | Loss: 0.3680 | LM: 0.2985
|
| 331 |
+
[2026-04-07 18:49:14] Validation | Batch 290/732 | Loss: 0.3678 | LM: 0.2983
|
| 332 |
+
[2026-04-07 18:49:15] Validation | Batch 300/732 | Loss: 0.3678 | LM: 0.2982
|
| 333 |
+
[2026-04-07 18:49:16] Validation | Batch 310/732 | Loss: 0.3676 | LM: 0.2980
|
| 334 |
+
[2026-04-07 18:49:17] Validation | Batch 320/732 | Loss: 0.3667 | LM: 0.2971
|
| 335 |
+
[2026-04-07 18:49:18] Validation | Batch 330/732 | Loss: 0.3656 | LM: 0.2960
|
| 336 |
+
[2026-04-07 18:49:19] Validation | Batch 340/732 | Loss: 0.3650 | LM: 0.2954
|
| 337 |
+
[2026-04-07 18:49:20] Validation | Batch 350/732 | Loss: 0.3653 | LM: 0.2957
|
| 338 |
+
[2026-04-07 18:49:21] Validation | Batch 360/732 | Loss: 0.3662 | LM: 0.2966
|
| 339 |
+
[2026-04-07 18:49:22] Validation | Batch 370/732 | Loss: 0.3652 | LM: 0.2956
|
| 340 |
+
[2026-04-07 18:49:23] Validation | Batch 380/732 | Loss: 0.3646 | LM: 0.2950
|
| 341 |
+
[2026-04-07 18:49:24] Validation | Batch 390/732 | Loss: 0.3642 | LM: 0.2946
|
| 342 |
+
[2026-04-07 18:49:25] Validation | Batch 400/732 | Loss: 0.3641 | LM: 0.2945
|
| 343 |
+
[2026-04-07 18:49:26] Validation | Batch 410/732 | Loss: 0.3633 | LM: 0.2937
|
| 344 |
+
[2026-04-07 18:49:27] Validation | Batch 420/732 | Loss: 0.3635 | LM: 0.2940
|
| 345 |
+
[2026-04-07 18:49:28] Validation | Batch 430/732 | Loss: 0.3635 | LM: 0.2939
|
| 346 |
+
[2026-04-07 18:49:29] Validation | Batch 440/732 | Loss: 0.3630 | LM: 0.2934
|
| 347 |
+
[2026-04-07 18:49:30] Validation | Batch 450/732 | Loss: 0.3628 | LM: 0.2932
|
| 348 |
+
[2026-04-07 18:49:31] Validation | Batch 460/732 | Loss: 0.3632 | LM: 0.2936
|
| 349 |
+
[2026-04-07 18:49:33] Validation | Batch 470/732 | Loss: 0.3630 | LM: 0.2934
|
| 350 |
+
[2026-04-07 18:49:33] Validation | Batch 480/732 | Loss: 0.3630 | LM: 0.2935
|
| 351 |
+
[2026-04-07 18:49:35] Validation | Batch 490/732 | Loss: 0.3641 | LM: 0.2945
|
| 352 |
+
[2026-04-07 18:49:36] Validation | Batch 500/732 | Loss: 0.3652 | LM: 0.2956
|
| 353 |
+
[2026-04-07 18:49:37] Validation | Batch 510/732 | Loss: 0.3648 | LM: 0.2953
|
| 354 |
+
[2026-04-07 18:49:38] Validation | Batch 520/732 | Loss: 0.3647 | LM: 0.2951
|
| 355 |
+
[2026-04-07 18:49:39] Validation | Batch 530/732 | Loss: 0.3641 | LM: 0.2945
|
| 356 |
+
[2026-04-07 18:49:40] Validation | Batch 540/732 | Loss: 0.3643 | LM: 0.2947
|
| 357 |
+
[2026-04-07 18:49:41] Validation | Batch 550/732 | Loss: 0.3642 | LM: 0.2946
|
| 358 |
+
[2026-04-07 18:49:42] Validation | Batch 560/732 | Loss: 0.3637 | LM: 0.2941
|
| 359 |
+
[2026-04-07 18:49:43] Validation | Batch 570/732 | Loss: 0.3638 | LM: 0.2941
|
| 360 |
+
[2026-04-07 18:49:44] Validation | Batch 580/732 | Loss: 0.3635 | LM: 0.2939
|
| 361 |
+
[2026-04-07 18:49:45] Validation | Batch 590/732 | Loss: 0.3635 | LM: 0.2939
|
| 362 |
+
[2026-04-07 18:49:46] Validation | Batch 600/732 | Loss: 0.3634 | LM: 0.2938
|
| 363 |
+
[2026-04-07 18:49:47] Validation | Batch 610/732 | Loss: 0.3639 | LM: 0.2943
|
| 364 |
+
[2026-04-07 18:49:49] Validation | Batch 620/732 | Loss: 0.3643 | LM: 0.2947
|
| 365 |
+
[2026-04-07 18:49:50] Validation | Batch 630/732 | Loss: 0.3641 | LM: 0.2945
|
| 366 |
+
[2026-04-07 18:49:51] Validation | Batch 640/732 | Loss: 0.3638 | LM: 0.2942
|
| 367 |
+
[2026-04-07 18:49:52] Validation | Batch 650/732 | Loss: 0.3637 | LM: 0.2941
|
| 368 |
+
[2026-04-07 18:49:53] Validation | Batch 660/732 | Loss: 0.3642 | LM: 0.2946
|
| 369 |
+
[2026-04-07 18:49:54] Validation | Batch 670/732 | Loss: 0.3648 | LM: 0.2952
|
| 370 |
+
[2026-04-07 18:49:55] Validation | Batch 680/732 | Loss: 0.3648 | LM: 0.2952
|
| 371 |
+
[2026-04-07 18:49:56] Validation | Batch 690/732 | Loss: 0.3649 | LM: 0.2953
|
| 372 |
+
[2026-04-07 18:49:57] Validation | Batch 700/732 | Loss: 0.3654 | LM: 0.2958
|
| 373 |
+
[2026-04-07 18:49:58] Validation | Batch 710/732 | Loss: 0.3658 | LM: 0.2962
|
| 374 |
+
[2026-04-07 18:49:59] Validation | Batch 720/732 | Loss: 0.3667 | LM: 0.2971
|
| 375 |
+
[2026-04-07 18:50:00] Validation | Batch 730/732 | Loss: 0.3664 | LM: 0.2968
|
| 376 |
+
[2026-04-07 18:50:00] Validation | Batch 732/732 | Loss: 0.3662 | LM: 0.2966
|
| 377 |
+
[2026-04-07 18:50:00] Validation | Loss: 0.3662 | LM: 0.2966 | PPL: 1.35 | Time: 77.29s
|
| 378 |
+
[2026-04-07 18:50:07] New best model saved! Val loss: 0.3662
|
| 379 |
+
[2026-04-07 18:50:16] Epoch 1 | Step 760 | Loss: 0.3793 | LM: 0.3040 | LB: 1.4650 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
|
| 380 |
+
[2026-04-07 18:50:25] Epoch 1 | Step 770 | Loss: 0.3791 | LM: 0.3039 | LB: 1.4642 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.346 | HR1: 0.365/SR1: 0.327 | LR: 1.00e-05
|
| 381 |
+
[2026-04-07 18:50:34] Epoch 1 | Step 780 | Loss: 0.3793 | LM: 0.3054 | LB: 1.4630 | CL0: 2.9 | CL1: 2.8 | HR0: 0.352/SR0: 0.346 | HR1: 0.365/SR1: 0.327 | LR: 1.00e-05
|
| 382 |
+
[2026-04-07 18:50:35] Reached max_steps=781, stopping training.
|
| 383 |
+
[2026-04-07 18:50:35] Epoch 1 completed in 986.44s | Loss: 0.3792 | CL0: 2.9 | CL1: 2.8
|
| 384 |
+
[2026-04-07 18:50:35]
|
| 385 |
+
Training completed!
|
| 386 |
+
[2026-04-07 18:50:38] Final model: outputs/N_8.0/model_final.pt
|