captioner-sft / trainer_state.json
Evan-Lin's picture
Upload folder using huggingface_hub
b0f59a7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8676036786395974,
"eval_steps": 1000.0,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00017352073572791948,
"grad_norm": 8.0,
"learning_rate": 6.920415224913495e-08,
"loss": 1.9091681241989136,
"step": 1,
"token_acc": 0.5288686692981869
},
{
"epoch": 0.0008676036786395974,
"grad_norm": 9.1875,
"learning_rate": 3.460207612456748e-07,
"loss": 1.9248077869415283,
"step": 5,
"token_acc": 0.5311277064784593
},
{
"epoch": 0.0017352073572791948,
"grad_norm": 8.0625,
"learning_rate": 6.920415224913496e-07,
"loss": 1.9248884201049805,
"step": 10,
"token_acc": 0.5290013381031705
},
{
"epoch": 0.0026028110359187923,
"grad_norm": 8.1875,
"learning_rate": 1.0380622837370243e-06,
"loss": 1.9355482101440429,
"step": 15,
"token_acc": 0.5267119264551204
},
{
"epoch": 0.0034704147145583897,
"grad_norm": 7.6875,
"learning_rate": 1.3840830449826992e-06,
"loss": 1.9109724044799805,
"step": 20,
"token_acc": 0.5344387591531053
},
{
"epoch": 0.004338018393197987,
"grad_norm": 7.5625,
"learning_rate": 1.7301038062283736e-06,
"loss": 1.9011001586914062,
"step": 25,
"token_acc": 0.5359598310957124
},
{
"epoch": 0.0052056220718375845,
"grad_norm": 7.09375,
"learning_rate": 2.0761245674740485e-06,
"loss": 1.8969675064086915,
"step": 30,
"token_acc": 0.535208283678928
},
{
"epoch": 0.006073225750477182,
"grad_norm": 6.375,
"learning_rate": 2.4221453287197232e-06,
"loss": 1.8716100692749023,
"step": 35,
"token_acc": 0.5402535722571752
},
{
"epoch": 0.006940829429116779,
"grad_norm": 5.71875,
"learning_rate": 2.7681660899653983e-06,
"loss": 1.845738410949707,
"step": 40,
"token_acc": 0.5421880925293895
},
{
"epoch": 0.007808433107756377,
"grad_norm": 5.34375,
"learning_rate": 3.114186851211073e-06,
"loss": 1.8337472915649413,
"step": 45,
"token_acc": 0.5421731912386869
},
{
"epoch": 0.008676036786395974,
"grad_norm": 4.875,
"learning_rate": 3.4602076124567473e-06,
"loss": 1.782250213623047,
"step": 50,
"token_acc": 0.5486540746507149
},
{
"epoch": 0.009543640465035572,
"grad_norm": 4.25,
"learning_rate": 3.8062283737024224e-06,
"loss": 1.7817264556884767,
"step": 55,
"token_acc": 0.553443922569435
},
{
"epoch": 0.010411244143675169,
"grad_norm": 4.0,
"learning_rate": 4.152249134948097e-06,
"loss": 1.7241941452026368,
"step": 60,
"token_acc": 0.562488997223915
},
{
"epoch": 0.011278847822314766,
"grad_norm": 3.46875,
"learning_rate": 4.498269896193772e-06,
"loss": 1.664463996887207,
"step": 65,
"token_acc": 0.5773684070111784
},
{
"epoch": 0.012146451500954364,
"grad_norm": 3.125,
"learning_rate": 4.8442906574394464e-06,
"loss": 1.6484092712402343,
"step": 70,
"token_acc": 0.5734682375674297
},
{
"epoch": 0.013014055179593961,
"grad_norm": 2.8125,
"learning_rate": 5.190311418685121e-06,
"loss": 1.6332122802734375,
"step": 75,
"token_acc": 0.5776690475867771
},
{
"epoch": 0.013881658858233559,
"grad_norm": 2.59375,
"learning_rate": 5.536332179930797e-06,
"loss": 1.6145336151123046,
"step": 80,
"token_acc": 0.5818942893134579
},
{
"epoch": 0.014749262536873156,
"grad_norm": 2.515625,
"learning_rate": 5.882352941176471e-06,
"loss": 1.5732461929321289,
"step": 85,
"token_acc": 0.5861122807970769
},
{
"epoch": 0.015616866215512754,
"grad_norm": 2.28125,
"learning_rate": 6.228373702422146e-06,
"loss": 1.5478083610534668,
"step": 90,
"token_acc": 0.5937751867175326
},
{
"epoch": 0.01648446989415235,
"grad_norm": 2.34375,
"learning_rate": 6.57439446366782e-06,
"loss": 1.5324504852294922,
"step": 95,
"token_acc": 0.5946228348671784
},
{
"epoch": 0.01735207357279195,
"grad_norm": 2.3125,
"learning_rate": 6.9204152249134946e-06,
"loss": 1.5145973205566405,
"step": 100,
"token_acc": 0.5947611081643478
},
{
"epoch": 0.018219677251431546,
"grad_norm": 2.25,
"learning_rate": 7.2664359861591705e-06,
"loss": 1.4743114471435548,
"step": 105,
"token_acc": 0.6041969950833631
},
{
"epoch": 0.019087280930071143,
"grad_norm": 2.34375,
"learning_rate": 7.612456747404845e-06,
"loss": 1.4888650894165039,
"step": 110,
"token_acc": 0.6016704100708781
},
{
"epoch": 0.01995488460871074,
"grad_norm": 2.3125,
"learning_rate": 7.958477508650519e-06,
"loss": 1.450081729888916,
"step": 115,
"token_acc": 0.6071094352086315
},
{
"epoch": 0.020822488287350338,
"grad_norm": 2.4375,
"learning_rate": 8.304498269896194e-06,
"loss": 1.453689956665039,
"step": 120,
"token_acc": 0.6067426531172755
},
{
"epoch": 0.021690091965989935,
"grad_norm": 2.25,
"learning_rate": 8.65051903114187e-06,
"loss": 1.4380900382995605,
"step": 125,
"token_acc": 0.6129145486928147
},
{
"epoch": 0.022557695644629533,
"grad_norm": 2.296875,
"learning_rate": 8.996539792387544e-06,
"loss": 1.4033380508422852,
"step": 130,
"token_acc": 0.6155833692194423
},
{
"epoch": 0.02342529932326913,
"grad_norm": 2.125,
"learning_rate": 9.34256055363322e-06,
"loss": 1.3930879592895509,
"step": 135,
"token_acc": 0.6180772126035058
},
{
"epoch": 0.024292903001908728,
"grad_norm": 1.96875,
"learning_rate": 9.688581314878893e-06,
"loss": 1.3432926177978515,
"step": 140,
"token_acc": 0.6306223488778284
},
{
"epoch": 0.025160506680548325,
"grad_norm": 2.234375,
"learning_rate": 1.0034602076124568e-05,
"loss": 1.39078369140625,
"step": 145,
"token_acc": 0.6173330823630644
},
{
"epoch": 0.026028110359187923,
"grad_norm": 2.28125,
"learning_rate": 1.0380622837370241e-05,
"loss": 1.3656013488769532,
"step": 150,
"token_acc": 0.6226175175695261
},
{
"epoch": 0.02689571403782752,
"grad_norm": 2.296875,
"learning_rate": 1.0726643598615918e-05,
"loss": 1.3753274917602538,
"step": 155,
"token_acc": 0.6213831896726154
},
{
"epoch": 0.027763317716467117,
"grad_norm": 2.046875,
"learning_rate": 1.1072664359861593e-05,
"loss": 1.3534158706665038,
"step": 160,
"token_acc": 0.6270216023570886
},
{
"epoch": 0.028630921395106715,
"grad_norm": 2.203125,
"learning_rate": 1.1418685121107267e-05,
"loss": 1.3482056617736817,
"step": 165,
"token_acc": 0.6246836055823732
},
{
"epoch": 0.029498525073746312,
"grad_norm": 2.265625,
"learning_rate": 1.1764705882352942e-05,
"loss": 1.3297002792358399,
"step": 170,
"token_acc": 0.6299395312649295
},
{
"epoch": 0.03036612875238591,
"grad_norm": 2.171875,
"learning_rate": 1.2110726643598615e-05,
"loss": 1.3250617980957031,
"step": 175,
"token_acc": 0.6287991301119186
},
{
"epoch": 0.031233732431025507,
"grad_norm": 2.140625,
"learning_rate": 1.2456747404844292e-05,
"loss": 1.32503080368042,
"step": 180,
"token_acc": 0.6283811790503951
},
{
"epoch": 0.032101336109665105,
"grad_norm": 1.984375,
"learning_rate": 1.2802768166089967e-05,
"loss": 1.3253366470336914,
"step": 185,
"token_acc": 0.6305334557323541
},
{
"epoch": 0.0329689397883047,
"grad_norm": 2.171875,
"learning_rate": 1.314878892733564e-05,
"loss": 1.3164624214172362,
"step": 190,
"token_acc": 0.6327720036837259
},
{
"epoch": 0.0338365434669443,
"grad_norm": 2.203125,
"learning_rate": 1.3494809688581316e-05,
"loss": 1.2994074821472168,
"step": 195,
"token_acc": 0.6317387471841726
},
{
"epoch": 0.0347041471455839,
"grad_norm": 2.203125,
"learning_rate": 1.3840830449826989e-05,
"loss": 1.3055123329162597,
"step": 200,
"token_acc": 0.6311878838826895
},
{
"epoch": 0.035571750824223494,
"grad_norm": 2.25,
"learning_rate": 1.4186851211072666e-05,
"loss": 1.2885337829589845,
"step": 205,
"token_acc": 0.6344415604742465
},
{
"epoch": 0.03643935450286309,
"grad_norm": 2.40625,
"learning_rate": 1.4532871972318341e-05,
"loss": 1.2472162246704102,
"step": 210,
"token_acc": 0.6459379308550766
},
{
"epoch": 0.03730695818150269,
"grad_norm": 2.21875,
"learning_rate": 1.4878892733564014e-05,
"loss": 1.2773432731628418,
"step": 215,
"token_acc": 0.6338010832102413
},
{
"epoch": 0.038174561860142286,
"grad_norm": 2.15625,
"learning_rate": 1.522491349480969e-05,
"loss": 1.272820281982422,
"step": 220,
"token_acc": 0.6386346675046153
},
{
"epoch": 0.039042165538781884,
"grad_norm": 2.171875,
"learning_rate": 1.5570934256055366e-05,
"loss": 1.2626455307006836,
"step": 225,
"token_acc": 0.6376424039839007
},
{
"epoch": 0.03990976921742148,
"grad_norm": 2.140625,
"learning_rate": 1.5916955017301038e-05,
"loss": 1.2593675613403321,
"step": 230,
"token_acc": 0.639645674135032
},
{
"epoch": 0.04077737289606108,
"grad_norm": 2.140625,
"learning_rate": 1.6262975778546713e-05,
"loss": 1.255127239227295,
"step": 235,
"token_acc": 0.6431021760799492
},
{
"epoch": 0.041644976574700676,
"grad_norm": 2.125,
"learning_rate": 1.6608996539792388e-05,
"loss": 1.2209264755249023,
"step": 240,
"token_acc": 0.6458485917837405
},
{
"epoch": 0.042512580253340274,
"grad_norm": 2.078125,
"learning_rate": 1.6955017301038063e-05,
"loss": 1.2057106018066406,
"step": 245,
"token_acc": 0.653398644744445
},
{
"epoch": 0.04338018393197987,
"grad_norm": 2.09375,
"learning_rate": 1.730103806228374e-05,
"loss": 1.254837989807129,
"step": 250,
"token_acc": 0.6412084543831126
},
{
"epoch": 0.04424778761061947,
"grad_norm": 2.09375,
"learning_rate": 1.7647058823529414e-05,
"loss": 1.2420223236083985,
"step": 255,
"token_acc": 0.6410300956861068
},
{
"epoch": 0.045115391289259066,
"grad_norm": 2.140625,
"learning_rate": 1.799307958477509e-05,
"loss": 1.2112503051757812,
"step": 260,
"token_acc": 0.6482978496344912
},
{
"epoch": 0.04598299496789866,
"grad_norm": 2.1875,
"learning_rate": 1.833910034602076e-05,
"loss": 1.2253754615783692,
"step": 265,
"token_acc": 0.6429915081156616
},
{
"epoch": 0.04685059864653826,
"grad_norm": 2.03125,
"learning_rate": 1.868512110726644e-05,
"loss": 1.2027314186096192,
"step": 270,
"token_acc": 0.6511740530352609
},
{
"epoch": 0.04771820232517786,
"grad_norm": 2.203125,
"learning_rate": 1.9031141868512114e-05,
"loss": 1.219920539855957,
"step": 275,
"token_acc": 0.6463804044677383
},
{
"epoch": 0.048585806003817456,
"grad_norm": 2.125,
"learning_rate": 1.9377162629757786e-05,
"loss": 1.1916674613952636,
"step": 280,
"token_acc": 0.6508475041894454
},
{
"epoch": 0.04945340968245705,
"grad_norm": 2.046875,
"learning_rate": 1.972318339100346e-05,
"loss": 1.2134785652160645,
"step": 285,
"token_acc": 0.6464441609025586
},
{
"epoch": 0.05032101336109665,
"grad_norm": 2.171875,
"learning_rate": 1.9999998353126843e-05,
"loss": 1.1799225807189941,
"step": 290,
"token_acc": 0.6547402065668172
},
{
"epoch": 0.05118861703973625,
"grad_norm": 2.140625,
"learning_rate": 1.99999407126232e-05,
"loss": 1.2018964767456055,
"step": 295,
"token_acc": 0.6471151511881963
},
{
"epoch": 0.052056220718375845,
"grad_norm": 2.203125,
"learning_rate": 1.9999800729003996e-05,
"loss": 1.1920422554016112,
"step": 300,
"token_acc": 0.6487302769689172
},
{
"epoch": 0.05292382439701544,
"grad_norm": 2.140625,
"learning_rate": 1.9999578403421912e-05,
"loss": 1.1693296432495117,
"step": 305,
"token_acc": 0.6556802430962789
},
{
"epoch": 0.05379142807565504,
"grad_norm": 2.15625,
"learning_rate": 1.9999273737707648e-05,
"loss": 1.1848974227905273,
"step": 310,
"token_acc": 0.6521950494915804
},
{
"epoch": 0.05465903175429464,
"grad_norm": 2.21875,
"learning_rate": 1.9998886734369936e-05,
"loss": 1.1778865814208985,
"step": 315,
"token_acc": 0.6545266496586913
},
{
"epoch": 0.055526635432934235,
"grad_norm": 2.0625,
"learning_rate": 1.9998417396595508e-05,
"loss": 1.1698062896728516,
"step": 320,
"token_acc": 0.6535174771198589
},
{
"epoch": 0.05639423911157383,
"grad_norm": 2.140625,
"learning_rate": 1.9997865728249043e-05,
"loss": 1.158426284790039,
"step": 325,
"token_acc": 0.6557887720539224
},
{
"epoch": 0.05726184279021343,
"grad_norm": 2.109375,
"learning_rate": 1.999723173387319e-05,
"loss": 1.1911964416503906,
"step": 330,
"token_acc": 0.6506533185984529
},
{
"epoch": 0.05812944646885303,
"grad_norm": 2.140625,
"learning_rate": 1.9996515418688493e-05,
"loss": 1.1536369323730469,
"step": 335,
"token_acc": 0.6576180488329568
},
{
"epoch": 0.058997050147492625,
"grad_norm": 2.078125,
"learning_rate": 1.999571678859333e-05,
"loss": 1.1710229873657227,
"step": 340,
"token_acc": 0.6535538729067681
},
{
"epoch": 0.05986465382613222,
"grad_norm": 2.15625,
"learning_rate": 1.9994835850163926e-05,
"loss": 1.1672002792358398,
"step": 345,
"token_acc": 0.6529600985558826
},
{
"epoch": 0.06073225750477182,
"grad_norm": 2.03125,
"learning_rate": 1.9993872610654236e-05,
"loss": 1.1647834777832031,
"step": 350,
"token_acc": 0.655277021628137
},
{
"epoch": 0.06159986118341142,
"grad_norm": 2.171875,
"learning_rate": 1.9992827077995925e-05,
"loss": 1.1796775817871095,
"step": 355,
"token_acc": 0.653560930884772
},
{
"epoch": 0.062467464862051014,
"grad_norm": 2.0625,
"learning_rate": 1.9991699260798284e-05,
"loss": 1.1580224990844727,
"step": 360,
"token_acc": 0.6587823913419223
},
{
"epoch": 0.06333506854069061,
"grad_norm": 2.1875,
"learning_rate": 1.999048916834817e-05,
"loss": 1.1684626579284667,
"step": 365,
"token_acc": 0.6539958690816651
},
{
"epoch": 0.06420267221933021,
"grad_norm": 2.25,
"learning_rate": 1.9989196810609918e-05,
"loss": 1.1673255920410157,
"step": 370,
"token_acc": 0.6526922661481962
},
{
"epoch": 0.0650702758979698,
"grad_norm": 2.09375,
"learning_rate": 1.9987822198225265e-05,
"loss": 1.1529643058776855,
"step": 375,
"token_acc": 0.6575958831216069
},
{
"epoch": 0.0659378795766094,
"grad_norm": 1.9921875,
"learning_rate": 1.9986365342513266e-05,
"loss": 1.1544547080993652,
"step": 380,
"token_acc": 0.6566502399001392
},
{
"epoch": 0.066805483255249,
"grad_norm": 2.125,
"learning_rate": 1.99848262554702e-05,
"loss": 1.169089126586914,
"step": 385,
"token_acc": 0.65285508142651
},
{
"epoch": 0.0676730869338886,
"grad_norm": 2.09375,
"learning_rate": 1.9983204949769454e-05,
"loss": 1.1696990966796874,
"step": 390,
"token_acc": 0.6517770916361757
},
{
"epoch": 0.0685406906125282,
"grad_norm": 1.9609375,
"learning_rate": 1.998150143876146e-05,
"loss": 1.1284924507141114,
"step": 395,
"token_acc": 0.6655096202085395
},
{
"epoch": 0.0694082942911678,
"grad_norm": 1.9765625,
"learning_rate": 1.9979715736473527e-05,
"loss": 1.1364903450012207,
"step": 400,
"token_acc": 0.6588324591008281
},
{
"epoch": 0.07027589796980739,
"grad_norm": 2.109375,
"learning_rate": 1.9977847857609775e-05,
"loss": 1.1590328216552734,
"step": 405,
"token_acc": 0.6557208069503755
},
{
"epoch": 0.07114350164844699,
"grad_norm": 2.09375,
"learning_rate": 1.9975897817550995e-05,
"loss": 1.1176044464111328,
"step": 410,
"token_acc": 0.665058862001308
},
{
"epoch": 0.07201110532708659,
"grad_norm": 2.140625,
"learning_rate": 1.9973865632354516e-05,
"loss": 1.1324227333068848,
"step": 415,
"token_acc": 0.6612732538710037
},
{
"epoch": 0.07287870900572618,
"grad_norm": 1.9921875,
"learning_rate": 1.9971751318754087e-05,
"loss": 1.126877784729004,
"step": 420,
"token_acc": 0.6609341202232488
},
{
"epoch": 0.07374631268436578,
"grad_norm": 2.171875,
"learning_rate": 1.9969554894159723e-05,
"loss": 1.14964017868042,
"step": 425,
"token_acc": 0.6562090443570724
},
{
"epoch": 0.07461391636300538,
"grad_norm": 2.03125,
"learning_rate": 1.996727637665758e-05,
"loss": 1.1191633224487305,
"step": 430,
"token_acc": 0.6615450810570731
},
{
"epoch": 0.07548152004164498,
"grad_norm": 2.015625,
"learning_rate": 1.9964915785009793e-05,
"loss": 1.1374661445617675,
"step": 435,
"token_acc": 0.660144832126399
},
{
"epoch": 0.07634912372028457,
"grad_norm": 2.078125,
"learning_rate": 1.996247313865432e-05,
"loss": 1.1567827224731446,
"step": 440,
"token_acc": 0.6550488426417499
},
{
"epoch": 0.07721672739892417,
"grad_norm": 2.078125,
"learning_rate": 1.9959948457704793e-05,
"loss": 1.1355746269226075,
"step": 445,
"token_acc": 0.6552852877530336
},
{
"epoch": 0.07808433107756377,
"grad_norm": 2.078125,
"learning_rate": 1.9957341762950346e-05,
"loss": 1.1385893821716309,
"step": 450,
"token_acc": 0.6600633591211301
},
{
"epoch": 0.07895193475620337,
"grad_norm": 2.03125,
"learning_rate": 1.9954653075855445e-05,
"loss": 1.1308669090270995,
"step": 455,
"token_acc": 0.6592446678440429
},
{
"epoch": 0.07981953843484296,
"grad_norm": 1.9140625,
"learning_rate": 1.9951882418559703e-05,
"loss": 1.1351963043212892,
"step": 460,
"token_acc": 0.6637463123076334
},
{
"epoch": 0.08068714211348256,
"grad_norm": 2.015625,
"learning_rate": 1.994902981387771e-05,
"loss": 1.1261632919311524,
"step": 465,
"token_acc": 0.6615251252132753
},
{
"epoch": 0.08155474579212216,
"grad_norm": 2.03125,
"learning_rate": 1.994609528529885e-05,
"loss": 1.1392766952514648,
"step": 470,
"token_acc": 0.6591416589510375
},
{
"epoch": 0.08242234947076175,
"grad_norm": 1.9609375,
"learning_rate": 1.994307885698708e-05,
"loss": 1.1046557426452637,
"step": 475,
"token_acc": 0.6653103722274739
},
{
"epoch": 0.08328995314940135,
"grad_norm": 2.078125,
"learning_rate": 1.9939980553780763e-05,
"loss": 1.1288423538208008,
"step": 480,
"token_acc": 0.6587156961405608
},
{
"epoch": 0.08415755682804095,
"grad_norm": 2.09375,
"learning_rate": 1.993680040119244e-05,
"loss": 1.1338699340820313,
"step": 485,
"token_acc": 0.6593820953616607
},
{
"epoch": 0.08502516050668055,
"grad_norm": 1.9453125,
"learning_rate": 1.9933538425408636e-05,
"loss": 1.1204511642456054,
"step": 490,
"token_acc": 0.6627913185082357
},
{
"epoch": 0.08589276418532014,
"grad_norm": 1.921875,
"learning_rate": 1.9930194653289635e-05,
"loss": 1.1193718910217285,
"step": 495,
"token_acc": 0.6627504181868472
},
{
"epoch": 0.08676036786395974,
"grad_norm": 1.9921875,
"learning_rate": 1.9926769112369263e-05,
"loss": 1.1357709884643554,
"step": 500,
"token_acc": 0.6581859131322919
},
{
"epoch": 0.08762797154259934,
"grad_norm": 2.078125,
"learning_rate": 1.9923261830854655e-05,
"loss": 1.1109633445739746,
"step": 505,
"token_acc": 0.6641862471522357
},
{
"epoch": 0.08849557522123894,
"grad_norm": 1.953125,
"learning_rate": 1.991967283762603e-05,
"loss": 1.0808055877685547,
"step": 510,
"token_acc": 0.6749422782577114
},
{
"epoch": 0.08936317889987853,
"grad_norm": 2.03125,
"learning_rate": 1.9916002162236458e-05,
"loss": 1.119293212890625,
"step": 515,
"token_acc": 0.6636471832848453
},
{
"epoch": 0.09023078257851813,
"grad_norm": 2.109375,
"learning_rate": 1.99122498349116e-05,
"loss": 1.1222724914550781,
"step": 520,
"token_acc": 0.6633997145381635
},
{
"epoch": 0.09109838625715773,
"grad_norm": 1.9609375,
"learning_rate": 1.990841588654947e-05,
"loss": 1.133096694946289,
"step": 525,
"token_acc": 0.660575962862001
},
{
"epoch": 0.09196598993579733,
"grad_norm": 2.03125,
"learning_rate": 1.990450034872018e-05,
"loss": 1.1255317687988282,
"step": 530,
"token_acc": 0.659338407094156
},
{
"epoch": 0.09283359361443692,
"grad_norm": 2.0,
"learning_rate": 1.990050325366568e-05,
"loss": 1.1009018898010254,
"step": 535,
"token_acc": 0.6676672499663481
},
{
"epoch": 0.09370119729307652,
"grad_norm": 2.0,
"learning_rate": 1.9896424634299495e-05,
"loss": 1.1132999420166017,
"step": 540,
"token_acc": 0.662574878385441
},
{
"epoch": 0.09456880097171612,
"grad_norm": 2.078125,
"learning_rate": 1.9892264524206442e-05,
"loss": 1.0917093276977539,
"step": 545,
"token_acc": 0.670899655371247
},
{
"epoch": 0.09543640465035572,
"grad_norm": 1.953125,
"learning_rate": 1.9888022957642365e-05,
"loss": 1.0798656463623046,
"step": 550,
"token_acc": 0.6730398457583547
},
{
"epoch": 0.09630400832899531,
"grad_norm": 1.9609375,
"learning_rate": 1.988369996953386e-05,
"loss": 1.1211360931396483,
"step": 555,
"token_acc": 0.660755798237552
},
{
"epoch": 0.09717161200763491,
"grad_norm": 1.9375,
"learning_rate": 1.987929559547796e-05,
"loss": 1.0922590255737306,
"step": 560,
"token_acc": 0.6691658981863865
},
{
"epoch": 0.09803921568627451,
"grad_norm": 2.015625,
"learning_rate": 1.9874809871741877e-05,
"loss": 1.109041404724121,
"step": 565,
"token_acc": 0.6648318872017354
},
{
"epoch": 0.0989068193649141,
"grad_norm": 2.078125,
"learning_rate": 1.9870242835262665e-05,
"loss": 1.1087127685546876,
"step": 570,
"token_acc": 0.6643103084814841
},
{
"epoch": 0.0997744230435537,
"grad_norm": 2.015625,
"learning_rate": 1.986559452364696e-05,
"loss": 1.1010761260986328,
"step": 575,
"token_acc": 0.6648252984798432
},
{
"epoch": 0.1006420267221933,
"grad_norm": 1.96875,
"learning_rate": 1.986086497517063e-05,
"loss": 1.107012939453125,
"step": 580,
"token_acc": 0.6654558712325808
},
{
"epoch": 0.1015096304008329,
"grad_norm": 2.109375,
"learning_rate": 1.985605422877848e-05,
"loss": 1.0979772567749024,
"step": 585,
"token_acc": 0.6680623147820886
},
{
"epoch": 0.1023772340794725,
"grad_norm": 2.234375,
"learning_rate": 1.9851162324083933e-05,
"loss": 1.0830554008483886,
"step": 590,
"token_acc": 0.6709642543415095
},
{
"epoch": 0.10324483775811209,
"grad_norm": 1.984375,
"learning_rate": 1.984618930136869e-05,
"loss": 1.0940834999084472,
"step": 595,
"token_acc": 0.6654719073768496
},
{
"epoch": 0.10411244143675169,
"grad_norm": 2.015625,
"learning_rate": 1.9841135201582418e-05,
"loss": 1.087096881866455,
"step": 600,
"token_acc": 0.6688288825090731
},
{
"epoch": 0.10498004511539129,
"grad_norm": 2.046875,
"learning_rate": 1.9836000066342396e-05,
"loss": 1.0840859413146973,
"step": 605,
"token_acc": 0.6716854817537411
},
{
"epoch": 0.10584764879403089,
"grad_norm": 2.140625,
"learning_rate": 1.9830783937933172e-05,
"loss": 1.1092602729797363,
"step": 610,
"token_acc": 0.6614420479795627
},
{
"epoch": 0.10671525247267048,
"grad_norm": 2.140625,
"learning_rate": 1.982548685930623e-05,
"loss": 1.067424201965332,
"step": 615,
"token_acc": 0.673647896123652
},
{
"epoch": 0.10758285615131008,
"grad_norm": 2.046875,
"learning_rate": 1.9820108874079626e-05,
"loss": 1.072523593902588,
"step": 620,
"token_acc": 0.670838285674334
},
{
"epoch": 0.10845045982994968,
"grad_norm": 2.265625,
"learning_rate": 1.9814650026537632e-05,
"loss": 1.1132768630981444,
"step": 625,
"token_acc": 0.6627720656963546
},
{
"epoch": 0.10931806350858927,
"grad_norm": 1.875,
"learning_rate": 1.9809110361630356e-05,
"loss": 1.081822395324707,
"step": 630,
"token_acc": 0.6701974000962927
},
{
"epoch": 0.11018566718722887,
"grad_norm": 2.03125,
"learning_rate": 1.9803489924973403e-05,
"loss": 1.0843083381652832,
"step": 635,
"token_acc": 0.670640893606908
},
{
"epoch": 0.11105327086586847,
"grad_norm": 1.875,
"learning_rate": 1.9797788762847474e-05,
"loss": 1.1068120002746582,
"step": 640,
"token_acc": 0.6664943545095905
},
{
"epoch": 0.11192087454450807,
"grad_norm": 2.15625,
"learning_rate": 1.9792006922197983e-05,
"loss": 1.090738296508789,
"step": 645,
"token_acc": 0.6678904842496042
},
{
"epoch": 0.11278847822314766,
"grad_norm": 2.21875,
"learning_rate": 1.97861444506347e-05,
"loss": 1.0894213676452638,
"step": 650,
"token_acc": 0.6677103350040985
},
{
"epoch": 0.11365608190178726,
"grad_norm": 1.9375,
"learning_rate": 1.9780201396431328e-05,
"loss": 1.1013753890991211,
"step": 655,
"token_acc": 0.6645008860011813
},
{
"epoch": 0.11452368558042686,
"grad_norm": 2.046875,
"learning_rate": 1.9774177808525113e-05,
"loss": 1.0939213752746582,
"step": 660,
"token_acc": 0.6666116111982823
},
{
"epoch": 0.11539128925906646,
"grad_norm": 2.046875,
"learning_rate": 1.9768073736516446e-05,
"loss": 1.0730672836303712,
"step": 665,
"token_acc": 0.674365815777946
},
{
"epoch": 0.11625889293770605,
"grad_norm": 1.9609375,
"learning_rate": 1.9761889230668462e-05,
"loss": 1.0676060676574708,
"step": 670,
"token_acc": 0.6705334815226451
},
{
"epoch": 0.11712649661634565,
"grad_norm": 2.078125,
"learning_rate": 1.975562434190661e-05,
"loss": 1.0712880134582519,
"step": 675,
"token_acc": 0.6705970273187744
},
{
"epoch": 0.11799410029498525,
"grad_norm": 2.15625,
"learning_rate": 1.9749279121818235e-05,
"loss": 1.1015710830688477,
"step": 680,
"token_acc": 0.6644264612144223
},
{
"epoch": 0.11886170397362485,
"grad_norm": 2.15625,
"learning_rate": 1.9742853622652176e-05,
"loss": 1.0666415214538574,
"step": 685,
"token_acc": 0.6735260146303254
},
{
"epoch": 0.11972930765226444,
"grad_norm": 2.015625,
"learning_rate": 1.9736347897318303e-05,
"loss": 1.1168096542358399,
"step": 690,
"token_acc": 0.6619707286530484
},
{
"epoch": 0.12059691133090404,
"grad_norm": 1.9140625,
"learning_rate": 1.9729761999387102e-05,
"loss": 1.061478042602539,
"step": 695,
"token_acc": 0.674633270806062
},
{
"epoch": 0.12146451500954364,
"grad_norm": 2.015625,
"learning_rate": 1.9723095983089235e-05,
"loss": 1.0845521926879882,
"step": 700,
"token_acc": 0.666814367237328
},
{
"epoch": 0.12233211868818324,
"grad_norm": 2.0625,
"learning_rate": 1.9716349903315075e-05,
"loss": 1.0705391883850097,
"step": 705,
"token_acc": 0.6704897791192207
},
{
"epoch": 0.12319972236682283,
"grad_norm": 2.0,
"learning_rate": 1.970952381561428e-05,
"loss": 1.0789600372314454,
"step": 710,
"token_acc": 0.6659224188949265
},
{
"epoch": 0.12406732604546243,
"grad_norm": 2.03125,
"learning_rate": 1.9702617776195314e-05,
"loss": 1.0921841621398927,
"step": 715,
"token_acc": 0.6679036012597562
},
{
"epoch": 0.12493492972410203,
"grad_norm": 2.0,
"learning_rate": 1.9695631841924993e-05,
"loss": 1.084920597076416,
"step": 720,
"token_acc": 0.6674108653000473
},
{
"epoch": 0.12580253340274164,
"grad_norm": 2.109375,
"learning_rate": 1.9688566070328018e-05,
"loss": 1.0615843772888183,
"step": 725,
"token_acc": 0.6733901515151515
},
{
"epoch": 0.12667013708138122,
"grad_norm": 2.046875,
"learning_rate": 1.9681420519586502e-05,
"loss": 1.0624969482421875,
"step": 730,
"token_acc": 0.6717009575388738
},
{
"epoch": 0.12753774076002083,
"grad_norm": 1.953125,
"learning_rate": 1.9674195248539482e-05,
"loss": 1.0610927581787108,
"step": 735,
"token_acc": 0.6758764832793959
},
{
"epoch": 0.12840534443866042,
"grad_norm": 1.984375,
"learning_rate": 1.9666890316682443e-05,
"loss": 1.0778383255004882,
"step": 740,
"token_acc": 0.6720665616068805
},
{
"epoch": 0.12927294811730003,
"grad_norm": 1.9453125,
"learning_rate": 1.9659505784166827e-05,
"loss": 1.078394317626953,
"step": 745,
"token_acc": 0.6708379109836813
},
{
"epoch": 0.1301405517959396,
"grad_norm": 2.125,
"learning_rate": 1.965204171179954e-05,
"loss": 1.088584041595459,
"step": 750,
"token_acc": 0.667237308961385
},
{
"epoch": 0.13100815547457922,
"grad_norm": 2.15625,
"learning_rate": 1.9644498161042436e-05,
"loss": 1.0937715530395509,
"step": 755,
"token_acc": 0.6666981577704298
},
{
"epoch": 0.1318757591532188,
"grad_norm": 1.84375,
"learning_rate": 1.9636875194011836e-05,
"loss": 1.0754453659057617,
"step": 760,
"token_acc": 0.6709822832582032
},
{
"epoch": 0.13274336283185842,
"grad_norm": 2.15625,
"learning_rate": 1.9629172873477995e-05,
"loss": 1.070410919189453,
"step": 765,
"token_acc": 0.6699807311459616
},
{
"epoch": 0.133610966510498,
"grad_norm": 2.015625,
"learning_rate": 1.9621391262864597e-05,
"loss": 1.0697467803955079,
"step": 770,
"token_acc": 0.672360857509975
},
{
"epoch": 0.13447857018913761,
"grad_norm": 2.0625,
"learning_rate": 1.961353042624823e-05,
"loss": 1.090577983856201,
"step": 775,
"token_acc": 0.6647531413321472
},
{
"epoch": 0.1353461738677772,
"grad_norm": 1.953125,
"learning_rate": 1.9605590428357853e-05,
"loss": 1.0771003723144532,
"step": 780,
"token_acc": 0.6692675159235669
},
{
"epoch": 0.1362137775464168,
"grad_norm": 1.9765625,
"learning_rate": 1.959757133457427e-05,
"loss": 1.0793813705444335,
"step": 785,
"token_acc": 0.6685078374160277
},
{
"epoch": 0.1370813812250564,
"grad_norm": 2.09375,
"learning_rate": 1.958947321092959e-05,
"loss": 1.0954531669616698,
"step": 790,
"token_acc": 0.666005196025954
},
{
"epoch": 0.137948984903696,
"grad_norm": 2.078125,
"learning_rate": 1.9581296124106682e-05,
"loss": 1.049675750732422,
"step": 795,
"token_acc": 0.6793725574174206
},
{
"epoch": 0.1388165885823356,
"grad_norm": 1.9921875,
"learning_rate": 1.9573040141438625e-05,
"loss": 1.0865850448608398,
"step": 800,
"token_acc": 0.6686926806866836
},
{
"epoch": 0.1396841922609752,
"grad_norm": 1.96875,
"learning_rate": 1.9564705330908155e-05,
"loss": 1.0714460372924806,
"step": 805,
"token_acc": 0.6687210017329085
},
{
"epoch": 0.14055179593961478,
"grad_norm": 2.109375,
"learning_rate": 1.9556291761147106e-05,
"loss": 1.0626968383789062,
"step": 810,
"token_acc": 0.6734635695958513
},
{
"epoch": 0.1414193996182544,
"grad_norm": 2.03125,
"learning_rate": 1.9547799501435848e-05,
"loss": 1.078728485107422,
"step": 815,
"token_acc": 0.6699388135142325
},
{
"epoch": 0.14228700329689398,
"grad_norm": 2.03125,
"learning_rate": 1.9539228621702696e-05,
"loss": 1.0764430999755858,
"step": 820,
"token_acc": 0.6692216671049172
},
{
"epoch": 0.1431546069755336,
"grad_norm": 2.03125,
"learning_rate": 1.9530579192523374e-05,
"loss": 1.0595266342163085,
"step": 825,
"token_acc": 0.6733650861607621
},
{
"epoch": 0.14402221065417317,
"grad_norm": 1.9609375,
"learning_rate": 1.9521851285120393e-05,
"loss": 1.0454116821289063,
"step": 830,
"token_acc": 0.6766406455817306
},
{
"epoch": 0.14488981433281278,
"grad_norm": 2.015625,
"learning_rate": 1.9513044971362494e-05,
"loss": 1.0634162902832032,
"step": 835,
"token_acc": 0.6721870895229326
},
{
"epoch": 0.14575741801145237,
"grad_norm": 2.015625,
"learning_rate": 1.9504160323764032e-05,
"loss": 1.0595422744750977,
"step": 840,
"token_acc": 0.6775388978821892
},
{
"epoch": 0.14662502169009198,
"grad_norm": 2.078125,
"learning_rate": 1.9495197415484397e-05,
"loss": 1.082723903656006,
"step": 845,
"token_acc": 0.668543901058705
},
{
"epoch": 0.14749262536873156,
"grad_norm": 2.0625,
"learning_rate": 1.9486156320327406e-05,
"loss": 1.0727534294128418,
"step": 850,
"token_acc": 0.6706723270354948
},
{
"epoch": 0.14836022904737117,
"grad_norm": 2.09375,
"learning_rate": 1.9477037112740703e-05,
"loss": 1.0933048248291015,
"step": 855,
"token_acc": 0.6638589138214922
},
{
"epoch": 0.14922783272601076,
"grad_norm": 1.8984375,
"learning_rate": 1.9467839867815118e-05,
"loss": 1.0769481658935547,
"step": 860,
"token_acc": 0.6706557839960199
},
{
"epoch": 0.15009543640465037,
"grad_norm": 1.953125,
"learning_rate": 1.9458564661284085e-05,
"loss": 1.062359619140625,
"step": 865,
"token_acc": 0.6725027997050067
},
{
"epoch": 0.15096304008328995,
"grad_norm": 2.015625,
"learning_rate": 1.9449211569523002e-05,
"loss": 1.0662097930908203,
"step": 870,
"token_acc": 0.6707562050881049
},
{
"epoch": 0.15183064376192956,
"grad_norm": 2.0625,
"learning_rate": 1.9439780669548586e-05,
"loss": 1.0621366500854492,
"step": 875,
"token_acc": 0.673269502864129
},
{
"epoch": 0.15269824744056915,
"grad_norm": 1.8984375,
"learning_rate": 1.9430272039018277e-05,
"loss": 1.0658045768737794,
"step": 880,
"token_acc": 0.6731692212416783
},
{
"epoch": 0.15356585111920876,
"grad_norm": 1.9609375,
"learning_rate": 1.942068575622956e-05,
"loss": 1.0896780967712403,
"step": 885,
"token_acc": 0.6671697313899149
},
{
"epoch": 0.15443345479784834,
"grad_norm": 1.953125,
"learning_rate": 1.9411021900119343e-05,
"loss": 1.0421188354492188,
"step": 890,
"token_acc": 0.6814261145654187
},
{
"epoch": 0.15530105847648795,
"grad_norm": 1.8671875,
"learning_rate": 1.94012805502633e-05,
"loss": 1.0770461082458496,
"step": 895,
"token_acc": 0.670378502031211
},
{
"epoch": 0.15616866215512754,
"grad_norm": 1.9765625,
"learning_rate": 1.9391461786875216e-05,
"loss": 1.0411422729492188,
"step": 900,
"token_acc": 0.6799084886073606
},
{
"epoch": 0.15703626583376715,
"grad_norm": 2.03125,
"learning_rate": 1.9381565690806328e-05,
"loss": 1.0435258865356445,
"step": 905,
"token_acc": 0.6792285176667363
},
{
"epoch": 0.15790386951240673,
"grad_norm": 1.90625,
"learning_rate": 1.9371592343544655e-05,
"loss": 1.0748100280761719,
"step": 910,
"token_acc": 0.6707071531575654
},
{
"epoch": 0.15877147319104634,
"grad_norm": 1.8828125,
"learning_rate": 1.9361541827214338e-05,
"loss": 1.0855265617370606,
"step": 915,
"token_acc": 0.667949364401157
},
{
"epoch": 0.15963907686968593,
"grad_norm": 1.9375,
"learning_rate": 1.9351414224574944e-05,
"loss": 1.0524426460266114,
"step": 920,
"token_acc": 0.6748159542907373
},
{
"epoch": 0.16050668054832554,
"grad_norm": 1.9921875,
"learning_rate": 1.9341209619020804e-05,
"loss": 1.0575942993164062,
"step": 925,
"token_acc": 0.6725628566510876
},
{
"epoch": 0.16137428422696512,
"grad_norm": 1.953125,
"learning_rate": 1.9330928094580324e-05,
"loss": 1.058868408203125,
"step": 930,
"token_acc": 0.6738103592539609
},
{
"epoch": 0.16224188790560473,
"grad_norm": 2.0625,
"learning_rate": 1.9320569735915273e-05,
"loss": 1.0528675079345704,
"step": 935,
"token_acc": 0.6737685311378745
},
{
"epoch": 0.16310949158424431,
"grad_norm": 2.171875,
"learning_rate": 1.9310134628320116e-05,
"loss": 1.0708015441894532,
"step": 940,
"token_acc": 0.6706150717308855
},
{
"epoch": 0.16397709526288393,
"grad_norm": 2.03125,
"learning_rate": 1.929962285772128e-05,
"loss": 1.0595834732055665,
"step": 945,
"token_acc": 0.6712688842219362
},
{
"epoch": 0.1648446989415235,
"grad_norm": 1.953125,
"learning_rate": 1.9289034510676483e-05,
"loss": 1.0492593765258789,
"step": 950,
"token_acc": 0.6747535596933187
},
{
"epoch": 0.16571230262016312,
"grad_norm": 1.9140625,
"learning_rate": 1.9278369674373985e-05,
"loss": 1.0697070121765138,
"step": 955,
"token_acc": 0.6718501687702754
},
{
"epoch": 0.1665799062988027,
"grad_norm": 1.9609375,
"learning_rate": 1.9267628436631893e-05,
"loss": 1.0314347267150878,
"step": 960,
"token_acc": 0.6785393180717892
},
{
"epoch": 0.16744750997744232,
"grad_norm": 2.140625,
"learning_rate": 1.9256810885897434e-05,
"loss": 1.0667208671569823,
"step": 965,
"token_acc": 0.6720282411646826
},
{
"epoch": 0.1683151136560819,
"grad_norm": 1.9140625,
"learning_rate": 1.9245917111246205e-05,
"loss": 1.0356231689453126,
"step": 970,
"token_acc": 0.6796255346195098
},
{
"epoch": 0.1691827173347215,
"grad_norm": 1.9921875,
"learning_rate": 1.9234947202381487e-05,
"loss": 1.0789193153381347,
"step": 975,
"token_acc": 0.6698977346968708
},
{
"epoch": 0.1700503210133611,
"grad_norm": 1.9453125,
"learning_rate": 1.922390124963345e-05,
"loss": 1.0505391120910645,
"step": 980,
"token_acc": 0.6759316831814153
},
{
"epoch": 0.1709179246920007,
"grad_norm": 1.953125,
"learning_rate": 1.9212779343958466e-05,
"loss": 1.0695667266845703,
"step": 985,
"token_acc": 0.6721886545823162
},
{
"epoch": 0.1717855283706403,
"grad_norm": 2.0,
"learning_rate": 1.92015815769383e-05,
"loss": 1.0540275573730469,
"step": 990,
"token_acc": 0.6755136400344937
},
{
"epoch": 0.1726531320492799,
"grad_norm": 1.9921875,
"learning_rate": 1.919030804077941e-05,
"loss": 1.0307014465332032,
"step": 995,
"token_acc": 0.6810972040253115
},
{
"epoch": 0.17352073572791948,
"grad_norm": 1.8828125,
"learning_rate": 1.9178958828312146e-05,
"loss": 1.067826271057129,
"step": 1000,
"token_acc": 0.6724737299518053
},
{
"epoch": 0.1743883394065591,
"grad_norm": 1.953125,
"learning_rate": 1.9167534032990024e-05,
"loss": 1.0573354721069337,
"step": 1005,
"token_acc": 0.6738269981618387
},
{
"epoch": 0.17525594308519868,
"grad_norm": 2.046875,
"learning_rate": 1.9156033748888918e-05,
"loss": 1.0550942420959473,
"step": 1010,
"token_acc": 0.6726485901683684
},
{
"epoch": 0.1761235467638383,
"grad_norm": 1.953125,
"learning_rate": 1.9144458070706317e-05,
"loss": 1.0487598419189452,
"step": 1015,
"token_acc": 0.6768756795940558
},
{
"epoch": 0.17699115044247787,
"grad_norm": 1.984375,
"learning_rate": 1.9132807093760523e-05,
"loss": 1.0621299743652344,
"step": 1020,
"token_acc": 0.6733473561667395
},
{
"epoch": 0.17785875412111748,
"grad_norm": 2.140625,
"learning_rate": 1.912108091398988e-05,
"loss": 1.052401065826416,
"step": 1025,
"token_acc": 0.6748749154834347
},
{
"epoch": 0.17872635779975707,
"grad_norm": 1.921875,
"learning_rate": 1.9109279627951978e-05,
"loss": 1.0468477249145507,
"step": 1030,
"token_acc": 0.6755857259832536
},
{
"epoch": 0.17959396147839668,
"grad_norm": 2.046875,
"learning_rate": 1.9097403332822863e-05,
"loss": 1.0689468383789062,
"step": 1035,
"token_acc": 0.670591049218667
},
{
"epoch": 0.18046156515703626,
"grad_norm": 1.921875,
"learning_rate": 1.908545212639622e-05,
"loss": 1.0497617721557617,
"step": 1040,
"token_acc": 0.6754224207406193
},
{
"epoch": 0.18132916883567587,
"grad_norm": 1.953125,
"learning_rate": 1.90734261070826e-05,
"loss": 1.0642064094543457,
"step": 1045,
"token_acc": 0.6719027275714755
},
{
"epoch": 0.18219677251431546,
"grad_norm": 1.828125,
"learning_rate": 1.906132537390857e-05,
"loss": 1.0482969284057617,
"step": 1050,
"token_acc": 0.6774891482197671
},
{
"epoch": 0.18306437619295507,
"grad_norm": 2.0625,
"learning_rate": 1.9049150026515937e-05,
"loss": 1.0419374465942384,
"step": 1055,
"token_acc": 0.6783982416374751
},
{
"epoch": 0.18393197987159465,
"grad_norm": 1.9296875,
"learning_rate": 1.9036900165160895e-05,
"loss": 1.047512149810791,
"step": 1060,
"token_acc": 0.673420406340701
},
{
"epoch": 0.18479958355023426,
"grad_norm": 2.015625,
"learning_rate": 1.9024575890713216e-05,
"loss": 1.0479446411132813,
"step": 1065,
"token_acc": 0.6774471529854157
},
{
"epoch": 0.18566718722887385,
"grad_norm": 2.109375,
"learning_rate": 1.9012177304655418e-05,
"loss": 1.0644286155700684,
"step": 1070,
"token_acc": 0.6712419897903769
},
{
"epoch": 0.18653479090751346,
"grad_norm": 2.03125,
"learning_rate": 1.8999704509081927e-05,
"loss": 1.0513483047485352,
"step": 1075,
"token_acc": 0.6753527477190749
},
{
"epoch": 0.18740239458615304,
"grad_norm": 1.890625,
"learning_rate": 1.8987157606698234e-05,
"loss": 1.025481605529785,
"step": 1080,
"token_acc": 0.6835672249886826
},
{
"epoch": 0.18826999826479265,
"grad_norm": 1.9453125,
"learning_rate": 1.8974536700820062e-05,
"loss": 1.0314741134643555,
"step": 1085,
"token_acc": 0.6798985689043553
},
{
"epoch": 0.18913760194343224,
"grad_norm": 2.046875,
"learning_rate": 1.896184189537249e-05,
"loss": 1.0473779678344726,
"step": 1090,
"token_acc": 0.6763754045307443
},
{
"epoch": 0.19000520562207185,
"grad_norm": 2.0625,
"learning_rate": 1.8949073294889127e-05,
"loss": 1.0450904846191407,
"step": 1095,
"token_acc": 0.6737394957983194
},
{
"epoch": 0.19087280930071143,
"grad_norm": 2.03125,
"learning_rate": 1.8936231004511224e-05,
"loss": 1.0552305221557616,
"step": 1100,
"token_acc": 0.6746180059360228
},
{
"epoch": 0.19174041297935104,
"grad_norm": 2.15625,
"learning_rate": 1.8923315129986838e-05,
"loss": 1.0332719802856445,
"step": 1105,
"token_acc": 0.6815246996363837
},
{
"epoch": 0.19260801665799063,
"grad_norm": 2.03125,
"learning_rate": 1.8910325777669923e-05,
"loss": 1.0561046600341797,
"step": 1110,
"token_acc": 0.6739904907684597
},
{
"epoch": 0.19347562033663024,
"grad_norm": 1.8828125,
"learning_rate": 1.8897263054519498e-05,
"loss": 1.0276466369628907,
"step": 1115,
"token_acc": 0.6814717548158276
},
{
"epoch": 0.19434322401526982,
"grad_norm": 2.03125,
"learning_rate": 1.8884127068098726e-05,
"loss": 1.0520359992980957,
"step": 1120,
"token_acc": 0.675560674842469
},
{
"epoch": 0.19521082769390943,
"grad_norm": 1.9921875,
"learning_rate": 1.8870917926574056e-05,
"loss": 1.0623506546020507,
"step": 1125,
"token_acc": 0.6724925733011843
},
{
"epoch": 0.19607843137254902,
"grad_norm": 1.921875,
"learning_rate": 1.8857635738714316e-05,
"loss": 1.050804901123047,
"step": 1130,
"token_acc": 0.6761054927622447
},
{
"epoch": 0.19694603505118863,
"grad_norm": 2.0,
"learning_rate": 1.884428061388983e-05,
"loss": 1.0528631210327148,
"step": 1135,
"token_acc": 0.6742988058872535
},
{
"epoch": 0.1978136387298282,
"grad_norm": 1.8671875,
"learning_rate": 1.8830852662071507e-05,
"loss": 1.0435836791992188,
"step": 1140,
"token_acc": 0.6766125320533894
},
{
"epoch": 0.19868124240846782,
"grad_norm": 1.90625,
"learning_rate": 1.8817351993829947e-05,
"loss": 1.058847713470459,
"step": 1145,
"token_acc": 0.6732232009828266
},
{
"epoch": 0.1995488460871074,
"grad_norm": 1.765625,
"learning_rate": 1.8803778720334512e-05,
"loss": 1.0335227966308593,
"step": 1150,
"token_acc": 0.6779969283000565
},
{
"epoch": 0.20041644976574702,
"grad_norm": 2.078125,
"learning_rate": 1.8790132953352427e-05,
"loss": 1.04959077835083,
"step": 1155,
"token_acc": 0.6770108354485658
},
{
"epoch": 0.2012840534443866,
"grad_norm": 1.84375,
"learning_rate": 1.8776414805247857e-05,
"loss": 1.0455670356750488,
"step": 1160,
"token_acc": 0.6780289627154183
},
{
"epoch": 0.2021516571230262,
"grad_norm": 1.9609375,
"learning_rate": 1.8762624388980976e-05,
"loss": 1.033797264099121,
"step": 1165,
"token_acc": 0.6787956767884714
},
{
"epoch": 0.2030192608016658,
"grad_norm": 1.859375,
"learning_rate": 1.8748761818107046e-05,
"loss": 1.0679737091064454,
"step": 1170,
"token_acc": 0.6713174689300571
},
{
"epoch": 0.2038868644803054,
"grad_norm": 1.890625,
"learning_rate": 1.8734827206775463e-05,
"loss": 1.0490418434143067,
"step": 1175,
"token_acc": 0.6751949483539963
},
{
"epoch": 0.204754468158945,
"grad_norm": 1.9296875,
"learning_rate": 1.8720820669728846e-05,
"loss": 1.0127446174621582,
"step": 1180,
"token_acc": 0.6857832294389704
},
{
"epoch": 0.2056220718375846,
"grad_norm": 1.96875,
"learning_rate": 1.8706742322302064e-05,
"loss": 1.0334016799926757,
"step": 1185,
"token_acc": 0.6802701904224747
},
{
"epoch": 0.20648967551622419,
"grad_norm": 1.90625,
"learning_rate": 1.8692592280421305e-05,
"loss": 1.043479824066162,
"step": 1190,
"token_acc": 0.6771668797706226
},
{
"epoch": 0.2073572791948638,
"grad_norm": 1.953125,
"learning_rate": 1.8678370660603115e-05,
"loss": 1.0523313522338866,
"step": 1195,
"token_acc": 0.6751031599887095
},
{
"epoch": 0.20822488287350338,
"grad_norm": 1.9765625,
"learning_rate": 1.8664077579953434e-05,
"loss": 1.05529727935791,
"step": 1200,
"token_acc": 0.6741786043282646
},
{
"epoch": 0.209092486552143,
"grad_norm": 2.140625,
"learning_rate": 1.864971315616664e-05,
"loss": 1.043968391418457,
"step": 1205,
"token_acc": 0.6766692503598716
},
{
"epoch": 0.20996009023078258,
"grad_norm": 2.015625,
"learning_rate": 1.8635277507524573e-05,
"loss": 1.0732519149780273,
"step": 1210,
"token_acc": 0.6701793283338767
},
{
"epoch": 0.2108276939094222,
"grad_norm": 1.8984375,
"learning_rate": 1.8620770752895567e-05,
"loss": 1.0491312980651855,
"step": 1215,
"token_acc": 0.6746863348120731
},
{
"epoch": 0.21169529758806177,
"grad_norm": 2.015625,
"learning_rate": 1.860619301173347e-05,
"loss": 1.0385177612304688,
"step": 1220,
"token_acc": 0.6771983724985469
},
{
"epoch": 0.21256290126670138,
"grad_norm": 2.09375,
"learning_rate": 1.8591544404076654e-05,
"loss": 1.0225757598876952,
"step": 1225,
"token_acc": 0.6851136908248575
},
{
"epoch": 0.21343050494534097,
"grad_norm": 1.890625,
"learning_rate": 1.8576825050547033e-05,
"loss": 1.0491232872009277,
"step": 1230,
"token_acc": 0.6755004153380315
},
{
"epoch": 0.21429810862398058,
"grad_norm": 1.90625,
"learning_rate": 1.856203507234907e-05,
"loss": 1.0523208618164062,
"step": 1235,
"token_acc": 0.675254080094805
},
{
"epoch": 0.21516571230262016,
"grad_norm": 2.0,
"learning_rate": 1.8547174591268774e-05,
"loss": 1.0285789489746093,
"step": 1240,
"token_acc": 0.6805194115460195
},
{
"epoch": 0.21603331598125977,
"grad_norm": 1.796875,
"learning_rate": 1.8532243729672707e-05,
"loss": 1.0230236053466797,
"step": 1245,
"token_acc": 0.6832664590042764
},
{
"epoch": 0.21690091965989935,
"grad_norm": 2.03125,
"learning_rate": 1.8517242610506953e-05,
"loss": 1.0365596771240235,
"step": 1250,
"token_acc": 0.678233046932105
},
{
"epoch": 0.21776852333853897,
"grad_norm": 1.9296875,
"learning_rate": 1.8502171357296144e-05,
"loss": 1.0360082626342773,
"step": 1255,
"token_acc": 0.6784880946067773
},
{
"epoch": 0.21863612701717855,
"grad_norm": 1.8984375,
"learning_rate": 1.8487030094142403e-05,
"loss": 1.044863796234131,
"step": 1260,
"token_acc": 0.6762245320026152
},
{
"epoch": 0.21950373069581816,
"grad_norm": 1.859375,
"learning_rate": 1.8471818945724355e-05,
"loss": 1.0216045379638672,
"step": 1265,
"token_acc": 0.6839718075188765
},
{
"epoch": 0.22037133437445774,
"grad_norm": 2.015625,
"learning_rate": 1.845653803729607e-05,
"loss": 1.0163522720336915,
"step": 1270,
"token_acc": 0.6835048168294121
},
{
"epoch": 0.22123893805309736,
"grad_norm": 1.8359375,
"learning_rate": 1.8441187494686055e-05,
"loss": 1.0463291168212892,
"step": 1275,
"token_acc": 0.6759099019331642
},
{
"epoch": 0.22210654173173694,
"grad_norm": 1.8046875,
"learning_rate": 1.8425767444296213e-05,
"loss": 1.0286881446838378,
"step": 1280,
"token_acc": 0.6834346103038309
},
{
"epoch": 0.22297414541037655,
"grad_norm": 1.9375,
"learning_rate": 1.8410278013100803e-05,
"loss": 1.0348123550415038,
"step": 1285,
"token_acc": 0.679287010183677
},
{
"epoch": 0.22384174908901613,
"grad_norm": 2.0625,
"learning_rate": 1.839471932864537e-05,
"loss": 1.0408474922180175,
"step": 1290,
"token_acc": 0.6770663593126929
},
{
"epoch": 0.22470935276765575,
"grad_norm": 2.0625,
"learning_rate": 1.8379091519045737e-05,
"loss": 1.0488122940063476,
"step": 1295,
"token_acc": 0.6739063026626222
},
{
"epoch": 0.22557695644629533,
"grad_norm": 1.921875,
"learning_rate": 1.8363394712986915e-05,
"loss": 1.0353066444396972,
"step": 1300,
"token_acc": 0.6792478688704328
},
{
"epoch": 0.22644456012493494,
"grad_norm": 2.0,
"learning_rate": 1.834762903972207e-05,
"loss": 1.0343815803527832,
"step": 1305,
"token_acc": 0.6786524515782157
},
{
"epoch": 0.22731216380357452,
"grad_norm": 1.9140625,
"learning_rate": 1.8331794629071427e-05,
"loss": 1.0241337776184083,
"step": 1310,
"token_acc": 0.6810138309840513
},
{
"epoch": 0.22817976748221414,
"grad_norm": 1.9921875,
"learning_rate": 1.831589161142124e-05,
"loss": 1.0487545013427735,
"step": 1315,
"token_acc": 0.6746494771055173
},
{
"epoch": 0.22904737116085372,
"grad_norm": 1.8984375,
"learning_rate": 1.8299920117722677e-05,
"loss": 1.0491311073303222,
"step": 1320,
"token_acc": 0.6740286726172584
},
{
"epoch": 0.22991497483949333,
"grad_norm": 1.7734375,
"learning_rate": 1.828388027949078e-05,
"loss": 1.0435140609741211,
"step": 1325,
"token_acc": 0.6763682837492424
},
{
"epoch": 0.2307825785181329,
"grad_norm": 2.015625,
"learning_rate": 1.8267772228803357e-05,
"loss": 1.023078155517578,
"step": 1330,
"token_acc": 0.6799355293097844
},
{
"epoch": 0.23165018219677252,
"grad_norm": 1.9296875,
"learning_rate": 1.82515960982999e-05,
"loss": 1.015854835510254,
"step": 1335,
"token_acc": 0.6842098118535009
},
{
"epoch": 0.2325177858754121,
"grad_norm": 1.9296875,
"learning_rate": 1.8235352021180496e-05,
"loss": 1.0593996047973633,
"step": 1340,
"token_acc": 0.6741832751181426
},
{
"epoch": 0.23338538955405172,
"grad_norm": 2.0,
"learning_rate": 1.821904013120473e-05,
"loss": 1.0396366119384766,
"step": 1345,
"token_acc": 0.6776407492466381
},
{
"epoch": 0.2342529932326913,
"grad_norm": 1.890625,
"learning_rate": 1.8202660562690592e-05,
"loss": 1.0485494613647461,
"step": 1350,
"token_acc": 0.6759969479137384
},
{
"epoch": 0.23512059691133091,
"grad_norm": 1.9765625,
"learning_rate": 1.8186213450513336e-05,
"loss": 1.026517391204834,
"step": 1355,
"token_acc": 0.6813391968138068
},
{
"epoch": 0.2359882005899705,
"grad_norm": 1.890625,
"learning_rate": 1.816969893010442e-05,
"loss": 1.041010570526123,
"step": 1360,
"token_acc": 0.6755975379040209
},
{
"epoch": 0.2368558042686101,
"grad_norm": 1.9765625,
"learning_rate": 1.815311713745036e-05,
"loss": 1.0168442726135254,
"step": 1365,
"token_acc": 0.6804629906694595
},
{
"epoch": 0.2377234079472497,
"grad_norm": 1.953125,
"learning_rate": 1.81364682090916e-05,
"loss": 1.025059700012207,
"step": 1370,
"token_acc": 0.680214399694494
},
{
"epoch": 0.2385910116258893,
"grad_norm": 1.890625,
"learning_rate": 1.811975228212143e-05,
"loss": 1.02586030960083,
"step": 1375,
"token_acc": 0.679387984579139
},
{
"epoch": 0.2394586153045289,
"grad_norm": 1.96875,
"learning_rate": 1.810296949418481e-05,
"loss": 1.0357915878295898,
"step": 1380,
"token_acc": 0.6767545616531072
},
{
"epoch": 0.2403262189831685,
"grad_norm": 1.8828125,
"learning_rate": 1.8086119983477265e-05,
"loss": 1.031496810913086,
"step": 1385,
"token_acc": 0.676317743132888
},
{
"epoch": 0.24119382266180808,
"grad_norm": 1.8671875,
"learning_rate": 1.8069203888743734e-05,
"loss": 1.0320685386657715,
"step": 1390,
"token_acc": 0.6808824724396653
},
{
"epoch": 0.2420614263404477,
"grad_norm": 1.9765625,
"learning_rate": 1.8052221349277445e-05,
"loss": 1.044478416442871,
"step": 1395,
"token_acc": 0.6767207412842042
},
{
"epoch": 0.24292903001908728,
"grad_norm": 1.875,
"learning_rate": 1.803517250491874e-05,
"loss": 1.037778091430664,
"step": 1400,
"token_acc": 0.6757977163281176
},
{
"epoch": 0.2437966336977269,
"grad_norm": 1.9140625,
"learning_rate": 1.801805749605395e-05,
"loss": 1.0458430290222167,
"step": 1405,
"token_acc": 0.6760411743080721
},
{
"epoch": 0.24466423737636647,
"grad_norm": 1.8984375,
"learning_rate": 1.800087646361423e-05,
"loss": 1.020294761657715,
"step": 1410,
"token_acc": 0.6817285303383098
},
{
"epoch": 0.24553184105500608,
"grad_norm": 1.875,
"learning_rate": 1.798362954907439e-05,
"loss": 1.0418660163879394,
"step": 1415,
"token_acc": 0.6780114226375908
},
{
"epoch": 0.24639944473364567,
"grad_norm": 2.015625,
"learning_rate": 1.796631689445174e-05,
"loss": 1.0439669609069824,
"step": 1420,
"token_acc": 0.6750978011601241
},
{
"epoch": 0.24726704841228528,
"grad_norm": 1.90625,
"learning_rate": 1.7948938642304915e-05,
"loss": 1.0315986633300782,
"step": 1425,
"token_acc": 0.6803868088271758
},
{
"epoch": 0.24813465209092486,
"grad_norm": 1.90625,
"learning_rate": 1.793149493573271e-05,
"loss": 1.0325140953063965,
"step": 1430,
"token_acc": 0.6792667142140159
},
{
"epoch": 0.24900225576956447,
"grad_norm": 1.921875,
"learning_rate": 1.791398591837289e-05,
"loss": 1.0254653930664062,
"step": 1435,
"token_acc": 0.6815645499333134
},
{
"epoch": 0.24986985944820406,
"grad_norm": 1.9453125,
"learning_rate": 1.7896411734401008e-05,
"loss": 1.042679786682129,
"step": 1440,
"token_acc": 0.6756525459991441
},
{
"epoch": 0.25073746312684364,
"grad_norm": 1.953125,
"learning_rate": 1.7878772528529232e-05,
"loss": 1.0409419059753418,
"step": 1445,
"token_acc": 0.6742666575920506
},
{
"epoch": 0.2516050668054833,
"grad_norm": 1.8828125,
"learning_rate": 1.7861068446005144e-05,
"loss": 1.0194078445434571,
"step": 1450,
"token_acc": 0.68190224912376
},
{
"epoch": 0.25247267048412286,
"grad_norm": 2.0,
"learning_rate": 1.7843299632610537e-05,
"loss": 1.031000518798828,
"step": 1455,
"token_acc": 0.6809780158582832
},
{
"epoch": 0.25334027416276245,
"grad_norm": 1.9296875,
"learning_rate": 1.782546623466022e-05,
"loss": 1.0219725608825683,
"step": 1460,
"token_acc": 0.6826487625065825
},
{
"epoch": 0.25420787784140203,
"grad_norm": 2.03125,
"learning_rate": 1.7807568399000824e-05,
"loss": 1.0241089820861817,
"step": 1465,
"token_acc": 0.6821418475993054
},
{
"epoch": 0.25507548152004167,
"grad_norm": 1.921875,
"learning_rate": 1.7789606273009574e-05,
"loss": 1.010830020904541,
"step": 1470,
"token_acc": 0.6835254004334725
},
{
"epoch": 0.25594308519868125,
"grad_norm": 1.921875,
"learning_rate": 1.7771580004593093e-05,
"loss": 1.045233917236328,
"step": 1475,
"token_acc": 0.6747018970189702
},
{
"epoch": 0.25681068887732084,
"grad_norm": 1.96875,
"learning_rate": 1.7753489742186164e-05,
"loss": 1.011804962158203,
"step": 1480,
"token_acc": 0.6846772177711121
},
{
"epoch": 0.2576782925559604,
"grad_norm": 1.921875,
"learning_rate": 1.773533563475053e-05,
"loss": 1.0484785079956054,
"step": 1485,
"token_acc": 0.6750959795243682
},
{
"epoch": 0.25854589623460006,
"grad_norm": 2.0,
"learning_rate": 1.771711783177366e-05,
"loss": 1.0313974380493165,
"step": 1490,
"token_acc": 0.6784674492495447
},
{
"epoch": 0.25941349991323964,
"grad_norm": 1.9921875,
"learning_rate": 1.76988364832675e-05,
"loss": 1.0448792457580567,
"step": 1495,
"token_acc": 0.6759750041845674
},
{
"epoch": 0.2602811035918792,
"grad_norm": 1.9375,
"learning_rate": 1.768049173976727e-05,
"loss": 1.030049991607666,
"step": 1500,
"token_acc": 0.6813090211643735
},
{
"epoch": 0.2611487072705188,
"grad_norm": 1.84375,
"learning_rate": 1.7662083752330193e-05,
"loss": 1.0194572448730468,
"step": 1505,
"token_acc": 0.6832258674993579
},
{
"epoch": 0.26201631094915845,
"grad_norm": 1.9375,
"learning_rate": 1.7643612672534275e-05,
"loss": 1.0071066856384276,
"step": 1510,
"token_acc": 0.684999272515641
},
{
"epoch": 0.26288391462779803,
"grad_norm": 1.9296875,
"learning_rate": 1.7625078652477036e-05,
"loss": 1.0143555641174316,
"step": 1515,
"token_acc": 0.683634143031619
},
{
"epoch": 0.2637515183064376,
"grad_norm": 1.9296875,
"learning_rate": 1.760648184477429e-05,
"loss": 1.0410999298095702,
"step": 1520,
"token_acc": 0.6760032102728732
},
{
"epoch": 0.2646191219850772,
"grad_norm": 1.9921875,
"learning_rate": 1.7587822402558837e-05,
"loss": 1.0309484481811524,
"step": 1525,
"token_acc": 0.6798862358621602
},
{
"epoch": 0.26548672566371684,
"grad_norm": 1.8984375,
"learning_rate": 1.756910047947926e-05,
"loss": 1.045750045776367,
"step": 1530,
"token_acc": 0.6779168647335341
},
{
"epoch": 0.2663543293423564,
"grad_norm": 2.078125,
"learning_rate": 1.755031622969862e-05,
"loss": 1.0056123733520508,
"step": 1535,
"token_acc": 0.6844262847741953
},
{
"epoch": 0.267221933020996,
"grad_norm": 1.875,
"learning_rate": 1.7531469807893196e-05,
"loss": 1.0222766876220704,
"step": 1540,
"token_acc": 0.6819466963244851
},
{
"epoch": 0.2680895366996356,
"grad_norm": 1.9375,
"learning_rate": 1.751256136925122e-05,
"loss": 1.0223438262939453,
"step": 1545,
"token_acc": 0.6796833846239153
},
{
"epoch": 0.26895714037827523,
"grad_norm": 1.9765625,
"learning_rate": 1.749359106947158e-05,
"loss": 1.0395459175109862,
"step": 1550,
"token_acc": 0.6780158536915294
},
{
"epoch": 0.2698247440569148,
"grad_norm": 2.09375,
"learning_rate": 1.7474559064762575e-05,
"loss": 1.0296743392944336,
"step": 1555,
"token_acc": 0.6756519151698767
},
{
"epoch": 0.2706923477355544,
"grad_norm": 1.9453125,
"learning_rate": 1.745546551184058e-05,
"loss": 1.016903781890869,
"step": 1560,
"token_acc": 0.6835704451583295
},
{
"epoch": 0.271559951414194,
"grad_norm": 2.046875,
"learning_rate": 1.74363105679288e-05,
"loss": 1.020066261291504,
"step": 1565,
"token_acc": 0.679187746898607
},
{
"epoch": 0.2724275550928336,
"grad_norm": 2.0,
"learning_rate": 1.7417094390755936e-05,
"loss": 1.0340109825134278,
"step": 1570,
"token_acc": 0.677778992239589
},
{
"epoch": 0.2732951587714732,
"grad_norm": 1.8203125,
"learning_rate": 1.739781713855492e-05,
"loss": 1.0160035133361816,
"step": 1575,
"token_acc": 0.681804898783274
},
{
"epoch": 0.2741627624501128,
"grad_norm": 1.9765625,
"learning_rate": 1.7378478970061596e-05,
"loss": 1.024774169921875,
"step": 1580,
"token_acc": 0.680820860552937
},
{
"epoch": 0.27503036612875237,
"grad_norm": 1.8671875,
"learning_rate": 1.735908004451341e-05,
"loss": 1.0384547233581543,
"step": 1585,
"token_acc": 0.6769406692778844
},
{
"epoch": 0.275897969807392,
"grad_norm": 1.8671875,
"learning_rate": 1.7339620521648107e-05,
"loss": 1.027394962310791,
"step": 1590,
"token_acc": 0.6820617131309908
},
{
"epoch": 0.2767655734860316,
"grad_norm": 1.90625,
"learning_rate": 1.7320100561702408e-05,
"loss": 1.0266061782836915,
"step": 1595,
"token_acc": 0.6778099499868386
},
{
"epoch": 0.2776331771646712,
"grad_norm": 2.046875,
"learning_rate": 1.73005203254107e-05,
"loss": 1.0057987213134765,
"step": 1600,
"token_acc": 0.6856035977459904
},
{
"epoch": 0.27850078084331076,
"grad_norm": 1.921875,
"learning_rate": 1.728087997400371e-05,
"loss": 1.0396166801452638,
"step": 1605,
"token_acc": 0.6768935264496704
},
{
"epoch": 0.2793683845219504,
"grad_norm": 1.765625,
"learning_rate": 1.726117966920716e-05,
"loss": 1.0311265945434571,
"step": 1610,
"token_acc": 0.6808383077444412
},
{
"epoch": 0.28023598820059,
"grad_norm": 1.8671875,
"learning_rate": 1.7241419573240463e-05,
"loss": 1.0097067832946778,
"step": 1615,
"token_acc": 0.683870040253019
},
{
"epoch": 0.28110359187922956,
"grad_norm": 1.96875,
"learning_rate": 1.7221599848815374e-05,
"loss": 1.0008836746215821,
"step": 1620,
"token_acc": 0.6870527000650618
},
{
"epoch": 0.28197119555786915,
"grad_norm": 1.8828125,
"learning_rate": 1.7201720659134642e-05,
"loss": 1.0405941009521484,
"step": 1625,
"token_acc": 0.6768849218838519
},
{
"epoch": 0.2828387992365088,
"grad_norm": 1.796875,
"learning_rate": 1.7181782167890678e-05,
"loss": 1.0066216468811036,
"step": 1630,
"token_acc": 0.6848891318550914
},
{
"epoch": 0.28370640291514837,
"grad_norm": 1.9375,
"learning_rate": 1.716178453926421e-05,
"loss": 1.046470832824707,
"step": 1635,
"token_acc": 0.6714027873902482
},
{
"epoch": 0.28457400659378795,
"grad_norm": 1.9140625,
"learning_rate": 1.7141727937922912e-05,
"loss": 1.0199688911437987,
"step": 1640,
"token_acc": 0.6823405115629932
},
{
"epoch": 0.28544161027242754,
"grad_norm": 1.8828125,
"learning_rate": 1.712161252902007e-05,
"loss": 1.044092559814453,
"step": 1645,
"token_acc": 0.6758528428093645
},
{
"epoch": 0.2863092139510672,
"grad_norm": 1.859375,
"learning_rate": 1.7101438478193212e-05,
"loss": 1.0233346939086914,
"step": 1650,
"token_acc": 0.6805489760838082
},
{
"epoch": 0.28717681762970676,
"grad_norm": 1.9375,
"learning_rate": 1.708120595156274e-05,
"loss": 1.0456744194030763,
"step": 1655,
"token_acc": 0.6750241212956581
},
{
"epoch": 0.28804442130834634,
"grad_norm": 2.03125,
"learning_rate": 1.706091511573057e-05,
"loss": 1.0319430351257324,
"step": 1660,
"token_acc": 0.6777862117640792
},
{
"epoch": 0.2889120249869859,
"grad_norm": 1.9296875,
"learning_rate": 1.704056613777876e-05,
"loss": 1.0204211235046388,
"step": 1665,
"token_acc": 0.6796524738028916
},
{
"epoch": 0.28977962866562557,
"grad_norm": 1.9375,
"learning_rate": 1.7020159185268123e-05,
"loss": 1.0458597183227538,
"step": 1670,
"token_acc": 0.6737207077953132
},
{
"epoch": 0.29064723234426515,
"grad_norm": 2.03125,
"learning_rate": 1.6999694426236862e-05,
"loss": 1.0375800132751465,
"step": 1675,
"token_acc": 0.6758920495200551
},
{
"epoch": 0.29151483602290473,
"grad_norm": 1.953125,
"learning_rate": 1.697917202919918e-05,
"loss": 1.0144439697265626,
"step": 1680,
"token_acc": 0.679975894834207
},
{
"epoch": 0.2923824397015443,
"grad_norm": 1.9609375,
"learning_rate": 1.6958592163143884e-05,
"loss": 1.0309619903564453,
"step": 1685,
"token_acc": 0.678642271573428
},
{
"epoch": 0.29325004338018396,
"grad_norm": 1.953125,
"learning_rate": 1.6937954997533016e-05,
"loss": 1.0367056846618652,
"step": 1690,
"token_acc": 0.6776543556428868
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.8203125,
"learning_rate": 1.691726070230043e-05,
"loss": 1.0386839866638184,
"step": 1695,
"token_acc": 0.6778316736701301
},
{
"epoch": 0.2949852507374631,
"grad_norm": 2.03125,
"learning_rate": 1.689650944785041e-05,
"loss": 1.0176087379455567,
"step": 1700,
"token_acc": 0.6798760737924237
},
{
"epoch": 0.2958528544161027,
"grad_norm": 1.796875,
"learning_rate": 1.6875701405056262e-05,
"loss": 1.006351852416992,
"step": 1705,
"token_acc": 0.6863717464315701
},
{
"epoch": 0.29672045809474235,
"grad_norm": 1.7890625,
"learning_rate": 1.685483674525891e-05,
"loss": 1.0238887786865234,
"step": 1710,
"token_acc": 0.6787703215736074
},
{
"epoch": 0.29758806177338193,
"grad_norm": 2.046875,
"learning_rate": 1.6833915640265485e-05,
"loss": 1.0253664016723634,
"step": 1715,
"token_acc": 0.6786322245940176
},
{
"epoch": 0.2984556654520215,
"grad_norm": 1.890625,
"learning_rate": 1.6812938262347907e-05,
"loss": 1.0375401496887207,
"step": 1720,
"token_acc": 0.677038246903498
},
{
"epoch": 0.2993232691306611,
"grad_norm": 1.8828125,
"learning_rate": 1.6791904784241458e-05,
"loss": 1.0252004623413087,
"step": 1725,
"token_acc": 0.6804137056166104
},
{
"epoch": 0.30019087280930074,
"grad_norm": 1.921875,
"learning_rate": 1.6770815379143385e-05,
"loss": 1.010302734375,
"step": 1730,
"token_acc": 0.6837099330986861
},
{
"epoch": 0.3010584764879403,
"grad_norm": 1.9609375,
"learning_rate": 1.674967022071144e-05,
"loss": 1.0301790237426758,
"step": 1735,
"token_acc": 0.6772626037659445
},
{
"epoch": 0.3019260801665799,
"grad_norm": 1.8515625,
"learning_rate": 1.6728469483062486e-05,
"loss": 0.9938658714294434,
"step": 1740,
"token_acc": 0.6898220909033759
},
{
"epoch": 0.3027936838452195,
"grad_norm": 1.9453125,
"learning_rate": 1.6707213340771028e-05,
"loss": 1.0314199447631835,
"step": 1745,
"token_acc": 0.6770383134840673
},
{
"epoch": 0.3036612875238591,
"grad_norm": 1.8046875,
"learning_rate": 1.6685901968867813e-05,
"loss": 1.0129788398742676,
"step": 1750,
"token_acc": 0.6820457843611499
},
{
"epoch": 0.3045288912024987,
"grad_norm": 1.9296875,
"learning_rate": 1.6664535542838352e-05,
"loss": 1.002908420562744,
"step": 1755,
"token_acc": 0.6864051119594943
},
{
"epoch": 0.3053964948811383,
"grad_norm": 1.8671875,
"learning_rate": 1.6643114238621495e-05,
"loss": 1.034525489807129,
"step": 1760,
"token_acc": 0.6801218196814923
},
{
"epoch": 0.3062640985597779,
"grad_norm": 2.03125,
"learning_rate": 1.6621638232607984e-05,
"loss": 1.025135612487793,
"step": 1765,
"token_acc": 0.6795281498360474
},
{
"epoch": 0.3071317022384175,
"grad_norm": 1.984375,
"learning_rate": 1.6600107701638993e-05,
"loss": 1.035383129119873,
"step": 1770,
"token_acc": 0.6749825634422447
},
{
"epoch": 0.3079993059170571,
"grad_norm": 1.9296875,
"learning_rate": 1.6578522823004666e-05,
"loss": 0.9947221755981446,
"step": 1775,
"token_acc": 0.6872965042273526
},
{
"epoch": 0.3088669095956967,
"grad_norm": 1.8671875,
"learning_rate": 1.6556883774442675e-05,
"loss": 1.0022805213928223,
"step": 1780,
"token_acc": 0.6862549392253107
},
{
"epoch": 0.30973451327433627,
"grad_norm": 1.9609375,
"learning_rate": 1.653519073413675e-05,
"loss": 1.0279296875,
"step": 1785,
"token_acc": 0.6769972826086956
},
{
"epoch": 0.3106021169529759,
"grad_norm": 1.9296875,
"learning_rate": 1.65134438807152e-05,
"loss": 1.0212496757507323,
"step": 1790,
"token_acc": 0.679449427274692
},
{
"epoch": 0.3114697206316155,
"grad_norm": 1.7265625,
"learning_rate": 1.649164339324945e-05,
"loss": 1.006572437286377,
"step": 1795,
"token_acc": 0.6861015265579256
},
{
"epoch": 0.31233732431025507,
"grad_norm": 1.859375,
"learning_rate": 1.646978945125257e-05,
"loss": 1.0250924110412598,
"step": 1800,
"token_acc": 0.6786020029623188
},
{
"epoch": 0.31320492798889465,
"grad_norm": 2.0,
"learning_rate": 1.6447882234677796e-05,
"loss": 1.0435279846191405,
"step": 1805,
"token_acc": 0.6758889509765172
},
{
"epoch": 0.3140725316675343,
"grad_norm": 1.890625,
"learning_rate": 1.6425921923917042e-05,
"loss": 1.0279791831970215,
"step": 1810,
"token_acc": 0.6805610242902337
},
{
"epoch": 0.3149401353461739,
"grad_norm": 1.90625,
"learning_rate": 1.6403908699799423e-05,
"loss": 1.02548828125,
"step": 1815,
"token_acc": 0.6779067427037907
},
{
"epoch": 0.31580773902481346,
"grad_norm": 2.015625,
"learning_rate": 1.6381842743589765e-05,
"loss": 1.0200424194335938,
"step": 1820,
"token_acc": 0.6822118412765064
},
{
"epoch": 0.31667534270345304,
"grad_norm": 1.8203125,
"learning_rate": 1.635972423698709e-05,
"loss": 1.0166802406311035,
"step": 1825,
"token_acc": 0.6827727138286145
},
{
"epoch": 0.3175429463820927,
"grad_norm": 1.921875,
"learning_rate": 1.6337553362123165e-05,
"loss": 1.0155767440795898,
"step": 1830,
"token_acc": 0.6837214270455031
},
{
"epoch": 0.31841055006073227,
"grad_norm": 1.8984375,
"learning_rate": 1.6315330301560956e-05,
"loss": 1.0089836120605469,
"step": 1835,
"token_acc": 0.6831295389068122
},
{
"epoch": 0.31927815373937185,
"grad_norm": 1.9375,
"learning_rate": 1.6293055238293155e-05,
"loss": 1.0108304977416993,
"step": 1840,
"token_acc": 0.6825737553161517
},
{
"epoch": 0.32014575741801143,
"grad_norm": 1.9453125,
"learning_rate": 1.6270728355740658e-05,
"loss": 1.0052438735961915,
"step": 1845,
"token_acc": 0.6869656992084433
},
{
"epoch": 0.3210133610966511,
"grad_norm": 1.984375,
"learning_rate": 1.6248349837751064e-05,
"loss": 1.0119807243347168,
"step": 1850,
"token_acc": 0.6814690154990364
},
{
"epoch": 0.32188096477529066,
"grad_norm": 1.9453125,
"learning_rate": 1.6225919868597154e-05,
"loss": 1.0213706970214844,
"step": 1855,
"token_acc": 0.6794819414937081
},
{
"epoch": 0.32274856845393024,
"grad_norm": 1.96875,
"learning_rate": 1.620343863297538e-05,
"loss": 0.9990407943725585,
"step": 1860,
"token_acc": 0.6861129568106312
},
{
"epoch": 0.3236161721325698,
"grad_norm": 2.0,
"learning_rate": 1.6180906316004336e-05,
"loss": 1.0262950897216796,
"step": 1865,
"token_acc": 0.6775860676697801
},
{
"epoch": 0.32448377581120946,
"grad_norm": 2.078125,
"learning_rate": 1.615832310322324e-05,
"loss": 1.036133098602295,
"step": 1870,
"token_acc": 0.6766465480728505
},
{
"epoch": 0.32535137948984905,
"grad_norm": 1.8203125,
"learning_rate": 1.6135689180590404e-05,
"loss": 1.020677089691162,
"step": 1875,
"token_acc": 0.6793250062916407
},
{
"epoch": 0.32621898316848863,
"grad_norm": 1.8515625,
"learning_rate": 1.6113004734481704e-05,
"loss": 1.0076414108276368,
"step": 1880,
"token_acc": 0.6839123609309945
},
{
"epoch": 0.3270865868471282,
"grad_norm": 1.9375,
"learning_rate": 1.609026995168904e-05,
"loss": 1.0311081886291504,
"step": 1885,
"token_acc": 0.6782416456600568
},
{
"epoch": 0.32795419052576785,
"grad_norm": 1.953125,
"learning_rate": 1.6067485019418814e-05,
"loss": 1.0099788665771485,
"step": 1890,
"token_acc": 0.6829244908301488
},
{
"epoch": 0.32882179420440744,
"grad_norm": 1.890625,
"learning_rate": 1.6044650125290365e-05,
"loss": 1.0263484001159668,
"step": 1895,
"token_acc": 0.6801680694975478
},
{
"epoch": 0.329689397883047,
"grad_norm": 1.8671875,
"learning_rate": 1.6021765457334444e-05,
"loss": 1.0163857460021972,
"step": 1900,
"token_acc": 0.6806223824561879
},
{
"epoch": 0.3305570015616866,
"grad_norm": 1.8125,
"learning_rate": 1.5998831203991648e-05,
"loss": 1.0088854789733888,
"step": 1905,
"token_acc": 0.6855646039732352
},
{
"epoch": 0.33142460524032624,
"grad_norm": 1.8046875,
"learning_rate": 1.5975847554110888e-05,
"loss": 0.9952527999877929,
"step": 1910,
"token_acc": 0.6883125788578638
},
{
"epoch": 0.3322922089189658,
"grad_norm": 1.8359375,
"learning_rate": 1.595281469694782e-05,
"loss": 1.0266911506652832,
"step": 1915,
"token_acc": 0.6797566371681416
},
{
"epoch": 0.3331598125976054,
"grad_norm": 1.90625,
"learning_rate": 1.592973282216329e-05,
"loss": 1.0018574714660644,
"step": 1920,
"token_acc": 0.6860606854970837
},
{
"epoch": 0.334027416276245,
"grad_norm": 1.9453125,
"learning_rate": 1.590660211982177e-05,
"loss": 1.0108092308044434,
"step": 1925,
"token_acc": 0.6822820656674948
},
{
"epoch": 0.33489501995488463,
"grad_norm": 2.015625,
"learning_rate": 1.5883422780389806e-05,
"loss": 1.0258635520935058,
"step": 1930,
"token_acc": 0.6778978538515823
},
{
"epoch": 0.3357626236335242,
"grad_norm": 1.9453125,
"learning_rate": 1.5860194994734427e-05,
"loss": 1.021854782104492,
"step": 1935,
"token_acc": 0.6808329178366179
},
{
"epoch": 0.3366302273121638,
"grad_norm": 1.9453125,
"learning_rate": 1.5836918954121588e-05,
"loss": 1.0331063270568848,
"step": 1940,
"token_acc": 0.6784242872199181
},
{
"epoch": 0.3374978309908034,
"grad_norm": 1.8828125,
"learning_rate": 1.58135948502146e-05,
"loss": 1.0210276603698731,
"step": 1945,
"token_acc": 0.6840329583182118
},
{
"epoch": 0.338365434669443,
"grad_norm": 1.8046875,
"learning_rate": 1.579022287507254e-05,
"loss": 1.0260606765747071,
"step": 1950,
"token_acc": 0.6783528979227396
},
{
"epoch": 0.3392330383480826,
"grad_norm": 1.9765625,
"learning_rate": 1.5766803221148676e-05,
"loss": 0.9952493667602539,
"step": 1955,
"token_acc": 0.6859501834760369
},
{
"epoch": 0.3401006420267222,
"grad_norm": 1.9609375,
"learning_rate": 1.574333608128887e-05,
"loss": 1.0229947090148925,
"step": 1960,
"token_acc": 0.6816778645360451
},
{
"epoch": 0.34096824570536177,
"grad_norm": 1.8984375,
"learning_rate": 1.5719821648730014e-05,
"loss": 1.0026690483093261,
"step": 1965,
"token_acc": 0.6833949856144678
},
{
"epoch": 0.3418358493840014,
"grad_norm": 1.7578125,
"learning_rate": 1.5696260117098424e-05,
"loss": 0.9994998931884765,
"step": 1970,
"token_acc": 0.6882951486903434
},
{
"epoch": 0.342703453062641,
"grad_norm": 1.953125,
"learning_rate": 1.5672651680408237e-05,
"loss": 1.0034085273742677,
"step": 1975,
"token_acc": 0.6842928918540483
},
{
"epoch": 0.3435710567412806,
"grad_norm": 1.8828125,
"learning_rate": 1.5648996533059824e-05,
"loss": 1.0039892196655273,
"step": 1980,
"token_acc": 0.6863158175442339
},
{
"epoch": 0.34443866041992016,
"grad_norm": 1.8125,
"learning_rate": 1.5625294869838203e-05,
"loss": 1.0203709602355957,
"step": 1985,
"token_acc": 0.6797555567287894
},
{
"epoch": 0.3453062640985598,
"grad_norm": 1.9765625,
"learning_rate": 1.5601546885911406e-05,
"loss": 1.021955966949463,
"step": 1990,
"token_acc": 0.6785332666062516
},
{
"epoch": 0.3461738677771994,
"grad_norm": 1.96875,
"learning_rate": 1.5577752776828892e-05,
"loss": 1.0178564071655274,
"step": 1995,
"token_acc": 0.6787890301656874
},
{
"epoch": 0.34704147145583897,
"grad_norm": 2.03125,
"learning_rate": 1.555391273851993e-05,
"loss": 0.9952051162719726,
"step": 2000,
"token_acc": 0.6859772527441359
},
{
"epoch": 0.34790907513447855,
"grad_norm": 2.046875,
"learning_rate": 1.553002696729198e-05,
"loss": 1.0093853950500489,
"step": 2005,
"token_acc": 0.6833726738760498
},
{
"epoch": 0.3487766788131182,
"grad_norm": 2.015625,
"learning_rate": 1.55060956598291e-05,
"loss": 1.0151101112365724,
"step": 2010,
"token_acc": 0.6831491047292776
},
{
"epoch": 0.3496442824917578,
"grad_norm": 1.9609375,
"learning_rate": 1.5482119013190296e-05,
"loss": 1.0173629760742187,
"step": 2015,
"token_acc": 0.6829174613265523
},
{
"epoch": 0.35051188617039736,
"grad_norm": 1.9296875,
"learning_rate": 1.5458097224807916e-05,
"loss": 1.019275188446045,
"step": 2020,
"token_acc": 0.6805337208534249
},
{
"epoch": 0.35137948984903694,
"grad_norm": 1.8671875,
"learning_rate": 1.5434030492486023e-05,
"loss": 1.0199106216430665,
"step": 2025,
"token_acc": 0.6799649276633055
},
{
"epoch": 0.3522470935276766,
"grad_norm": 1.84375,
"learning_rate": 1.5409919014398762e-05,
"loss": 1.0161332130432128,
"step": 2030,
"token_acc": 0.682195193046612
},
{
"epoch": 0.35311469720631616,
"grad_norm": 1.84375,
"learning_rate": 1.5385762989088738e-05,
"loss": 1.027943992614746,
"step": 2035,
"token_acc": 0.676602066311027
},
{
"epoch": 0.35398230088495575,
"grad_norm": 1.7890625,
"learning_rate": 1.5361562615465366e-05,
"loss": 1.0008016586303712,
"step": 2040,
"token_acc": 0.6849210596735349
},
{
"epoch": 0.35484990456359533,
"grad_norm": 1.859375,
"learning_rate": 1.5337318092803243e-05,
"loss": 1.0304694175720215,
"step": 2045,
"token_acc": 0.6774736297159127
},
{
"epoch": 0.35571750824223497,
"grad_norm": 1.8828125,
"learning_rate": 1.5313029620740506e-05,
"loss": 1.0220866203308105,
"step": 2050,
"token_acc": 0.6807486487213273
},
{
"epoch": 0.35658511192087455,
"grad_norm": 1.9453125,
"learning_rate": 1.5288697399277182e-05,
"loss": 1.019200611114502,
"step": 2055,
"token_acc": 0.6806197591915717
},
{
"epoch": 0.35745271559951414,
"grad_norm": 1.796875,
"learning_rate": 1.526432162877356e-05,
"loss": 1.013671875,
"step": 2060,
"token_acc": 0.6828907213817285
},
{
"epoch": 0.3583203192781537,
"grad_norm": 1.921875,
"learning_rate": 1.5239902509948514e-05,
"loss": 1.0091094017028808,
"step": 2065,
"token_acc": 0.6834054718392647
},
{
"epoch": 0.35918792295679336,
"grad_norm": 1.7578125,
"learning_rate": 1.521544024387787e-05,
"loss": 1.0055926322937012,
"step": 2070,
"token_acc": 0.6828146538012936
},
{
"epoch": 0.36005552663543294,
"grad_norm": 1.7578125,
"learning_rate": 1.5190935031992742e-05,
"loss": 1.0013408660888672,
"step": 2075,
"token_acc": 0.6865663839408236
},
{
"epoch": 0.3609231303140725,
"grad_norm": 1.96875,
"learning_rate": 1.5166387076077876e-05,
"loss": 1.014689826965332,
"step": 2080,
"token_acc": 0.6808145941313308
},
{
"epoch": 0.3617907339927121,
"grad_norm": 1.875,
"learning_rate": 1.5141796578269986e-05,
"loss": 1.0103944778442382,
"step": 2085,
"token_acc": 0.6806936577861687
},
{
"epoch": 0.36265833767135175,
"grad_norm": 1.90625,
"learning_rate": 1.5117163741056092e-05,
"loss": 1.0004392623901368,
"step": 2090,
"token_acc": 0.6851040904004753
},
{
"epoch": 0.36352594134999133,
"grad_norm": 1.8671875,
"learning_rate": 1.5092488767271858e-05,
"loss": 1.004606342315674,
"step": 2095,
"token_acc": 0.682853725269135
},
{
"epoch": 0.3643935450286309,
"grad_norm": 1.9375,
"learning_rate": 1.5067771860099905e-05,
"loss": 0.9848871231079102,
"step": 2100,
"token_acc": 0.6914043831501331
},
{
"epoch": 0.3652611487072705,
"grad_norm": 1.9140625,
"learning_rate": 1.5043013223068155e-05,
"loss": 1.0125656127929688,
"step": 2105,
"token_acc": 0.6832955602426212
},
{
"epoch": 0.36612875238591014,
"grad_norm": 1.8828125,
"learning_rate": 1.501821306004815e-05,
"loss": 1.0106427192687988,
"step": 2110,
"token_acc": 0.6838881419006099
},
{
"epoch": 0.3669963560645497,
"grad_norm": 1.8203125,
"learning_rate": 1.4993371575253368e-05,
"loss": 1.0103277206420898,
"step": 2115,
"token_acc": 0.6830820506764292
},
{
"epoch": 0.3678639597431893,
"grad_norm": 1.90625,
"learning_rate": 1.496848897323755e-05,
"loss": 0.9989145278930665,
"step": 2120,
"token_acc": 0.6848548395882129
},
{
"epoch": 0.3687315634218289,
"grad_norm": 1.9453125,
"learning_rate": 1.4943565458892999e-05,
"loss": 1.0156753540039063,
"step": 2125,
"token_acc": 0.683586704457614
},
{
"epoch": 0.36959916710046853,
"grad_norm": 1.828125,
"learning_rate": 1.4918601237448925e-05,
"loss": 1.0110110282897948,
"step": 2130,
"token_acc": 0.6824813659671195
},
{
"epoch": 0.3704667707791081,
"grad_norm": 1.90625,
"learning_rate": 1.4893596514469718e-05,
"loss": 1.0106982231140136,
"step": 2135,
"token_acc": 0.6820436574981416
},
{
"epoch": 0.3713343744577477,
"grad_norm": 1.859375,
"learning_rate": 1.4868551495853278e-05,
"loss": 1.0084819793701172,
"step": 2140,
"token_acc": 0.6837009642055211
},
{
"epoch": 0.3722019781363873,
"grad_norm": 1.9296875,
"learning_rate": 1.4843466387829317e-05,
"loss": 1.0337956428527832,
"step": 2145,
"token_acc": 0.6756529177470663
},
{
"epoch": 0.3730695818150269,
"grad_norm": 1.7890625,
"learning_rate": 1.4818341396957651e-05,
"loss": 1.010234260559082,
"step": 2150,
"token_acc": 0.6839618937946557
},
{
"epoch": 0.3739371854936665,
"grad_norm": 3.484375,
"learning_rate": 1.4793176730126512e-05,
"loss": 0.9982177734375,
"step": 2155,
"token_acc": 0.6882951820647545
},
{
"epoch": 0.3748047891723061,
"grad_norm": 1.8046875,
"learning_rate": 1.4767972594550832e-05,
"loss": 1.0000919342041015,
"step": 2160,
"token_acc": 0.685829937736179
},
{
"epoch": 0.37567239285094567,
"grad_norm": 1.796875,
"learning_rate": 1.4742729197770551e-05,
"loss": 1.0299704551696778,
"step": 2165,
"token_acc": 0.6772884904796179
},
{
"epoch": 0.3765399965295853,
"grad_norm": 1.9765625,
"learning_rate": 1.4717446747648894e-05,
"loss": 1.016530704498291,
"step": 2170,
"token_acc": 0.6815982696795492
},
{
"epoch": 0.3774076002082249,
"grad_norm": 2.03125,
"learning_rate": 1.4692125452370664e-05,
"loss": 1.0197928428649903,
"step": 2175,
"token_acc": 0.6793494519840083
},
{
"epoch": 0.3782752038868645,
"grad_norm": 1.8671875,
"learning_rate": 1.4666765520440534e-05,
"loss": 1.0177095413208008,
"step": 2180,
"token_acc": 0.6810032017075773
},
{
"epoch": 0.37914280756550406,
"grad_norm": 1.9609375,
"learning_rate": 1.464136716068132e-05,
"loss": 1.0126147270202637,
"step": 2185,
"token_acc": 0.683709293410274
},
{
"epoch": 0.3800104112441437,
"grad_norm": 1.84375,
"learning_rate": 1.461593058223227e-05,
"loss": 1.021070957183838,
"step": 2190,
"token_acc": 0.679652122955623
},
{
"epoch": 0.3808780149227833,
"grad_norm": 1.8671875,
"learning_rate": 1.4590455994547337e-05,
"loss": 1.001976203918457,
"step": 2195,
"token_acc": 0.6833014477415503
},
{
"epoch": 0.38174561860142286,
"grad_norm": 1.8671875,
"learning_rate": 1.456494360739346e-05,
"loss": 0.9893196105957032,
"step": 2200,
"token_acc": 0.6892311085988446
},
{
"epoch": 0.38261322228006245,
"grad_norm": 2.03125,
"learning_rate": 1.4539393630848829e-05,
"loss": 0.9814781188964844,
"step": 2205,
"token_acc": 0.6899440949405221
},
{
"epoch": 0.3834808259587021,
"grad_norm": 1.9921875,
"learning_rate": 1.451380627530115e-05,
"loss": 1.011701488494873,
"step": 2210,
"token_acc": 0.6809758515295867
},
{
"epoch": 0.38434842963734167,
"grad_norm": 1.8828125,
"learning_rate": 1.4488181751445939e-05,
"loss": 1.0211992263793945,
"step": 2215,
"token_acc": 0.6797978865156532
},
{
"epoch": 0.38521603331598125,
"grad_norm": 2.0,
"learning_rate": 1.4462520270284756e-05,
"loss": 0.9845295906066894,
"step": 2220,
"token_acc": 0.6868465406909026
},
{
"epoch": 0.38608363699462084,
"grad_norm": 1.859375,
"learning_rate": 1.4436822043123485e-05,
"loss": 1.0249157905578614,
"step": 2225,
"token_acc": 0.6786562283760498
},
{
"epoch": 0.3869512406732605,
"grad_norm": 1.765625,
"learning_rate": 1.441108728157059e-05,
"loss": 1.0030797004699707,
"step": 2230,
"token_acc": 0.684765917234319
},
{
"epoch": 0.38781884435190006,
"grad_norm": 1.96875,
"learning_rate": 1.4385316197535373e-05,
"loss": 1.0158026695251465,
"step": 2235,
"token_acc": 0.6832608666746447
},
{
"epoch": 0.38868644803053964,
"grad_norm": 1.84375,
"learning_rate": 1.4359509003226221e-05,
"loss": 1.0172318458557128,
"step": 2240,
"token_acc": 0.6808322441812877
},
{
"epoch": 0.3895540517091792,
"grad_norm": 1.9453125,
"learning_rate": 1.4333665911148881e-05,
"loss": 0.9851541519165039,
"step": 2245,
"token_acc": 0.6889603544215962
},
{
"epoch": 0.39042165538781887,
"grad_norm": 1.953125,
"learning_rate": 1.4307787134104682e-05,
"loss": 1.014187717437744,
"step": 2250,
"token_acc": 0.683114625160409
},
{
"epoch": 0.39128925906645845,
"grad_norm": 1.8203125,
"learning_rate": 1.42818728851888e-05,
"loss": 1.0081872940063477,
"step": 2255,
"token_acc": 0.6823870250820193
},
{
"epoch": 0.39215686274509803,
"grad_norm": 1.859375,
"learning_rate": 1.4255923377788497e-05,
"loss": 1.0085988998413087,
"step": 2260,
"token_acc": 0.6840598070654684
},
{
"epoch": 0.3930244664237376,
"grad_norm": 1.859375,
"learning_rate": 1.4229938825581373e-05,
"loss": 1.0013799667358398,
"step": 2265,
"token_acc": 0.6847899527045825
},
{
"epoch": 0.39389207010237726,
"grad_norm": 1.890625,
"learning_rate": 1.4203919442533597e-05,
"loss": 1.018793773651123,
"step": 2270,
"token_acc": 0.681686886192952
},
{
"epoch": 0.39475967378101684,
"grad_norm": 1.9921875,
"learning_rate": 1.4177865442898137e-05,
"loss": 1.0064517974853515,
"step": 2275,
"token_acc": 0.6819670370966876
},
{
"epoch": 0.3956272774596564,
"grad_norm": 1.9609375,
"learning_rate": 1.4151777041213021e-05,
"loss": 0.9828666687011719,
"step": 2280,
"token_acc": 0.6887780548628429
},
{
"epoch": 0.396494881138296,
"grad_norm": 1.96875,
"learning_rate": 1.4125654452299553e-05,
"loss": 1.0092188835144043,
"step": 2285,
"token_acc": 0.6844631486295059
},
{
"epoch": 0.39736248481693565,
"grad_norm": 1.8828125,
"learning_rate": 1.4099497891260538e-05,
"loss": 0.9924700736999512,
"step": 2290,
"token_acc": 0.6873599312908464
},
{
"epoch": 0.39823008849557523,
"grad_norm": 2.0,
"learning_rate": 1.4073307573478528e-05,
"loss": 1.0148592948913575,
"step": 2295,
"token_acc": 0.6811773236297232
},
{
"epoch": 0.3990976921742148,
"grad_norm": 1.9765625,
"learning_rate": 1.4047083714614038e-05,
"loss": 1.0003128051757812,
"step": 2300,
"token_acc": 0.6852241329539362
},
{
"epoch": 0.3999652958528544,
"grad_norm": 1.984375,
"learning_rate": 1.4020826530603775e-05,
"loss": 0.9960025787353516,
"step": 2305,
"token_acc": 0.6852598031645303
},
{
"epoch": 0.40083289953149404,
"grad_norm": 2.0,
"learning_rate": 1.399453623765885e-05,
"loss": 1.0148781776428222,
"step": 2310,
"token_acc": 0.684109947643979
},
{
"epoch": 0.4017005032101336,
"grad_norm": 1.90625,
"learning_rate": 1.3968213052263014e-05,
"loss": 1.012251091003418,
"step": 2315,
"token_acc": 0.6833315462148831
},
{
"epoch": 0.4025681068887732,
"grad_norm": 1.921875,
"learning_rate": 1.3941857191170857e-05,
"loss": 0.9941699028015136,
"step": 2320,
"token_acc": 0.6864559695983815
},
{
"epoch": 0.4034357105674128,
"grad_norm": 1.96875,
"learning_rate": 1.3915468871406044e-05,
"loss": 1.0085437774658204,
"step": 2325,
"token_acc": 0.6833488248572567
},
{
"epoch": 0.4043033142460524,
"grad_norm": 1.9296875,
"learning_rate": 1.38890483102595e-05,
"loss": 1.0144371032714843,
"step": 2330,
"token_acc": 0.68039780521262
},
{
"epoch": 0.405170917924692,
"grad_norm": 1.75,
"learning_rate": 1.3862595725287653e-05,
"loss": 0.9994147300720215,
"step": 2335,
"token_acc": 0.687611521794545
},
{
"epoch": 0.4060385216033316,
"grad_norm": 1.9140625,
"learning_rate": 1.3836111334310622e-05,
"loss": 0.9963122367858886,
"step": 2340,
"token_acc": 0.685745011351416
},
{
"epoch": 0.4069061252819712,
"grad_norm": 1.859375,
"learning_rate": 1.3809595355410424e-05,
"loss": 1.0122366905212403,
"step": 2345,
"token_acc": 0.683117204922772
},
{
"epoch": 0.4077737289606108,
"grad_norm": 1.890625,
"learning_rate": 1.3783048006929185e-05,
"loss": 1.0144343376159668,
"step": 2350,
"token_acc": 0.6814033279539999
},
{
"epoch": 0.4086413326392504,
"grad_norm": 1.8984375,
"learning_rate": 1.375646950746734e-05,
"loss": 1.0156232833862304,
"step": 2355,
"token_acc": 0.6830444078275435
},
{
"epoch": 0.40950893631789,
"grad_norm": 1.8984375,
"learning_rate": 1.3729860075881827e-05,
"loss": 1.034743595123291,
"step": 2360,
"token_acc": 0.6783182628209359
},
{
"epoch": 0.41037653999652957,
"grad_norm": 1.9375,
"learning_rate": 1.3703219931284304e-05,
"loss": 0.9984539031982422,
"step": 2365,
"token_acc": 0.6839781943890441
},
{
"epoch": 0.4112441436751692,
"grad_norm": 1.90625,
"learning_rate": 1.3676549293039316e-05,
"loss": 1.0032421112060548,
"step": 2370,
"token_acc": 0.6834760671844918
},
{
"epoch": 0.4121117473538088,
"grad_norm": 1.7890625,
"learning_rate": 1.3649848380762513e-05,
"loss": 0.9850346565246582,
"step": 2375,
"token_acc": 0.6905733974775712
},
{
"epoch": 0.41297935103244837,
"grad_norm": 1.859375,
"learning_rate": 1.3623117414318827e-05,
"loss": 1.0028590202331542,
"step": 2380,
"token_acc": 0.6836534850029511
},
{
"epoch": 0.41384695471108796,
"grad_norm": 1.859375,
"learning_rate": 1.3596356613820669e-05,
"loss": 1.013303279876709,
"step": 2385,
"token_acc": 0.6805802728792536
},
{
"epoch": 0.4147145583897276,
"grad_norm": 1.8828125,
"learning_rate": 1.3569566199626114e-05,
"loss": 1.0094331741333007,
"step": 2390,
"token_acc": 0.685405305236406
},
{
"epoch": 0.4155821620683672,
"grad_norm": 1.9140625,
"learning_rate": 1.3542746392337087e-05,
"loss": 1.005965805053711,
"step": 2395,
"token_acc": 0.682434716756596
},
{
"epoch": 0.41644976574700676,
"grad_norm": 1.75,
"learning_rate": 1.3515897412797547e-05,
"loss": 0.9940034866333007,
"step": 2400,
"token_acc": 0.6875033593120129
},
{
"epoch": 0.41731736942564635,
"grad_norm": 1.90625,
"learning_rate": 1.348901948209167e-05,
"loss": 0.9850317955017089,
"step": 2405,
"token_acc": 0.686592845447229
},
{
"epoch": 0.418184973104286,
"grad_norm": 1.953125,
"learning_rate": 1.3462112821542016e-05,
"loss": 1.0118427276611328,
"step": 2410,
"token_acc": 0.6816512666869937
},
{
"epoch": 0.41905257678292557,
"grad_norm": 1.875,
"learning_rate": 1.3435177652707735e-05,
"loss": 1.0028743743896484,
"step": 2415,
"token_acc": 0.6860446549751178
},
{
"epoch": 0.41992018046156515,
"grad_norm": 1.8828125,
"learning_rate": 1.3408214197382705e-05,
"loss": 0.9918471336364746,
"step": 2420,
"token_acc": 0.6874362288279708
},
{
"epoch": 0.42078778414020473,
"grad_norm": 1.84375,
"learning_rate": 1.3381222677593737e-05,
"loss": 1.0141358375549316,
"step": 2425,
"token_acc": 0.6807328527018983
},
{
"epoch": 0.4216553878188444,
"grad_norm": 1.84375,
"learning_rate": 1.3354203315598733e-05,
"loss": 1.0219820976257323,
"step": 2430,
"token_acc": 0.6813468119008437
},
{
"epoch": 0.42252299149748396,
"grad_norm": 1.96875,
"learning_rate": 1.3327156333884856e-05,
"loss": 1.0195876121520997,
"step": 2435,
"token_acc": 0.6805910377684181
},
{
"epoch": 0.42339059517612354,
"grad_norm": 1.8671875,
"learning_rate": 1.33000819551667e-05,
"loss": 1.0096649169921874,
"step": 2440,
"token_acc": 0.6815645521723036
},
{
"epoch": 0.4242581988547631,
"grad_norm": 1.9453125,
"learning_rate": 1.3272980402384459e-05,
"loss": 1.0119336128234864,
"step": 2445,
"token_acc": 0.68414329128903
},
{
"epoch": 0.42512580253340276,
"grad_norm": 1.90625,
"learning_rate": 1.3245851898702083e-05,
"loss": 1.004085636138916,
"step": 2450,
"token_acc": 0.6859707219637835
},
{
"epoch": 0.42599340621204235,
"grad_norm": 1.90625,
"learning_rate": 1.3218696667505444e-05,
"loss": 1.006967830657959,
"step": 2455,
"token_acc": 0.6833605995039316
},
{
"epoch": 0.42686100989068193,
"grad_norm": 1.984375,
"learning_rate": 1.319151493240051e-05,
"loss": 1.0127968788146973,
"step": 2460,
"token_acc": 0.6829119501118417
},
{
"epoch": 0.4277286135693215,
"grad_norm": 1.9453125,
"learning_rate": 1.3164306917211475e-05,
"loss": 1.0015942573547363,
"step": 2465,
"token_acc": 0.683394712251965
},
{
"epoch": 0.42859621724796115,
"grad_norm": 1.8359375,
"learning_rate": 1.313707284597895e-05,
"loss": 0.9921387672424317,
"step": 2470,
"token_acc": 0.6884706008353861
},
{
"epoch": 0.42946382092660074,
"grad_norm": 1.796875,
"learning_rate": 1.3109812942958087e-05,
"loss": 0.9937407493591308,
"step": 2475,
"token_acc": 0.6858287322723828
},
{
"epoch": 0.4303314246052403,
"grad_norm": 1.8125,
"learning_rate": 1.308252743261675e-05,
"loss": 1.0000595092773437,
"step": 2480,
"token_acc": 0.6844413945289899
},
{
"epoch": 0.4311990282838799,
"grad_norm": 1.890625,
"learning_rate": 1.3055216539633668e-05,
"loss": 0.9946840286254883,
"step": 2485,
"token_acc": 0.6857801388537539
},
{
"epoch": 0.43206663196251954,
"grad_norm": 1.7421875,
"learning_rate": 1.302788048889657e-05,
"loss": 0.9850924491882325,
"step": 2490,
"token_acc": 0.6912732362675458
},
{
"epoch": 0.4329342356411591,
"grad_norm": 1.8828125,
"learning_rate": 1.3000519505500354e-05,
"loss": 1.013066577911377,
"step": 2495,
"token_acc": 0.6841163491550963
},
{
"epoch": 0.4338018393197987,
"grad_norm": 1.90625,
"learning_rate": 1.297313381474522e-05,
"loss": 1.0059243202209474,
"step": 2500,
"token_acc": 0.682485376889968
},
{
"epoch": 0.4346694429984383,
"grad_norm": 1.8984375,
"learning_rate": 1.2945723642134808e-05,
"loss": 0.9933188438415528,
"step": 2505,
"token_acc": 0.6839858247063091
},
{
"epoch": 0.43553704667707793,
"grad_norm": 1.8359375,
"learning_rate": 1.2918289213374362e-05,
"loss": 1.0141347885131835,
"step": 2510,
"token_acc": 0.6845435525845792
},
{
"epoch": 0.4364046503557175,
"grad_norm": 1.8125,
"learning_rate": 1.2890830754368855e-05,
"loss": 1.0011796951293945,
"step": 2515,
"token_acc": 0.6849304174950298
},
{
"epoch": 0.4372722540343571,
"grad_norm": 1.859375,
"learning_rate": 1.2863348491221129e-05,
"loss": 1.004225254058838,
"step": 2520,
"token_acc": 0.6839080459770115
},
{
"epoch": 0.4381398577129967,
"grad_norm": 1.8828125,
"learning_rate": 1.2835842650230046e-05,
"loss": 1.005355167388916,
"step": 2525,
"token_acc": 0.6849887538762527
},
{
"epoch": 0.4390074613916363,
"grad_norm": 1.796875,
"learning_rate": 1.2808313457888614e-05,
"loss": 1.0048332214355469,
"step": 2530,
"token_acc": 0.6836279848033905
},
{
"epoch": 0.4398750650702759,
"grad_norm": 1.8515625,
"learning_rate": 1.2780761140882123e-05,
"loss": 1.0195894241333008,
"step": 2535,
"token_acc": 0.6795473179123936
},
{
"epoch": 0.4407426687489155,
"grad_norm": 1.9140625,
"learning_rate": 1.2753185926086282e-05,
"loss": 1.0192377090454101,
"step": 2540,
"token_acc": 0.6803299012123347
},
{
"epoch": 0.4416102724275551,
"grad_norm": 1.9921875,
"learning_rate": 1.2725588040565344e-05,
"loss": 1.005928134918213,
"step": 2545,
"token_acc": 0.6801644427607302
},
{
"epoch": 0.4424778761061947,
"grad_norm": 1.8984375,
"learning_rate": 1.2697967711570243e-05,
"loss": 1.003110980987549,
"step": 2550,
"token_acc": 0.6823795540443708
},
{
"epoch": 0.4433454797848343,
"grad_norm": 1.8203125,
"learning_rate": 1.2670325166536726e-05,
"loss": 1.000045108795166,
"step": 2555,
"token_acc": 0.6848798995377668
},
{
"epoch": 0.4442130834634739,
"grad_norm": 1.7890625,
"learning_rate": 1.2642660633083467e-05,
"loss": 0.9951872825622559,
"step": 2560,
"token_acc": 0.6862447171184515
},
{
"epoch": 0.44508068714211346,
"grad_norm": 1.859375,
"learning_rate": 1.2614974339010208e-05,
"loss": 1.0055727005004882,
"step": 2565,
"token_acc": 0.6850169715125947
},
{
"epoch": 0.4459482908207531,
"grad_norm": 1.8671875,
"learning_rate": 1.2587266512295868e-05,
"loss": 1.0195012092590332,
"step": 2570,
"token_acc": 0.6816625277741472
},
{
"epoch": 0.4468158944993927,
"grad_norm": 1.921875,
"learning_rate": 1.2559537381096681e-05,
"loss": 0.9964936256408692,
"step": 2575,
"token_acc": 0.6885902240435685
},
{
"epoch": 0.44768349817803227,
"grad_norm": 1.90625,
"learning_rate": 1.2531787173744298e-05,
"loss": 0.9999607086181641,
"step": 2580,
"token_acc": 0.6850149960102359
},
{
"epoch": 0.44855110185667185,
"grad_norm": 1.9609375,
"learning_rate": 1.2504016118743936e-05,
"loss": 1.0000761032104493,
"step": 2585,
"token_acc": 0.6829418781621488
},
{
"epoch": 0.4494187055353115,
"grad_norm": 1.9140625,
"learning_rate": 1.2476224444772467e-05,
"loss": 1.0015432357788085,
"step": 2590,
"token_acc": 0.6852074323242031
},
{
"epoch": 0.4502863092139511,
"grad_norm": 1.8984375,
"learning_rate": 1.244841238067655e-05,
"loss": 1.0161554336547851,
"step": 2595,
"token_acc": 0.68090608621095
},
{
"epoch": 0.45115391289259066,
"grad_norm": 1.953125,
"learning_rate": 1.242058015547074e-05,
"loss": 1.0064961433410644,
"step": 2600,
"token_acc": 0.6828215164844034
},
{
"epoch": 0.45202151657123024,
"grad_norm": 1.8359375,
"learning_rate": 1.2392727998335617e-05,
"loss": 1.0068798065185547,
"step": 2605,
"token_acc": 0.6836080829566604
},
{
"epoch": 0.4528891202498699,
"grad_norm": 1.875,
"learning_rate": 1.2364856138615873e-05,
"loss": 0.9954544067382812,
"step": 2610,
"token_acc": 0.6852013951546656
},
{
"epoch": 0.45375672392850946,
"grad_norm": 1.8671875,
"learning_rate": 1.2336964805818445e-05,
"loss": 1.012361431121826,
"step": 2615,
"token_acc": 0.6833901146441704
},
{
"epoch": 0.45462432760714905,
"grad_norm": 1.9375,
"learning_rate": 1.2309054229610625e-05,
"loss": 1.0001043319702148,
"step": 2620,
"token_acc": 0.6823560827524173
},
{
"epoch": 0.45549193128578863,
"grad_norm": 1.9453125,
"learning_rate": 1.2281124639818152e-05,
"loss": 1.0002737998962403,
"step": 2625,
"token_acc": 0.6835434270674609
},
{
"epoch": 0.45635953496442827,
"grad_norm": 1.8828125,
"learning_rate": 1.2253176266423332e-05,
"loss": 0.9922337532043457,
"step": 2630,
"token_acc": 0.6857845693124239
},
{
"epoch": 0.45722713864306785,
"grad_norm": 1.78125,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.9958258628845215,
"step": 2635,
"token_acc": 0.6889344316136575
},
{
"epoch": 0.45809474232170744,
"grad_norm": 1.8125,
"learning_rate": 1.2197224089527347e-05,
"loss": 1.000858688354492,
"step": 2640,
"token_acc": 0.6852724153892232
},
{
"epoch": 0.458962346000347,
"grad_norm": 1.8984375,
"learning_rate": 1.2169220746756567e-05,
"loss": 1.01625337600708,
"step": 2645,
"token_acc": 0.6788307748873977
},
{
"epoch": 0.45982994967898666,
"grad_norm": 1.8359375,
"learning_rate": 1.2141199541840428e-05,
"loss": 1.0196890830993652,
"step": 2650,
"token_acc": 0.6794828350233297
},
{
"epoch": 0.46069755335762624,
"grad_norm": 1.84375,
"learning_rate": 1.2113160705515626e-05,
"loss": 1.0036340713500977,
"step": 2655,
"token_acc": 0.6851736637091539
},
{
"epoch": 0.4615651570362658,
"grad_norm": 1.9609375,
"learning_rate": 1.2085104468664041e-05,
"loss": 1.0029501914978027,
"step": 2660,
"token_acc": 0.6866774142396532
},
{
"epoch": 0.4624327607149054,
"grad_norm": 1.8203125,
"learning_rate": 1.2057031062310845e-05,
"loss": 1.0131060600280761,
"step": 2665,
"token_acc": 0.6806618788309262
},
{
"epoch": 0.46330036439354505,
"grad_norm": 1.8671875,
"learning_rate": 1.2028940717622576e-05,
"loss": 1.0073641777038573,
"step": 2670,
"token_acc": 0.6839914676655741
},
{
"epoch": 0.46416796807218463,
"grad_norm": 1.7734375,
"learning_rate": 1.2000833665905255e-05,
"loss": 1.0106398582458496,
"step": 2675,
"token_acc": 0.6813406569965871
},
{
"epoch": 0.4650355717508242,
"grad_norm": 1.8671875,
"learning_rate": 1.1972710138602482e-05,
"loss": 1.0050904273986816,
"step": 2680,
"token_acc": 0.6856562992838491
},
{
"epoch": 0.4659031754294638,
"grad_norm": 1.90625,
"learning_rate": 1.194457036729351e-05,
"loss": 1.015509033203125,
"step": 2685,
"token_acc": 0.6802187834233648
},
{
"epoch": 0.46677077910810344,
"grad_norm": 1.90625,
"learning_rate": 1.1916414583691361e-05,
"loss": 1.0169716835021974,
"step": 2690,
"token_acc": 0.6822935779816514
},
{
"epoch": 0.467638382786743,
"grad_norm": 1.890625,
"learning_rate": 1.18882430196409e-05,
"loss": 1.00482234954834,
"step": 2695,
"token_acc": 0.6824643916517463
},
{
"epoch": 0.4685059864653826,
"grad_norm": 1.8046875,
"learning_rate": 1.1860055907116937e-05,
"loss": 1.013214111328125,
"step": 2700,
"token_acc": 0.6813338959360743
},
{
"epoch": 0.4693735901440222,
"grad_norm": 2.015625,
"learning_rate": 1.1831853478222318e-05,
"loss": 1.0059806823730468,
"step": 2705,
"token_acc": 0.683576909519807
},
{
"epoch": 0.47024119382266183,
"grad_norm": 1.9296875,
"learning_rate": 1.1803635965186002e-05,
"loss": 0.9913622856140136,
"step": 2710,
"token_acc": 0.6861254522541567
},
{
"epoch": 0.4711087975013014,
"grad_norm": 1.734375,
"learning_rate": 1.1775403600361167e-05,
"loss": 1.0054823875427246,
"step": 2715,
"token_acc": 0.684439954609615
},
{
"epoch": 0.471976401179941,
"grad_norm": 1.8671875,
"learning_rate": 1.1747156616223272e-05,
"loss": 1.0157322883605957,
"step": 2720,
"token_acc": 0.680252464832309
},
{
"epoch": 0.4728440048585806,
"grad_norm": 1.890625,
"learning_rate": 1.1718895245368167e-05,
"loss": 1.0170634269714356,
"step": 2725,
"token_acc": 0.6813107028863409
},
{
"epoch": 0.4737116085372202,
"grad_norm": 1.984375,
"learning_rate": 1.1690619720510165e-05,
"loss": 0.9852043151855469,
"step": 2730,
"token_acc": 0.6872285921724955
},
{
"epoch": 0.4745792122158598,
"grad_norm": 1.7734375,
"learning_rate": 1.1662330274480128e-05,
"loss": 1.0059645652770997,
"step": 2735,
"token_acc": 0.6836207236712307
},
{
"epoch": 0.4754468158944994,
"grad_norm": 1.859375,
"learning_rate": 1.1634027140223544e-05,
"loss": 0.9841846466064453,
"step": 2740,
"token_acc": 0.6890565215615255
},
{
"epoch": 0.47631441957313897,
"grad_norm": 1.859375,
"learning_rate": 1.1605710550798626e-05,
"loss": 0.995844554901123,
"step": 2745,
"token_acc": 0.6851391782871187
},
{
"epoch": 0.4771820232517786,
"grad_norm": 1.7421875,
"learning_rate": 1.1577380739374376e-05,
"loss": 0.9913998603820801,
"step": 2750,
"token_acc": 0.6908336288532514
},
{
"epoch": 0.4780496269304182,
"grad_norm": 1.8984375,
"learning_rate": 1.1549037939228667e-05,
"loss": 0.9965376853942871,
"step": 2755,
"token_acc": 0.6868992547759661
},
{
"epoch": 0.4789172306090578,
"grad_norm": 1.8828125,
"learning_rate": 1.1520682383746334e-05,
"loss": 1.0012220382690429,
"step": 2760,
"token_acc": 0.6884490453429107
},
{
"epoch": 0.47978483428769736,
"grad_norm": 1.8984375,
"learning_rate": 1.1492314306417233e-05,
"loss": 0.9879722595214844,
"step": 2765,
"token_acc": 0.6881789911554461
},
{
"epoch": 0.480652437966337,
"grad_norm": 1.953125,
"learning_rate": 1.1463933940834342e-05,
"loss": 1.0053581237792968,
"step": 2770,
"token_acc": 0.6855757229040982
},
{
"epoch": 0.4815200416449766,
"grad_norm": 1.96875,
"learning_rate": 1.1435541520691815e-05,
"loss": 0.9921921730041504,
"step": 2775,
"token_acc": 0.6862049831504012
},
{
"epoch": 0.48238764532361617,
"grad_norm": 1.9453125,
"learning_rate": 1.1407137279783074e-05,
"loss": 1.0183118820190429,
"step": 2780,
"token_acc": 0.6817061841095448
},
{
"epoch": 0.48325524900225575,
"grad_norm": 1.9375,
"learning_rate": 1.1378721451998874e-05,
"loss": 0.9925461769104004,
"step": 2785,
"token_acc": 0.688545962485034
},
{
"epoch": 0.4841228526808954,
"grad_norm": 1.9609375,
"learning_rate": 1.1350294271325379e-05,
"loss": 1.0159781455993653,
"step": 2790,
"token_acc": 0.681974551332532
},
{
"epoch": 0.48499045635953497,
"grad_norm": 1.9140625,
"learning_rate": 1.1321855971842243e-05,
"loss": 1.0086934089660644,
"step": 2795,
"token_acc": 0.6831038631199038
},
{
"epoch": 0.48585806003817456,
"grad_norm": 1.875,
"learning_rate": 1.129340678772067e-05,
"loss": 1.0199012756347656,
"step": 2800,
"token_acc": 0.6786575483258884
},
{
"epoch": 0.48672566371681414,
"grad_norm": 1.953125,
"learning_rate": 1.1264946953221496e-05,
"loss": 1.0137310028076172,
"step": 2805,
"token_acc": 0.6824437730782141
},
{
"epoch": 0.4875932673954538,
"grad_norm": 1.8515625,
"learning_rate": 1.123647670269325e-05,
"loss": 1.0020729064941407,
"step": 2810,
"token_acc": 0.6836910759886811
},
{
"epoch": 0.48846087107409336,
"grad_norm": 1.90625,
"learning_rate": 1.1207996270570242e-05,
"loss": 0.9875768661499024,
"step": 2815,
"token_acc": 0.6884513431530621
},
{
"epoch": 0.48932847475273294,
"grad_norm": 1.9140625,
"learning_rate": 1.117950589137061e-05,
"loss": 1.0017758369445802,
"step": 2820,
"token_acc": 0.6836398649214367
},
{
"epoch": 0.49019607843137253,
"grad_norm": 1.875,
"learning_rate": 1.1151005799694401e-05,
"loss": 1.0143745422363282,
"step": 2825,
"token_acc": 0.6843956569062094
},
{
"epoch": 0.49106368211001217,
"grad_norm": 1.8984375,
"learning_rate": 1.1122496230221644e-05,
"loss": 1.0051603317260742,
"step": 2830,
"token_acc": 0.6826833612462451
},
{
"epoch": 0.49193128578865175,
"grad_norm": 1.8984375,
"learning_rate": 1.1093977417710408e-05,
"loss": 0.9880369186401368,
"step": 2835,
"token_acc": 0.6881879959200434
},
{
"epoch": 0.49279888946729133,
"grad_norm": 1.8515625,
"learning_rate": 1.1065449596994876e-05,
"loss": 0.9956092834472656,
"step": 2840,
"token_acc": 0.6856416772554003
},
{
"epoch": 0.4936664931459309,
"grad_norm": 1.8828125,
"learning_rate": 1.1036913002983392e-05,
"loss": 1.0082509994506836,
"step": 2845,
"token_acc": 0.6834558638400725
},
{
"epoch": 0.49453409682457056,
"grad_norm": 1.8125,
"learning_rate": 1.1008367870656568e-05,
"loss": 0.9957260131835938,
"step": 2850,
"token_acc": 0.6855102932343144
},
{
"epoch": 0.49540170050321014,
"grad_norm": 1.8515625,
"learning_rate": 1.0979814435065308e-05,
"loss": 0.9961285591125488,
"step": 2855,
"token_acc": 0.6854094100735335
},
{
"epoch": 0.4962693041818497,
"grad_norm": 1.9296875,
"learning_rate": 1.0951252931328887e-05,
"loss": 0.9894907951354981,
"step": 2860,
"token_acc": 0.69068332911443
},
{
"epoch": 0.4971369078604893,
"grad_norm": 1.9765625,
"learning_rate": 1.092268359463302e-05,
"loss": 1.0121468544006347,
"step": 2865,
"token_acc": 0.6819141923071749
},
{
"epoch": 0.49800451153912895,
"grad_norm": 1.8359375,
"learning_rate": 1.0894106660227926e-05,
"loss": 1.017982578277588,
"step": 2870,
"token_acc": 0.6814496708942045
},
{
"epoch": 0.49887211521776853,
"grad_norm": 1.8515625,
"learning_rate": 1.0865522363426376e-05,
"loss": 1.0043160438537597,
"step": 2875,
"token_acc": 0.6829500019793358
},
{
"epoch": 0.4997397188964081,
"grad_norm": 1.875,
"learning_rate": 1.0836930939601768e-05,
"loss": 1.008955478668213,
"step": 2880,
"token_acc": 0.6822375933533712
},
{
"epoch": 0.5006073225750477,
"grad_norm": 1.9140625,
"learning_rate": 1.0808332624186197e-05,
"loss": 1.0033825874328612,
"step": 2885,
"token_acc": 0.684949342881556
},
{
"epoch": 0.5014749262536873,
"grad_norm": 1.9140625,
"learning_rate": 1.0779727652668496e-05,
"loss": 1.001988697052002,
"step": 2890,
"token_acc": 0.6846615607534672
},
{
"epoch": 0.5023425299323269,
"grad_norm": 1.8515625,
"learning_rate": 1.0751116260592312e-05,
"loss": 0.9898590087890625,
"step": 2895,
"token_acc": 0.6871776024781131
},
{
"epoch": 0.5032101336109666,
"grad_norm": 1.9296875,
"learning_rate": 1.072249868355415e-05,
"loss": 0.9838379859924317,
"step": 2900,
"token_acc": 0.6897600586613799
},
{
"epoch": 0.5040777372896061,
"grad_norm": 1.84375,
"learning_rate": 1.0693875157201459e-05,
"loss": 0.9647768020629883,
"step": 2905,
"token_acc": 0.6952837795361677
},
{
"epoch": 0.5049453409682457,
"grad_norm": 1.859375,
"learning_rate": 1.0665245917230666e-05,
"loss": 1.0030086517333985,
"step": 2910,
"token_acc": 0.6839139614674057
},
{
"epoch": 0.5058129446468853,
"grad_norm": 1.796875,
"learning_rate": 1.0636611199385251e-05,
"loss": 1.0003300666809083,
"step": 2915,
"token_acc": 0.6843827426478509
},
{
"epoch": 0.5066805483255249,
"grad_norm": 1.796875,
"learning_rate": 1.0607971239453805e-05,
"loss": 0.9978496551513671,
"step": 2920,
"token_acc": 0.6854442053489087
},
{
"epoch": 0.5075481520041645,
"grad_norm": 1.96875,
"learning_rate": 1.0579326273268074e-05,
"loss": 0.9909579277038574,
"step": 2925,
"token_acc": 0.686803062770415
},
{
"epoch": 0.5084157556828041,
"grad_norm": 1.8984375,
"learning_rate": 1.0550676536701034e-05,
"loss": 0.9943648338317871,
"step": 2930,
"token_acc": 0.6851877207875784
},
{
"epoch": 0.5092833593614436,
"grad_norm": 1.859375,
"learning_rate": 1.052202226566494e-05,
"loss": 0.9951316833496093,
"step": 2935,
"token_acc": 0.6850922617852889
},
{
"epoch": 0.5101509630400833,
"grad_norm": 1.828125,
"learning_rate": 1.0493363696109388e-05,
"loss": 0.9918990135192871,
"step": 2940,
"token_acc": 0.6878556595377437
},
{
"epoch": 0.5110185667187229,
"grad_norm": 1.875,
"learning_rate": 1.0464701064019364e-05,
"loss": 1.0089019775390624,
"step": 2945,
"token_acc": 0.682502467917078
},
{
"epoch": 0.5118861703973625,
"grad_norm": 1.84375,
"learning_rate": 1.0436034605413312e-05,
"loss": 0.982180118560791,
"step": 2950,
"token_acc": 0.6869962643166984
},
{
"epoch": 0.5127537740760021,
"grad_norm": 1.8984375,
"learning_rate": 1.0407364556341183e-05,
"loss": 1.0064614295959473,
"step": 2955,
"token_acc": 0.6841499638737991
},
{
"epoch": 0.5136213777546417,
"grad_norm": 1.8203125,
"learning_rate": 1.0378691152882496e-05,
"loss": 1.0329419136047364,
"step": 2960,
"token_acc": 0.6754290852352366
},
{
"epoch": 0.5144889814332813,
"grad_norm": 1.8828125,
"learning_rate": 1.0350014631144382e-05,
"loss": 1.0033533096313476,
"step": 2965,
"token_acc": 0.6818894869228896
},
{
"epoch": 0.5153565851119208,
"grad_norm": 1.9140625,
"learning_rate": 1.0321335227259661e-05,
"loss": 0.9842534065246582,
"step": 2970,
"token_acc": 0.6909560794180386
},
{
"epoch": 0.5162241887905604,
"grad_norm": 1.921875,
"learning_rate": 1.0292653177384878e-05,
"loss": 1.0118374824523926,
"step": 2975,
"token_acc": 0.682726188540142
},
{
"epoch": 0.5170917924692001,
"grad_norm": 1.765625,
"learning_rate": 1.0263968717698365e-05,
"loss": 1.0183884620666503,
"step": 2980,
"token_acc": 0.6805970149253732
},
{
"epoch": 0.5179593961478397,
"grad_norm": 1.8203125,
"learning_rate": 1.0235282084398301e-05,
"loss": 0.9902758598327637,
"step": 2985,
"token_acc": 0.6868815227383335
},
{
"epoch": 0.5188269998264793,
"grad_norm": 1.8515625,
"learning_rate": 1.0206593513700767e-05,
"loss": 1.0007359504699707,
"step": 2990,
"token_acc": 0.683507329474766
},
{
"epoch": 0.5196946035051189,
"grad_norm": 1.875,
"learning_rate": 1.0177903241837789e-05,
"loss": 0.9968069076538086,
"step": 2995,
"token_acc": 0.6845976760975876
},
{
"epoch": 0.5205622071837585,
"grad_norm": 1.921875,
"learning_rate": 1.0149211505055407e-05,
"loss": 0.9842087745666503,
"step": 3000,
"token_acc": 0.6890838871678698
},
{
"epoch": 0.521429810862398,
"grad_norm": 1.8359375,
"learning_rate": 1.012051853961172e-05,
"loss": 0.9979012489318848,
"step": 3005,
"token_acc": 0.6855041583613994
},
{
"epoch": 0.5222974145410376,
"grad_norm": 1.8671875,
"learning_rate": 1.0091824581774947e-05,
"loss": 1.0025611877441407,
"step": 3010,
"token_acc": 0.6843610112039744
},
{
"epoch": 0.5231650182196772,
"grad_norm": 1.828125,
"learning_rate": 1.0063129867821475e-05,
"loss": 1.0025950431823731,
"step": 3015,
"token_acc": 0.6844177684199013
},
{
"epoch": 0.5240326218983169,
"grad_norm": 1.859375,
"learning_rate": 1.0034434634033919e-05,
"loss": 0.9844324111938476,
"step": 3020,
"token_acc": 0.6901680615091516
},
{
"epoch": 0.5249002255769565,
"grad_norm": 1.875,
"learning_rate": 1.0005739116699178e-05,
"loss": 1.0121084213256837,
"step": 3025,
"token_acc": 0.6821625441696113
},
{
"epoch": 0.5257678292555961,
"grad_norm": 1.8359375,
"learning_rate": 9.977043552106484e-06,
"loss": 0.9731731414794922,
"step": 3030,
"token_acc": 0.69295219319862
},
{
"epoch": 0.5266354329342356,
"grad_norm": 1.84375,
"learning_rate": 9.94834817654545e-06,
"loss": 0.9912844657897949,
"step": 3035,
"token_acc": 0.6854252683732452
},
{
"epoch": 0.5275030366128752,
"grad_norm": 1.9375,
"learning_rate": 9.919653226304148e-06,
"loss": 0.989024543762207,
"step": 3040,
"token_acc": 0.6876149180822745
},
{
"epoch": 0.5283706402915148,
"grad_norm": 1.7890625,
"learning_rate": 9.890958937667135e-06,
"loss": 1.012401008605957,
"step": 3045,
"token_acc": 0.6832023046685692
},
{
"epoch": 0.5292382439701544,
"grad_norm": 1.8671875,
"learning_rate": 9.862265546913526e-06,
"loss": 1.0105487823486328,
"step": 3050,
"token_acc": 0.6831975602049648
},
{
"epoch": 0.530105847648794,
"grad_norm": 1.9921875,
"learning_rate": 9.83357329031504e-06,
"loss": 0.9997787475585938,
"step": 3055,
"token_acc": 0.6836862959420685
},
{
"epoch": 0.5309734513274337,
"grad_norm": 1.890625,
"learning_rate": 9.804882404134057e-06,
"loss": 0.9793942451477051,
"step": 3060,
"token_acc": 0.6905264857446551
},
{
"epoch": 0.5318410550060733,
"grad_norm": 1.8828125,
"learning_rate": 9.776193124621673e-06,
"loss": 1.0060349464416505,
"step": 3065,
"token_acc": 0.6837666900913563
},
{
"epoch": 0.5327086586847128,
"grad_norm": 1.8046875,
"learning_rate": 9.747505688015757e-06,
"loss": 0.9506141662597656,
"step": 3070,
"token_acc": 0.696441489065717
},
{
"epoch": 0.5335762623633524,
"grad_norm": 1.8203125,
"learning_rate": 9.718820330538999e-06,
"loss": 1.0000137329101562,
"step": 3075,
"token_acc": 0.6840303318042609
},
{
"epoch": 0.534443866041992,
"grad_norm": 1.90625,
"learning_rate": 9.690137288396967e-06,
"loss": 0.9879467010498046,
"step": 3080,
"token_acc": 0.6884837459463735
},
{
"epoch": 0.5353114697206316,
"grad_norm": 1.828125,
"learning_rate": 9.66145679777617e-06,
"loss": 1.0037842750549317,
"step": 3085,
"token_acc": 0.683606172775142
},
{
"epoch": 0.5361790733992712,
"grad_norm": 1.90625,
"learning_rate": 9.632779094842104e-06,
"loss": 0.9850837707519531,
"step": 3090,
"token_acc": 0.6906727747296649
},
{
"epoch": 0.5370466770779108,
"grad_norm": 1.9765625,
"learning_rate": 9.604104415737309e-06,
"loss": 1.0082507133483887,
"step": 3095,
"token_acc": 0.6840075020949999
},
{
"epoch": 0.5379142807565505,
"grad_norm": 1.8828125,
"learning_rate": 9.575432996579424e-06,
"loss": 0.9955901145935059,
"step": 3100,
"token_acc": 0.6857669735637754
},
{
"epoch": 0.53878188443519,
"grad_norm": 1.765625,
"learning_rate": 9.546765073459245e-06,
"loss": 0.9778296470642089,
"step": 3105,
"token_acc": 0.6909004764286278
},
{
"epoch": 0.5396494881138296,
"grad_norm": 1.8515625,
"learning_rate": 9.51810088243879e-06,
"loss": 1.0080193519592284,
"step": 3110,
"token_acc": 0.6846056403760251
},
{
"epoch": 0.5405170917924692,
"grad_norm": 1.984375,
"learning_rate": 9.489440659549333e-06,
"loss": 1.0060848236083983,
"step": 3115,
"token_acc": 0.6859093319194062
},
{
"epoch": 0.5413846954711088,
"grad_norm": 1.8046875,
"learning_rate": 9.46078464078948e-06,
"loss": 0.9916322708129883,
"step": 3120,
"token_acc": 0.6852762549715146
},
{
"epoch": 0.5422522991497484,
"grad_norm": 1.8359375,
"learning_rate": 9.432133062123215e-06,
"loss": 0.9954086303710937,
"step": 3125,
"token_acc": 0.6874728114000975
},
{
"epoch": 0.543119902828388,
"grad_norm": 1.9609375,
"learning_rate": 9.40348615947796e-06,
"loss": 1.0074991226196288,
"step": 3130,
"token_acc": 0.681418392340236
},
{
"epoch": 0.5439875065070275,
"grad_norm": 1.8359375,
"learning_rate": 9.374844168742637e-06,
"loss": 1.0012994766235352,
"step": 3135,
"token_acc": 0.6855507942467278
},
{
"epoch": 0.5448551101856672,
"grad_norm": 1.8515625,
"learning_rate": 9.34620732576572e-06,
"loss": 1.0055302619934081,
"step": 3140,
"token_acc": 0.6849006828057107
},
{
"epoch": 0.5457227138643068,
"grad_norm": 1.9765625,
"learning_rate": 9.317575866353293e-06,
"loss": 0.9842160224914551,
"step": 3145,
"token_acc": 0.6883597598729373
},
{
"epoch": 0.5465903175429464,
"grad_norm": 1.8828125,
"learning_rate": 9.28895002626711e-06,
"loss": 1.0006650924682616,
"step": 3150,
"token_acc": 0.6856869530964238
},
{
"epoch": 0.547457921221586,
"grad_norm": 1.8359375,
"learning_rate": 9.260330041222656e-06,
"loss": 1.0168807983398438,
"step": 3155,
"token_acc": 0.6813513261486406
},
{
"epoch": 0.5483255249002256,
"grad_norm": 1.9453125,
"learning_rate": 9.231716146887203e-06,
"loss": 0.9734827041625976,
"step": 3160,
"token_acc": 0.693440864594789
},
{
"epoch": 0.5491931285788652,
"grad_norm": 1.90625,
"learning_rate": 9.203108578877866e-06,
"loss": 0.9954551696777344,
"step": 3165,
"token_acc": 0.685033919424062
},
{
"epoch": 0.5500607322575047,
"grad_norm": 1.859375,
"learning_rate": 9.174507572759672e-06,
"loss": 1.005191707611084,
"step": 3170,
"token_acc": 0.6829043026216833
},
{
"epoch": 0.5509283359361443,
"grad_norm": 1.859375,
"learning_rate": 9.145913364043604e-06,
"loss": 0.9932435035705567,
"step": 3175,
"token_acc": 0.6873352300905745
},
{
"epoch": 0.551795939614784,
"grad_norm": 1.90625,
"learning_rate": 9.117326188184696e-06,
"loss": 0.9784406661987305,
"step": 3180,
"token_acc": 0.691351665477983
},
{
"epoch": 0.5526635432934236,
"grad_norm": 1.8203125,
"learning_rate": 9.088746280580046e-06,
"loss": 1.0030339241027832,
"step": 3185,
"token_acc": 0.6849746393518213
},
{
"epoch": 0.5535311469720632,
"grad_norm": 1.9765625,
"learning_rate": 9.060173876566916e-06,
"loss": 1.0087509155273438,
"step": 3190,
"token_acc": 0.682428123685603
},
{
"epoch": 0.5543987506507028,
"grad_norm": 1.8125,
"learning_rate": 9.031609211420775e-06,
"loss": 1.0267830848693849,
"step": 3195,
"token_acc": 0.679297126313532
},
{
"epoch": 0.5552663543293423,
"grad_norm": 1.84375,
"learning_rate": 9.003052520353372e-06,
"loss": 1.0051657676696777,
"step": 3200,
"token_acc": 0.6841281932693093
},
{
"epoch": 0.5561339580079819,
"grad_norm": 1.8046875,
"learning_rate": 8.974504038510793e-06,
"loss": 1.005373477935791,
"step": 3205,
"token_acc": 0.6836050245944957
},
{
"epoch": 0.5570015616866215,
"grad_norm": 1.8515625,
"learning_rate": 8.945964000971525e-06,
"loss": 0.9805338859558106,
"step": 3210,
"token_acc": 0.6906489566678965
},
{
"epoch": 0.5578691653652611,
"grad_norm": 1.8359375,
"learning_rate": 8.917432642744519e-06,
"loss": 1.0035972595214844,
"step": 3215,
"token_acc": 0.6831262001280136
},
{
"epoch": 0.5587367690439008,
"grad_norm": 1.8515625,
"learning_rate": 8.888910198767265e-06,
"loss": 0.9910804748535156,
"step": 3220,
"token_acc": 0.6850102007945882
},
{
"epoch": 0.5596043727225404,
"grad_norm": 1.796875,
"learning_rate": 8.860396903903844e-06,
"loss": 0.9914836883544922,
"step": 3225,
"token_acc": 0.6887413708576804
},
{
"epoch": 0.56047197640118,
"grad_norm": 1.8203125,
"learning_rate": 8.831892992943e-06,
"loss": 1.0112311363220214,
"step": 3230,
"token_acc": 0.6815932803989763
},
{
"epoch": 0.5613395800798195,
"grad_norm": 1.9609375,
"learning_rate": 8.803398700596208e-06,
"loss": 1.007247543334961,
"step": 3235,
"token_acc": 0.683875897072066
},
{
"epoch": 0.5622071837584591,
"grad_norm": 1.90625,
"learning_rate": 8.774914261495738e-06,
"loss": 1.0004298210144043,
"step": 3240,
"token_acc": 0.6841970344985766
},
{
"epoch": 0.5630747874370987,
"grad_norm": 1.8359375,
"learning_rate": 8.746439910192735e-06,
"loss": 0.9888349533081054,
"step": 3245,
"token_acc": 0.6879607213774719
},
{
"epoch": 0.5639423911157383,
"grad_norm": 1.890625,
"learning_rate": 8.717975881155261e-06,
"loss": 1.0053036689758301,
"step": 3250,
"token_acc": 0.6843277773304346
},
{
"epoch": 0.5648099947943779,
"grad_norm": 1.8125,
"learning_rate": 8.689522408766395e-06,
"loss": 1.006988525390625,
"step": 3255,
"token_acc": 0.6830830648001983
},
{
"epoch": 0.5656775984730176,
"grad_norm": 1.90625,
"learning_rate": 8.661079727322276e-06,
"loss": 1.0136844635009765,
"step": 3260,
"token_acc": 0.6810806425442155
},
{
"epoch": 0.5665452021516572,
"grad_norm": 1.8671875,
"learning_rate": 8.632648071030198e-06,
"loss": 1.0038190841674806,
"step": 3265,
"token_acc": 0.6849343777015168
},
{
"epoch": 0.5674128058302967,
"grad_norm": 1.9296875,
"learning_rate": 8.604227674006661e-06,
"loss": 0.9864459991455078,
"step": 3270,
"token_acc": 0.6862783616540615
},
{
"epoch": 0.5682804095089363,
"grad_norm": 1.8515625,
"learning_rate": 8.57581877027546e-06,
"loss": 0.9800386428833008,
"step": 3275,
"token_acc": 0.6911001694197374
},
{
"epoch": 0.5691480131875759,
"grad_norm": 1.8203125,
"learning_rate": 8.547421593765744e-06,
"loss": 0.9790647506713868,
"step": 3280,
"token_acc": 0.6886432619731929
},
{
"epoch": 0.5700156168662155,
"grad_norm": 1.9609375,
"learning_rate": 8.519036378310098e-06,
"loss": 0.9918664932250977,
"step": 3285,
"token_acc": 0.6854125633826426
},
{
"epoch": 0.5708832205448551,
"grad_norm": 1.8984375,
"learning_rate": 8.490663357642615e-06,
"loss": 0.9926240921020508,
"step": 3290,
"token_acc": 0.687613955720063
},
{
"epoch": 0.5717508242234947,
"grad_norm": 1.875,
"learning_rate": 8.462302765396975e-06,
"loss": 0.9821521759033203,
"step": 3295,
"token_acc": 0.6893415493905228
},
{
"epoch": 0.5726184279021344,
"grad_norm": 1.828125,
"learning_rate": 8.433954835104513e-06,
"loss": 1.0029169082641602,
"step": 3300,
"token_acc": 0.6836323546782512
},
{
"epoch": 0.5734860315807739,
"grad_norm": 1.921875,
"learning_rate": 8.4056198001923e-06,
"loss": 0.9930968284606934,
"step": 3305,
"token_acc": 0.6868556180002426
},
{
"epoch": 0.5743536352594135,
"grad_norm": 1.921875,
"learning_rate": 8.377297893981224e-06,
"loss": 0.9897697448730469,
"step": 3310,
"token_acc": 0.6850893984441819
},
{
"epoch": 0.5752212389380531,
"grad_norm": 1.8359375,
"learning_rate": 8.348989349684077e-06,
"loss": 1.0004033088684081,
"step": 3315,
"token_acc": 0.6834115743155585
},
{
"epoch": 0.5760888426166927,
"grad_norm": 1.953125,
"learning_rate": 8.320694400403608e-06,
"loss": 1.0031415939331054,
"step": 3320,
"token_acc": 0.6850924472948079
},
{
"epoch": 0.5769564462953323,
"grad_norm": 1.9140625,
"learning_rate": 8.292413279130625e-06,
"loss": 0.9991157531738282,
"step": 3325,
"token_acc": 0.6833071420830172
},
{
"epoch": 0.5778240499739719,
"grad_norm": 1.828125,
"learning_rate": 8.264146218742074e-06,
"loss": 1.0167976379394532,
"step": 3330,
"token_acc": 0.6805396906454517
},
{
"epoch": 0.5786916536526114,
"grad_norm": 2.046875,
"learning_rate": 8.235893451999118e-06,
"loss": 1.0147868156433106,
"step": 3335,
"token_acc": 0.6792890262751159
},
{
"epoch": 0.5795592573312511,
"grad_norm": 1.8515625,
"learning_rate": 8.207655211545218e-06,
"loss": 1.0142845153808593,
"step": 3340,
"token_acc": 0.6800431959683763
},
{
"epoch": 0.5804268610098907,
"grad_norm": 1.8125,
"learning_rate": 8.179431729904223e-06,
"loss": 1.012403964996338,
"step": 3345,
"token_acc": 0.6800115019148074
},
{
"epoch": 0.5812944646885303,
"grad_norm": 1.9609375,
"learning_rate": 8.151223239478453e-06,
"loss": 0.9996941566467286,
"step": 3350,
"token_acc": 0.6826174967983586
},
{
"epoch": 0.5821620683671699,
"grad_norm": 1.90625,
"learning_rate": 8.123029972546782e-06,
"loss": 1.0093581199645996,
"step": 3355,
"token_acc": 0.6819460251429169
},
{
"epoch": 0.5830296720458095,
"grad_norm": 1.8671875,
"learning_rate": 8.09485216126273e-06,
"loss": 1.0049400329589844,
"step": 3360,
"token_acc": 0.6827514040478966
},
{
"epoch": 0.583897275724449,
"grad_norm": 1.8828125,
"learning_rate": 8.066690037652552e-06,
"loss": 0.9991744995117188,
"step": 3365,
"token_acc": 0.6828673913638729
},
{
"epoch": 0.5847648794030886,
"grad_norm": 1.921875,
"learning_rate": 8.03854383361332e-06,
"loss": 0.9949298858642578,
"step": 3370,
"token_acc": 0.6877322396851174
},
{
"epoch": 0.5856324830817282,
"grad_norm": 1.8515625,
"learning_rate": 8.010413780911022e-06,
"loss": 1.0077406883239746,
"step": 3375,
"token_acc": 0.6826122846664953
},
{
"epoch": 0.5865000867603679,
"grad_norm": 1.9140625,
"learning_rate": 7.982300111178648e-06,
"loss": 1.0013755798339843,
"step": 3380,
"token_acc": 0.6858611685344359
},
{
"epoch": 0.5873676904390075,
"grad_norm": 1.921875,
"learning_rate": 7.954203055914289e-06,
"loss": 0.9829542160034179,
"step": 3385,
"token_acc": 0.6888979370249728
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.8671875,
"learning_rate": 7.926122846479224e-06,
"loss": 0.993384838104248,
"step": 3390,
"token_acc": 0.6845540146288179
},
{
"epoch": 0.5891028977962867,
"grad_norm": 1.828125,
"learning_rate": 7.898059714096016e-06,
"loss": 0.956721305847168,
"step": 3395,
"token_acc": 0.698894211628116
},
{
"epoch": 0.5899705014749262,
"grad_norm": 1.84375,
"learning_rate": 7.870013889846608e-06,
"loss": 0.9920453071594239,
"step": 3400,
"token_acc": 0.686382917252157
},
{
"epoch": 0.5908381051535658,
"grad_norm": 1.84375,
"learning_rate": 7.841985604670427e-06,
"loss": 0.9913934707641602,
"step": 3405,
"token_acc": 0.6889823114142937
},
{
"epoch": 0.5917057088322054,
"grad_norm": 2.0,
"learning_rate": 7.81397508936247e-06,
"loss": 0.9880316734313965,
"step": 3410,
"token_acc": 0.6867218573075777
},
{
"epoch": 0.592573312510845,
"grad_norm": 1.9609375,
"learning_rate": 7.78598257457142e-06,
"loss": 0.9705442428588867,
"step": 3415,
"token_acc": 0.6910044977511245
},
{
"epoch": 0.5934409161894847,
"grad_norm": 1.8046875,
"learning_rate": 7.758008290797727e-06,
"loss": 0.9677356719970703,
"step": 3420,
"token_acc": 0.6959344774631571
},
{
"epoch": 0.5943085198681243,
"grad_norm": 1.8828125,
"learning_rate": 7.730052468391726e-06,
"loss": 0.9935931205749512,
"step": 3425,
"token_acc": 0.6881914107130855
},
{
"epoch": 0.5951761235467639,
"grad_norm": 1.921875,
"learning_rate": 7.702115337551733e-06,
"loss": 1.028738307952881,
"step": 3430,
"token_acc": 0.6780585491818397
},
{
"epoch": 0.5960437272254034,
"grad_norm": 1.921875,
"learning_rate": 7.674197128322151e-06,
"loss": 1.0033409118652343,
"step": 3435,
"token_acc": 0.6833868116036933
},
{
"epoch": 0.596911330904043,
"grad_norm": 1.8046875,
"learning_rate": 7.646298070591578e-06,
"loss": 1.0005316734313965,
"step": 3440,
"token_acc": 0.6868071389260162
},
{
"epoch": 0.5977789345826826,
"grad_norm": 1.7265625,
"learning_rate": 7.618418394090907e-06,
"loss": 0.9753083229064942,
"step": 3445,
"token_acc": 0.6916532970218273
},
{
"epoch": 0.5986465382613222,
"grad_norm": 1.8984375,
"learning_rate": 7.59055832839144e-06,
"loss": 0.9871037483215332,
"step": 3450,
"token_acc": 0.688853524302102
},
{
"epoch": 0.5995141419399618,
"grad_norm": 1.890625,
"learning_rate": 7.562718102903002e-06,
"loss": 0.9996206283569335,
"step": 3455,
"token_acc": 0.6837434616393171
},
{
"epoch": 0.6003817456186015,
"grad_norm": 1.875,
"learning_rate": 7.534897946872042e-06,
"loss": 1.0057412147521974,
"step": 3460,
"token_acc": 0.6834609861177597
},
{
"epoch": 0.601249349297241,
"grad_norm": 1.890625,
"learning_rate": 7.507098089379749e-06,
"loss": 0.990781593322754,
"step": 3465,
"token_acc": 0.6880597411570862
},
{
"epoch": 0.6021169529758806,
"grad_norm": 1.8125,
"learning_rate": 7.479318759340171e-06,
"loss": 0.9857464790344238,
"step": 3470,
"token_acc": 0.689186540346292
},
{
"epoch": 0.6029845566545202,
"grad_norm": 1.828125,
"learning_rate": 7.451560185498318e-06,
"loss": 0.9758604049682618,
"step": 3475,
"token_acc": 0.6898560948081264
},
{
"epoch": 0.6038521603331598,
"grad_norm": 1.8671875,
"learning_rate": 7.423822596428291e-06,
"loss": 0.9707001686096192,
"step": 3480,
"token_acc": 0.6923664838627496
},
{
"epoch": 0.6047197640117994,
"grad_norm": 1.796875,
"learning_rate": 7.396106220531398e-06,
"loss": 1.0107527732849122,
"step": 3485,
"token_acc": 0.6832125667742106
},
{
"epoch": 0.605587367690439,
"grad_norm": 1.9296875,
"learning_rate": 7.368411286034265e-06,
"loss": 1.011655330657959,
"step": 3490,
"token_acc": 0.6814160469354903
},
{
"epoch": 0.6064549713690786,
"grad_norm": 1.8515625,
"learning_rate": 7.340738020986961e-06,
"loss": 1.0010527610778808,
"step": 3495,
"token_acc": 0.6855910839856707
},
{
"epoch": 0.6073225750477182,
"grad_norm": 1.8359375,
"learning_rate": 7.313086653261126e-06,
"loss": 1.0003108024597167,
"step": 3500,
"token_acc": 0.6845823427706937
},
{
"epoch": 0.6081901787263578,
"grad_norm": 1.90625,
"learning_rate": 7.285457410548084e-06,
"loss": 1.0062461853027345,
"step": 3505,
"token_acc": 0.6841428111933098
},
{
"epoch": 0.6090577824049974,
"grad_norm": 1.8515625,
"learning_rate": 7.2578505203569775e-06,
"loss": 1.0086194038391114,
"step": 3510,
"token_acc": 0.6810051221539865
},
{
"epoch": 0.609925386083637,
"grad_norm": 1.8984375,
"learning_rate": 7.230266210012886e-06,
"loss": 0.9880249977111817,
"step": 3515,
"token_acc": 0.6883796750337954
},
{
"epoch": 0.6107929897622766,
"grad_norm": 1.828125,
"learning_rate": 7.20270470665497e-06,
"loss": 0.9953752517700195,
"step": 3520,
"token_acc": 0.6858969161328684
},
{
"epoch": 0.6116605934409162,
"grad_norm": 1.8359375,
"learning_rate": 7.1751662372345745e-06,
"loss": 1.0096102714538575,
"step": 3525,
"token_acc": 0.6835760199396563
},
{
"epoch": 0.6125281971195558,
"grad_norm": 1.9296875,
"learning_rate": 7.1476510285133824e-06,
"loss": 1.01358003616333,
"step": 3530,
"token_acc": 0.6808007033204843
},
{
"epoch": 0.6133958007981953,
"grad_norm": 1.8671875,
"learning_rate": 7.1201593070615385e-06,
"loss": 0.9893976211547851,
"step": 3535,
"token_acc": 0.6872642713325582
},
{
"epoch": 0.614263404476835,
"grad_norm": 1.859375,
"learning_rate": 7.0926912992557825e-06,
"loss": 1.0022952079772949,
"step": 3540,
"token_acc": 0.6836349718409973
},
{
"epoch": 0.6151310081554746,
"grad_norm": 1.875,
"learning_rate": 7.065247231277592e-06,
"loss": 0.9951557159423828,
"step": 3545,
"token_acc": 0.6851784290675207
},
{
"epoch": 0.6159986118341142,
"grad_norm": 1.828125,
"learning_rate": 7.037827329111313e-06,
"loss": 1.0185998916625976,
"step": 3550,
"token_acc": 0.6794846010484963
},
{
"epoch": 0.6168662155127538,
"grad_norm": 1.8671875,
"learning_rate": 7.010431818542298e-06,
"loss": 1.0109454154968263,
"step": 3555,
"token_acc": 0.682662396471839
},
{
"epoch": 0.6177338191913934,
"grad_norm": 1.78125,
"learning_rate": 6.983060925155056e-06,
"loss": 0.990286922454834,
"step": 3560,
"token_acc": 0.6862885957035297
},
{
"epoch": 0.618601422870033,
"grad_norm": 1.84375,
"learning_rate": 6.955714874331388e-06,
"loss": 0.9858268737792969,
"step": 3565,
"token_acc": 0.6888843703402467
},
{
"epoch": 0.6194690265486725,
"grad_norm": 1.8203125,
"learning_rate": 6.928393891248529e-06,
"loss": 1.0141701698303223,
"step": 3570,
"token_acc": 0.6834784012484361
},
{
"epoch": 0.6203366302273121,
"grad_norm": 1.8125,
"learning_rate": 6.901098200877301e-06,
"loss": 0.967597770690918,
"step": 3575,
"token_acc": 0.6944241377018675
},
{
"epoch": 0.6212042339059518,
"grad_norm": 2.015625,
"learning_rate": 6.873828027980256e-06,
"loss": 0.9855113983154297,
"step": 3580,
"token_acc": 0.6862524757342923
},
{
"epoch": 0.6220718375845914,
"grad_norm": 1.9140625,
"learning_rate": 6.846583597109817e-06,
"loss": 0.9973045349121094,
"step": 3585,
"token_acc": 0.684243293722762
},
{
"epoch": 0.622939441263231,
"grad_norm": 1.9140625,
"learning_rate": 6.819365132606459e-06,
"loss": 0.9799047470092773,
"step": 3590,
"token_acc": 0.6892774554748672
},
{
"epoch": 0.6238070449418706,
"grad_norm": 1.75,
"learning_rate": 6.7921728585968215e-06,
"loss": 1.0055973052978515,
"step": 3595,
"token_acc": 0.6808834120188846
},
{
"epoch": 0.6246746486205101,
"grad_norm": 1.7265625,
"learning_rate": 6.765006998991889e-06,
"loss": 0.9758973121643066,
"step": 3600,
"token_acc": 0.6907637655417407
},
{
"epoch": 0.6255422522991497,
"grad_norm": 1.890625,
"learning_rate": 6.737867777485136e-06,
"loss": 1.0146740913391112,
"step": 3605,
"token_acc": 0.6811284150100989
},
{
"epoch": 0.6264098559777893,
"grad_norm": 1.90625,
"learning_rate": 6.710755417550698e-06,
"loss": 0.9987593650817871,
"step": 3610,
"token_acc": 0.6847243880941081
},
{
"epoch": 0.6272774596564289,
"grad_norm": 1.953125,
"learning_rate": 6.683670142441514e-06,
"loss": 0.9979434013366699,
"step": 3615,
"token_acc": 0.6853054139387396
},
{
"epoch": 0.6281450633350686,
"grad_norm": 1.8203125,
"learning_rate": 6.6566121751875e-06,
"loss": 0.9827108383178711,
"step": 3620,
"token_acc": 0.6896419200084816
},
{
"epoch": 0.6290126670137082,
"grad_norm": 1.875,
"learning_rate": 6.6295817385937104e-06,
"loss": 0.9979496002197266,
"step": 3625,
"token_acc": 0.6842961073185775
},
{
"epoch": 0.6298802706923478,
"grad_norm": 1.921875,
"learning_rate": 6.602579055238501e-06,
"loss": 0.9886339187622071,
"step": 3630,
"token_acc": 0.6858520767782801
},
{
"epoch": 0.6307478743709873,
"grad_norm": 1.9375,
"learning_rate": 6.575604347471696e-06,
"loss": 1.0002639770507813,
"step": 3635,
"token_acc": 0.6843852893576651
},
{
"epoch": 0.6316154780496269,
"grad_norm": 1.8984375,
"learning_rate": 6.548657837412764e-06,
"loss": 0.9971570014953614,
"step": 3640,
"token_acc": 0.6848995111352526
},
{
"epoch": 0.6324830817282665,
"grad_norm": 1.90625,
"learning_rate": 6.5217397469489765e-06,
"loss": 0.9921416282653809,
"step": 3645,
"token_acc": 0.6852397462075014
},
{
"epoch": 0.6333506854069061,
"grad_norm": 1.7421875,
"learning_rate": 6.494850297733591e-06,
"loss": 1.0081979751586914,
"step": 3650,
"token_acc": 0.6827699225310147
},
{
"epoch": 0.6342182890855457,
"grad_norm": 1.953125,
"learning_rate": 6.467989711184021e-06,
"loss": 0.9944825172424316,
"step": 3655,
"token_acc": 0.6852892695976437
},
{
"epoch": 0.6350858927641854,
"grad_norm": 1.875,
"learning_rate": 6.4411582084800215e-06,
"loss": 0.9934005737304688,
"step": 3660,
"token_acc": 0.6851075268817204
},
{
"epoch": 0.635953496442825,
"grad_norm": 1.90625,
"learning_rate": 6.414356010561853e-06,
"loss": 0.9901107788085938,
"step": 3665,
"token_acc": 0.6862788024738656
},
{
"epoch": 0.6368211001214645,
"grad_norm": 1.8359375,
"learning_rate": 6.387583338128471e-06,
"loss": 1.0017055511474608,
"step": 3670,
"token_acc": 0.6829901814126799
},
{
"epoch": 0.6376887038001041,
"grad_norm": 1.84375,
"learning_rate": 6.3608404116357096e-06,
"loss": 1.0016436576843262,
"step": 3675,
"token_acc": 0.6823521311023893
},
{
"epoch": 0.6385563074787437,
"grad_norm": 1.8515625,
"learning_rate": 6.334127451294461e-06,
"loss": 0.995360279083252,
"step": 3680,
"token_acc": 0.6857756640635555
},
{
"epoch": 0.6394239111573833,
"grad_norm": 1.8984375,
"learning_rate": 6.307444677068869e-06,
"loss": 1.0071782112121581,
"step": 3685,
"token_acc": 0.6841917710589074
},
{
"epoch": 0.6402915148360229,
"grad_norm": 1.7890625,
"learning_rate": 6.280792308674512e-06,
"loss": 0.9938779830932617,
"step": 3690,
"token_acc": 0.6878277558523004
},
{
"epoch": 0.6411591185146625,
"grad_norm": 1.9140625,
"learning_rate": 6.254170565576596e-06,
"loss": 0.9867862701416016,
"step": 3695,
"token_acc": 0.6865127083902706
},
{
"epoch": 0.6420267221933021,
"grad_norm": 1.828125,
"learning_rate": 6.227579666988149e-06,
"loss": 0.9970829010009765,
"step": 3700,
"token_acc": 0.6850032654838358
},
{
"epoch": 0.6428943258719417,
"grad_norm": 1.84375,
"learning_rate": 6.201019831868209e-06,
"loss": 0.9995267868041993,
"step": 3705,
"token_acc": 0.6846874095894374
},
{
"epoch": 0.6437619295505813,
"grad_norm": 1.84375,
"learning_rate": 6.174491278920034e-06,
"loss": 0.9917936325073242,
"step": 3710,
"token_acc": 0.6880288247439375
},
{
"epoch": 0.6446295332292209,
"grad_norm": 1.8046875,
"learning_rate": 6.147994226589287e-06,
"loss": 0.9787176132202149,
"step": 3715,
"token_acc": 0.6913672458526614
},
{
"epoch": 0.6454971369078605,
"grad_norm": 1.8359375,
"learning_rate": 6.121528893062246e-06,
"loss": 1.0017691612243653,
"step": 3720,
"token_acc": 0.6832628692610052
},
{
"epoch": 0.6463647405865001,
"grad_norm": 1.859375,
"learning_rate": 6.095095496264001e-06,
"loss": 0.997169303894043,
"step": 3725,
"token_acc": 0.6856908315278095
},
{
"epoch": 0.6472323442651396,
"grad_norm": 1.828125,
"learning_rate": 6.068694253856675e-06,
"loss": 0.9935990333557129,
"step": 3730,
"token_acc": 0.6857996759957544
},
{
"epoch": 0.6480999479437792,
"grad_norm": 1.9296875,
"learning_rate": 6.04232538323761e-06,
"loss": 0.995047664642334,
"step": 3735,
"token_acc": 0.6855156587473002
},
{
"epoch": 0.6489675516224189,
"grad_norm": 1.8671875,
"learning_rate": 6.015989101537586e-06,
"loss": 0.9964488983154297,
"step": 3740,
"token_acc": 0.6852130600180629
},
{
"epoch": 0.6498351553010585,
"grad_norm": 1.8046875,
"learning_rate": 5.989685625619039e-06,
"loss": 1.001780128479004,
"step": 3745,
"token_acc": 0.6852903955410754
},
{
"epoch": 0.6507027589796981,
"grad_norm": 1.8359375,
"learning_rate": 5.963415172074272e-06,
"loss": 0.9760993003845215,
"step": 3750,
"token_acc": 0.6886531679352932
},
{
"epoch": 0.6515703626583377,
"grad_norm": 1.84375,
"learning_rate": 5.937177957223661e-06,
"loss": 0.9900795936584472,
"step": 3755,
"token_acc": 0.6872612410739596
},
{
"epoch": 0.6524379663369773,
"grad_norm": 1.84375,
"learning_rate": 5.910974197113892e-06,
"loss": 1.001762866973877,
"step": 3760,
"token_acc": 0.6836190449665084
},
{
"epoch": 0.6533055700156168,
"grad_norm": 1.765625,
"learning_rate": 5.884804107516169e-06,
"loss": 0.9720080375671387,
"step": 3765,
"token_acc": 0.6952360976377127
},
{
"epoch": 0.6541731736942564,
"grad_norm": 1.890625,
"learning_rate": 5.858667903924439e-06,
"loss": 0.984315013885498,
"step": 3770,
"token_acc": 0.6863298561396332
},
{
"epoch": 0.655040777372896,
"grad_norm": 1.796875,
"learning_rate": 5.8325658015536205e-06,
"loss": 1.002072525024414,
"step": 3775,
"token_acc": 0.6841665768774916
},
{
"epoch": 0.6559083810515357,
"grad_norm": 1.875,
"learning_rate": 5.8064980153378335e-06,
"loss": 0.9898612976074219,
"step": 3780,
"token_acc": 0.686063766347234
},
{
"epoch": 0.6567759847301753,
"grad_norm": 1.84375,
"learning_rate": 5.780464759928623e-06,
"loss": 1.0027915000915528,
"step": 3785,
"token_acc": 0.6840225269854513
},
{
"epoch": 0.6576435884088149,
"grad_norm": 1.8984375,
"learning_rate": 5.7544662496931935e-06,
"loss": 0.9923629760742188,
"step": 3790,
"token_acc": 0.6860443020793746
},
{
"epoch": 0.6585111920874545,
"grad_norm": 1.8828125,
"learning_rate": 5.7285026987126526e-06,
"loss": 1.0032987594604492,
"step": 3795,
"token_acc": 0.683890081813487
},
{
"epoch": 0.659378795766094,
"grad_norm": 1.8203125,
"learning_rate": 5.7025743207802345e-06,
"loss": 1.0057662963867187,
"step": 3800,
"token_acc": 0.6835548723113827
},
{
"epoch": 0.6602463994447336,
"grad_norm": 1.8125,
"learning_rate": 5.676681329399543e-06,
"loss": 0.9910049438476562,
"step": 3805,
"token_acc": 0.6876422267858134
},
{
"epoch": 0.6611140031233732,
"grad_norm": 1.8359375,
"learning_rate": 5.650823937782803e-06,
"loss": 1.0060483932495117,
"step": 3810,
"token_acc": 0.6838536439827497
},
{
"epoch": 0.6619816068020128,
"grad_norm": 1.7890625,
"learning_rate": 5.625002358849096e-06,
"loss": 0.9882902145385742,
"step": 3815,
"token_acc": 0.6876298080917173
},
{
"epoch": 0.6628492104806525,
"grad_norm": 1.7734375,
"learning_rate": 5.599216805222609e-06,
"loss": 0.9882322311401367,
"step": 3820,
"token_acc": 0.6853805976085054
},
{
"epoch": 0.6637168141592921,
"grad_norm": 1.8203125,
"learning_rate": 5.573467489230879e-06,
"loss": 1.0068046569824218,
"step": 3825,
"token_acc": 0.6833182949170152
},
{
"epoch": 0.6645844178379317,
"grad_norm": 1.796875,
"learning_rate": 5.547754622903059e-06,
"loss": 0.995240306854248,
"step": 3830,
"token_acc": 0.6859673775279661
},
{
"epoch": 0.6654520215165712,
"grad_norm": 1.84375,
"learning_rate": 5.522078417968151e-06,
"loss": 0.9991961479187011,
"step": 3835,
"token_acc": 0.6834129511677283
},
{
"epoch": 0.6663196251952108,
"grad_norm": 1.7421875,
"learning_rate": 5.496439085853282e-06,
"loss": 0.9904547691345215,
"step": 3840,
"token_acc": 0.6883501895504571
},
{
"epoch": 0.6671872288738504,
"grad_norm": 1.8515625,
"learning_rate": 5.470836837681955e-06,
"loss": 0.9769336700439453,
"step": 3845,
"token_acc": 0.6896110755886686
},
{
"epoch": 0.66805483255249,
"grad_norm": 1.8984375,
"learning_rate": 5.445271884272303e-06,
"loss": 1.0078944206237792,
"step": 3850,
"token_acc": 0.6813604508440128
},
{
"epoch": 0.6689224362311296,
"grad_norm": 1.8515625,
"learning_rate": 5.4197444361353675e-06,
"loss": 1.0107319831848145,
"step": 3855,
"token_acc": 0.681410079867805
},
{
"epoch": 0.6697900399097693,
"grad_norm": 1.828125,
"learning_rate": 5.394254703473354e-06,
"loss": 0.964967918395996,
"step": 3860,
"token_acc": 0.6943124165554072
},
{
"epoch": 0.6706576435884088,
"grad_norm": 1.890625,
"learning_rate": 5.368802896177911e-06,
"loss": 0.9789441108703614,
"step": 3865,
"token_acc": 0.6884867885627476
},
{
"epoch": 0.6715252472670484,
"grad_norm": 1.8828125,
"learning_rate": 5.343389223828392e-06,
"loss": 0.9796417236328125,
"step": 3870,
"token_acc": 0.6890180582340962
},
{
"epoch": 0.672392850945688,
"grad_norm": 1.8828125,
"learning_rate": 5.318013895690131e-06,
"loss": 0.9787491798400879,
"step": 3875,
"token_acc": 0.6898387987482578
},
{
"epoch": 0.6732604546243276,
"grad_norm": 1.9375,
"learning_rate": 5.292677120712726e-06,
"loss": 0.9852935791015625,
"step": 3880,
"token_acc": 0.6870102408889257
},
{
"epoch": 0.6741280583029672,
"grad_norm": 1.9921875,
"learning_rate": 5.267379107528311e-06,
"loss": 0.9924633026123046,
"step": 3885,
"token_acc": 0.6847909474491753
},
{
"epoch": 0.6749956619816068,
"grad_norm": 1.859375,
"learning_rate": 5.242120064449845e-06,
"loss": 0.9971447944641113,
"step": 3890,
"token_acc": 0.6849295083489171
},
{
"epoch": 0.6758632656602463,
"grad_norm": 1.7890625,
"learning_rate": 5.216900199469391e-06,
"loss": 0.9826061248779296,
"step": 3895,
"token_acc": 0.6904354672313623
},
{
"epoch": 0.676730869338886,
"grad_norm": 1.859375,
"learning_rate": 5.191719720256407e-06,
"loss": 0.9958490371704102,
"step": 3900,
"token_acc": 0.6858113156286083
},
{
"epoch": 0.6775984730175256,
"grad_norm": 1.796875,
"learning_rate": 5.166578834156031e-06,
"loss": 0.9950273513793946,
"step": 3905,
"token_acc": 0.6859179612865821
},
{
"epoch": 0.6784660766961652,
"grad_norm": 1.859375,
"learning_rate": 5.14147774818738e-06,
"loss": 0.990997314453125,
"step": 3910,
"token_acc": 0.6872399539201735
},
{
"epoch": 0.6793336803748048,
"grad_norm": 1.921875,
"learning_rate": 5.1164166690418435e-06,
"loss": 0.9976764678955078,
"step": 3915,
"token_acc": 0.6839109763660167
},
{
"epoch": 0.6802012840534444,
"grad_norm": 1.8125,
"learning_rate": 5.091395803081376e-06,
"loss": 0.980461311340332,
"step": 3920,
"token_acc": 0.6900414130464994
},
{
"epoch": 0.681068887732084,
"grad_norm": 1.8125,
"learning_rate": 5.066415356336807e-06,
"loss": 1.005615234375,
"step": 3925,
"token_acc": 0.681454565176126
},
{
"epoch": 0.6819364914107235,
"grad_norm": 2.015625,
"learning_rate": 5.041475534506131e-06,
"loss": 0.993968391418457,
"step": 3930,
"token_acc": 0.6851136910077625
},
{
"epoch": 0.6828040950893631,
"grad_norm": 1.9453125,
"learning_rate": 5.01657654295284e-06,
"loss": 1.0097810745239257,
"step": 3935,
"token_acc": 0.6827556629888105
},
{
"epoch": 0.6836716987680028,
"grad_norm": 1.6953125,
"learning_rate": 4.991718586704196e-06,
"loss": 0.9924948692321778,
"step": 3940,
"token_acc": 0.6904830287206266
},
{
"epoch": 0.6845393024466424,
"grad_norm": 1.9296875,
"learning_rate": 4.9669018704495696e-06,
"loss": 0.9993215560913086,
"step": 3945,
"token_acc": 0.6851340222617751
},
{
"epoch": 0.685406906125282,
"grad_norm": 1.796875,
"learning_rate": 4.9421265985387475e-06,
"loss": 0.9833191871643067,
"step": 3950,
"token_acc": 0.6883911507101707
},
{
"epoch": 0.6862745098039216,
"grad_norm": 1.9296875,
"learning_rate": 4.9173929749802465e-06,
"loss": 1.0078816413879395,
"step": 3955,
"token_acc": 0.6850704225352112
},
{
"epoch": 0.6871421134825612,
"grad_norm": 1.875,
"learning_rate": 4.892701203439635e-06,
"loss": 1.0204105377197266,
"step": 3960,
"token_acc": 0.6796032157676348
},
{
"epoch": 0.6880097171612007,
"grad_norm": 1.9453125,
"learning_rate": 4.868051487237858e-06,
"loss": 0.973170280456543,
"step": 3965,
"token_acc": 0.6899492217684071
},
{
"epoch": 0.6888773208398403,
"grad_norm": 1.7421875,
"learning_rate": 4.843444029349564e-06,
"loss": 0.9647638320922851,
"step": 3970,
"token_acc": 0.6957806900520547
},
{
"epoch": 0.6897449245184799,
"grad_norm": 1.890625,
"learning_rate": 4.8188790324014274e-06,
"loss": 0.9891746520996094,
"step": 3975,
"token_acc": 0.685054815133276
},
{
"epoch": 0.6906125281971196,
"grad_norm": 1.8828125,
"learning_rate": 4.794356698670488e-06,
"loss": 0.9468636512756348,
"step": 3980,
"token_acc": 0.6995367131713369
},
{
"epoch": 0.6914801318757592,
"grad_norm": 1.8359375,
"learning_rate": 4.769877230082476e-06,
"loss": 0.9977554321289063,
"step": 3985,
"token_acc": 0.6852840924340428
},
{
"epoch": 0.6923477355543988,
"grad_norm": 1.9140625,
"learning_rate": 4.74544082821016e-06,
"loss": 1.004736328125,
"step": 3990,
"token_acc": 0.6820171598669235
},
{
"epoch": 0.6932153392330384,
"grad_norm": 1.8828125,
"learning_rate": 4.721047694271676e-06,
"loss": 1.0017391204833985,
"step": 3995,
"token_acc": 0.684533952315144
},
{
"epoch": 0.6940829429116779,
"grad_norm": 1.8828125,
"learning_rate": 4.69669802912888e-06,
"loss": 0.9762969970703125,
"step": 4000,
"token_acc": 0.6934091245841135
},
{
"epoch": 0.6949505465903175,
"grad_norm": 1.7890625,
"learning_rate": 4.672392033285695e-06,
"loss": 1.0025498390197753,
"step": 4005,
"token_acc": 0.6851401316784188
},
{
"epoch": 0.6958181502689571,
"grad_norm": 1.875,
"learning_rate": 4.648129906886445e-06,
"loss": 1.0146098136901855,
"step": 4010,
"token_acc": 0.6792288989232372
},
{
"epoch": 0.6966857539475967,
"grad_norm": 1.78125,
"learning_rate": 4.623911849714226e-06,
"loss": 1.0010202407836915,
"step": 4015,
"token_acc": 0.6856064118699079
},
{
"epoch": 0.6975533576262364,
"grad_norm": 1.8671875,
"learning_rate": 4.599738061189244e-06,
"loss": 1.0105598449707032,
"step": 4020,
"token_acc": 0.682280948032655
},
{
"epoch": 0.698420961304876,
"grad_norm": 1.890625,
"learning_rate": 4.575608740367189e-06,
"loss": 0.9960094451904297,
"step": 4025,
"token_acc": 0.6854422794662214
},
{
"epoch": 0.6992885649835155,
"grad_norm": 1.875,
"learning_rate": 4.551524085937582e-06,
"loss": 0.9695888519287109,
"step": 4030,
"token_acc": 0.6933236382866208
},
{
"epoch": 0.7001561686621551,
"grad_norm": 1.796875,
"learning_rate": 4.527484296222149e-06,
"loss": 0.9828217506408692,
"step": 4035,
"token_acc": 0.6878763576059919
},
{
"epoch": 0.7010237723407947,
"grad_norm": 1.8984375,
"learning_rate": 4.503489569173179e-06,
"loss": 0.9933969497680664,
"step": 4040,
"token_acc": 0.6868701758147513
},
{
"epoch": 0.7018913760194343,
"grad_norm": 1.8203125,
"learning_rate": 4.479540102371904e-06,
"loss": 1.0078033447265624,
"step": 4045,
"token_acc": 0.6846505259554748
},
{
"epoch": 0.7027589796980739,
"grad_norm": 1.7578125,
"learning_rate": 4.455636093026865e-06,
"loss": 0.9774109840393066,
"step": 4050,
"token_acc": 0.69020612269789
},
{
"epoch": 0.7036265833767135,
"grad_norm": 1.8125,
"learning_rate": 4.431777737972287e-06,
"loss": 0.9925678253173829,
"step": 4055,
"token_acc": 0.6882775426446069
},
{
"epoch": 0.7044941870553532,
"grad_norm": 1.796875,
"learning_rate": 4.4079652336664645e-06,
"loss": 0.9903898239135742,
"step": 4060,
"token_acc": 0.6858611892801725
},
{
"epoch": 0.7053617907339927,
"grad_norm": 1.7734375,
"learning_rate": 4.384198776190137e-06,
"loss": 0.9989794731140137,
"step": 4065,
"token_acc": 0.6841588232951453
},
{
"epoch": 0.7062293944126323,
"grad_norm": 1.859375,
"learning_rate": 4.360478561244885e-06,
"loss": 0.982159423828125,
"step": 4070,
"token_acc": 0.6892994694174365
},
{
"epoch": 0.7070969980912719,
"grad_norm": 1.84375,
"learning_rate": 4.336804784151505e-06,
"loss": 0.9847228050231933,
"step": 4075,
"token_acc": 0.6867390010281942
},
{
"epoch": 0.7079646017699115,
"grad_norm": 1.8359375,
"learning_rate": 4.313177639848408e-06,
"loss": 1.0108787536621093,
"step": 4080,
"token_acc": 0.6838802388894016
},
{
"epoch": 0.7088322054485511,
"grad_norm": 1.90625,
"learning_rate": 4.2895973228900154e-06,
"loss": 0.9985545158386231,
"step": 4085,
"token_acc": 0.6844401828768361
},
{
"epoch": 0.7096998091271907,
"grad_norm": 1.859375,
"learning_rate": 4.2660640274451545e-06,
"loss": 0.9826979637145996,
"step": 4090,
"token_acc": 0.6863773965691221
},
{
"epoch": 0.7105674128058302,
"grad_norm": 1.84375,
"learning_rate": 4.242577947295462e-06,
"loss": 0.9989730834960937,
"step": 4095,
"token_acc": 0.6846098407914565
},
{
"epoch": 0.7114350164844699,
"grad_norm": 1.828125,
"learning_rate": 4.219139275833783e-06,
"loss": 1.000558090209961,
"step": 4100,
"token_acc": 0.6841488044823767
},
{
"epoch": 0.7123026201631095,
"grad_norm": 1.78125,
"learning_rate": 4.1957482060625865e-06,
"loss": 0.9966065406799316,
"step": 4105,
"token_acc": 0.6879207664422579
},
{
"epoch": 0.7131702238417491,
"grad_norm": 1.8203125,
"learning_rate": 4.172404930592372e-06,
"loss": 0.9852560997009278,
"step": 4110,
"token_acc": 0.6896971139227118
},
{
"epoch": 0.7140378275203887,
"grad_norm": 1.765625,
"learning_rate": 4.149109641640079e-06,
"loss": 1.001215362548828,
"step": 4115,
"token_acc": 0.6853088591189812
},
{
"epoch": 0.7149054311990283,
"grad_norm": 1.78125,
"learning_rate": 4.1258625310275145e-06,
"loss": 1.0101828575134277,
"step": 4120,
"token_acc": 0.681881495767706
},
{
"epoch": 0.7157730348776679,
"grad_norm": 1.8828125,
"learning_rate": 4.102663790179764e-06,
"loss": 0.9940977096557617,
"step": 4125,
"token_acc": 0.6862203534229258
},
{
"epoch": 0.7166406385563074,
"grad_norm": 1.8984375,
"learning_rate": 4.079513610123619e-06,
"loss": 0.9920468330383301,
"step": 4130,
"token_acc": 0.6872316721917288
},
{
"epoch": 0.717508242234947,
"grad_norm": 1.7734375,
"learning_rate": 4.056412181486003e-06,
"loss": 0.9854813575744629,
"step": 4135,
"token_acc": 0.6900252525252525
},
{
"epoch": 0.7183758459135867,
"grad_norm": 1.953125,
"learning_rate": 4.033359694492411e-06,
"loss": 0.9985934257507324,
"step": 4140,
"token_acc": 0.6840057676088909
},
{
"epoch": 0.7192434495922263,
"grad_norm": 1.8046875,
"learning_rate": 4.010356338965323e-06,
"loss": 0.9948851585388183,
"step": 4145,
"token_acc": 0.6854739461477084
},
{
"epoch": 0.7201110532708659,
"grad_norm": 1.96875,
"learning_rate": 3.98740230432266e-06,
"loss": 0.9762655258178711,
"step": 4150,
"token_acc": 0.6923338872694581
},
{
"epoch": 0.7209786569495055,
"grad_norm": 1.9140625,
"learning_rate": 3.9644977795762175e-06,
"loss": 0.988780403137207,
"step": 4155,
"token_acc": 0.6866318047733977
},
{
"epoch": 0.721846260628145,
"grad_norm": 1.953125,
"learning_rate": 3.941642953330102e-06,
"loss": 0.9889546394348144,
"step": 4160,
"token_acc": 0.6865713642503377
},
{
"epoch": 0.7227138643067846,
"grad_norm": 1.859375,
"learning_rate": 3.9188380137791934e-06,
"loss": 0.9839936256408691,
"step": 4165,
"token_acc": 0.6882079424724933
},
{
"epoch": 0.7235814679854242,
"grad_norm": 1.8984375,
"learning_rate": 3.896083148707579e-06,
"loss": 0.9844943046569824,
"step": 4170,
"token_acc": 0.6876788477073265
},
{
"epoch": 0.7244490716640638,
"grad_norm": 1.8515625,
"learning_rate": 3.87337854548702e-06,
"loss": 0.9963854789733887,
"step": 4175,
"token_acc": 0.6848597774936757
},
{
"epoch": 0.7253166753427035,
"grad_norm": 1.8515625,
"learning_rate": 3.8507243910754015e-06,
"loss": 1.0020368576049805,
"step": 4180,
"token_acc": 0.6843103494659816
},
{
"epoch": 0.7261842790213431,
"grad_norm": 1.8671875,
"learning_rate": 3.828120872015193e-06,
"loss": 1.0066667556762696,
"step": 4185,
"token_acc": 0.6836107139967091
},
{
"epoch": 0.7270518826999827,
"grad_norm": 1.9140625,
"learning_rate": 3.8055681744319173e-06,
"loss": 1.0011329650878906,
"step": 4190,
"token_acc": 0.6850806824639539
},
{
"epoch": 0.7279194863786222,
"grad_norm": 1.8828125,
"learning_rate": 3.783066484032615e-06,
"loss": 1.0011292457580567,
"step": 4195,
"token_acc": 0.6820659087561429
},
{
"epoch": 0.7287870900572618,
"grad_norm": 1.8203125,
"learning_rate": 3.7606159861043123e-06,
"loss": 1.0115188598632812,
"step": 4200,
"token_acc": 0.6830158518715865
},
{
"epoch": 0.7296546937359014,
"grad_norm": 1.8125,
"learning_rate": 3.738216865512496e-06,
"loss": 0.9878059387207031,
"step": 4205,
"token_acc": 0.6888757571280839
},
{
"epoch": 0.730522297414541,
"grad_norm": 1.8046875,
"learning_rate": 3.7158693066996066e-06,
"loss": 0.9820815086364746,
"step": 4210,
"token_acc": 0.6891283735961433
},
{
"epoch": 0.7313899010931806,
"grad_norm": 1.8515625,
"learning_rate": 3.69357349368349e-06,
"loss": 1.001258373260498,
"step": 4215,
"token_acc": 0.6836823676196354
},
{
"epoch": 0.7322575047718203,
"grad_norm": 1.796875,
"learning_rate": 3.6713296100559084e-06,
"loss": 1.0037827491760254,
"step": 4220,
"token_acc": 0.6847890011370023
},
{
"epoch": 0.7331251084504599,
"grad_norm": 1.9296875,
"learning_rate": 3.649137838981014e-06,
"loss": 0.9784846305847168,
"step": 4225,
"token_acc": 0.6925998220664197
},
{
"epoch": 0.7339927121290994,
"grad_norm": 1.9453125,
"learning_rate": 3.6269983631938476e-06,
"loss": 0.98970947265625,
"step": 4230,
"token_acc": 0.6865225040519761
},
{
"epoch": 0.734860315807739,
"grad_norm": 1.828125,
"learning_rate": 3.604911364998832e-06,
"loss": 1.0065629005432128,
"step": 4235,
"token_acc": 0.6821462879099767
},
{
"epoch": 0.7357279194863786,
"grad_norm": 1.8203125,
"learning_rate": 3.582877026268269e-06,
"loss": 1.0006741523742675,
"step": 4240,
"token_acc": 0.6835158700622043
},
{
"epoch": 0.7365955231650182,
"grad_norm": 1.875,
"learning_rate": 3.560895528440844e-06,
"loss": 0.9968295097351074,
"step": 4245,
"token_acc": 0.6860261131570137
},
{
"epoch": 0.7374631268436578,
"grad_norm": 1.8984375,
"learning_rate": 3.5389670525201335e-06,
"loss": 0.994806957244873,
"step": 4250,
"token_acc": 0.6844711335861778
},
{
"epoch": 0.7383307305222974,
"grad_norm": 1.9296875,
"learning_rate": 3.5170917790731084e-06,
"loss": 0.9853558540344238,
"step": 4255,
"token_acc": 0.690081677065686
},
{
"epoch": 0.7391983342009371,
"grad_norm": 1.796875,
"learning_rate": 3.4952698882286564e-06,
"loss": 1.002675437927246,
"step": 4260,
"token_acc": 0.6839060402684564
},
{
"epoch": 0.7400659378795766,
"grad_norm": 1.890625,
"learning_rate": 3.473501559676088e-06,
"loss": 1.006124496459961,
"step": 4265,
"token_acc": 0.6798628939749822
},
{
"epoch": 0.7409335415582162,
"grad_norm": 1.875,
"learning_rate": 3.4517869726636667e-06,
"loss": 0.9663874626159668,
"step": 4270,
"token_acc": 0.6933329657757991
},
{
"epoch": 0.7418011452368558,
"grad_norm": 1.90625,
"learning_rate": 3.4301263059971234e-06,
"loss": 0.9783464431762695,
"step": 4275,
"token_acc": 0.690311533509431
},
{
"epoch": 0.7426687489154954,
"grad_norm": 1.90625,
"learning_rate": 3.408519738038202e-06,
"loss": 0.9907986640930175,
"step": 4280,
"token_acc": 0.6858886450905102
},
{
"epoch": 0.743536352594135,
"grad_norm": 1.8515625,
"learning_rate": 3.3869674467031633e-06,
"loss": 0.9949624061584472,
"step": 4285,
"token_acc": 0.6854110544056531
},
{
"epoch": 0.7444039562727746,
"grad_norm": 1.859375,
"learning_rate": 3.3654696094613424e-06,
"loss": 1.0062894821166992,
"step": 4290,
"token_acc": 0.68370965995235
},
{
"epoch": 0.7452715599514141,
"grad_norm": 1.890625,
"learning_rate": 3.3440264033336787e-06,
"loss": 0.9806596755981445,
"step": 4295,
"token_acc": 0.6898993765722411
},
{
"epoch": 0.7461391636300538,
"grad_norm": 1.859375,
"learning_rate": 3.3226380048912586e-06,
"loss": 0.9737249374389648,
"step": 4300,
"token_acc": 0.6907929820819113
},
{
"epoch": 0.7470067673086934,
"grad_norm": 1.765625,
"learning_rate": 3.3013045902538634e-06,
"loss": 0.975331974029541,
"step": 4305,
"token_acc": 0.6905592319015476
},
{
"epoch": 0.747874370987333,
"grad_norm": 1.9609375,
"learning_rate": 3.2800263350885165e-06,
"loss": 0.9860298156738281,
"step": 4310,
"token_acc": 0.6891362690327527
},
{
"epoch": 0.7487419746659726,
"grad_norm": 1.859375,
"learning_rate": 3.2588034146080404e-06,
"loss": 0.9883022308349609,
"step": 4315,
"token_acc": 0.6857588710224575
},
{
"epoch": 0.7496095783446122,
"grad_norm": 1.828125,
"learning_rate": 3.2376360035696085e-06,
"loss": 1.0138681411743165,
"step": 4320,
"token_acc": 0.6818151051185206
},
{
"epoch": 0.7504771820232518,
"grad_norm": 1.8203125,
"learning_rate": 3.216524276273313e-06,
"loss": 1.0137529373168945,
"step": 4325,
"token_acc": 0.6836542657647829
},
{
"epoch": 0.7513447857018913,
"grad_norm": 1.859375,
"learning_rate": 3.1954684065607232e-06,
"loss": 0.9806119918823242,
"step": 4330,
"token_acc": 0.6892710892710893
},
{
"epoch": 0.7522123893805309,
"grad_norm": 1.8828125,
"learning_rate": 3.174468567813461e-06,
"loss": 1.0116167068481445,
"step": 4335,
"token_acc": 0.6821417273014869
},
{
"epoch": 0.7530799930591706,
"grad_norm": 1.921875,
"learning_rate": 3.1535249329517603e-06,
"loss": 1.0085960388183595,
"step": 4340,
"token_acc": 0.6825007871428767
},
{
"epoch": 0.7539475967378102,
"grad_norm": 1.8984375,
"learning_rate": 3.1326376744330667e-06,
"loss": 0.9790970802307128,
"step": 4345,
"token_acc": 0.689821249191562
},
{
"epoch": 0.7548152004164498,
"grad_norm": 1.8203125,
"learning_rate": 3.1118069642505886e-06,
"loss": 0.9997638702392578,
"step": 4350,
"token_acc": 0.6841588385994876
},
{
"epoch": 0.7556828040950894,
"grad_norm": 1.8125,
"learning_rate": 3.0910329739319033e-06,
"loss": 0.9993162155151367,
"step": 4355,
"token_acc": 0.6841426321221009
},
{
"epoch": 0.756550407773729,
"grad_norm": 1.9140625,
"learning_rate": 3.0703158745375316e-06,
"loss": 0.9740482330322265,
"step": 4360,
"token_acc": 0.6929492242406393
},
{
"epoch": 0.7574180114523685,
"grad_norm": 1.9140625,
"learning_rate": 3.0496558366595364e-06,
"loss": 0.9911387443542481,
"step": 4365,
"token_acc": 0.6877242474870587
},
{
"epoch": 0.7582856151310081,
"grad_norm": 1.8359375,
"learning_rate": 3.029053030420115e-06,
"loss": 1.001497173309326,
"step": 4370,
"token_acc": 0.6854735659622271
},
{
"epoch": 0.7591532188096477,
"grad_norm": 1.8828125,
"learning_rate": 3.0085076254701983e-06,
"loss": 0.9972357749938965,
"step": 4375,
"token_acc": 0.6851984268859492
},
{
"epoch": 0.7600208224882874,
"grad_norm": 1.8125,
"learning_rate": 2.988019790988056e-06,
"loss": 0.990943431854248,
"step": 4380,
"token_acc": 0.6880961127665075
},
{
"epoch": 0.760888426166927,
"grad_norm": 1.8359375,
"learning_rate": 2.9675896956778984e-06,
"loss": 0.9964810371398926,
"step": 4385,
"token_acc": 0.6846022286951177
},
{
"epoch": 0.7617560298455666,
"grad_norm": 1.8671875,
"learning_rate": 2.947217507768495e-06,
"loss": 0.9866546630859375,
"step": 4390,
"token_acc": 0.6860718843921116
},
{
"epoch": 0.7626236335242061,
"grad_norm": 1.796875,
"learning_rate": 2.926903395011781e-06,
"loss": 0.9983717918395996,
"step": 4395,
"token_acc": 0.6877645635960492
},
{
"epoch": 0.7634912372028457,
"grad_norm": 1.875,
"learning_rate": 2.9066475246814828e-06,
"loss": 1.0109498977661133,
"step": 4400,
"token_acc": 0.6828291696597227
},
{
"epoch": 0.7643588408814853,
"grad_norm": 1.8046875,
"learning_rate": 2.886450063571735e-06,
"loss": 0.9692567825317383,
"step": 4405,
"token_acc": 0.6919382320189973
},
{
"epoch": 0.7652264445601249,
"grad_norm": 1.890625,
"learning_rate": 2.86631117799571e-06,
"loss": 0.9896286964416504,
"step": 4410,
"token_acc": 0.6858808026192449
},
{
"epoch": 0.7660940482387645,
"grad_norm": 1.8515625,
"learning_rate": 2.8462310337842523e-06,
"loss": 0.9920248031616211,
"step": 4415,
"token_acc": 0.6869760785115518
},
{
"epoch": 0.7669616519174042,
"grad_norm": 1.921875,
"learning_rate": 2.8262097962845058e-06,
"loss": 1.0015531539916993,
"step": 4420,
"token_acc": 0.6852368826004658
},
{
"epoch": 0.7678292555960438,
"grad_norm": 1.8671875,
"learning_rate": 2.806247630358554e-06,
"loss": 1.0034663200378418,
"step": 4425,
"token_acc": 0.684781729446976
},
{
"epoch": 0.7686968592746833,
"grad_norm": 1.8828125,
"learning_rate": 2.7863447003820642e-06,
"loss": 0.9939127922058105,
"step": 4430,
"token_acc": 0.6864493951901668
},
{
"epoch": 0.7695644629533229,
"grad_norm": 1.84375,
"learning_rate": 2.7665011702429357e-06,
"loss": 0.9952418327331543,
"step": 4435,
"token_acc": 0.6857203881939902
},
{
"epoch": 0.7704320666319625,
"grad_norm": 1.890625,
"learning_rate": 2.746717203339946e-06,
"loss": 0.9777667045593261,
"step": 4440,
"token_acc": 0.6909912776054509
},
{
"epoch": 0.7712996703106021,
"grad_norm": 1.8671875,
"learning_rate": 2.7269929625814085e-06,
"loss": 1.0063211441040039,
"step": 4445,
"token_acc": 0.6833169581450187
},
{
"epoch": 0.7721672739892417,
"grad_norm": 1.859375,
"learning_rate": 2.7073286103838293e-06,
"loss": 1.0100595474243164,
"step": 4450,
"token_acc": 0.6820285638719915
},
{
"epoch": 0.7730348776678813,
"grad_norm": 1.9140625,
"learning_rate": 2.6877243086705716e-06,
"loss": 0.9833673477172852,
"step": 4455,
"token_acc": 0.6882294325611871
},
{
"epoch": 0.773902481346521,
"grad_norm": 1.859375,
"learning_rate": 2.6681802188705196e-06,
"loss": 0.9901654243469238,
"step": 4460,
"token_acc": 0.6868834587465766
},
{
"epoch": 0.7747700850251605,
"grad_norm": 1.921875,
"learning_rate": 2.6486965019167544e-06,
"loss": 0.9956707000732422,
"step": 4465,
"token_acc": 0.6848234865946864
},
{
"epoch": 0.7756376887038001,
"grad_norm": 1.8515625,
"learning_rate": 2.629273318245219e-06,
"loss": 0.9965853691101074,
"step": 4470,
"token_acc": 0.6839174626010737
},
{
"epoch": 0.7765052923824397,
"grad_norm": 1.890625,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.9861255645751953,
"step": 4475,
"token_acc": 0.6883442417490601
},
{
"epoch": 0.7773728960610793,
"grad_norm": 1.9453125,
"learning_rate": 2.590609189999049e-06,
"loss": 1.0013001441955567,
"step": 4480,
"token_acc": 0.6833337796941535
},
{
"epoch": 0.7782404997397189,
"grad_norm": 1.9453125,
"learning_rate": 2.5713685637987818e-06,
"loss": 0.9976703643798828,
"step": 4485,
"token_acc": 0.6877284595300261
},
{
"epoch": 0.7791081034183585,
"grad_norm": 1.953125,
"learning_rate": 2.5521891076268555e-06,
"loss": 0.9790729522705078,
"step": 4490,
"token_acc": 0.6894264797255315
},
{
"epoch": 0.779975707096998,
"grad_norm": 1.765625,
"learning_rate": 2.5330709794138254e-06,
"loss": 0.9921565055847168,
"step": 4495,
"token_acc": 0.684914119045047
},
{
"epoch": 0.7808433107756377,
"grad_norm": 1.84375,
"learning_rate": 2.5140143365852476e-06,
"loss": 0.9999216079711915,
"step": 4500,
"token_acc": 0.6869082423624708
},
{
"epoch": 0.7817109144542773,
"grad_norm": 1.828125,
"learning_rate": 2.4950193360603868e-06,
"loss": 0.9970880508422851,
"step": 4505,
"token_acc": 0.6865171230142891
},
{
"epoch": 0.7825785181329169,
"grad_norm": 1.8828125,
"learning_rate": 2.4760861342509235e-06,
"loss": 0.9840543746948243,
"step": 4510,
"token_acc": 0.6897681822438032
},
{
"epoch": 0.7834461218115565,
"grad_norm": 1.8203125,
"learning_rate": 2.4572148870596636e-06,
"loss": 1.0138338088989258,
"step": 4515,
"token_acc": 0.6816099820996104
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.84375,
"learning_rate": 2.438405749879258e-06,
"loss": 1.0032525062561035,
"step": 4520,
"token_acc": 0.6844471121782046
},
{
"epoch": 0.7851813291688357,
"grad_norm": 1.8359375,
"learning_rate": 2.4196588775909204e-06,
"loss": 1.0106260299682617,
"step": 4525,
"token_acc": 0.6811335272101303
},
{
"epoch": 0.7860489328474752,
"grad_norm": 1.84375,
"learning_rate": 2.4009744245631515e-06,
"loss": 0.9920726776123047,
"step": 4530,
"token_acc": 0.6867518931683088
},
{
"epoch": 0.7869165365261148,
"grad_norm": 1.890625,
"learning_rate": 2.3823525446504735e-06,
"loss": 0.985197639465332,
"step": 4535,
"token_acc": 0.6879341219882787
},
{
"epoch": 0.7877841402047545,
"grad_norm": 1.9453125,
"learning_rate": 2.363793391192155e-06,
"loss": 0.9904392242431641,
"step": 4540,
"token_acc": 0.6878958715534904
},
{
"epoch": 0.7886517438833941,
"grad_norm": 1.8984375,
"learning_rate": 2.345297117010954e-06,
"loss": 0.9871200561523438,
"step": 4545,
"token_acc": 0.687506753106429
},
{
"epoch": 0.7895193475620337,
"grad_norm": 1.8359375,
"learning_rate": 2.3268638744118555e-06,
"loss": 0.9928851127624512,
"step": 4550,
"token_acc": 0.6844927026075655
},
{
"epoch": 0.7903869512406733,
"grad_norm": 1.8046875,
"learning_rate": 2.308493815180827e-06,
"loss": 0.974305534362793,
"step": 4555,
"token_acc": 0.6914228654424733
},
{
"epoch": 0.7912545549193128,
"grad_norm": 1.8359375,
"learning_rate": 2.2901870905835533e-06,
"loss": 1.0047635078430175,
"step": 4560,
"token_acc": 0.6855282218262464
},
{
"epoch": 0.7921221585979524,
"grad_norm": 1.8828125,
"learning_rate": 2.2719438513642023e-06,
"loss": 1.0162674903869628,
"step": 4565,
"token_acc": 0.6807637282560736
},
{
"epoch": 0.792989762276592,
"grad_norm": 1.859375,
"learning_rate": 2.25376424774418e-06,
"loss": 1.004638671875,
"step": 4570,
"token_acc": 0.6815402254920696
},
{
"epoch": 0.7938573659552316,
"grad_norm": 1.8515625,
"learning_rate": 2.2356484294208945e-06,
"loss": 0.9928275108337402,
"step": 4575,
"token_acc": 0.6833461637156787
},
{
"epoch": 0.7947249696338713,
"grad_norm": 1.8359375,
"learning_rate": 2.2175965455665225e-06,
"loss": 0.9946788787841797,
"step": 4580,
"token_acc": 0.6848246423935879
},
{
"epoch": 0.7955925733125109,
"grad_norm": 1.8515625,
"learning_rate": 2.1996087448267813e-06,
"loss": 0.9975082397460937,
"step": 4585,
"token_acc": 0.6866429591314421
},
{
"epoch": 0.7964601769911505,
"grad_norm": 1.8828125,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.9974835395812989,
"step": 4590,
"token_acc": 0.6866237463087606
},
{
"epoch": 0.79732778066979,
"grad_norm": 1.890625,
"learning_rate": 2.163825984634419e-06,
"loss": 1.0059508323669433,
"step": 4595,
"token_acc": 0.681298324742268
},
{
"epoch": 0.7981953843484296,
"grad_norm": 1.859375,
"learning_rate": 2.146031319829942e-06,
"loss": 0.9988635063171387,
"step": 4600,
"token_acc": 0.6838742790766779
},
{
"epoch": 0.7990629880270692,
"grad_norm": 1.765625,
"learning_rate": 2.1283013274339535e-06,
"loss": 0.9845050811767578,
"step": 4605,
"token_acc": 0.6879293681268448
},
{
"epoch": 0.7999305917057088,
"grad_norm": 1.921875,
"learning_rate": 2.110636153441602e-06,
"loss": 0.9597654342651367,
"step": 4610,
"token_acc": 0.696718661601875
},
{
"epoch": 0.8007981953843484,
"grad_norm": 1.875,
"learning_rate": 2.0930359433142934e-06,
"loss": 1.0043936729431153,
"step": 4615,
"token_acc": 0.6844164224450837
},
{
"epoch": 0.8016657990629881,
"grad_norm": 1.8359375,
"learning_rate": 2.0755008419785037e-06,
"loss": 1.0047181129455567,
"step": 4620,
"token_acc": 0.6838641217477737
},
{
"epoch": 0.8025334027416277,
"grad_norm": 1.90625,
"learning_rate": 2.058030993824577e-06,
"loss": 0.9849211692810058,
"step": 4625,
"token_acc": 0.6874467883833129
},
{
"epoch": 0.8034010064202672,
"grad_norm": 1.953125,
"learning_rate": 2.040626542705536e-06,
"loss": 0.9540246963500977,
"step": 4630,
"token_acc": 0.6967229009113826
},
{
"epoch": 0.8042686100989068,
"grad_norm": 1.8046875,
"learning_rate": 2.023287631935904e-06,
"loss": 1.00880708694458,
"step": 4635,
"token_acc": 0.6815457263858633
},
{
"epoch": 0.8051362137775464,
"grad_norm": 1.6796875,
"learning_rate": 2.0060144042905227e-06,
"loss": 0.9796277999877929,
"step": 4640,
"token_acc": 0.690756012376665
},
{
"epoch": 0.806003817456186,
"grad_norm": 1.8984375,
"learning_rate": 1.9888070020033713e-06,
"loss": 0.9896170616149902,
"step": 4645,
"token_acc": 0.6856324413853025
},
{
"epoch": 0.8068714211348256,
"grad_norm": 1.828125,
"learning_rate": 1.971665566766401e-06,
"loss": 1.0029498100280763,
"step": 4650,
"token_acc": 0.6831084917137528
},
{
"epoch": 0.8077390248134652,
"grad_norm": 1.9921875,
"learning_rate": 1.954590239728369e-06,
"loss": 0.9786740303039551,
"step": 4655,
"token_acc": 0.6886030063097072
},
{
"epoch": 0.8086066284921049,
"grad_norm": 1.9296875,
"learning_rate": 1.9375811614936703e-06,
"loss": 1.0019638061523437,
"step": 4660,
"token_acc": 0.6835257720567924
},
{
"epoch": 0.8094742321707444,
"grad_norm": 1.8984375,
"learning_rate": 1.9206384721211847e-06,
"loss": 0.9825675010681152,
"step": 4665,
"token_acc": 0.6896243896500885
},
{
"epoch": 0.810341835849384,
"grad_norm": 1.875,
"learning_rate": 1.9037623111231229e-06,
"loss": 1.002269172668457,
"step": 4670,
"token_acc": 0.6822305407169595
},
{
"epoch": 0.8112094395280236,
"grad_norm": 1.921875,
"learning_rate": 1.8869528174638752e-06,
"loss": 0.9967728614807129,
"step": 4675,
"token_acc": 0.683384136015715
},
{
"epoch": 0.8120770432066632,
"grad_norm": 1.875,
"learning_rate": 1.8702101295588714e-06,
"loss": 0.9936102867126465,
"step": 4680,
"token_acc": 0.6876304142688162
},
{
"epoch": 0.8129446468853028,
"grad_norm": 1.8359375,
"learning_rate": 1.8535343852734333e-06,
"loss": 0.9896058082580567,
"step": 4685,
"token_acc": 0.6851116625310174
},
{
"epoch": 0.8138122505639424,
"grad_norm": 1.890625,
"learning_rate": 1.8369257219216563e-06,
"loss": 0.999512004852295,
"step": 4690,
"token_acc": 0.6847072393860351
},
{
"epoch": 0.8146798542425819,
"grad_norm": 1.90625,
"learning_rate": 1.8203842762652546e-06,
"loss": 0.9870369911193848,
"step": 4695,
"token_acc": 0.6873048561748488
},
{
"epoch": 0.8155474579212216,
"grad_norm": 1.8828125,
"learning_rate": 1.8039101845124552e-06,
"loss": 1.0059050559997558,
"step": 4700,
"token_acc": 0.6838551420357133
},
{
"epoch": 0.8164150615998612,
"grad_norm": 1.8515625,
"learning_rate": 1.7875035823168641e-06,
"loss": 1.0021234512329102,
"step": 4705,
"token_acc": 0.6836149967576725
},
{
"epoch": 0.8172826652785008,
"grad_norm": 1.8671875,
"learning_rate": 1.7711646047763586e-06,
"loss": 1.0115555763244628,
"step": 4710,
"token_acc": 0.6826202404154295
},
{
"epoch": 0.8181502689571404,
"grad_norm": 1.8828125,
"learning_rate": 1.7548933864319661e-06,
"loss": 0.9789422035217286,
"step": 4715,
"token_acc": 0.6882753373099447
},
{
"epoch": 0.81901787263578,
"grad_norm": 1.8046875,
"learning_rate": 1.7386900612667635e-06,
"loss": 0.992159366607666,
"step": 4720,
"token_acc": 0.6854386416259326
},
{
"epoch": 0.8198854763144195,
"grad_norm": 1.875,
"learning_rate": 1.722554762704769e-06,
"loss": 0.9974750518798828,
"step": 4725,
"token_acc": 0.6872423661616824
},
{
"epoch": 0.8207530799930591,
"grad_norm": 1.921875,
"learning_rate": 1.706487623609846e-06,
"loss": 1.0016369819641113,
"step": 4730,
"token_acc": 0.6850419346958717
},
{
"epoch": 0.8216206836716987,
"grad_norm": 1.7890625,
"learning_rate": 1.6904887762846068e-06,
"loss": 0.9975146293640137,
"step": 4735,
"token_acc": 0.6852138558677419
},
{
"epoch": 0.8224882873503384,
"grad_norm": 1.796875,
"learning_rate": 1.6745583524693275e-06,
"loss": 0.9930521965026855,
"step": 4740,
"token_acc": 0.6858060739712228
},
{
"epoch": 0.823355891028978,
"grad_norm": 1.828125,
"learning_rate": 1.658696483340858e-06,
"loss": 0.990367317199707,
"step": 4745,
"token_acc": 0.690088659520974
},
{
"epoch": 0.8242234947076176,
"grad_norm": 1.890625,
"learning_rate": 1.6429032995115446e-06,
"loss": 0.9985919952392578,
"step": 4750,
"token_acc": 0.6834251915580051
},
{
"epoch": 0.8250910983862572,
"grad_norm": 1.859375,
"learning_rate": 1.6271789310281515e-06,
"loss": 0.9976622581481933,
"step": 4755,
"token_acc": 0.6856444289207125
},
{
"epoch": 0.8259587020648967,
"grad_norm": 1.8984375,
"learning_rate": 1.6115235073708024e-06,
"loss": 0.9920053482055664,
"step": 4760,
"token_acc": 0.6863879817112998
},
{
"epoch": 0.8268263057435363,
"grad_norm": 1.8984375,
"learning_rate": 1.5959371574518934e-06,
"loss": 1.0084431648254395,
"step": 4765,
"token_acc": 0.6817347253306887
},
{
"epoch": 0.8276939094221759,
"grad_norm": 1.84375,
"learning_rate": 1.580420009615048e-06,
"loss": 0.9829930305480957,
"step": 4770,
"token_acc": 0.690848938000397
},
{
"epoch": 0.8285615131008155,
"grad_norm": 1.8359375,
"learning_rate": 1.564972191634051e-06,
"loss": 0.9950210571289062,
"step": 4775,
"token_acc": 0.6865175616096914
},
{
"epoch": 0.8294291167794552,
"grad_norm": 1.859375,
"learning_rate": 1.5495938307118052e-06,
"loss": 0.9901968002319336,
"step": 4780,
"token_acc": 0.686112085480694
},
{
"epoch": 0.8302967204580948,
"grad_norm": 1.8203125,
"learning_rate": 1.5342850534792753e-06,
"loss": 0.9941259384155273,
"step": 4785,
"token_acc": 0.6846874957604699
},
{
"epoch": 0.8311643241367344,
"grad_norm": 1.8203125,
"learning_rate": 1.5190459859944506e-06,
"loss": 0.9872735977172852,
"step": 4790,
"token_acc": 0.6862535758770308
},
{
"epoch": 0.8320319278153739,
"grad_norm": 1.921875,
"learning_rate": 1.5038767537413035e-06,
"loss": 0.9889012336730957,
"step": 4795,
"token_acc": 0.6895972043893206
},
{
"epoch": 0.8328995314940135,
"grad_norm": 1.7890625,
"learning_rate": 1.4887774816287604e-06,
"loss": 0.9911365509033203,
"step": 4800,
"token_acc": 0.6869375511317153
},
{
"epoch": 0.8337671351726531,
"grad_norm": 1.8515625,
"learning_rate": 1.4737482939896675e-06,
"loss": 1.0037782669067383,
"step": 4805,
"token_acc": 0.6832780223501523
},
{
"epoch": 0.8346347388512927,
"grad_norm": 1.796875,
"learning_rate": 1.4587893145797738e-06,
"loss": 0.9940081596374511,
"step": 4810,
"token_acc": 0.6838530744774403
},
{
"epoch": 0.8355023425299323,
"grad_norm": 1.8359375,
"learning_rate": 1.4439006665767042e-06,
"loss": 0.9779527664184571,
"step": 4815,
"token_acc": 0.6904385373836274
},
{
"epoch": 0.836369946208572,
"grad_norm": 1.9609375,
"learning_rate": 1.4290824725789542e-06,
"loss": 0.9910070419311523,
"step": 4820,
"token_acc": 0.6858737818147412
},
{
"epoch": 0.8372375498872116,
"grad_norm": 1.859375,
"learning_rate": 1.4143348546048706e-06,
"loss": 1.0061234474182128,
"step": 4825,
"token_acc": 0.6816261604255763
},
{
"epoch": 0.8381051535658511,
"grad_norm": 1.90625,
"learning_rate": 1.3996579340916583e-06,
"loss": 1.0056955337524414,
"step": 4830,
"token_acc": 0.6805094883366865
},
{
"epoch": 0.8389727572444907,
"grad_norm": 1.875,
"learning_rate": 1.3850518318943685e-06,
"loss": 0.9870254516601562,
"step": 4835,
"token_acc": 0.6884185128317614
},
{
"epoch": 0.8398403609231303,
"grad_norm": 1.78125,
"learning_rate": 1.3705166682849103e-06,
"loss": 0.9996889114379883,
"step": 4840,
"token_acc": 0.6827413811061348
},
{
"epoch": 0.8407079646017699,
"grad_norm": 1.796875,
"learning_rate": 1.3560525629510567e-06,
"loss": 1.0011041641235352,
"step": 4845,
"token_acc": 0.6841137322872151
},
{
"epoch": 0.8415755682804095,
"grad_norm": 1.8671875,
"learning_rate": 1.341659634995467e-06,
"loss": 0.9982816696166992,
"step": 4850,
"token_acc": 0.6866214715232295
},
{
"epoch": 0.842443171959049,
"grad_norm": 1.8671875,
"learning_rate": 1.327338002934695e-06,
"loss": 1.0017461776733398,
"step": 4855,
"token_acc": 0.6838751233417388
},
{
"epoch": 0.8433107756376887,
"grad_norm": 1.9453125,
"learning_rate": 1.3130877846982204e-06,
"loss": 0.9614505767822266,
"step": 4860,
"token_acc": 0.6950054819491962
},
{
"epoch": 0.8441783793163283,
"grad_norm": 1.8984375,
"learning_rate": 1.2989090976274765e-06,
"loss": 1.0008953094482422,
"step": 4865,
"token_acc": 0.6828094757789712
},
{
"epoch": 0.8450459829949679,
"grad_norm": 1.7890625,
"learning_rate": 1.28480205847488e-06,
"loss": 0.987119197845459,
"step": 4870,
"token_acc": 0.6882399580718422
},
{
"epoch": 0.8459135866736075,
"grad_norm": 1.8125,
"learning_rate": 1.2707667834028782e-06,
"loss": 1.0048983573913575,
"step": 4875,
"token_acc": 0.6831559340074508
},
{
"epoch": 0.8467811903522471,
"grad_norm": 1.8984375,
"learning_rate": 1.256803387982981e-06,
"loss": 0.9872228622436523,
"step": 4880,
"token_acc": 0.6871233979735624
},
{
"epoch": 0.8476487940308867,
"grad_norm": 1.921875,
"learning_rate": 1.2429119871948203e-06,
"loss": 0.9801000595092774,
"step": 4885,
"token_acc": 0.690729556130764
},
{
"epoch": 0.8485163977095262,
"grad_norm": 1.8203125,
"learning_rate": 1.2290926954251937e-06,
"loss": 0.9848250389099121,
"step": 4890,
"token_acc": 0.6876951737632853
},
{
"epoch": 0.8493840013881658,
"grad_norm": 1.921875,
"learning_rate": 1.2153456264671337e-06,
"loss": 0.986370849609375,
"step": 4895,
"token_acc": 0.6873767258382643
},
{
"epoch": 0.8502516050668055,
"grad_norm": 1.8828125,
"learning_rate": 1.2016708935189591e-06,
"loss": 0.9943758010864258,
"step": 4900,
"token_acc": 0.6850112466771181
},
{
"epoch": 0.8511192087454451,
"grad_norm": 1.828125,
"learning_rate": 1.1880686091833482e-06,
"loss": 1.000884437561035,
"step": 4905,
"token_acc": 0.683370710159701
},
{
"epoch": 0.8519868124240847,
"grad_norm": 1.8359375,
"learning_rate": 1.174538885466412e-06,
"loss": 0.9865160942077636,
"step": 4910,
"token_acc": 0.6890235069467308
},
{
"epoch": 0.8528544161027243,
"grad_norm": 1.828125,
"learning_rate": 1.1610818337767716e-06,
"loss": 0.9991436004638672,
"step": 4915,
"token_acc": 0.6858561584726297
},
{
"epoch": 0.8537220197813639,
"grad_norm": 1.765625,
"learning_rate": 1.147697564924639e-06,
"loss": 0.9727308273315429,
"step": 4920,
"token_acc": 0.6912715446298077
},
{
"epoch": 0.8545896234600034,
"grad_norm": 1.8515625,
"learning_rate": 1.1343861891209106e-06,
"loss": 1.0181291580200196,
"step": 4925,
"token_acc": 0.6820682501542784
},
{
"epoch": 0.855457227138643,
"grad_norm": 1.8359375,
"learning_rate": 1.121147815976248e-06,
"loss": 1.0026049613952637,
"step": 4930,
"token_acc": 0.683598010267869
},
{
"epoch": 0.8563248308172826,
"grad_norm": 1.7734375,
"learning_rate": 1.1079825545001887e-06,
"loss": 0.9865102767944336,
"step": 4935,
"token_acc": 0.6898536402969927
},
{
"epoch": 0.8571924344959223,
"grad_norm": 1.9140625,
"learning_rate": 1.0948905131002407e-06,
"loss": 1.0127381324768066,
"step": 4940,
"token_acc": 0.6825190010857763
},
{
"epoch": 0.8580600381745619,
"grad_norm": 1.84375,
"learning_rate": 1.081871799580989e-06,
"loss": 0.9863951683044434,
"step": 4945,
"token_acc": 0.6883122286792139
},
{
"epoch": 0.8589276418532015,
"grad_norm": 1.8828125,
"learning_rate": 1.0689265211432132e-06,
"loss": 0.9868002891540527,
"step": 4950,
"token_acc": 0.6885490091767522
},
{
"epoch": 0.8597952455318411,
"grad_norm": 1.75,
"learning_rate": 1.0560547843830016e-06,
"loss": 0.9947976112365723,
"step": 4955,
"token_acc": 0.6865387356336733
},
{
"epoch": 0.8606628492104806,
"grad_norm": 1.859375,
"learning_rate": 1.0432566952908696e-06,
"loss": 1.0024614334106445,
"step": 4960,
"token_acc": 0.683781453319746
},
{
"epoch": 0.8615304528891202,
"grad_norm": 1.8359375,
"learning_rate": 1.030532359250901e-06,
"loss": 0.9844224929809571,
"step": 4965,
"token_acc": 0.6905729592779288
},
{
"epoch": 0.8623980565677598,
"grad_norm": 1.921875,
"learning_rate": 1.0178818810398616e-06,
"loss": 1.004835605621338,
"step": 4970,
"token_acc": 0.6851532852387675
},
{
"epoch": 0.8632656602463994,
"grad_norm": 1.890625,
"learning_rate": 1.0053053648263477e-06,
"loss": 0.9801043510437012,
"step": 4975,
"token_acc": 0.6884155757432423
},
{
"epoch": 0.8641332639250391,
"grad_norm": 1.96875,
"learning_rate": 9.92802914169927e-07,
"loss": 0.9909211158752441,
"step": 4980,
"token_acc": 0.6887015132838522
},
{
"epoch": 0.8650008676036787,
"grad_norm": 1.9296875,
"learning_rate": 9.803746320202812e-07,
"loss": 1.0029238700866698,
"step": 4985,
"token_acc": 0.6833788400406263
},
{
"epoch": 0.8658684712823183,
"grad_norm": 1.8046875,
"learning_rate": 9.680206207163666e-07,
"loss": 0.990473747253418,
"step": 4990,
"token_acc": 0.6859917435513812
},
{
"epoch": 0.8667360749609578,
"grad_norm": 1.8671875,
"learning_rate": 9.557409819855645e-07,
"loss": 0.9845627784729004,
"step": 4995,
"token_acc": 0.6892265928567627
},
{
"epoch": 0.8676036786395974,
"grad_norm": 1.7890625,
"learning_rate": 9.435358169428444e-07,
"loss": 0.9981782913208008,
"step": 5000,
"token_acc": 0.6857057648919893
}
],
"logging_steps": 5,
"max_steps": 5763,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.988905759887589e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}