dear-8b-reranker-listwise-lora-v1 / trainer_state.json
abdoelsayed's picture
Upload folder using huggingface_hub
31fe04b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.8057726999398676,
"eval_steps": 500,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004008819402685909,
"grad_norm": 0.7079163193702698,
"learning_rate": 1.2016021361815755e-06,
"loss": 0.4167,
"step": 10
},
{
"epoch": 0.008017638805371818,
"grad_norm": 0.7350050210952759,
"learning_rate": 2.5367156208277703e-06,
"loss": 0.4259,
"step": 20
},
{
"epoch": 0.012026458208057728,
"grad_norm": 0.8004947900772095,
"learning_rate": 3.871829105473966e-06,
"loss": 0.4058,
"step": 30
},
{
"epoch": 0.016035277610743637,
"grad_norm": 0.7146145701408386,
"learning_rate": 5.206942590120161e-06,
"loss": 0.3923,
"step": 40
},
{
"epoch": 0.020044097013429546,
"grad_norm": 0.7212072610855103,
"learning_rate": 6.542056074766355e-06,
"loss": 0.3844,
"step": 50
},
{
"epoch": 0.024052916416115455,
"grad_norm": 0.6941127181053162,
"learning_rate": 7.87716955941255e-06,
"loss": 0.3679,
"step": 60
},
{
"epoch": 0.028061735818801364,
"grad_norm": 0.5748901963233948,
"learning_rate": 9.212283044058744e-06,
"loss": 0.3518,
"step": 70
},
{
"epoch": 0.032070555221487274,
"grad_norm": 0.5205843448638916,
"learning_rate": 1.054739652870494e-05,
"loss": 0.3631,
"step": 80
},
{
"epoch": 0.03607937462417318,
"grad_norm": 0.5671967267990112,
"learning_rate": 1.1882510013351136e-05,
"loss": 0.3526,
"step": 90
},
{
"epoch": 0.04008819402685909,
"grad_norm": 0.6912459135055542,
"learning_rate": 1.321762349799733e-05,
"loss": 0.3186,
"step": 100
},
{
"epoch": 0.044097013429545,
"grad_norm": 0.6880580186843872,
"learning_rate": 1.4552736982643526e-05,
"loss": 0.3127,
"step": 110
},
{
"epoch": 0.04810583283223091,
"grad_norm": 0.7009742259979248,
"learning_rate": 1.588785046728972e-05,
"loss": 0.3092,
"step": 120
},
{
"epoch": 0.05211465223491682,
"grad_norm": 0.7727891802787781,
"learning_rate": 1.7222963951935918e-05,
"loss": 0.3138,
"step": 130
},
{
"epoch": 0.05612347163760273,
"grad_norm": 0.6901352405548096,
"learning_rate": 1.855807743658211e-05,
"loss": 0.3014,
"step": 140
},
{
"epoch": 0.06013229104028864,
"grad_norm": 0.8163438439369202,
"learning_rate": 1.9893190921228304e-05,
"loss": 0.312,
"step": 150
},
{
"epoch": 0.06414111044297455,
"grad_norm": 0.8670288324356079,
"learning_rate": 2.12283044058745e-05,
"loss": 0.2875,
"step": 160
},
{
"epoch": 0.06814992984566046,
"grad_norm": 1.0734412670135498,
"learning_rate": 2.2563417890520698e-05,
"loss": 0.2919,
"step": 170
},
{
"epoch": 0.07215874924834637,
"grad_norm": 0.704152524471283,
"learning_rate": 2.389853137516689e-05,
"loss": 0.3094,
"step": 180
},
{
"epoch": 0.07616756865103227,
"grad_norm": 0.7793599963188171,
"learning_rate": 2.5233644859813084e-05,
"loss": 0.2992,
"step": 190
},
{
"epoch": 0.08017638805371818,
"grad_norm": 0.8480731248855591,
"learning_rate": 2.656875834445928e-05,
"loss": 0.2957,
"step": 200
},
{
"epoch": 0.0841852074564041,
"grad_norm": 0.8737421631813049,
"learning_rate": 2.7903871829105478e-05,
"loss": 0.2794,
"step": 210
},
{
"epoch": 0.08819402685909,
"grad_norm": 0.8049966096878052,
"learning_rate": 2.923898531375167e-05,
"loss": 0.2767,
"step": 220
},
{
"epoch": 0.09220284626177591,
"grad_norm": 0.8555333614349365,
"learning_rate": 3.0574098798397864e-05,
"loss": 0.2899,
"step": 230
},
{
"epoch": 0.09621166566446182,
"grad_norm": 0.8982564806938171,
"learning_rate": 3.190921228304406e-05,
"loss": 0.2661,
"step": 240
},
{
"epoch": 0.10022048506714773,
"grad_norm": 0.7863436937332153,
"learning_rate": 3.324432576769025e-05,
"loss": 0.284,
"step": 250
},
{
"epoch": 0.10422930446983364,
"grad_norm": 0.861031711101532,
"learning_rate": 3.457943925233645e-05,
"loss": 0.266,
"step": 260
},
{
"epoch": 0.10823812387251955,
"grad_norm": 0.7962524890899658,
"learning_rate": 3.5914552736982644e-05,
"loss": 0.2589,
"step": 270
},
{
"epoch": 0.11224694327520546,
"grad_norm": 0.7888882756233215,
"learning_rate": 3.7249666221628844e-05,
"loss": 0.2692,
"step": 280
},
{
"epoch": 0.11625576267789137,
"grad_norm": 1.0324134826660156,
"learning_rate": 3.858477970627504e-05,
"loss": 0.2785,
"step": 290
},
{
"epoch": 0.12026458208057728,
"grad_norm": 0.8519011735916138,
"learning_rate": 3.991989319092123e-05,
"loss": 0.2837,
"step": 300
},
{
"epoch": 0.12427340148326319,
"grad_norm": 0.9894006848335266,
"learning_rate": 4.1255006675567424e-05,
"loss": 0.2729,
"step": 310
},
{
"epoch": 0.1282822208859491,
"grad_norm": 0.8675716519355774,
"learning_rate": 4.259012016021362e-05,
"loss": 0.2637,
"step": 320
},
{
"epoch": 0.132291040288635,
"grad_norm": 0.8296481370925903,
"learning_rate": 4.392523364485982e-05,
"loss": 0.2597,
"step": 330
},
{
"epoch": 0.1362998596913209,
"grad_norm": 0.7083739042282104,
"learning_rate": 4.526034712950601e-05,
"loss": 0.2731,
"step": 340
},
{
"epoch": 0.14030867909400682,
"grad_norm": 0.8215944766998291,
"learning_rate": 4.6595460614152204e-05,
"loss": 0.2655,
"step": 350
},
{
"epoch": 0.14431749849669273,
"grad_norm": 0.7739771008491516,
"learning_rate": 4.79305740987984e-05,
"loss": 0.2661,
"step": 360
},
{
"epoch": 0.14832631789937864,
"grad_norm": 0.6515551805496216,
"learning_rate": 4.92656875834446e-05,
"loss": 0.272,
"step": 370
},
{
"epoch": 0.15233513730206455,
"grad_norm": 0.7235403060913086,
"learning_rate": 5.0600801068090784e-05,
"loss": 0.295,
"step": 380
},
{
"epoch": 0.15634395670475046,
"grad_norm": 0.7624292373657227,
"learning_rate": 5.1935914552736984e-05,
"loss": 0.2622,
"step": 390
},
{
"epoch": 0.16035277610743637,
"grad_norm": 0.7019667029380798,
"learning_rate": 5.327102803738318e-05,
"loss": 0.2562,
"step": 400
},
{
"epoch": 0.16436159551012228,
"grad_norm": 0.7686800360679626,
"learning_rate": 5.460614152202938e-05,
"loss": 0.2726,
"step": 410
},
{
"epoch": 0.1683704149128082,
"grad_norm": 0.6799090504646301,
"learning_rate": 5.594125500667558e-05,
"loss": 0.266,
"step": 420
},
{
"epoch": 0.1723792343154941,
"grad_norm": 0.6165328025817871,
"learning_rate": 5.7276368491321764e-05,
"loss": 0.2706,
"step": 430
},
{
"epoch": 0.17638805371818,
"grad_norm": 2.7531023025512695,
"learning_rate": 5.8611481975967965e-05,
"loss": 0.2645,
"step": 440
},
{
"epoch": 0.1803968731208659,
"grad_norm": 0.7134599685668945,
"learning_rate": 5.994659546061415e-05,
"loss": 0.2823,
"step": 450
},
{
"epoch": 0.18440569252355182,
"grad_norm": 0.8196555376052856,
"learning_rate": 6.128170894526035e-05,
"loss": 0.2568,
"step": 460
},
{
"epoch": 0.18841451192623773,
"grad_norm": 0.7205436825752258,
"learning_rate": 6.261682242990654e-05,
"loss": 0.2638,
"step": 470
},
{
"epoch": 0.19242333132892364,
"grad_norm": 0.6776229739189148,
"learning_rate": 6.395193591455274e-05,
"loss": 0.2565,
"step": 480
},
{
"epoch": 0.19643215073160955,
"grad_norm": 0.5640079379081726,
"learning_rate": 6.528704939919892e-05,
"loss": 0.2652,
"step": 490
},
{
"epoch": 0.20044097013429546,
"grad_norm": 0.6904841661453247,
"learning_rate": 6.662216288384512e-05,
"loss": 0.2761,
"step": 500
},
{
"epoch": 0.20444978953698137,
"grad_norm": 0.602130651473999,
"learning_rate": 6.795727636849132e-05,
"loss": 0.2628,
"step": 510
},
{
"epoch": 0.20845860893966728,
"grad_norm": 0.5913508534431458,
"learning_rate": 6.929238985313752e-05,
"loss": 0.2655,
"step": 520
},
{
"epoch": 0.2124674283423532,
"grad_norm": 0.6655270457267761,
"learning_rate": 7.062750333778372e-05,
"loss": 0.2453,
"step": 530
},
{
"epoch": 0.2164762477450391,
"grad_norm": 0.7957231998443604,
"learning_rate": 7.196261682242991e-05,
"loss": 0.2572,
"step": 540
},
{
"epoch": 0.220485067147725,
"grad_norm": 0.6448558568954468,
"learning_rate": 7.329773030707611e-05,
"loss": 0.2674,
"step": 550
},
{
"epoch": 0.22449388655041091,
"grad_norm": 0.6882309317588806,
"learning_rate": 7.46328437917223e-05,
"loss": 0.2634,
"step": 560
},
{
"epoch": 0.22850270595309682,
"grad_norm": 0.5508169531822205,
"learning_rate": 7.59679572763685e-05,
"loss": 0.2821,
"step": 570
},
{
"epoch": 0.23251152535578273,
"grad_norm": 0.567504346370697,
"learning_rate": 7.73030707610147e-05,
"loss": 0.2623,
"step": 580
},
{
"epoch": 0.23652034475846864,
"grad_norm": 0.5799248218536377,
"learning_rate": 7.863818424566088e-05,
"loss": 0.2574,
"step": 590
},
{
"epoch": 0.24052916416115455,
"grad_norm": 0.6579704880714417,
"learning_rate": 7.997329773030708e-05,
"loss": 0.2639,
"step": 600
},
{
"epoch": 0.24453798356384046,
"grad_norm": 0.6886210441589355,
"learning_rate": 8.130841121495327e-05,
"loss": 0.2604,
"step": 610
},
{
"epoch": 0.24854680296652637,
"grad_norm": 0.702531635761261,
"learning_rate": 8.264352469959947e-05,
"loss": 0.2549,
"step": 620
},
{
"epoch": 0.25255562236921225,
"grad_norm": 0.605786144733429,
"learning_rate": 8.397863818424566e-05,
"loss": 0.2515,
"step": 630
},
{
"epoch": 0.2565644417718982,
"grad_norm": 0.7157173752784729,
"learning_rate": 8.531375166889186e-05,
"loss": 0.2565,
"step": 640
},
{
"epoch": 0.26057326117458407,
"grad_norm": 0.552195131778717,
"learning_rate": 8.664886515353804e-05,
"loss": 0.2675,
"step": 650
},
{
"epoch": 0.26458208057727,
"grad_norm": 0.7387903928756714,
"learning_rate": 8.798397863818424e-05,
"loss": 0.2593,
"step": 660
},
{
"epoch": 0.2685908999799559,
"grad_norm": 0.5697731971740723,
"learning_rate": 8.931909212283044e-05,
"loss": 0.2525,
"step": 670
},
{
"epoch": 0.2725997193826418,
"grad_norm": 0.5313478708267212,
"learning_rate": 9.065420560747664e-05,
"loss": 0.2503,
"step": 680
},
{
"epoch": 0.2766085387853277,
"grad_norm": 0.5595772862434387,
"learning_rate": 9.198931909212284e-05,
"loss": 0.2455,
"step": 690
},
{
"epoch": 0.28061735818801364,
"grad_norm": 0.6393229365348816,
"learning_rate": 9.332443257676903e-05,
"loss": 0.2541,
"step": 700
},
{
"epoch": 0.2846261775906995,
"grad_norm": 0.6859897375106812,
"learning_rate": 9.465954606141523e-05,
"loss": 0.2552,
"step": 710
},
{
"epoch": 0.28863499699338546,
"grad_norm": 0.5158377289772034,
"learning_rate": 9.599465954606142e-05,
"loss": 0.2658,
"step": 720
},
{
"epoch": 0.29264381639607134,
"grad_norm": 0.5928187966346741,
"learning_rate": 9.732977303070762e-05,
"loss": 0.2515,
"step": 730
},
{
"epoch": 0.2966526357987573,
"grad_norm": 0.5400727391242981,
"learning_rate": 9.86648865153538e-05,
"loss": 0.255,
"step": 740
},
{
"epoch": 0.30066145520144316,
"grad_norm": 0.6557461023330688,
"learning_rate": 0.0001,
"loss": 0.266,
"step": 750
},
{
"epoch": 0.3046702746041291,
"grad_norm": 0.5242008566856384,
"learning_rate": 9.999945572080073e-05,
"loss": 0.2581,
"step": 760
},
{
"epoch": 0.308679094006815,
"grad_norm": 0.6318579912185669,
"learning_rate": 9.999782289505249e-05,
"loss": 0.2487,
"step": 770
},
{
"epoch": 0.3126879134095009,
"grad_norm": 0.5616021156311035,
"learning_rate": 9.999510155830382e-05,
"loss": 0.2477,
"step": 780
},
{
"epoch": 0.3166967328121868,
"grad_norm": 0.6462226510047913,
"learning_rate": 9.999129176980139e-05,
"loss": 0.2534,
"step": 790
},
{
"epoch": 0.32070555221487274,
"grad_norm": 0.6198021769523621,
"learning_rate": 9.998639361248875e-05,
"loss": 0.2526,
"step": 800
},
{
"epoch": 0.3247143716175586,
"grad_norm": 0.7665322422981262,
"learning_rate": 9.99804071930045e-05,
"loss": 0.243,
"step": 810
},
{
"epoch": 0.32872319102024455,
"grad_norm": 0.4766557514667511,
"learning_rate": 9.997333264168e-05,
"loss": 0.2535,
"step": 820
},
{
"epoch": 0.33273201042293044,
"grad_norm": 0.5588070154190063,
"learning_rate": 9.996517011253648e-05,
"loss": 0.2597,
"step": 830
},
{
"epoch": 0.3367408298256164,
"grad_norm": 0.5454280376434326,
"learning_rate": 9.995591978328171e-05,
"loss": 0.244,
"step": 840
},
{
"epoch": 0.34074964922830225,
"grad_norm": 0.49733200669288635,
"learning_rate": 9.994558185530623e-05,
"loss": 0.2537,
"step": 850
},
{
"epoch": 0.3447584686309882,
"grad_norm": 0.5581687688827515,
"learning_rate": 9.993415655367875e-05,
"loss": 0.2404,
"step": 860
},
{
"epoch": 0.34876728803367407,
"grad_norm": 0.6027563810348511,
"learning_rate": 9.992164412714143e-05,
"loss": 0.2482,
"step": 870
},
{
"epoch": 0.35277610743636,
"grad_norm": 0.5277743935585022,
"learning_rate": 9.990804484810444e-05,
"loss": 0.2495,
"step": 880
},
{
"epoch": 0.3567849268390459,
"grad_norm": 0.4644189774990082,
"learning_rate": 9.989335901263996e-05,
"loss": 0.2484,
"step": 890
},
{
"epoch": 0.3607937462417318,
"grad_norm": 0.5761633515357971,
"learning_rate": 9.987758694047575e-05,
"loss": 0.2449,
"step": 900
},
{
"epoch": 0.3648025656444177,
"grad_norm": 0.5238028168678284,
"learning_rate": 9.986072897498829e-05,
"loss": 0.2492,
"step": 910
},
{
"epoch": 0.36881138504710365,
"grad_norm": 0.7435324788093567,
"learning_rate": 9.984278548319515e-05,
"loss": 0.2329,
"step": 920
},
{
"epoch": 0.3728202044497895,
"grad_norm": 0.5836918950080872,
"learning_rate": 9.982375685574712e-05,
"loss": 0.2366,
"step": 930
},
{
"epoch": 0.37682902385247546,
"grad_norm": 0.5967078804969788,
"learning_rate": 9.980364350691962e-05,
"loss": 0.2323,
"step": 940
},
{
"epoch": 0.38083784325516135,
"grad_norm": 0.5183435082435608,
"learning_rate": 9.978244587460376e-05,
"loss": 0.2496,
"step": 950
},
{
"epoch": 0.3848466626578473,
"grad_norm": 0.5637623071670532,
"learning_rate": 9.976016442029675e-05,
"loss": 0.2469,
"step": 960
},
{
"epoch": 0.38885548206053316,
"grad_norm": 0.5092435479164124,
"learning_rate": 9.973679962909189e-05,
"loss": 0.2423,
"step": 970
},
{
"epoch": 0.3928643014632191,
"grad_norm": 0.589830219745636,
"learning_rate": 9.971235200966795e-05,
"loss": 0.2327,
"step": 980
},
{
"epoch": 0.396873120865905,
"grad_norm": 0.47097843885421753,
"learning_rate": 9.968682209427817e-05,
"loss": 0.2597,
"step": 990
},
{
"epoch": 0.4008819402685909,
"grad_norm": 0.5486937761306763,
"learning_rate": 9.966021043873864e-05,
"loss": 0.2471,
"step": 1000
},
{
"epoch": 0.4048907596712768,
"grad_norm": 0.7114580273628235,
"learning_rate": 9.963251762241616e-05,
"loss": 0.2438,
"step": 1010
},
{
"epoch": 0.40889957907396274,
"grad_norm": 0.6333823800086975,
"learning_rate": 9.96037442482157e-05,
"loss": 0.2264,
"step": 1020
},
{
"epoch": 0.4129083984766486,
"grad_norm": 0.6404210329055786,
"learning_rate": 9.95738909425672e-05,
"loss": 0.2388,
"step": 1030
},
{
"epoch": 0.41691721787933456,
"grad_norm": 0.4633651673793793,
"learning_rate": 9.954295835541203e-05,
"loss": 0.2438,
"step": 1040
},
{
"epoch": 0.42092603728202044,
"grad_norm": 0.5816488265991211,
"learning_rate": 9.951094716018871e-05,
"loss": 0.2397,
"step": 1050
},
{
"epoch": 0.4249348566847064,
"grad_norm": 0.4773317277431488,
"learning_rate": 9.947785805381836e-05,
"loss": 0.2549,
"step": 1060
},
{
"epoch": 0.42894367608739226,
"grad_norm": 0.6512565016746521,
"learning_rate": 9.944369175668948e-05,
"loss": 0.2341,
"step": 1070
},
{
"epoch": 0.4329524954900782,
"grad_norm": 0.5367722511291504,
"learning_rate": 9.940844901264225e-05,
"loss": 0.2331,
"step": 1080
},
{
"epoch": 0.4369613148927641,
"grad_norm": 0.46036213636398315,
"learning_rate": 9.937213058895237e-05,
"loss": 0.2506,
"step": 1090
},
{
"epoch": 0.44097013429545,
"grad_norm": 0.5731292366981506,
"learning_rate": 9.933473727631435e-05,
"loss": 0.2458,
"step": 1100
},
{
"epoch": 0.4449789536981359,
"grad_norm": 0.4861133396625519,
"learning_rate": 9.929626988882428e-05,
"loss": 0.2385,
"step": 1110
},
{
"epoch": 0.44898777310082183,
"grad_norm": 0.5132921934127808,
"learning_rate": 9.925672926396212e-05,
"loss": 0.239,
"step": 1120
},
{
"epoch": 0.4529965925035077,
"grad_norm": 0.5186300873756409,
"learning_rate": 9.921611626257344e-05,
"loss": 0.2342,
"step": 1130
},
{
"epoch": 0.45700541190619365,
"grad_norm": 0.5068562626838684,
"learning_rate": 9.917443176885073e-05,
"loss": 0.2377,
"step": 1140
},
{
"epoch": 0.46101423130887953,
"grad_norm": 0.5708321928977966,
"learning_rate": 9.913167669031409e-05,
"loss": 0.2245,
"step": 1150
},
{
"epoch": 0.46502305071156547,
"grad_norm": 0.5989340543746948,
"learning_rate": 9.908785195779153e-05,
"loss": 0.2235,
"step": 1160
},
{
"epoch": 0.46903187011425135,
"grad_norm": 0.5365070700645447,
"learning_rate": 9.904295852539867e-05,
"loss": 0.2434,
"step": 1170
},
{
"epoch": 0.4730406895169373,
"grad_norm": 0.625773549079895,
"learning_rate": 9.899699737051793e-05,
"loss": 0.23,
"step": 1180
},
{
"epoch": 0.47704950891962317,
"grad_norm": 0.5225462317466736,
"learning_rate": 9.894996949377738e-05,
"loss": 0.2219,
"step": 1190
},
{
"epoch": 0.4810583283223091,
"grad_norm": 0.6289816498756409,
"learning_rate": 9.890187591902879e-05,
"loss": 0.2428,
"step": 1200
},
{
"epoch": 0.485067147724995,
"grad_norm": 0.5729430317878723,
"learning_rate": 9.885271769332547e-05,
"loss": 0.2267,
"step": 1210
},
{
"epoch": 0.4890759671276809,
"grad_norm": 0.4762110412120819,
"learning_rate": 9.880249588689941e-05,
"loss": 0.2306,
"step": 1220
},
{
"epoch": 0.4930847865303668,
"grad_norm": 0.5188407301902771,
"learning_rate": 9.875121159313797e-05,
"loss": 0.2389,
"step": 1230
},
{
"epoch": 0.49709360593305274,
"grad_norm": 0.630754828453064,
"learning_rate": 9.869886592856016e-05,
"loss": 0.2262,
"step": 1240
},
{
"epoch": 0.5011024253357387,
"grad_norm": 0.5617574453353882,
"learning_rate": 9.864546003279222e-05,
"loss": 0.2362,
"step": 1250
},
{
"epoch": 0.5051112447384245,
"grad_norm": 0.6316712498664856,
"learning_rate": 9.859099506854285e-05,
"loss": 0.2265,
"step": 1260
},
{
"epoch": 0.5091200641411104,
"grad_norm": 0.4762302339076996,
"learning_rate": 9.8535472221578e-05,
"loss": 0.2286,
"step": 1270
},
{
"epoch": 0.5131288835437964,
"grad_norm": 0.6005476117134094,
"learning_rate": 9.847889270069483e-05,
"loss": 0.2217,
"step": 1280
},
{
"epoch": 0.5171377029464823,
"grad_norm": 0.5312756299972534,
"learning_rate": 9.842125773769563e-05,
"loss": 0.2285,
"step": 1290
},
{
"epoch": 0.5211465223491681,
"grad_norm": 0.7248687744140625,
"learning_rate": 9.836256858736086e-05,
"loss": 0.2354,
"step": 1300
},
{
"epoch": 0.5251553417518541,
"grad_norm": 0.5596895813941956,
"learning_rate": 9.830282652742186e-05,
"loss": 0.2286,
"step": 1310
},
{
"epoch": 0.52916416115454,
"grad_norm": 0.6484787464141846,
"learning_rate": 9.824203285853305e-05,
"loss": 0.2325,
"step": 1320
},
{
"epoch": 0.533172980557226,
"grad_norm": 0.5286840200424194,
"learning_rate": 9.81801889042436e-05,
"loss": 0.2213,
"step": 1330
},
{
"epoch": 0.5371817999599118,
"grad_norm": 0.5632983446121216,
"learning_rate": 9.811729601096865e-05,
"loss": 0.2262,
"step": 1340
},
{
"epoch": 0.5411906193625977,
"grad_norm": 0.6314755082130432,
"learning_rate": 9.805335554795993e-05,
"loss": 0.226,
"step": 1350
},
{
"epoch": 0.5451994387652837,
"grad_norm": 0.5536089539527893,
"learning_rate": 9.798836890727601e-05,
"loss": 0.2363,
"step": 1360
},
{
"epoch": 0.5492082581679696,
"grad_norm": 0.5642661452293396,
"learning_rate": 9.792233750375193e-05,
"loss": 0.2367,
"step": 1370
},
{
"epoch": 0.5532170775706554,
"grad_norm": 0.4720064103603363,
"learning_rate": 9.785526277496851e-05,
"loss": 0.2278,
"step": 1380
},
{
"epoch": 0.5572258969733414,
"grad_norm": 0.568137526512146,
"learning_rate": 9.778714618122091e-05,
"loss": 0.2135,
"step": 1390
},
{
"epoch": 0.5612347163760273,
"grad_norm": 0.5233467221260071,
"learning_rate": 9.771798920548693e-05,
"loss": 0.2243,
"step": 1400
},
{
"epoch": 0.5652435357787132,
"grad_norm": 0.5088178515434265,
"learning_rate": 9.764779335339473e-05,
"loss": 0.2438,
"step": 1410
},
{
"epoch": 0.569252355181399,
"grad_norm": 0.6083818078041077,
"learning_rate": 9.757656015318998e-05,
"loss": 0.2223,
"step": 1420
},
{
"epoch": 0.573261174584085,
"grad_norm": 0.5877081155776978,
"learning_rate": 9.750429115570264e-05,
"loss": 0.2298,
"step": 1430
},
{
"epoch": 0.5772699939867709,
"grad_norm": 0.6110019683837891,
"learning_rate": 9.743098793431321e-05,
"loss": 0.2323,
"step": 1440
},
{
"epoch": 0.5812788133894567,
"grad_norm": 0.5051080584526062,
"learning_rate": 9.735665208491842e-05,
"loss": 0.2436,
"step": 1450
},
{
"epoch": 0.5852876327921427,
"grad_norm": 0.5243321657180786,
"learning_rate": 9.728128522589655e-05,
"loss": 0.2338,
"step": 1460
},
{
"epoch": 0.5892964521948286,
"grad_norm": 0.6249774694442749,
"learning_rate": 9.720488899807214e-05,
"loss": 0.226,
"step": 1470
},
{
"epoch": 0.5933052715975146,
"grad_norm": 0.5004896521568298,
"learning_rate": 9.71274650646803e-05,
"loss": 0.2144,
"step": 1480
},
{
"epoch": 0.5973140910002004,
"grad_norm": 0.6254176497459412,
"learning_rate": 9.704901511133048e-05,
"loss": 0.219,
"step": 1490
},
{
"epoch": 0.6013229104028863,
"grad_norm": 0.5976850390434265,
"learning_rate": 9.696954084596979e-05,
"loss": 0.2323,
"step": 1500
},
{
"epoch": 0.6053317298055723,
"grad_norm": 0.588320791721344,
"learning_rate": 9.688904399884583e-05,
"loss": 0.2049,
"step": 1510
},
{
"epoch": 0.6093405492082582,
"grad_norm": 0.655425488948822,
"learning_rate": 9.680752632246896e-05,
"loss": 0.224,
"step": 1520
},
{
"epoch": 0.613349368610944,
"grad_norm": 0.6558622121810913,
"learning_rate": 9.672498959157422e-05,
"loss": 0.2201,
"step": 1530
},
{
"epoch": 0.61735818801363,
"grad_norm": 0.6564059853553772,
"learning_rate": 9.664143560308263e-05,
"loss": 0.2075,
"step": 1540
},
{
"epoch": 0.6213670074163159,
"grad_norm": 0.573246419429779,
"learning_rate": 9.655686617606212e-05,
"loss": 0.2091,
"step": 1550
},
{
"epoch": 0.6253758268190018,
"grad_norm": 0.6015535593032837,
"learning_rate": 9.647128315168788e-05,
"loss": 0.2221,
"step": 1560
},
{
"epoch": 0.6293846462216877,
"grad_norm": 0.6874203085899353,
"learning_rate": 9.638468839320232e-05,
"loss": 0.213,
"step": 1570
},
{
"epoch": 0.6333934656243736,
"grad_norm": 0.5189663171768188,
"learning_rate": 9.629708378587445e-05,
"loss": 0.2161,
"step": 1580
},
{
"epoch": 0.6374022850270595,
"grad_norm": 0.571725070476532,
"learning_rate": 9.62084712369589e-05,
"loss": 0.2236,
"step": 1590
},
{
"epoch": 0.6414111044297455,
"grad_norm": 0.6262040734291077,
"learning_rate": 9.61188526756544e-05,
"loss": 0.2346,
"step": 1600
},
{
"epoch": 0.6454199238324313,
"grad_norm": 0.6156971454620361,
"learning_rate": 9.602823005306164e-05,
"loss": 0.2089,
"step": 1610
},
{
"epoch": 0.6494287432351172,
"grad_norm": 0.5515331625938416,
"learning_rate": 9.5936605342141e-05,
"loss": 0.2225,
"step": 1620
},
{
"epoch": 0.6534375626378032,
"grad_norm": 0.6686428785324097,
"learning_rate": 9.584398053766941e-05,
"loss": 0.2189,
"step": 1630
},
{
"epoch": 0.6574463820404891,
"grad_norm": 0.5298424959182739,
"learning_rate": 9.575035765619708e-05,
"loss": 0.2297,
"step": 1640
},
{
"epoch": 0.6614552014431749,
"grad_norm": 0.6391364932060242,
"learning_rate": 9.565573873600349e-05,
"loss": 0.2441,
"step": 1650
},
{
"epoch": 0.6654640208458609,
"grad_norm": 0.6574255228042603,
"learning_rate": 9.556012583705303e-05,
"loss": 0.2329,
"step": 1660
},
{
"epoch": 0.6694728402485468,
"grad_norm": 0.5856221914291382,
"learning_rate": 9.546352104095019e-05,
"loss": 0.2001,
"step": 1670
},
{
"epoch": 0.6734816596512327,
"grad_norm": 0.6181838512420654,
"learning_rate": 9.536592645089421e-05,
"loss": 0.2255,
"step": 1680
},
{
"epoch": 0.6774904790539186,
"grad_norm": 0.635492742061615,
"learning_rate": 9.52673441916333e-05,
"loss": 0.1973,
"step": 1690
},
{
"epoch": 0.6814992984566045,
"grad_norm": 0.6166744232177734,
"learning_rate": 9.51677764094184e-05,
"loss": 0.2248,
"step": 1700
},
{
"epoch": 0.6855081178592904,
"grad_norm": 0.6294150352478027,
"learning_rate": 9.506722527195639e-05,
"loss": 0.2123,
"step": 1710
},
{
"epoch": 0.6895169372619764,
"grad_norm": 0.6106050610542297,
"learning_rate": 9.496569296836301e-05,
"loss": 0.208,
"step": 1720
},
{
"epoch": 0.6935257566646622,
"grad_norm": 0.6652440428733826,
"learning_rate": 9.486318170911508e-05,
"loss": 0.2112,
"step": 1730
},
{
"epoch": 0.6975345760673481,
"grad_norm": 0.5508642792701721,
"learning_rate": 9.475969372600246e-05,
"loss": 0.2299,
"step": 1740
},
{
"epoch": 0.7015433954700341,
"grad_norm": 0.5851196050643921,
"learning_rate": 9.465523127207938e-05,
"loss": 0.2283,
"step": 1750
},
{
"epoch": 0.70555221487272,
"grad_norm": 0.6574164628982544,
"learning_rate": 9.454979662161547e-05,
"loss": 0.2149,
"step": 1760
},
{
"epoch": 0.7095610342754058,
"grad_norm": 0.562202513217926,
"learning_rate": 9.444339207004626e-05,
"loss": 0.2162,
"step": 1770
},
{
"epoch": 0.7135698536780918,
"grad_norm": 0.5654606223106384,
"learning_rate": 9.433601993392308e-05,
"loss": 0.2283,
"step": 1780
},
{
"epoch": 0.7175786730807777,
"grad_norm": 0.5194072127342224,
"learning_rate": 9.422768255086274e-05,
"loss": 0.2266,
"step": 1790
},
{
"epoch": 0.7215874924834637,
"grad_norm": 0.651335597038269,
"learning_rate": 9.411838227949663e-05,
"loss": 0.1999,
"step": 1800
},
{
"epoch": 0.7255963118861495,
"grad_norm": 0.6659877300262451,
"learning_rate": 9.400812149941932e-05,
"loss": 0.2148,
"step": 1810
},
{
"epoch": 0.7296051312888354,
"grad_norm": 0.6771412491798401,
"learning_rate": 9.389690261113672e-05,
"loss": 0.2233,
"step": 1820
},
{
"epoch": 0.7336139506915214,
"grad_norm": 0.8170326948165894,
"learning_rate": 9.378472803601397e-05,
"loss": 0.2282,
"step": 1830
},
{
"epoch": 0.7376227700942073,
"grad_norm": 0.6430959701538086,
"learning_rate": 9.36716002162226e-05,
"loss": 0.2036,
"step": 1840
},
{
"epoch": 0.7416315894968931,
"grad_norm": 0.6288866996765137,
"learning_rate": 9.355752161468731e-05,
"loss": 0.2223,
"step": 1850
},
{
"epoch": 0.745640408899579,
"grad_norm": 0.7772784233093262,
"learning_rate": 9.344249471503259e-05,
"loss": 0.2183,
"step": 1860
},
{
"epoch": 0.749649228302265,
"grad_norm": 0.6505045890808105,
"learning_rate": 9.332652202152833e-05,
"loss": 0.2126,
"step": 1870
},
{
"epoch": 0.7536580477049509,
"grad_norm": 0.5706261992454529,
"learning_rate": 9.320960605903553e-05,
"loss": 0.2107,
"step": 1880
},
{
"epoch": 0.7576668671076368,
"grad_norm": 0.5667653679847717,
"learning_rate": 9.309174937295126e-05,
"loss": 0.2036,
"step": 1890
},
{
"epoch": 0.7616756865103227,
"grad_norm": 0.6292815208435059,
"learning_rate": 9.297295452915323e-05,
"loss": 0.2038,
"step": 1900
},
{
"epoch": 0.7656845059130086,
"grad_norm": 0.6061923503875732,
"learning_rate": 9.285322411394393e-05,
"loss": 0.2183,
"step": 1910
},
{
"epoch": 0.7696933253156946,
"grad_norm": 0.7514089941978455,
"learning_rate": 9.273256073399434e-05,
"loss": 0.2135,
"step": 1920
},
{
"epoch": 0.7737021447183804,
"grad_norm": 0.6030351519584656,
"learning_rate": 9.261096701628718e-05,
"loss": 0.2098,
"step": 1930
},
{
"epoch": 0.7777109641210663,
"grad_norm": 0.7148683667182922,
"learning_rate": 9.248844560805969e-05,
"loss": 0.2085,
"step": 1940
},
{
"epoch": 0.7817197835237523,
"grad_norm": 0.7136949300765991,
"learning_rate": 9.236499917674606e-05,
"loss": 0.1998,
"step": 1950
},
{
"epoch": 0.7857286029264382,
"grad_norm": 0.7132196426391602,
"learning_rate": 9.224063040991924e-05,
"loss": 0.2082,
"step": 1960
},
{
"epoch": 0.789737422329124,
"grad_norm": 0.5503913164138794,
"learning_rate": 9.211534201523255e-05,
"loss": 0.2238,
"step": 1970
},
{
"epoch": 0.79374624173181,
"grad_norm": 0.7679104804992676,
"learning_rate": 9.198913672036072e-05,
"loss": 0.1971,
"step": 1980
},
{
"epoch": 0.7977550611344959,
"grad_norm": 0.9002260565757751,
"learning_rate": 9.186201727294036e-05,
"loss": 0.1998,
"step": 1990
},
{
"epoch": 0.8017638805371818,
"grad_norm": 0.5790923833847046,
"learning_rate": 9.173398644051035e-05,
"loss": 0.2113,
"step": 2000
},
{
"epoch": 0.8057726999398677,
"grad_norm": 0.6548293828964233,
"learning_rate": 9.160504701045145e-05,
"loss": 0.1969,
"step": 2010
},
{
"epoch": 0.8097815193425536,
"grad_norm": 0.6647776961326599,
"learning_rate": 9.147520178992563e-05,
"loss": 0.1979,
"step": 2020
},
{
"epoch": 0.8137903387452395,
"grad_norm": 0.6299743056297302,
"learning_rate": 9.134445360581503e-05,
"loss": 0.206,
"step": 2030
},
{
"epoch": 0.8177991581479255,
"grad_norm": 0.6221920847892761,
"learning_rate": 9.121280530466027e-05,
"loss": 0.1889,
"step": 2040
},
{
"epoch": 0.8218079775506113,
"grad_norm": 0.6568713784217834,
"learning_rate": 9.108025975259869e-05,
"loss": 0.2094,
"step": 2050
},
{
"epoch": 0.8258167969532972,
"grad_norm": 0.8146998882293701,
"learning_rate": 9.094681983530173e-05,
"loss": 0.2159,
"step": 2060
},
{
"epoch": 0.8298256163559832,
"grad_norm": 0.6871969103813171,
"learning_rate": 9.081248845791227e-05,
"loss": 0.1827,
"step": 2070
},
{
"epoch": 0.8338344357586691,
"grad_norm": 0.7771655321121216,
"learning_rate": 9.067726854498127e-05,
"loss": 0.1995,
"step": 2080
},
{
"epoch": 0.8378432551613549,
"grad_norm": 0.8692470192909241,
"learning_rate": 9.054116304040416e-05,
"loss": 0.202,
"step": 2090
},
{
"epoch": 0.8418520745640409,
"grad_norm": 0.5309840440750122,
"learning_rate": 9.040417490735676e-05,
"loss": 0.2159,
"step": 2100
},
{
"epoch": 0.8458608939667268,
"grad_norm": 0.6645334362983704,
"learning_rate": 9.026630712823072e-05,
"loss": 0.2175,
"step": 2110
},
{
"epoch": 0.8498697133694127,
"grad_norm": 0.613962709903717,
"learning_rate": 9.012756270456861e-05,
"loss": 0.2081,
"step": 2120
},
{
"epoch": 0.8538785327720986,
"grad_norm": 0.6764446496963501,
"learning_rate": 8.99879446569986e-05,
"loss": 0.213,
"step": 2130
},
{
"epoch": 0.8578873521747845,
"grad_norm": 0.6048487424850464,
"learning_rate": 8.984745602516865e-05,
"loss": 0.1879,
"step": 2140
},
{
"epoch": 0.8618961715774704,
"grad_norm": 0.5892179608345032,
"learning_rate": 8.970609986768035e-05,
"loss": 0.1827,
"step": 2150
},
{
"epoch": 0.8659049909801564,
"grad_norm": 0.7431573867797852,
"learning_rate": 8.956387926202234e-05,
"loss": 0.2055,
"step": 2160
},
{
"epoch": 0.8699138103828422,
"grad_norm": 0.6326702833175659,
"learning_rate": 8.942079730450335e-05,
"loss": 0.206,
"step": 2170
},
{
"epoch": 0.8739226297855281,
"grad_norm": 0.6847805976867676,
"learning_rate": 8.927685711018467e-05,
"loss": 0.2161,
"step": 2180
},
{
"epoch": 0.8779314491882141,
"grad_norm": 0.636877179145813,
"learning_rate": 8.913206181281248e-05,
"loss": 0.2014,
"step": 2190
},
{
"epoch": 0.8819402685909,
"grad_norm": 0.756361722946167,
"learning_rate": 8.89864145647495e-05,
"loss": 0.2063,
"step": 2200
},
{
"epoch": 0.8859490879935858,
"grad_norm": 0.5681055784225464,
"learning_rate": 8.883991853690646e-05,
"loss": 0.1997,
"step": 2210
},
{
"epoch": 0.8899579073962718,
"grad_norm": 0.6439403891563416,
"learning_rate": 8.869257691867296e-05,
"loss": 0.2029,
"step": 2220
},
{
"epoch": 0.8939667267989577,
"grad_norm": 0.6258695721626282,
"learning_rate": 8.854439291784813e-05,
"loss": 0.2062,
"step": 2230
},
{
"epoch": 0.8979755462016437,
"grad_norm": 0.6915255188941956,
"learning_rate": 8.839536976057075e-05,
"loss": 0.2008,
"step": 2240
},
{
"epoch": 0.9019843656043295,
"grad_norm": 0.7225965857505798,
"learning_rate": 8.824551069124898e-05,
"loss": 0.1915,
"step": 2250
},
{
"epoch": 0.9059931850070154,
"grad_norm": 0.784816563129425,
"learning_rate": 8.809481897248983e-05,
"loss": 0.1897,
"step": 2260
},
{
"epoch": 0.9100020044097014,
"grad_norm": 0.7496415972709656,
"learning_rate": 8.7943297885028e-05,
"loss": 0.198,
"step": 2270
},
{
"epoch": 0.9140108238123873,
"grad_norm": 0.6198856830596924,
"learning_rate": 8.779095072765453e-05,
"loss": 0.2055,
"step": 2280
},
{
"epoch": 0.9180196432150731,
"grad_norm": 0.6876329183578491,
"learning_rate": 8.763778081714498e-05,
"loss": 0.1969,
"step": 2290
},
{
"epoch": 0.9220284626177591,
"grad_norm": 0.7026522159576416,
"learning_rate": 8.748379148818722e-05,
"loss": 0.1811,
"step": 2300
},
{
"epoch": 0.926037282020445,
"grad_norm": 0.6701675057411194,
"learning_rate": 8.732898609330875e-05,
"loss": 0.1902,
"step": 2310
},
{
"epoch": 0.9300461014231309,
"grad_norm": 0.6713166236877441,
"learning_rate": 8.717336800280386e-05,
"loss": 0.2093,
"step": 2320
},
{
"epoch": 0.9340549208258168,
"grad_norm": 0.7247043251991272,
"learning_rate": 8.701694060466014e-05,
"loss": 0.1916,
"step": 2330
},
{
"epoch": 0.9380637402285027,
"grad_norm": 0.6550298929214478,
"learning_rate": 8.685970730448475e-05,
"loss": 0.2034,
"step": 2340
},
{
"epoch": 0.9420725596311886,
"grad_norm": 0.7075363397598267,
"learning_rate": 8.670167152543026e-05,
"loss": 0.1823,
"step": 2350
},
{
"epoch": 0.9460813790338746,
"grad_norm": 0.7122249603271484,
"learning_rate": 8.654283670812017e-05,
"loss": 0.1941,
"step": 2360
},
{
"epoch": 0.9500901984365604,
"grad_norm": 0.6687220335006714,
"learning_rate": 8.638320631057397e-05,
"loss": 0.1933,
"step": 2370
},
{
"epoch": 0.9540990178392463,
"grad_norm": 0.635455310344696,
"learning_rate": 8.622278380813186e-05,
"loss": 0.1967,
"step": 2380
},
{
"epoch": 0.9581078372419323,
"grad_norm": 0.7970702052116394,
"learning_rate": 8.606157269337906e-05,
"loss": 0.1901,
"step": 2390
},
{
"epoch": 0.9621166566446182,
"grad_norm": 0.7364137768745422,
"learning_rate": 8.589957647606988e-05,
"loss": 0.1945,
"step": 2400
},
{
"epoch": 0.966125476047304,
"grad_norm": 0.7844299674034119,
"learning_rate": 8.573679868305114e-05,
"loss": 0.1821,
"step": 2410
},
{
"epoch": 0.97013429544999,
"grad_norm": 0.8092600703239441,
"learning_rate": 8.557324285818552e-05,
"loss": 0.1934,
"step": 2420
},
{
"epoch": 0.9741431148526759,
"grad_norm": 0.66877281665802,
"learning_rate": 8.540891256227437e-05,
"loss": 0.2021,
"step": 2430
},
{
"epoch": 0.9781519342553618,
"grad_norm": 0.7711961269378662,
"learning_rate": 8.524381137298014e-05,
"loss": 0.1801,
"step": 2440
},
{
"epoch": 0.9821607536580477,
"grad_norm": 0.6817704439163208,
"learning_rate": 8.507794288474856e-05,
"loss": 0.1928,
"step": 2450
},
{
"epoch": 0.9861695730607336,
"grad_norm": 0.8401746153831482,
"learning_rate": 8.491131070873038e-05,
"loss": 0.1884,
"step": 2460
},
{
"epoch": 0.9901783924634195,
"grad_norm": 0.7808353900909424,
"learning_rate": 8.474391847270265e-05,
"loss": 0.1966,
"step": 2470
},
{
"epoch": 0.9941872118661055,
"grad_norm": 0.6367965340614319,
"learning_rate": 8.45757698209899e-05,
"loss": 0.1892,
"step": 2480
},
{
"epoch": 0.9981960312687913,
"grad_norm": 0.7107962369918823,
"learning_rate": 8.440686841438462e-05,
"loss": 0.1961,
"step": 2490
},
{
"epoch": 1.002004409701343,
"grad_norm": 0.593016505241394,
"learning_rate": 8.423721793006775e-05,
"loss": 0.1773,
"step": 2500
},
{
"epoch": 1.006013229104029,
"grad_norm": 0.7551445364952087,
"learning_rate": 8.406682206152845e-05,
"loss": 0.1733,
"step": 2510
},
{
"epoch": 1.0100220485067148,
"grad_norm": 0.8561877608299255,
"learning_rate": 8.389568451848382e-05,
"loss": 0.1594,
"step": 2520
},
{
"epoch": 1.0140308679094008,
"grad_norm": 0.8644078969955444,
"learning_rate": 8.372380902679804e-05,
"loss": 0.179,
"step": 2530
},
{
"epoch": 1.0180396873120865,
"grad_norm": 0.778167188167572,
"learning_rate": 8.355119932840129e-05,
"loss": 0.1616,
"step": 2540
},
{
"epoch": 1.0220485067147724,
"grad_norm": 0.7065404057502747,
"learning_rate": 8.337785918120837e-05,
"loss": 0.1768,
"step": 2550
},
{
"epoch": 1.0260573261174584,
"grad_norm": 0.8743630051612854,
"learning_rate": 8.320379235903668e-05,
"loss": 0.1687,
"step": 2560
},
{
"epoch": 1.0300661455201443,
"grad_norm": 1.0897860527038574,
"learning_rate": 8.302900265152427e-05,
"loss": 0.1558,
"step": 2570
},
{
"epoch": 1.0340749649228302,
"grad_norm": 0.7313379645347595,
"learning_rate": 8.285349386404722e-05,
"loss": 0.16,
"step": 2580
},
{
"epoch": 1.0380837843255162,
"grad_norm": 0.8040058016777039,
"learning_rate": 8.267726981763682e-05,
"loss": 0.1571,
"step": 2590
},
{
"epoch": 1.0420926037282021,
"grad_norm": 0.8637468218803406,
"learning_rate": 8.250033434889637e-05,
"loss": 0.16,
"step": 2600
},
{
"epoch": 1.046101423130888,
"grad_norm": 0.7505359053611755,
"learning_rate": 8.232269130991769e-05,
"loss": 0.1597,
"step": 2610
},
{
"epoch": 1.0501102425335738,
"grad_norm": 0.8430061340332031,
"learning_rate": 8.214434456819725e-05,
"loss": 0.1723,
"step": 2620
},
{
"epoch": 1.0541190619362597,
"grad_norm": 0.66597580909729,
"learning_rate": 8.196529800655188e-05,
"loss": 0.1751,
"step": 2630
},
{
"epoch": 1.0581278813389456,
"grad_norm": 0.8823577761650085,
"learning_rate": 8.178555552303437e-05,
"loss": 0.1701,
"step": 2640
},
{
"epoch": 1.0621367007416316,
"grad_norm": 0.9401513338088989,
"learning_rate": 8.160512103084851e-05,
"loss": 0.1564,
"step": 2650
},
{
"epoch": 1.0661455201443175,
"grad_norm": 0.7342818379402161,
"learning_rate": 8.142399845826394e-05,
"loss": 0.1507,
"step": 2660
},
{
"epoch": 1.0701543395470035,
"grad_norm": 0.8487102389335632,
"learning_rate": 8.12421917485306e-05,
"loss": 0.1633,
"step": 2670
},
{
"epoch": 1.0741631589496894,
"grad_norm": 0.8836720585823059,
"learning_rate": 8.105970485979295e-05,
"loss": 0.1682,
"step": 2680
},
{
"epoch": 1.0781719783523753,
"grad_norm": 0.6858396530151367,
"learning_rate": 8.087654176500366e-05,
"loss": 0.1723,
"step": 2690
},
{
"epoch": 1.082180797755061,
"grad_norm": 1.028981328010559,
"learning_rate": 8.069270645183722e-05,
"loss": 0.1555,
"step": 2700
},
{
"epoch": 1.086189617157747,
"grad_norm": 0.9475600719451904,
"learning_rate": 8.050820292260313e-05,
"loss": 0.1591,
"step": 2710
},
{
"epoch": 1.090198436560433,
"grad_norm": 0.683160126209259,
"learning_rate": 8.032303519415874e-05,
"loss": 0.1703,
"step": 2720
},
{
"epoch": 1.0942072559631189,
"grad_norm": 0.8751930594444275,
"learning_rate": 8.013720729782173e-05,
"loss": 0.1489,
"step": 2730
},
{
"epoch": 1.0982160753658048,
"grad_norm": 0.8032315373420715,
"learning_rate": 7.995072327928243e-05,
"loss": 0.1439,
"step": 2740
},
{
"epoch": 1.1022248947684907,
"grad_norm": 0.7631738185882568,
"learning_rate": 7.976358719851579e-05,
"loss": 0.1676,
"step": 2750
},
{
"epoch": 1.1062337141711767,
"grad_norm": 0.7207862734794617,
"learning_rate": 7.957580312969283e-05,
"loss": 0.1494,
"step": 2760
},
{
"epoch": 1.1102425335738626,
"grad_norm": 0.6857604384422302,
"learning_rate": 7.938737516109207e-05,
"loss": 0.1594,
"step": 2770
},
{
"epoch": 1.1142513529765483,
"grad_norm": 1.0340170860290527,
"learning_rate": 7.919830739501043e-05,
"loss": 0.1621,
"step": 2780
},
{
"epoch": 1.1182601723792343,
"grad_norm": 0.7190383672714233,
"learning_rate": 7.900860394767402e-05,
"loss": 0.1638,
"step": 2790
},
{
"epoch": 1.1222689917819202,
"grad_norm": 0.8485333919525146,
"learning_rate": 7.881826894914846e-05,
"loss": 0.1619,
"step": 2800
},
{
"epoch": 1.1262778111846061,
"grad_norm": 0.8466002345085144,
"learning_rate": 7.862730654324899e-05,
"loss": 0.1448,
"step": 2810
},
{
"epoch": 1.130286630587292,
"grad_norm": 0.7490071058273315,
"learning_rate": 7.843572088745019e-05,
"loss": 0.1649,
"step": 2820
},
{
"epoch": 1.134295449989978,
"grad_norm": 0.7291231751441956,
"learning_rate": 7.824351615279557e-05,
"loss": 0.1604,
"step": 2830
},
{
"epoch": 1.138304269392664,
"grad_norm": 1.1249662637710571,
"learning_rate": 7.80506965238067e-05,
"loss": 0.1383,
"step": 2840
},
{
"epoch": 1.1423130887953499,
"grad_norm": 0.8020785450935364,
"learning_rate": 7.785726619839212e-05,
"loss": 0.1565,
"step": 2850
},
{
"epoch": 1.1463219081980356,
"grad_norm": 0.9652583599090576,
"learning_rate": 7.766322938775589e-05,
"loss": 0.1513,
"step": 2860
},
{
"epoch": 1.1503307276007215,
"grad_norm": 0.8806086182594299,
"learning_rate": 7.746859031630605e-05,
"loss": 0.1607,
"step": 2870
},
{
"epoch": 1.1543395470034075,
"grad_norm": 0.9319799542427063,
"learning_rate": 7.72733532215625e-05,
"loss": 0.1588,
"step": 2880
},
{
"epoch": 1.1583483664060934,
"grad_norm": 0.9107722640037537,
"learning_rate": 7.707752235406485e-05,
"loss": 0.1445,
"step": 2890
},
{
"epoch": 1.1623571858087793,
"grad_norm": 0.9413526654243469,
"learning_rate": 7.688110197727975e-05,
"loss": 0.1589,
"step": 2900
},
{
"epoch": 1.1663660052114653,
"grad_norm": 0.9594728350639343,
"learning_rate": 7.668409636750828e-05,
"loss": 0.1584,
"step": 2910
},
{
"epoch": 1.1703748246141512,
"grad_norm": 0.7484379410743713,
"learning_rate": 7.648650981379264e-05,
"loss": 0.1582,
"step": 2920
},
{
"epoch": 1.174383644016837,
"grad_norm": 0.840965747833252,
"learning_rate": 7.628834661782288e-05,
"loss": 0.1563,
"step": 2930
},
{
"epoch": 1.1783924634195229,
"grad_norm": 0.7504467368125916,
"learning_rate": 7.608961109384321e-05,
"loss": 0.145,
"step": 2940
},
{
"epoch": 1.1824012828222088,
"grad_norm": 1.0212056636810303,
"learning_rate": 7.589030756855813e-05,
"loss": 0.1562,
"step": 2950
},
{
"epoch": 1.1864101022248947,
"grad_norm": 0.9360294342041016,
"learning_rate": 7.569044038103813e-05,
"loss": 0.156,
"step": 2960
},
{
"epoch": 1.1904189216275807,
"grad_norm": 0.649131178855896,
"learning_rate": 7.549001388262535e-05,
"loss": 0.1713,
"step": 2970
},
{
"epoch": 1.1944277410302666,
"grad_norm": 1.106505274772644,
"learning_rate": 7.528903243683874e-05,
"loss": 0.1475,
"step": 2980
},
{
"epoch": 1.1984365604329525,
"grad_norm": 0.8083673119544983,
"learning_rate": 7.508750041927914e-05,
"loss": 0.1512,
"step": 2990
},
{
"epoch": 1.2024453798356385,
"grad_norm": 0.7395840287208557,
"learning_rate": 7.488542221753394e-05,
"loss": 0.1481,
"step": 3000
},
{
"epoch": 1.2064541992383244,
"grad_norm": 0.923462986946106,
"learning_rate": 7.46828022310816e-05,
"loss": 0.1537,
"step": 3010
},
{
"epoch": 1.2104630186410101,
"grad_norm": 0.8510660529136658,
"learning_rate": 7.44796448711959e-05,
"loss": 0.1525,
"step": 3020
},
{
"epoch": 1.214471838043696,
"grad_norm": 0.881767749786377,
"learning_rate": 7.427595456084981e-05,
"loss": 0.1641,
"step": 3030
},
{
"epoch": 1.218480657446382,
"grad_norm": 0.8366743326187134,
"learning_rate": 7.407173573461934e-05,
"loss": 0.1502,
"step": 3040
},
{
"epoch": 1.222489476849068,
"grad_norm": 0.8755321502685547,
"learning_rate": 7.386699283858683e-05,
"loss": 0.1495,
"step": 3050
},
{
"epoch": 1.2264982962517539,
"grad_norm": 0.841222882270813,
"learning_rate": 7.366173033024428e-05,
"loss": 0.1423,
"step": 3060
},
{
"epoch": 1.2305071156544398,
"grad_norm": 0.8285235166549683,
"learning_rate": 7.345595267839621e-05,
"loss": 0.1632,
"step": 3070
},
{
"epoch": 1.2345159350571258,
"grad_norm": 0.764156699180603,
"learning_rate": 7.324966436306246e-05,
"loss": 0.1466,
"step": 3080
},
{
"epoch": 1.2385247544598115,
"grad_norm": 1.1134533882141113,
"learning_rate": 7.30428698753806e-05,
"loss": 0.1393,
"step": 3090
},
{
"epoch": 1.2425335738624974,
"grad_norm": 0.8127875328063965,
"learning_rate": 7.283557371750813e-05,
"loss": 0.1597,
"step": 3100
},
{
"epoch": 1.2465423932651833,
"grad_norm": 0.8257074356079102,
"learning_rate": 7.262778040252455e-05,
"loss": 0.1659,
"step": 3110
},
{
"epoch": 1.2505512126678693,
"grad_norm": 0.7807098031044006,
"learning_rate": 7.2419494454333e-05,
"loss": 0.1476,
"step": 3120
},
{
"epoch": 1.2545600320705552,
"grad_norm": 0.7114003300666809,
"learning_rate": 7.221072040756188e-05,
"loss": 0.1467,
"step": 3130
},
{
"epoch": 1.2585688514732412,
"grad_norm": 0.7870392203330994,
"learning_rate": 7.2001462807466e-05,
"loss": 0.1471,
"step": 3140
},
{
"epoch": 1.262577670875927,
"grad_norm": 0.6909427046775818,
"learning_rate": 7.179172620982774e-05,
"loss": 0.1575,
"step": 3150
},
{
"epoch": 1.266586490278613,
"grad_norm": 0.8754594922065735,
"learning_rate": 7.158151518085776e-05,
"loss": 0.155,
"step": 3160
},
{
"epoch": 1.270595309681299,
"grad_norm": 0.7454276084899902,
"learning_rate": 7.137083429709573e-05,
"loss": 0.1431,
"step": 3170
},
{
"epoch": 1.274604129083985,
"grad_norm": 0.9142866134643555,
"learning_rate": 7.115968814531052e-05,
"loss": 0.1342,
"step": 3180
},
{
"epoch": 1.2786129484866706,
"grad_norm": 0.8666753768920898,
"learning_rate": 7.09480813224005e-05,
"loss": 0.142,
"step": 3190
},
{
"epoch": 1.2826217678893566,
"grad_norm": 0.8461101651191711,
"learning_rate": 7.073601843529333e-05,
"loss": 0.1396,
"step": 3200
},
{
"epoch": 1.2866305872920425,
"grad_norm": 0.8602980375289917,
"learning_rate": 7.052350410084574e-05,
"loss": 0.1435,
"step": 3210
},
{
"epoch": 1.2906394066947284,
"grad_norm": 1.0527535676956177,
"learning_rate": 7.031054294574303e-05,
"loss": 0.1474,
"step": 3220
},
{
"epoch": 1.2946482260974144,
"grad_norm": 0.84455806016922,
"learning_rate": 7.009713960639826e-05,
"loss": 0.1565,
"step": 3230
},
{
"epoch": 1.2986570455001,
"grad_norm": 0.7223050594329834,
"learning_rate": 6.98832987288514e-05,
"loss": 0.1482,
"step": 3240
},
{
"epoch": 1.302665864902786,
"grad_norm": 0.8750767111778259,
"learning_rate": 6.966902496866807e-05,
"loss": 0.1611,
"step": 3250
},
{
"epoch": 1.306674684305472,
"grad_norm": 0.7444009184837341,
"learning_rate": 6.945432299083834e-05,
"loss": 0.1647,
"step": 3260
},
{
"epoch": 1.310683503708158,
"grad_norm": 1.013881802558899,
"learning_rate": 6.9239197469675e-05,
"loss": 0.1412,
"step": 3270
},
{
"epoch": 1.3146923231108438,
"grad_norm": 0.8479213118553162,
"learning_rate": 6.902365308871193e-05,
"loss": 0.1369,
"step": 3280
},
{
"epoch": 1.3187011425135298,
"grad_norm": 0.8772777318954468,
"learning_rate": 6.880769454060201e-05,
"loss": 0.1501,
"step": 3290
},
{
"epoch": 1.3227099619162157,
"grad_norm": 0.8388547301292419,
"learning_rate": 6.859132652701514e-05,
"loss": 0.1402,
"step": 3300
},
{
"epoch": 1.3267187813189016,
"grad_norm": 0.8197916746139526,
"learning_rate": 6.837455375853561e-05,
"loss": 0.1351,
"step": 3310
},
{
"epoch": 1.3307276007215876,
"grad_norm": 0.9061885476112366,
"learning_rate": 6.815738095455984e-05,
"loss": 0.139,
"step": 3320
},
{
"epoch": 1.3347364201242735,
"grad_norm": 0.721653938293457,
"learning_rate": 6.793981284319339e-05,
"loss": 0.1556,
"step": 3330
},
{
"epoch": 1.3387452395269592,
"grad_norm": 0.9494278430938721,
"learning_rate": 6.772185416114814e-05,
"loss": 0.1423,
"step": 3340
},
{
"epoch": 1.3427540589296452,
"grad_norm": 0.8513092994689941,
"learning_rate": 6.750350965363919e-05,
"loss": 0.1393,
"step": 3350
},
{
"epoch": 1.346762878332331,
"grad_norm": 0.8258860111236572,
"learning_rate": 6.728478407428151e-05,
"loss": 0.146,
"step": 3360
},
{
"epoch": 1.350771697735017,
"grad_norm": 0.8146616220474243,
"learning_rate": 6.706568218498639e-05,
"loss": 0.148,
"step": 3370
},
{
"epoch": 1.354780517137703,
"grad_norm": 0.9726580381393433,
"learning_rate": 6.684620875585787e-05,
"loss": 0.1404,
"step": 3380
},
{
"epoch": 1.358789336540389,
"grad_norm": 1.0220385789871216,
"learning_rate": 6.662636856508887e-05,
"loss": 0.1504,
"step": 3390
},
{
"epoch": 1.3627981559430746,
"grad_norm": 0.9221115708351135,
"learning_rate": 6.640616639885708e-05,
"loss": 0.1407,
"step": 3400
},
{
"epoch": 1.3668069753457606,
"grad_norm": 0.9321884512901306,
"learning_rate": 6.618560705122086e-05,
"loss": 0.1286,
"step": 3410
},
{
"epoch": 1.3708157947484465,
"grad_norm": 0.8789135217666626,
"learning_rate": 6.596469532401483e-05,
"loss": 0.1478,
"step": 3420
},
{
"epoch": 1.3748246141511324,
"grad_norm": 0.8220512270927429,
"learning_rate": 6.574343602674528e-05,
"loss": 0.1439,
"step": 3430
},
{
"epoch": 1.3788334335538184,
"grad_norm": 1.0369560718536377,
"learning_rate": 6.552183397648555e-05,
"loss": 0.1323,
"step": 3440
},
{
"epoch": 1.3828422529565043,
"grad_norm": 1.0133991241455078,
"learning_rate": 6.529989399777109e-05,
"loss": 0.1472,
"step": 3450
},
{
"epoch": 1.3868510723591903,
"grad_norm": 0.9306389093399048,
"learning_rate": 6.507762092249448e-05,
"loss": 0.1446,
"step": 3460
},
{
"epoch": 1.3908598917618762,
"grad_norm": 1.021039366722107,
"learning_rate": 6.485501958980016e-05,
"loss": 0.1341,
"step": 3470
},
{
"epoch": 1.3948687111645621,
"grad_norm": 0.7612369656562805,
"learning_rate": 6.463209484597913e-05,
"loss": 0.1437,
"step": 3480
},
{
"epoch": 1.398877530567248,
"grad_norm": 0.7720378041267395,
"learning_rate": 6.440885154436344e-05,
"loss": 0.1184,
"step": 3490
},
{
"epoch": 1.4028863499699338,
"grad_norm": 0.9269343614578247,
"learning_rate": 6.418529454522051e-05,
"loss": 0.1474,
"step": 3500
},
{
"epoch": 1.4068951693726197,
"grad_norm": 0.8597378730773926,
"learning_rate": 6.396142871564731e-05,
"loss": 0.1395,
"step": 3510
},
{
"epoch": 1.4109039887753057,
"grad_norm": 0.9362756013870239,
"learning_rate": 6.373725892946443e-05,
"loss": 0.1476,
"step": 3520
},
{
"epoch": 1.4149128081779916,
"grad_norm": 0.8636417388916016,
"learning_rate": 6.351279006710994e-05,
"loss": 0.1333,
"step": 3530
},
{
"epoch": 1.4189216275806775,
"grad_norm": 0.9320933818817139,
"learning_rate": 6.328802701553313e-05,
"loss": 0.1464,
"step": 3540
},
{
"epoch": 1.4229304469833635,
"grad_norm": 1.1692008972167969,
"learning_rate": 6.306297466808818e-05,
"loss": 0.1515,
"step": 3550
},
{
"epoch": 1.4269392663860492,
"grad_norm": 0.7800849676132202,
"learning_rate": 6.283763792442751e-05,
"loss": 0.1414,
"step": 3560
},
{
"epoch": 1.4309480857887351,
"grad_norm": 1.0798330307006836,
"learning_rate": 6.261202169039526e-05,
"loss": 0.1478,
"step": 3570
},
{
"epoch": 1.434956905191421,
"grad_norm": 0.8681895136833191,
"learning_rate": 6.23861308779203e-05,
"loss": 0.1413,
"step": 3580
},
{
"epoch": 1.438965724594107,
"grad_norm": 1.3371766805648804,
"learning_rate": 6.21599704049095e-05,
"loss": 0.132,
"step": 3590
},
{
"epoch": 1.442974543996793,
"grad_norm": 0.923513650894165,
"learning_rate": 6.19335451951405e-05,
"loss": 0.1435,
"step": 3600
},
{
"epoch": 1.4469833633994789,
"grad_norm": 0.9107206463813782,
"learning_rate": 6.170686017815456e-05,
"loss": 0.1219,
"step": 3610
},
{
"epoch": 1.4509921828021648,
"grad_norm": 0.9753092527389526,
"learning_rate": 6.147992028914926e-05,
"loss": 0.1426,
"step": 3620
},
{
"epoch": 1.4550010022048507,
"grad_norm": 0.9150570631027222,
"learning_rate": 6.125273046887106e-05,
"loss": 0.1342,
"step": 3630
},
{
"epoch": 1.4590098216075367,
"grad_norm": 1.0572060346603394,
"learning_rate": 6.10252956635077e-05,
"loss": 0.1274,
"step": 3640
},
{
"epoch": 1.4630186410102226,
"grad_norm": 0.7989734411239624,
"learning_rate": 6.079762082458049e-05,
"loss": 0.1385,
"step": 3650
},
{
"epoch": 1.4670274604129083,
"grad_norm": 0.8875731229782104,
"learning_rate": 6.056971090883665e-05,
"loss": 0.1413,
"step": 3660
},
{
"epoch": 1.4710362798155943,
"grad_norm": 0.9534810185432434,
"learning_rate": 6.0341570878141184e-05,
"loss": 0.1267,
"step": 3670
},
{
"epoch": 1.4750450992182802,
"grad_norm": 0.7729069590568542,
"learning_rate": 6.0113205699369056e-05,
"loss": 0.1469,
"step": 3680
},
{
"epoch": 1.4790539186209661,
"grad_norm": 0.6528967022895813,
"learning_rate": 5.988462034429692e-05,
"loss": 0.1314,
"step": 3690
},
{
"epoch": 1.483062738023652,
"grad_norm": 1.0471932888031006,
"learning_rate": 5.965581978949494e-05,
"loss": 0.1294,
"step": 3700
},
{
"epoch": 1.487071557426338,
"grad_norm": 0.8370137810707092,
"learning_rate": 5.942680901621842e-05,
"loss": 0.1507,
"step": 3710
},
{
"epoch": 1.4910803768290237,
"grad_norm": 0.7025067210197449,
"learning_rate": 5.9197593010299377e-05,
"loss": 0.1386,
"step": 3720
},
{
"epoch": 1.4950891962317097,
"grad_norm": 0.9664121866226196,
"learning_rate": 5.8968176762037985e-05,
"loss": 0.145,
"step": 3730
},
{
"epoch": 1.4990980156343956,
"grad_norm": 0.8898931741714478,
"learning_rate": 5.87385652660939e-05,
"loss": 0.1386,
"step": 3740
},
{
"epoch": 1.5031068350370815,
"grad_norm": 0.750616192817688,
"learning_rate": 5.850876352137759e-05,
"loss": 0.153,
"step": 3750
},
{
"epoch": 1.5071156544397675,
"grad_norm": 1.0957409143447876,
"learning_rate": 5.827877653094144e-05,
"loss": 0.1329,
"step": 3760
},
{
"epoch": 1.5111244738424534,
"grad_norm": 0.8789597749710083,
"learning_rate": 5.8048609301870816e-05,
"loss": 0.1329,
"step": 3770
},
{
"epoch": 1.5151332932451393,
"grad_norm": 0.7944477200508118,
"learning_rate": 5.781826684517515e-05,
"loss": 0.1256,
"step": 3780
},
{
"epoch": 1.5191421126478253,
"grad_norm": 0.8657981753349304,
"learning_rate": 5.758775417567878e-05,
"loss": 0.1266,
"step": 3790
},
{
"epoch": 1.5231509320505112,
"grad_norm": 0.8267760276794434,
"learning_rate": 5.73570763119117e-05,
"loss": 0.1269,
"step": 3800
},
{
"epoch": 1.5271597514531972,
"grad_norm": 0.9449699521064758,
"learning_rate": 5.7126238276000474e-05,
"loss": 0.1331,
"step": 3810
},
{
"epoch": 1.531168570855883,
"grad_norm": 1.0582398176193237,
"learning_rate": 5.689524509355873e-05,
"loss": 0.1277,
"step": 3820
},
{
"epoch": 1.5351773902585688,
"grad_norm": 0.8139535784721375,
"learning_rate": 5.6664101793577865e-05,
"loss": 0.1275,
"step": 3830
},
{
"epoch": 1.5391862096612547,
"grad_norm": 0.7074098587036133,
"learning_rate": 5.643281340831745e-05,
"loss": 0.1307,
"step": 3840
},
{
"epoch": 1.5431950290639407,
"grad_norm": 0.858897864818573,
"learning_rate": 5.6201384973195825e-05,
"loss": 0.1296,
"step": 3850
},
{
"epoch": 1.5472038484666266,
"grad_norm": 0.984902560710907,
"learning_rate": 5.596982152668029e-05,
"loss": 0.1315,
"step": 3860
},
{
"epoch": 1.5512126678693123,
"grad_norm": 0.9450563192367554,
"learning_rate": 5.5738128110177523e-05,
"loss": 0.1275,
"step": 3870
},
{
"epoch": 1.5552214872719983,
"grad_norm": 1.13248610496521,
"learning_rate": 5.550630976792385e-05,
"loss": 0.1364,
"step": 3880
},
{
"epoch": 1.5592303066746842,
"grad_norm": 0.9023851752281189,
"learning_rate": 5.5274371546875304e-05,
"loss": 0.1262,
"step": 3890
},
{
"epoch": 1.5632391260773701,
"grad_norm": 0.9542123079299927,
"learning_rate": 5.5042318496597876e-05,
"loss": 0.1398,
"step": 3900
},
{
"epoch": 1.567247945480056,
"grad_norm": 0.8645676374435425,
"learning_rate": 5.4810155669157495e-05,
"loss": 0.1356,
"step": 3910
},
{
"epoch": 1.571256764882742,
"grad_norm": 0.8348353505134583,
"learning_rate": 5.457788811901008e-05,
"loss": 0.1431,
"step": 3920
},
{
"epoch": 1.575265584285428,
"grad_norm": 0.8592683672904968,
"learning_rate": 5.434552090289145e-05,
"loss": 0.1243,
"step": 3930
},
{
"epoch": 1.579274403688114,
"grad_norm": 0.9037445187568665,
"learning_rate": 5.411305907970734e-05,
"loss": 0.1201,
"step": 3940
},
{
"epoch": 1.5832832230907998,
"grad_norm": 0.7110516428947449,
"learning_rate": 5.3880507710423134e-05,
"loss": 0.1331,
"step": 3950
},
{
"epoch": 1.5872920424934858,
"grad_norm": 0.8847816586494446,
"learning_rate": 5.3647871857953735e-05,
"loss": 0.1224,
"step": 3960
},
{
"epoch": 1.5913008618961717,
"grad_norm": 0.9340296983718872,
"learning_rate": 5.341515658705339e-05,
"loss": 0.1315,
"step": 3970
},
{
"epoch": 1.5953096812988576,
"grad_norm": 0.9499775767326355,
"learning_rate": 5.318236696420534e-05,
"loss": 0.1338,
"step": 3980
},
{
"epoch": 1.5993185007015434,
"grad_norm": 0.9325523972511292,
"learning_rate": 5.294950805751158e-05,
"loss": 0.1277,
"step": 3990
},
{
"epoch": 1.6033273201042293,
"grad_norm": 0.9514039158821106,
"learning_rate": 5.271658493658245e-05,
"loss": 0.1287,
"step": 4000
},
{
"epoch": 1.6073361395069152,
"grad_norm": 1.022368311882019,
"learning_rate": 5.248360267242637e-05,
"loss": 0.1363,
"step": 4010
},
{
"epoch": 1.6113449589096012,
"grad_norm": 0.8409161567687988,
"learning_rate": 5.2250566337339326e-05,
"loss": 0.1341,
"step": 4020
},
{
"epoch": 1.6153537783122869,
"grad_norm": 1.0613347291946411,
"learning_rate": 5.201748100479452e-05,
"loss": 0.1329,
"step": 4030
},
{
"epoch": 1.6193625977149728,
"grad_norm": 0.8661359548568726,
"learning_rate": 5.178435174933188e-05,
"loss": 0.119,
"step": 4040
},
{
"epoch": 1.6233714171176588,
"grad_norm": 0.9642584919929504,
"learning_rate": 5.15511836464476e-05,
"loss": 0.1279,
"step": 4050
},
{
"epoch": 1.6273802365203447,
"grad_norm": 0.9616632461547852,
"learning_rate": 5.131798177248357e-05,
"loss": 0.1294,
"step": 4060
},
{
"epoch": 1.6313890559230306,
"grad_norm": 1.1416373252868652,
"learning_rate": 5.108475120451702e-05,
"loss": 0.1394,
"step": 4070
},
{
"epoch": 1.6353978753257166,
"grad_norm": 0.9488154649734497,
"learning_rate": 5.085149702024977e-05,
"loss": 0.1222,
"step": 4080
},
{
"epoch": 1.6394066947284025,
"grad_norm": 1.030707597732544,
"learning_rate": 5.061822429789788e-05,
"loss": 0.1304,
"step": 4090
},
{
"epoch": 1.6434155141310884,
"grad_norm": 1.0803980827331543,
"learning_rate": 5.038493811608095e-05,
"loss": 0.1326,
"step": 4100
},
{
"epoch": 1.6474243335337744,
"grad_norm": 0.8971238136291504,
"learning_rate": 5.015164355371164e-05,
"loss": 0.1163,
"step": 4110
},
{
"epoch": 1.6514331529364603,
"grad_norm": 0.7943403124809265,
"learning_rate": 4.9918345689885035e-05,
"loss": 0.1268,
"step": 4120
},
{
"epoch": 1.6554419723391463,
"grad_norm": 1.109113097190857,
"learning_rate": 4.968504960376815e-05,
"loss": 0.1289,
"step": 4130
},
{
"epoch": 1.6594507917418322,
"grad_norm": 1.1698325872421265,
"learning_rate": 4.945176037448923e-05,
"loss": 0.1138,
"step": 4140
},
{
"epoch": 1.663459611144518,
"grad_norm": 1.1132344007492065,
"learning_rate": 4.9218483081027284e-05,
"loss": 0.1244,
"step": 4150
},
{
"epoch": 1.6674684305472038,
"grad_norm": 0.8619892001152039,
"learning_rate": 4.8985222802101475e-05,
"loss": 0.1296,
"step": 4160
},
{
"epoch": 1.6714772499498898,
"grad_norm": 1.010392427444458,
"learning_rate": 4.875198461606047e-05,
"loss": 0.1307,
"step": 4170
},
{
"epoch": 1.6754860693525757,
"grad_norm": 0.8872926831245422,
"learning_rate": 4.851877360077203e-05,
"loss": 0.1241,
"step": 4180
},
{
"epoch": 1.6794948887552614,
"grad_norm": 1.035994052886963,
"learning_rate": 4.828559483351233e-05,
"loss": 0.112,
"step": 4190
},
{
"epoch": 1.6835037081579474,
"grad_norm": 1.1755554676055908,
"learning_rate": 4.805245339085548e-05,
"loss": 0.1198,
"step": 4200
},
{
"epoch": 1.6875125275606333,
"grad_norm": 1.008541226387024,
"learning_rate": 4.781935434856299e-05,
"loss": 0.1348,
"step": 4210
},
{
"epoch": 1.6915213469633192,
"grad_norm": 1.0429742336273193,
"learning_rate": 4.758630278147327e-05,
"loss": 0.1205,
"step": 4220
},
{
"epoch": 1.6955301663660052,
"grad_norm": 0.8936703205108643,
"learning_rate": 4.735330376339111e-05,
"loss": 0.119,
"step": 4230
},
{
"epoch": 1.6995389857686911,
"grad_norm": 0.9886868596076965,
"learning_rate": 4.712036236697728e-05,
"loss": 0.1084,
"step": 4240
},
{
"epoch": 1.703547805171377,
"grad_norm": 0.9149814248085022,
"learning_rate": 4.6887483663638084e-05,
"loss": 0.1303,
"step": 4250
},
{
"epoch": 1.707556624574063,
"grad_norm": 0.9031015634536743,
"learning_rate": 4.665467272341484e-05,
"loss": 0.109,
"step": 4260
},
{
"epoch": 1.711565443976749,
"grad_norm": 1.041288137435913,
"learning_rate": 4.6421934614873654e-05,
"loss": 0.1246,
"step": 4270
},
{
"epoch": 1.7155742633794349,
"grad_norm": 0.9827173352241516,
"learning_rate": 4.6189274404994984e-05,
"loss": 0.1252,
"step": 4280
},
{
"epoch": 1.7195830827821208,
"grad_norm": 1.0415915250778198,
"learning_rate": 4.595669715906333e-05,
"loss": 0.1122,
"step": 4290
},
{
"epoch": 1.7235919021848067,
"grad_norm": 1.0126681327819824,
"learning_rate": 4.572420794055698e-05,
"loss": 0.1213,
"step": 4300
},
{
"epoch": 1.7276007215874924,
"grad_norm": 0.9639745354652405,
"learning_rate": 4.549181181103778e-05,
"loss": 0.1279,
"step": 4310
},
{
"epoch": 1.7316095409901784,
"grad_norm": 1.1144078969955444,
"learning_rate": 4.5259513830040875e-05,
"loss": 0.1189,
"step": 4320
},
{
"epoch": 1.7356183603928643,
"grad_norm": 1.139124870300293,
"learning_rate": 4.502731905496463e-05,
"loss": 0.1112,
"step": 4330
},
{
"epoch": 1.7396271797955503,
"grad_norm": 1.0518343448638916,
"learning_rate": 4.479523254096055e-05,
"loss": 0.1321,
"step": 4340
},
{
"epoch": 1.743635999198236,
"grad_norm": 0.7808403968811035,
"learning_rate": 4.456325934082302e-05,
"loss": 0.1391,
"step": 4350
},
{
"epoch": 1.747644818600922,
"grad_norm": 1.047770619392395,
"learning_rate": 4.433140450487962e-05,
"loss": 0.1302,
"step": 4360
},
{
"epoch": 1.7516536380036078,
"grad_norm": 0.9837223291397095,
"learning_rate": 4.409967308088091e-05,
"loss": 0.1193,
"step": 4370
},
{
"epoch": 1.7556624574062938,
"grad_norm": 1.0093597173690796,
"learning_rate": 4.3868070113890626e-05,
"loss": 0.1163,
"step": 4380
},
{
"epoch": 1.7596712768089797,
"grad_norm": 1.1313358545303345,
"learning_rate": 4.36366006461759e-05,
"loss": 0.1274,
"step": 4390
},
{
"epoch": 1.7636800962116657,
"grad_norm": 0.9579795598983765,
"learning_rate": 4.340526971709735e-05,
"loss": 0.1103,
"step": 4400
},
{
"epoch": 1.7676889156143516,
"grad_norm": 1.0444706678390503,
"learning_rate": 4.317408236299952e-05,
"loss": 0.1121,
"step": 4410
},
{
"epoch": 1.7716977350170375,
"grad_norm": 0.9483968019485474,
"learning_rate": 4.2943043617101134e-05,
"loss": 0.1086,
"step": 4420
},
{
"epoch": 1.7757065544197235,
"grad_norm": 1.0954207181930542,
"learning_rate": 4.2712158509385495e-05,
"loss": 0.1166,
"step": 4430
},
{
"epoch": 1.7797153738224094,
"grad_norm": 1.169009804725647,
"learning_rate": 4.2481432066491114e-05,
"loss": 0.1164,
"step": 4440
},
{
"epoch": 1.7837241932250953,
"grad_norm": 0.9690777063369751,
"learning_rate": 4.2250869311602124e-05,
"loss": 0.1237,
"step": 4450
},
{
"epoch": 1.7877330126277813,
"grad_norm": 1.0763111114501953,
"learning_rate": 4.2020475264338966e-05,
"loss": 0.1382,
"step": 4460
},
{
"epoch": 1.791741832030467,
"grad_norm": 0.924728274345398,
"learning_rate": 4.179025494064916e-05,
"loss": 0.104,
"step": 4470
},
{
"epoch": 1.795750651433153,
"grad_norm": 0.9748139977455139,
"learning_rate": 4.156021335269806e-05,
"loss": 0.1071,
"step": 4480
},
{
"epoch": 1.7997594708358389,
"grad_norm": 1.1556870937347412,
"learning_rate": 4.133035550875968e-05,
"loss": 0.1137,
"step": 4490
},
{
"epoch": 1.8037682902385248,
"grad_norm": 1.1552350521087646,
"learning_rate": 4.110068641310775e-05,
"loss": 0.1207,
"step": 4500
},
{
"epoch": 1.8077771096412105,
"grad_norm": 1.115271806716919,
"learning_rate": 4.0871211065906786e-05,
"loss": 0.1205,
"step": 4510
},
{
"epoch": 1.8117859290438965,
"grad_norm": 0.9051127433776855,
"learning_rate": 4.0641934463103054e-05,
"loss": 0.1123,
"step": 4520
},
{
"epoch": 1.8157947484465824,
"grad_norm": 1.0964293479919434,
"learning_rate": 4.0412861596316013e-05,
"loss": 0.1092,
"step": 4530
},
{
"epoch": 1.8198035678492683,
"grad_norm": 1.308677315711975,
"learning_rate": 4.0183997452729534e-05,
"loss": 0.1182,
"step": 4540
},
{
"epoch": 1.8238123872519543,
"grad_norm": 0.9863505959510803,
"learning_rate": 3.99553470149833e-05,
"loss": 0.1138,
"step": 4550
},
{
"epoch": 1.8278212066546402,
"grad_norm": 0.9477949142456055,
"learning_rate": 3.9726915261064426e-05,
"loss": 0.123,
"step": 4560
},
{
"epoch": 1.8318300260573261,
"grad_norm": 1.130746841430664,
"learning_rate": 3.9498707164198984e-05,
"loss": 0.1096,
"step": 4570
},
{
"epoch": 1.835838845460012,
"grad_norm": 1.0901241302490234,
"learning_rate": 3.927072769274377e-05,
"loss": 0.1062,
"step": 4580
},
{
"epoch": 1.839847664862698,
"grad_norm": 0.79862380027771,
"learning_rate": 3.904298181007817e-05,
"loss": 0.1117,
"step": 4590
},
{
"epoch": 1.843856484265384,
"grad_norm": 0.8396957516670227,
"learning_rate": 3.881547447449606e-05,
"loss": 0.1247,
"step": 4600
},
{
"epoch": 1.84786530366807,
"grad_norm": 1.0613499879837036,
"learning_rate": 3.858821063909782e-05,
"loss": 0.1101,
"step": 4610
},
{
"epoch": 1.8518741230707558,
"grad_norm": 1.147533655166626,
"learning_rate": 3.8361195251682614e-05,
"loss": 0.1141,
"step": 4620
},
{
"epoch": 1.8558829424734415,
"grad_norm": 1.1135718822479248,
"learning_rate": 3.8134433254640576e-05,
"loss": 0.1266,
"step": 4630
},
{
"epoch": 1.8598917618761275,
"grad_norm": 1.0798869132995605,
"learning_rate": 3.790792958484522e-05,
"loss": 0.1132,
"step": 4640
},
{
"epoch": 1.8639005812788134,
"grad_norm": 0.9285503029823303,
"learning_rate": 3.7681689173545984e-05,
"loss": 0.1059,
"step": 4650
},
{
"epoch": 1.8679094006814991,
"grad_norm": 1.1934738159179688,
"learning_rate": 3.745571694626088e-05,
"loss": 0.1013,
"step": 4660
},
{
"epoch": 1.871918220084185,
"grad_norm": 1.0734087228775024,
"learning_rate": 3.7230017822669204e-05,
"loss": 0.1056,
"step": 4670
},
{
"epoch": 1.875927039486871,
"grad_norm": 0.9423579573631287,
"learning_rate": 3.700459671650452e-05,
"loss": 0.1193,
"step": 4680
},
{
"epoch": 1.879935858889557,
"grad_norm": 0.9041392803192139,
"learning_rate": 3.677945853544755e-05,
"loss": 0.1098,
"step": 4690
},
{
"epoch": 1.8839446782922429,
"grad_norm": 1.1040509939193726,
"learning_rate": 3.6554608181019465e-05,
"loss": 0.1195,
"step": 4700
},
{
"epoch": 1.8879534976949288,
"grad_norm": 1.2079628705978394,
"learning_rate": 3.633005054847514e-05,
"loss": 0.12,
"step": 4710
},
{
"epoch": 1.8919623170976148,
"grad_norm": 0.9661321640014648,
"learning_rate": 3.6105790526696445e-05,
"loss": 0.1128,
"step": 4720
},
{
"epoch": 1.8959711365003007,
"grad_norm": 1.2310171127319336,
"learning_rate": 3.588183299808604e-05,
"loss": 0.1165,
"step": 4730
},
{
"epoch": 1.8999799559029866,
"grad_norm": 0.9907431602478027,
"learning_rate": 3.565818283846089e-05,
"loss": 0.1037,
"step": 4740
},
{
"epoch": 1.9039887753056726,
"grad_norm": 0.9235789775848389,
"learning_rate": 3.543484491694615e-05,
"loss": 0.0974,
"step": 4750
},
{
"epoch": 1.9079975947083585,
"grad_norm": 1.1032791137695312,
"learning_rate": 3.521182409586925e-05,
"loss": 0.1223,
"step": 4760
},
{
"epoch": 1.9120064141110444,
"grad_norm": 1.138131856918335,
"learning_rate": 3.4989125230653965e-05,
"loss": 0.1085,
"step": 4770
},
{
"epoch": 1.9160152335137302,
"grad_norm": 1.0244325399398804,
"learning_rate": 3.476675316971466e-05,
"loss": 0.0997,
"step": 4780
},
{
"epoch": 1.920024052916416,
"grad_norm": 1.141847014427185,
"learning_rate": 3.454471275435083e-05,
"loss": 0.1054,
"step": 4790
},
{
"epoch": 1.924032872319102,
"grad_norm": 0.9330345988273621,
"learning_rate": 3.4323008818641696e-05,
"loss": 0.1065,
"step": 4800
},
{
"epoch": 1.928041691721788,
"grad_norm": 0.9627101421356201,
"learning_rate": 3.410164618934082e-05,
"loss": 0.0913,
"step": 4810
},
{
"epoch": 1.9320505111244737,
"grad_norm": 0.9817176461219788,
"learning_rate": 3.388062968577124e-05,
"loss": 0.1243,
"step": 4820
},
{
"epoch": 1.9360593305271596,
"grad_norm": 1.1931806802749634,
"learning_rate": 3.3659964119720356e-05,
"loss": 0.1068,
"step": 4830
},
{
"epoch": 1.9400681499298456,
"grad_norm": 1.1554603576660156,
"learning_rate": 3.3439654295335274e-05,
"loss": 0.1116,
"step": 4840
},
{
"epoch": 1.9440769693325315,
"grad_norm": 1.0284534692764282,
"learning_rate": 3.321970500901819e-05,
"loss": 0.1021,
"step": 4850
},
{
"epoch": 1.9480857887352174,
"grad_norm": 0.9820400476455688,
"learning_rate": 3.3000121049321956e-05,
"loss": 0.093,
"step": 4860
},
{
"epoch": 1.9520946081379034,
"grad_norm": 0.9649590849876404,
"learning_rate": 3.2780907196845845e-05,
"loss": 0.105,
"step": 4870
},
{
"epoch": 1.9561034275405893,
"grad_norm": 1.1404318809509277,
"learning_rate": 3.256206822413145e-05,
"loss": 0.1028,
"step": 4880
},
{
"epoch": 1.9601122469432752,
"grad_norm": 0.8916597366333008,
"learning_rate": 3.234360889555884e-05,
"loss": 0.114,
"step": 4890
},
{
"epoch": 1.9641210663459612,
"grad_norm": 1.0824633836746216,
"learning_rate": 3.2125533967242704e-05,
"loss": 0.1047,
"step": 4900
},
{
"epoch": 1.9681298857486471,
"grad_norm": 1.3187285661697388,
"learning_rate": 3.190784818692897e-05,
"loss": 0.1035,
"step": 4910
},
{
"epoch": 1.972138705151333,
"grad_norm": 1.2455309629440308,
"learning_rate": 3.169055629389132e-05,
"loss": 0.1032,
"step": 4920
},
{
"epoch": 1.976147524554019,
"grad_norm": 0.8298673629760742,
"learning_rate": 3.147366301882805e-05,
"loss": 0.1028,
"step": 4930
},
{
"epoch": 1.9801563439567047,
"grad_norm": 1.0020873546600342,
"learning_rate": 3.1257173083759086e-05,
"loss": 0.1167,
"step": 4940
},
{
"epoch": 1.9841651633593906,
"grad_norm": 1.1114490032196045,
"learning_rate": 3.104109120192317e-05,
"loss": 0.0998,
"step": 4950
},
{
"epoch": 1.9881739827620766,
"grad_norm": 1.0216310024261475,
"learning_rate": 3.082542207767523e-05,
"loss": 0.1189,
"step": 4960
},
{
"epoch": 1.9921828021647625,
"grad_norm": 1.2210197448730469,
"learning_rate": 3.0610170406384045e-05,
"loss": 0.1088,
"step": 4970
},
{
"epoch": 1.9961916215674482,
"grad_norm": 1.0631357431411743,
"learning_rate": 3.0395340874329837e-05,
"loss": 0.1098,
"step": 4980
},
{
"epoch": 2.0,
"grad_norm": 1.2607094049453735,
"learning_rate": 3.0180938158602483e-05,
"loss": 0.1112,
"step": 4990
},
{
"epoch": 2.004008819402686,
"grad_norm": 1.0189579725265503,
"learning_rate": 2.996696692699952e-05,
"loss": 0.0646,
"step": 5000
},
{
"epoch": 2.008017638805372,
"grad_norm": 1.3740028142929077,
"learning_rate": 2.9753431837924545e-05,
"loss": 0.083,
"step": 5010
},
{
"epoch": 2.012026458208058,
"grad_norm": 1.5331498384475708,
"learning_rate": 2.9540337540285868e-05,
"loss": 0.0717,
"step": 5020
},
{
"epoch": 2.0160352776107437,
"grad_norm": 1.4585199356079102,
"learning_rate": 2.9327688673395236e-05,
"loss": 0.071,
"step": 5030
},
{
"epoch": 2.0200440970134297,
"grad_norm": 1.2551562786102295,
"learning_rate": 2.911548986686683e-05,
"loss": 0.0805,
"step": 5040
},
{
"epoch": 2.0240529164161156,
"grad_norm": 0.9197141528129578,
"learning_rate": 2.890374574051654e-05,
"loss": 0.0747,
"step": 5050
},
{
"epoch": 2.0280617358188016,
"grad_norm": 1.359002947807312,
"learning_rate": 2.869246090426131e-05,
"loss": 0.0746,
"step": 5060
},
{
"epoch": 2.0320705552214875,
"grad_norm": 1.1969698667526245,
"learning_rate": 2.8481639958018758e-05,
"loss": 0.0703,
"step": 5070
},
{
"epoch": 2.036079374624173,
"grad_norm": 1.179650902748108,
"learning_rate": 2.827128749160715e-05,
"loss": 0.0744,
"step": 5080
},
{
"epoch": 2.040088194026859,
"grad_norm": 1.3521727323532104,
"learning_rate": 2.8061408084645358e-05,
"loss": 0.0712,
"step": 5090
},
{
"epoch": 2.044097013429545,
"grad_norm": 1.1871998310089111,
"learning_rate": 2.78520063064532e-05,
"loss": 0.0617,
"step": 5100
},
{
"epoch": 2.048105832832231,
"grad_norm": 1.5966202020645142,
"learning_rate": 2.7643086715951964e-05,
"loss": 0.0822,
"step": 5110
},
{
"epoch": 2.0521146522349167,
"grad_norm": 1.5227017402648926,
"learning_rate": 2.7434653861565175e-05,
"loss": 0.0782,
"step": 5120
},
{
"epoch": 2.0561234716376027,
"grad_norm": 1.2805331945419312,
"learning_rate": 2.7226712281119448e-05,
"loss": 0.065,
"step": 5130
},
{
"epoch": 2.0601322910402886,
"grad_norm": 1.0091133117675781,
"learning_rate": 2.701926650174592e-05,
"loss": 0.0771,
"step": 5140
},
{
"epoch": 2.0641411104429745,
"grad_norm": 1.114707589149475,
"learning_rate": 2.6812321039781507e-05,
"loss": 0.0796,
"step": 5150
},
{
"epoch": 2.0681499298456605,
"grad_norm": 1.2408510446548462,
"learning_rate": 2.6605880400670573e-05,
"loss": 0.0624,
"step": 5160
},
{
"epoch": 2.0721587492483464,
"grad_norm": 1.6427772045135498,
"learning_rate": 2.639994907886697e-05,
"loss": 0.0682,
"step": 5170
},
{
"epoch": 2.0761675686510324,
"grad_norm": 1.5963749885559082,
"learning_rate": 2.61945315577361e-05,
"loss": 0.0594,
"step": 5180
},
{
"epoch": 2.0801763880537183,
"grad_norm": 1.3586746454238892,
"learning_rate": 2.5989632309457318e-05,
"loss": 0.0764,
"step": 5190
},
{
"epoch": 2.0841852074564042,
"grad_norm": 1.2391273975372314,
"learning_rate": 2.5785255794926573e-05,
"loss": 0.0554,
"step": 5200
},
{
"epoch": 2.08819402685909,
"grad_norm": 1.079005241394043,
"learning_rate": 2.558140646365929e-05,
"loss": 0.0618,
"step": 5210
},
{
"epoch": 2.092202846261776,
"grad_norm": 1.494195580482483,
"learning_rate": 2.537808875369351e-05,
"loss": 0.0745,
"step": 5220
},
{
"epoch": 2.0962116656644616,
"grad_norm": 1.301558256149292,
"learning_rate": 2.5175307091493255e-05,
"loss": 0.0661,
"step": 5230
},
{
"epoch": 2.1002204850671475,
"grad_norm": 1.9365872144699097,
"learning_rate": 2.497306589185212e-05,
"loss": 0.0726,
"step": 5240
},
{
"epoch": 2.1042293044698335,
"grad_norm": 1.3354028463363647,
"learning_rate": 2.4771369557797264e-05,
"loss": 0.0742,
"step": 5250
},
{
"epoch": 2.1082381238725194,
"grad_norm": 1.2166097164154053,
"learning_rate": 2.4570222480493437e-05,
"loss": 0.0763,
"step": 5260
},
{
"epoch": 2.1122469432752053,
"grad_norm": 1.8834477663040161,
"learning_rate": 2.4369629039147458e-05,
"loss": 0.0657,
"step": 5270
},
{
"epoch": 2.1162557626778913,
"grad_norm": 1.174580693244934,
"learning_rate": 2.416959360091283e-05,
"loss": 0.0725,
"step": 5280
},
{
"epoch": 2.120264582080577,
"grad_norm": 1.636049747467041,
"learning_rate": 2.397012052079469e-05,
"loss": 0.0677,
"step": 5290
},
{
"epoch": 2.124273401483263,
"grad_norm": 1.0291727781295776,
"learning_rate": 2.3771214141554932e-05,
"loss": 0.072,
"step": 5300
},
{
"epoch": 2.128282220885949,
"grad_norm": 1.4774961471557617,
"learning_rate": 2.3572878793617785e-05,
"loss": 0.0626,
"step": 5310
},
{
"epoch": 2.132291040288635,
"grad_norm": 1.3647609949111938,
"learning_rate": 2.3375118794975436e-05,
"loss": 0.0822,
"step": 5320
},
{
"epoch": 2.136299859691321,
"grad_norm": 1.2686548233032227,
"learning_rate": 2.3177938451093994e-05,
"loss": 0.0654,
"step": 5330
},
{
"epoch": 2.140308679094007,
"grad_norm": 1.0904122591018677,
"learning_rate": 2.298134205481986e-05,
"loss": 0.0788,
"step": 5340
},
{
"epoch": 2.144317498496693,
"grad_norm": 1.5554347038269043,
"learning_rate": 2.278533388628621e-05,
"loss": 0.0618,
"step": 5350
},
{
"epoch": 2.148326317899379,
"grad_norm": 1.4490818977355957,
"learning_rate": 2.2589918212819787e-05,
"loss": 0.0714,
"step": 5360
},
{
"epoch": 2.1523351373020647,
"grad_norm": 1.056925892829895,
"learning_rate": 2.2395099288848066e-05,
"loss": 0.0787,
"step": 5370
},
{
"epoch": 2.1563439567047507,
"grad_norm": 1.642364740371704,
"learning_rate": 2.2200881355806565e-05,
"loss": 0.0766,
"step": 5380
},
{
"epoch": 2.1603527761074366,
"grad_norm": 1.0628970861434937,
"learning_rate": 2.2007268642046476e-05,
"loss": 0.0557,
"step": 5390
},
{
"epoch": 2.164361595510122,
"grad_norm": 0.9546886086463928,
"learning_rate": 2.181426536274277e-05,
"loss": 0.0591,
"step": 5400
},
{
"epoch": 2.168370414912808,
"grad_norm": 1.6997793912887573,
"learning_rate": 2.1621875719802258e-05,
"loss": 0.069,
"step": 5410
},
{
"epoch": 2.172379234315494,
"grad_norm": 1.2331558465957642,
"learning_rate": 2.1430103901772135e-05,
"loss": 0.0765,
"step": 5420
},
{
"epoch": 2.17638805371818,
"grad_norm": 1.4007397890090942,
"learning_rate": 2.1238954083748887e-05,
"loss": 0.0759,
"step": 5430
},
{
"epoch": 2.180396873120866,
"grad_norm": 1.1436303853988647,
"learning_rate": 2.1048430427287304e-05,
"loss": 0.0681,
"step": 5440
},
{
"epoch": 2.1844056925235518,
"grad_norm": 0.89713454246521,
"learning_rate": 2.085853708030991e-05,
"loss": 0.0701,
"step": 5450
},
{
"epoch": 2.1884145119262377,
"grad_norm": 1.5042600631713867,
"learning_rate": 2.0669278177016664e-05,
"loss": 0.0654,
"step": 5460
},
{
"epoch": 2.1924233313289236,
"grad_norm": 1.211078405380249,
"learning_rate": 2.0480657837794963e-05,
"loss": 0.069,
"step": 5470
},
{
"epoch": 2.1964321507316096,
"grad_norm": 0.9574674367904663,
"learning_rate": 2.0292680169129828e-05,
"loss": 0.0623,
"step": 5480
},
{
"epoch": 2.2004409701342955,
"grad_norm": 1.1876091957092285,
"learning_rate": 2.0105349263514728e-05,
"loss": 0.0637,
"step": 5490
},
{
"epoch": 2.2044497895369815,
"grad_norm": 1.3990014791488647,
"learning_rate": 1.991866919936226e-05,
"loss": 0.0659,
"step": 5500
},
{
"epoch": 2.2084586089396674,
"grad_norm": 0.9827179312705994,
"learning_rate": 1.9732644040915427e-05,
"loss": 0.0603,
"step": 5510
},
{
"epoch": 2.2124674283423533,
"grad_norm": 1.0140836238861084,
"learning_rate": 1.9547277838159222e-05,
"loss": 0.0574,
"step": 5520
},
{
"epoch": 2.2164762477450393,
"grad_norm": 1.0066441297531128,
"learning_rate": 1.936257462673238e-05,
"loss": 0.0693,
"step": 5530
},
{
"epoch": 2.220485067147725,
"grad_norm": 1.0355478525161743,
"learning_rate": 1.9178538427839537e-05,
"loss": 0.0623,
"step": 5540
},
{
"epoch": 2.2244938865504107,
"grad_norm": 1.0241667032241821,
"learning_rate": 1.8995173248163716e-05,
"loss": 0.0575,
"step": 5550
},
{
"epoch": 2.2285027059530966,
"grad_norm": 1.0945931673049927,
"learning_rate": 1.8812483079779008e-05,
"loss": 0.0617,
"step": 5560
},
{
"epoch": 2.2325115253557826,
"grad_norm": 1.2159384489059448,
"learning_rate": 1.863047190006375e-05,
"loss": 0.0764,
"step": 5570
},
{
"epoch": 2.2365203447584685,
"grad_norm": 1.3158025741577148,
"learning_rate": 1.8449143671613962e-05,
"loss": 0.0663,
"step": 5580
},
{
"epoch": 2.2405291641611544,
"grad_norm": 1.1542671918869019,
"learning_rate": 1.8268502342156918e-05,
"loss": 0.064,
"step": 5590
},
{
"epoch": 2.2445379835638404,
"grad_norm": 1.233852744102478,
"learning_rate": 1.808855184446535e-05,
"loss": 0.0708,
"step": 5600
},
{
"epoch": 2.2485468029665263,
"grad_norm": 1.0461921691894531,
"learning_rate": 1.7909296096271783e-05,
"loss": 0.0611,
"step": 5610
},
{
"epoch": 2.2525556223692123,
"grad_norm": 1.2904634475708008,
"learning_rate": 1.773073900018321e-05,
"loss": 0.0598,
"step": 5620
},
{
"epoch": 2.256564441771898,
"grad_norm": 1.213394284248352,
"learning_rate": 1.7552884443596168e-05,
"loss": 0.0608,
"step": 5630
},
{
"epoch": 2.260573261174584,
"grad_norm": 1.203125,
"learning_rate": 1.73757362986121e-05,
"loss": 0.0638,
"step": 5640
},
{
"epoch": 2.26458208057727,
"grad_norm": 1.0718966722488403,
"learning_rate": 1.7199298421952987e-05,
"loss": 0.0628,
"step": 5650
},
{
"epoch": 2.268590899979956,
"grad_norm": 1.5006955862045288,
"learning_rate": 1.7023574654877482e-05,
"loss": 0.0591,
"step": 5660
},
{
"epoch": 2.272599719382642,
"grad_norm": 1.0694504976272583,
"learning_rate": 1.684856882309729e-05,
"loss": 0.0699,
"step": 5670
},
{
"epoch": 2.276608538785328,
"grad_norm": 1.068630337715149,
"learning_rate": 1.6674284736693713e-05,
"loss": 0.0599,
"step": 5680
},
{
"epoch": 2.280617358188014,
"grad_norm": 0.9531617760658264,
"learning_rate": 1.6500726190034888e-05,
"loss": 0.0595,
"step": 5690
},
{
"epoch": 2.2846261775906997,
"grad_norm": 1.1300429105758667,
"learning_rate": 1.6327896961693086e-05,
"loss": 0.0704,
"step": 5700
},
{
"epoch": 2.2886349969933857,
"grad_norm": 1.248582124710083,
"learning_rate": 1.6155800814362475e-05,
"loss": 0.0591,
"step": 5710
},
{
"epoch": 2.292643816396071,
"grad_norm": 1.2277759313583374,
"learning_rate": 1.598444149477718e-05,
"loss": 0.0644,
"step": 5720
},
{
"epoch": 2.296652635798757,
"grad_norm": 1.4432833194732666,
"learning_rate": 1.5813822733629745e-05,
"loss": 0.0715,
"step": 5730
},
{
"epoch": 2.300661455201443,
"grad_norm": 1.1492823362350464,
"learning_rate": 1.5643948245489836e-05,
"loss": 0.0525,
"step": 5740
},
{
"epoch": 2.304670274604129,
"grad_norm": 1.4520362615585327,
"learning_rate": 1.547482172872351e-05,
"loss": 0.0536,
"step": 5750
},
{
"epoch": 2.308679094006815,
"grad_norm": 1.236132025718689,
"learning_rate": 1.530644686541258e-05,
"loss": 0.0584,
"step": 5760
},
{
"epoch": 2.312687913409501,
"grad_norm": 1.4177806377410889,
"learning_rate": 1.5138827321274435e-05,
"loss": 0.0597,
"step": 5770
},
{
"epoch": 2.316696732812187,
"grad_norm": 1.0297455787658691,
"learning_rate": 1.497196674558235e-05,
"loss": 0.0627,
"step": 5780
},
{
"epoch": 2.3207055522148727,
"grad_norm": 1.1963504552841187,
"learning_rate": 1.4805868771085946e-05,
"loss": 0.0627,
"step": 5790
},
{
"epoch": 2.3247143716175587,
"grad_norm": 1.5588128566741943,
"learning_rate": 1.4640537013932121e-05,
"loss": 0.0609,
"step": 5800
},
{
"epoch": 2.3287231910202446,
"grad_norm": 1.5374112129211426,
"learning_rate": 1.4475975073586345e-05,
"loss": 0.0716,
"step": 5810
},
{
"epoch": 2.3327320104229305,
"grad_norm": 1.6463807821273804,
"learning_rate": 1.431218653275424e-05,
"loss": 0.0737,
"step": 5820
},
{
"epoch": 2.3367408298256165,
"grad_norm": 1.3641928434371948,
"learning_rate": 1.4149174957303629e-05,
"loss": 0.0672,
"step": 5830
},
{
"epoch": 2.3407496492283024,
"grad_norm": 1.259701132774353,
"learning_rate": 1.398694389618696e-05,
"loss": 0.0759,
"step": 5840
},
{
"epoch": 2.3447584686309884,
"grad_norm": 1.2060563564300537,
"learning_rate": 1.3825496881363864e-05,
"loss": 0.0628,
"step": 5850
},
{
"epoch": 2.348767288033674,
"grad_norm": 1.3083685636520386,
"learning_rate": 1.3664837427724431e-05,
"loss": 0.0578,
"step": 5860
},
{
"epoch": 2.35277610743636,
"grad_norm": 1.3214398622512817,
"learning_rate": 1.3504969033012615e-05,
"loss": 0.06,
"step": 5870
},
{
"epoch": 2.3567849268390457,
"grad_norm": 1.1904963254928589,
"learning_rate": 1.3345895177750094e-05,
"loss": 0.0617,
"step": 5880
},
{
"epoch": 2.3607937462417317,
"grad_norm": 1.1517525911331177,
"learning_rate": 1.3187619325160483e-05,
"loss": 0.0528,
"step": 5890
},
{
"epoch": 2.3648025656444176,
"grad_norm": 1.424729824066162,
"learning_rate": 1.3030144921093979e-05,
"loss": 0.0652,
"step": 5900
},
{
"epoch": 2.3688113850471035,
"grad_norm": 1.4582880735397339,
"learning_rate": 1.2873475393952245e-05,
"loss": 0.0641,
"step": 5910
},
{
"epoch": 2.3728202044497895,
"grad_norm": 1.2188777923583984,
"learning_rate": 1.2717614154613877e-05,
"loss": 0.067,
"step": 5920
},
{
"epoch": 2.3768290238524754,
"grad_norm": 1.2932417392730713,
"learning_rate": 1.2562564596360144e-05,
"loss": 0.0535,
"step": 5930
},
{
"epoch": 2.3808378432551613,
"grad_norm": 1.2565412521362305,
"learning_rate": 1.2408330094800974e-05,
"loss": 0.0642,
"step": 5940
},
{
"epoch": 2.3848466626578473,
"grad_norm": 1.1354554891586304,
"learning_rate": 1.225491400780162e-05,
"loss": 0.0518,
"step": 5950
},
{
"epoch": 2.388855482060533,
"grad_norm": 1.0824904441833496,
"learning_rate": 1.2102319675409491e-05,
"loss": 0.0593,
"step": 5960
},
{
"epoch": 2.392864301463219,
"grad_norm": 1.3248436450958252,
"learning_rate": 1.1950550419781414e-05,
"loss": 0.0606,
"step": 5970
},
{
"epoch": 2.396873120865905,
"grad_norm": 1.3530750274658203,
"learning_rate": 1.1799609545111363e-05,
"loss": 0.058,
"step": 5980
},
{
"epoch": 2.400881940268591,
"grad_norm": 1.5529499053955078,
"learning_rate": 1.1649500337558478e-05,
"loss": 0.066,
"step": 5990
},
{
"epoch": 2.404890759671277,
"grad_norm": 0.9849441647529602,
"learning_rate": 1.15002260651755e-05,
"loss": 0.0657,
"step": 6000
},
{
"epoch": 2.408899579073963,
"grad_norm": 1.6223032474517822,
"learning_rate": 1.1351789977837696e-05,
"loss": 0.0687,
"step": 6010
},
{
"epoch": 2.412908398476649,
"grad_norm": 1.4085158109664917,
"learning_rate": 1.1204195307172094e-05,
"loss": 0.0608,
"step": 6020
},
{
"epoch": 2.416917217879335,
"grad_norm": 1.329626441001892,
"learning_rate": 1.1057445266487016e-05,
"loss": 0.0619,
"step": 6030
},
{
"epoch": 2.4209260372820203,
"grad_norm": 1.2898280620574951,
"learning_rate": 1.091154305070226e-05,
"loss": 0.0653,
"step": 6040
},
{
"epoch": 2.424934856684706,
"grad_norm": 1.290812611579895,
"learning_rate": 1.0766491836279486e-05,
"loss": 0.0636,
"step": 6050
},
{
"epoch": 2.428943676087392,
"grad_norm": 1.226360559463501,
"learning_rate": 1.0622294781153036e-05,
"loss": 0.0486,
"step": 6060
},
{
"epoch": 2.432952495490078,
"grad_norm": 1.4300650358200073,
"learning_rate": 1.047895502466122e-05,
"loss": 0.0711,
"step": 6070
},
{
"epoch": 2.436961314892764,
"grad_norm": 1.4043900966644287,
"learning_rate": 1.0336475687477964e-05,
"loss": 0.0625,
"step": 6080
},
{
"epoch": 2.44097013429545,
"grad_norm": 1.0565260648727417,
"learning_rate": 1.0194859871544831e-05,
"loss": 0.0561,
"step": 6090
},
{
"epoch": 2.444978953698136,
"grad_norm": 1.1184768676757812,
"learning_rate": 1.0054110660003551e-05,
"loss": 0.0584,
"step": 6100
},
{
"epoch": 2.448987773100822,
"grad_norm": 1.2461347579956055,
"learning_rate": 9.914231117128841e-06,
"loss": 0.0709,
"step": 6110
},
{
"epoch": 2.4529965925035078,
"grad_norm": 1.2430334091186523,
"learning_rate": 9.77522428826173e-06,
"loss": 0.0606,
"step": 6120
},
{
"epoch": 2.4570054119061937,
"grad_norm": 1.2371201515197754,
"learning_rate": 9.637093199743236e-06,
"loss": 0.0627,
"step": 6130
},
{
"epoch": 2.4610142313088796,
"grad_norm": 1.1069798469543457,
"learning_rate": 9.499840858848497e-06,
"loss": 0.0564,
"step": 6140
},
{
"epoch": 2.4650230507115656,
"grad_norm": 1.2197552919387817,
"learning_rate": 9.363470253721268e-06,
"loss": 0.0611,
"step": 6150
},
{
"epoch": 2.4690318701142515,
"grad_norm": 1.005348563194275,
"learning_rate": 9.227984353308926e-06,
"loss": 0.0513,
"step": 6160
},
{
"epoch": 2.4730406895169375,
"grad_norm": 1.237045407295227,
"learning_rate": 9.09338610729773e-06,
"loss": 0.0598,
"step": 6170
},
{
"epoch": 2.477049508919623,
"grad_norm": 1.4536449909210205,
"learning_rate": 8.959678446048725e-06,
"loss": 0.0587,
"step": 6180
},
{
"epoch": 2.481058328322309,
"grad_norm": 1.0354303121566772,
"learning_rate": 8.826864280533853e-06,
"loss": 0.0589,
"step": 6190
},
{
"epoch": 2.485067147724995,
"grad_norm": 1.427465796470642,
"learning_rate": 8.694946502272628e-06,
"loss": 0.0482,
"step": 6200
},
{
"epoch": 2.4890759671276808,
"grad_norm": 0.8941324353218079,
"learning_rate": 8.563927983269154e-06,
"loss": 0.0635,
"step": 6210
},
{
"epoch": 2.4930847865303667,
"grad_norm": 1.0785568952560425,
"learning_rate": 8.433811575949618e-06,
"loss": 0.0622,
"step": 6220
},
{
"epoch": 2.4970936059330526,
"grad_norm": 1.0514103174209595,
"learning_rate": 8.304600113100181e-06,
"loss": 0.0566,
"step": 6230
},
{
"epoch": 2.5011024253357386,
"grad_norm": 1.5485719442367554,
"learning_rate": 8.1762964078053e-06,
"loss": 0.051,
"step": 6240
},
{
"epoch": 2.5051112447384245,
"grad_norm": 1.6941360235214233,
"learning_rate": 8.048903253386515e-06,
"loss": 0.0497,
"step": 6250
},
{
"epoch": 2.5091200641411104,
"grad_norm": 1.2973785400390625,
"learning_rate": 7.922423423341551e-06,
"loss": 0.0544,
"step": 6260
},
{
"epoch": 2.5131288835437964,
"grad_norm": 1.2612277269363403,
"learning_rate": 7.796859671284045e-06,
"loss": 0.0614,
"step": 6270
},
{
"epoch": 2.5171377029464823,
"grad_norm": 1.012540340423584,
"learning_rate": 7.672214730883565e-06,
"loss": 0.0655,
"step": 6280
},
{
"epoch": 2.5211465223491683,
"grad_norm": 1.5126792192459106,
"learning_rate": 7.548491315806011e-06,
"loss": 0.055,
"step": 6290
},
{
"epoch": 2.525155341751854,
"grad_norm": 1.1852082014083862,
"learning_rate": 7.425692119654648e-06,
"loss": 0.0621,
"step": 6300
},
{
"epoch": 2.52916416115454,
"grad_norm": 1.29707670211792,
"learning_rate": 7.3038198159114005e-06,
"loss": 0.0605,
"step": 6310
},
{
"epoch": 2.533172980557226,
"grad_norm": 1.53734290599823,
"learning_rate": 7.1828770578786616e-06,
"loss": 0.0581,
"step": 6320
},
{
"epoch": 2.537181799959912,
"grad_norm": 1.385482668876648,
"learning_rate": 7.062866478621538e-06,
"loss": 0.0601,
"step": 6330
},
{
"epoch": 2.541190619362598,
"grad_norm": 1.2017461061477661,
"learning_rate": 6.943790690910512e-06,
"loss": 0.0504,
"step": 6340
},
{
"epoch": 2.545199438765284,
"grad_norm": 1.387803077697754,
"learning_rate": 6.825652287164541e-06,
"loss": 0.0574,
"step": 6350
},
{
"epoch": 2.54920825816797,
"grad_norm": 1.3186421394348145,
"learning_rate": 6.708453839394657e-06,
"loss": 0.0585,
"step": 6360
},
{
"epoch": 2.5532170775706553,
"grad_norm": 1.398294448852539,
"learning_rate": 6.592197899147984e-06,
"loss": 0.0694,
"step": 6370
},
{
"epoch": 2.5572258969733412,
"grad_norm": 1.121980905532837,
"learning_rate": 6.476886997452092e-06,
"loss": 0.0513,
"step": 6380
},
{
"epoch": 2.561234716376027,
"grad_norm": 1.4234181642532349,
"learning_rate": 6.362523644760016e-06,
"loss": 0.0546,
"step": 6390
},
{
"epoch": 2.565243535778713,
"grad_norm": 1.428277611732483,
"learning_rate": 6.24911033089548e-06,
"loss": 0.0598,
"step": 6400
},
{
"epoch": 2.569252355181399,
"grad_norm": 1.793627381324768,
"learning_rate": 6.1366495249988275e-06,
"loss": 0.0624,
"step": 6410
},
{
"epoch": 2.573261174584085,
"grad_norm": 1.2697495222091675,
"learning_rate": 6.0251436754731495e-06,
"loss": 0.058,
"step": 6420
},
{
"epoch": 2.577269993986771,
"grad_norm": 1.402320384979248,
"learning_rate": 5.914595209931006e-06,
"loss": 0.0523,
"step": 6430
},
{
"epoch": 2.581278813389457,
"grad_norm": 1.0470867156982422,
"learning_rate": 5.805006535141621e-06,
"loss": 0.0645,
"step": 6440
},
{
"epoch": 2.585287632792143,
"grad_norm": 1.0179369449615479,
"learning_rate": 5.6963800369784385e-06,
"loss": 0.0579,
"step": 6450
},
{
"epoch": 2.5892964521948287,
"grad_norm": 1.298664927482605,
"learning_rate": 5.588718080367195e-06,
"loss": 0.0596,
"step": 6460
},
{
"epoch": 2.5933052715975147,
"grad_norm": 1.2758408784866333,
"learning_rate": 5.4820230092344385e-06,
"loss": 0.0635,
"step": 6470
},
{
"epoch": 2.5973140910002,
"grad_norm": 1.3012737035751343,
"learning_rate": 5.376297146456488e-06,
"loss": 0.0542,
"step": 6480
},
{
"epoch": 2.601322910402886,
"grad_norm": 1.1228282451629639,
"learning_rate": 5.271542793808837e-06,
"loss": 0.0547,
"step": 6490
},
{
"epoch": 2.605331729805572,
"grad_norm": 1.2128888368606567,
"learning_rate": 5.1677622319161125e-06,
"loss": 0.0582,
"step": 6500
},
{
"epoch": 2.609340549208258,
"grad_norm": 1.4961844682693481,
"learning_rate": 5.064957720202374e-06,
"loss": 0.0548,
"step": 6510
},
{
"epoch": 2.613349368610944,
"grad_norm": 1.5457652807235718,
"learning_rate": 4.963131496841878e-06,
"loss": 0.069,
"step": 6520
},
{
"epoch": 2.61735818801363,
"grad_norm": 0.805304229259491,
"learning_rate": 4.862285778710462e-06,
"loss": 0.0454,
"step": 6530
},
{
"epoch": 2.621367007416316,
"grad_norm": 1.2888411283493042,
"learning_rate": 4.762422761337182e-06,
"loss": 0.0531,
"step": 6540
},
{
"epoch": 2.6253758268190017,
"grad_norm": 1.2126528024673462,
"learning_rate": 4.663544618856575e-06,
"loss": 0.0688,
"step": 6550
},
{
"epoch": 2.6293846462216877,
"grad_norm": 1.269785761833191,
"learning_rate": 4.565653503961281e-06,
"loss": 0.0551,
"step": 6560
},
{
"epoch": 2.6333934656243736,
"grad_norm": 1.2016196250915527,
"learning_rate": 4.468751547855215e-06,
"loss": 0.0692,
"step": 6570
},
{
"epoch": 2.6374022850270595,
"grad_norm": 1.2045531272888184,
"learning_rate": 4.372840860207123e-06,
"loss": 0.0468,
"step": 6580
},
{
"epoch": 2.6414111044297455,
"grad_norm": 0.6648709177970886,
"learning_rate": 4.2779235291047105e-06,
"loss": 0.053,
"step": 6590
},
{
"epoch": 2.6454199238324314,
"grad_norm": 1.4050956964492798,
"learning_rate": 4.184001621009137e-06,
"loss": 0.054,
"step": 6600
},
{
"epoch": 2.6494287432351173,
"grad_norm": 1.4628591537475586,
"learning_rate": 4.091077180710029e-06,
"loss": 0.0633,
"step": 6610
},
{
"epoch": 2.6534375626378033,
"grad_norm": 0.8363884091377258,
"learning_rate": 3.9991522312809945e-06,
"loss": 0.0523,
"step": 6620
},
{
"epoch": 2.657446382040489,
"grad_norm": 1.3806829452514648,
"learning_rate": 3.908228774035544e-06,
"loss": 0.057,
"step": 6630
},
{
"epoch": 2.661455201443175,
"grad_norm": 1.4262441396713257,
"learning_rate": 3.818308788483533e-06,
"loss": 0.0458,
"step": 6640
},
{
"epoch": 2.665464020845861,
"grad_norm": 1.5106767416000366,
"learning_rate": 3.72939423228808e-06,
"loss": 0.0636,
"step": 6650
},
{
"epoch": 2.669472840248547,
"grad_norm": 1.71439790725708,
"learning_rate": 3.6414870412229184e-06,
"loss": 0.0601,
"step": 6660
},
{
"epoch": 2.673481659651233,
"grad_norm": 1.167715072631836,
"learning_rate": 3.5545891291302704e-06,
"loss": 0.0486,
"step": 6670
},
{
"epoch": 2.6774904790539185,
"grad_norm": 1.4082682132720947,
"learning_rate": 3.4687023878791857e-06,
"loss": 0.0543,
"step": 6680
},
{
"epoch": 2.6814992984566044,
"grad_norm": 1.1505839824676514,
"learning_rate": 3.3838286873243197e-06,
"loss": 0.0512,
"step": 6690
},
{
"epoch": 2.6855081178592903,
"grad_norm": 1.6079833507537842,
"learning_rate": 3.2999698752652685e-06,
"loss": 0.0567,
"step": 6700
},
{
"epoch": 2.6895169372619763,
"grad_norm": 1.8533159494400024,
"learning_rate": 3.2171277774063204e-06,
"loss": 0.0588,
"step": 6710
},
{
"epoch": 2.693525756664662,
"grad_norm": 1.6835551261901855,
"learning_rate": 3.1353041973166965e-06,
"loss": 0.0619,
"step": 6720
},
{
"epoch": 2.697534576067348,
"grad_norm": 1.7107609510421753,
"learning_rate": 3.054500916391312e-06,
"loss": 0.0581,
"step": 6730
},
{
"epoch": 2.701543395470034,
"grad_norm": 1.131338119506836,
"learning_rate": 2.9747196938119614e-06,
"loss": 0.0501,
"step": 6740
},
{
"epoch": 2.70555221487272,
"grad_norm": 1.9655126333236694,
"learning_rate": 2.8959622665090338e-06,
"loss": 0.049,
"step": 6750
},
{
"epoch": 2.709561034275406,
"grad_norm": 1.6402628421783447,
"learning_rate": 2.818230349123724e-06,
"loss": 0.0671,
"step": 6760
},
{
"epoch": 2.713569853678092,
"grad_norm": 1.2333426475524902,
"learning_rate": 2.741525633970665e-06,
"loss": 0.0526,
"step": 6770
},
{
"epoch": 2.717578673080778,
"grad_norm": 1.035477638244629,
"learning_rate": 2.665849791001074e-06,
"loss": 0.0479,
"step": 6780
},
{
"epoch": 2.7215874924834638,
"grad_norm": 1.5815109014511108,
"learning_rate": 2.591204467766456e-06,
"loss": 0.0659,
"step": 6790
},
{
"epoch": 2.7255963118861493,
"grad_norm": 1.1497759819030762,
"learning_rate": 2.517591289382676e-06,
"loss": 0.0502,
"step": 6800
},
{
"epoch": 2.729605131288835,
"grad_norm": 1.2210969924926758,
"learning_rate": 2.4450118584946002e-06,
"loss": 0.0589,
"step": 6810
},
{
"epoch": 2.733613950691521,
"grad_norm": 1.4234390258789062,
"learning_rate": 2.373467755241221e-06,
"loss": 0.0577,
"step": 6820
},
{
"epoch": 2.737622770094207,
"grad_norm": 1.0173630714416504,
"learning_rate": 2.302960537221227e-06,
"loss": 0.0418,
"step": 6830
},
{
"epoch": 2.741631589496893,
"grad_norm": 1.2278695106506348,
"learning_rate": 2.2334917394590873e-06,
"loss": 0.0429,
"step": 6840
},
{
"epoch": 2.745640408899579,
"grad_norm": 1.5566179752349854,
"learning_rate": 2.1650628743716874e-06,
"loss": 0.0504,
"step": 6850
},
{
"epoch": 2.749649228302265,
"grad_norm": 1.4304202795028687,
"learning_rate": 2.097675431735341e-06,
"loss": 0.0547,
"step": 6860
},
{
"epoch": 2.753658047704951,
"grad_norm": 1.333284854888916,
"learning_rate": 2.0313308786533647e-06,
"loss": 0.0629,
"step": 6870
},
{
"epoch": 2.7576668671076368,
"grad_norm": 1.5218358039855957,
"learning_rate": 1.966030659524182e-06,
"loss": 0.0514,
"step": 6880
},
{
"epoch": 2.7616756865103227,
"grad_norm": 1.650320291519165,
"learning_rate": 1.9017761960098302e-06,
"loss": 0.0574,
"step": 6890
},
{
"epoch": 2.7656845059130086,
"grad_norm": 1.31277334690094,
"learning_rate": 1.838568887005021e-06,
"loss": 0.0574,
"step": 6900
},
{
"epoch": 2.7696933253156946,
"grad_norm": 1.2339560985565186,
"learning_rate": 1.776410108606702e-06,
"loss": 0.0568,
"step": 6910
},
{
"epoch": 2.7737021447183805,
"grad_norm": 1.1014797687530518,
"learning_rate": 1.7153012140840808e-06,
"loss": 0.0633,
"step": 6920
},
{
"epoch": 2.7777109641210664,
"grad_norm": 1.3516331911087036,
"learning_rate": 1.6552435338491544e-06,
"loss": 0.0444,
"step": 6930
},
{
"epoch": 2.7817197835237524,
"grad_norm": 1.180059552192688,
"learning_rate": 1.596238375427772e-06,
"loss": 0.0529,
"step": 6940
},
{
"epoch": 2.7857286029264383,
"grad_norm": 1.1580662727355957,
"learning_rate": 1.538287023431162e-06,
"loss": 0.0567,
"step": 6950
},
{
"epoch": 2.7897374223291243,
"grad_norm": 1.3882676362991333,
"learning_rate": 1.4813907395279214e-06,
"loss": 0.0631,
"step": 6960
},
{
"epoch": 2.79374624173181,
"grad_norm": 0.8586792349815369,
"learning_rate": 1.4255507624166109e-06,
"loss": 0.0487,
"step": 6970
},
{
"epoch": 2.797755061134496,
"grad_norm": 1.3401018381118774,
"learning_rate": 1.3707683077987588e-06,
"loss": 0.059,
"step": 6980
},
{
"epoch": 2.801763880537182,
"grad_norm": 1.160061240196228,
"learning_rate": 1.3170445683523769e-06,
"loss": 0.0511,
"step": 6990
},
{
"epoch": 2.8057726999398676,
"grad_norm": 1.5262839794158936,
"learning_rate": 1.264380713706037e-06,
"loss": 0.0571,
"step": 7000
}
],
"logging_steps": 10,
"max_steps": 7482,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.328918701667516e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}