robertou2 commited on
Commit
102e276
·
verified ·
1 Parent(s): 7d86599

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d2263abc52214918cb0243613829263c5eb866b6277704478283989215eeae3
3
  size 738232680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b50522bbf97853647b3e7dd976a7b77143060a9d6ae9f84a139704035d1b37e
3
  size 738232680
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21ed021ca7796b559e38f4de1c6f075d653c3c73fdf265bca596f2ba21ee61c8
3
  size 1476611275
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db096db6ef6cca049587aa70a5476910a9efc056a7da2ade2fd3a3e127d7b24
3
  size 1476611275
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d416d016b635652b44c8f24b86395735e0658c83adbca0c05503d6f290df3a8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc40fda2dc7baa0544c9cb1a8fdefc63f3880e97dda10b6d0305d014ae7c30be
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f91f56974adaa6f012d64b7fe0783f94fe00a197b4ccc0cf01788db9b8df0028
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:600c82ad1c2a74114e7bf057a4638d5bdb5e6d378a5ca371abf2b3eea8ba107f
3
  size 1465
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 60,
3
- "best_metric": 0.5306870341300964,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-60",
5
- "epoch": 3.1578947368421053,
6
  "eval_steps": 1,
7
- "global_step": 60,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -908,6 +908,831 @@
908
  "eval_samples_per_second": 33.566,
909
  "eval_steps_per_second": 4.475,
910
  "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  }
912
  ],
913
  "logging_steps": 1,
@@ -927,7 +1752,7 @@
927
  "attributes": {}
928
  }
929
  },
930
- "total_flos": 2557484965533696.0,
931
  "train_batch_size": 1,
932
  "trial_name": null,
933
  "trial_params": null
 
1
  {
2
+ "best_global_step": 115,
3
+ "best_metric": 0.09458151459693909,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-115",
5
+ "epoch": 6.052631578947368,
6
  "eval_steps": 1,
7
+ "global_step": 115,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
908
  "eval_samples_per_second": 33.566,
909
  "eval_steps_per_second": 4.475,
910
  "step": 60
911
+ },
912
+ {
913
+ "epoch": 3.2105263157894735,
914
+ "grad_norm": 2.3808937072753906,
915
+ "learning_rate": 0.0004561103900854401,
916
+ "loss": 0.5372,
917
+ "step": 61
918
+ },
919
+ {
920
+ "epoch": 3.2105263157894735,
921
+ "eval_loss": 0.535223662853241,
922
+ "eval_runtime": 0.8966,
923
+ "eval_samples_per_second": 33.459,
924
+ "eval_steps_per_second": 4.461,
925
+ "step": 61
926
+ },
927
+ {
928
+ "epoch": 3.263157894736842,
929
+ "grad_norm": 1.8272178173065186,
930
+ "learning_rate": 0.0004542005660466094,
931
+ "loss": 0.5399,
932
+ "step": 62
933
+ },
934
+ {
935
+ "epoch": 3.263157894736842,
936
+ "eval_loss": 0.5316082239151001,
937
+ "eval_runtime": 0.8994,
938
+ "eval_samples_per_second": 33.354,
939
+ "eval_steps_per_second": 4.447,
940
+ "step": 62
941
+ },
942
+ {
943
+ "epoch": 3.3157894736842106,
944
+ "grad_norm": 2.0635435581207275,
945
+ "learning_rate": 0.0004522542485937369,
946
+ "loss": 0.5531,
947
+ "step": 63
948
+ },
949
+ {
950
+ "epoch": 3.3157894736842106,
951
+ "eval_loss": 0.5134085416793823,
952
+ "eval_runtime": 0.8937,
953
+ "eval_samples_per_second": 33.567,
954
+ "eval_steps_per_second": 4.476,
955
+ "step": 63
956
+ },
957
+ {
958
+ "epoch": 3.3684210526315788,
959
+ "grad_norm": 2.268183708190918,
960
+ "learning_rate": 0.0004502717855601809,
961
+ "loss": 0.5291,
962
+ "step": 64
963
+ },
964
+ {
965
+ "epoch": 3.3684210526315788,
966
+ "eval_loss": 0.5419598817825317,
967
+ "eval_runtime": 0.8959,
968
+ "eval_samples_per_second": 33.486,
969
+ "eval_steps_per_second": 4.465,
970
+ "step": 64
971
+ },
972
+ {
973
+ "epoch": 3.4210526315789473,
974
+ "grad_norm": 1.8800358772277832,
975
+ "learning_rate": 0.0004482535312390058,
976
+ "loss": 0.5501,
977
+ "step": 65
978
+ },
979
+ {
980
+ "epoch": 3.4210526315789473,
981
+ "eval_loss": 0.5209227800369263,
982
+ "eval_runtime": 0.8927,
983
+ "eval_samples_per_second": 33.606,
984
+ "eval_steps_per_second": 4.481,
985
+ "step": 65
986
+ },
987
+ {
988
+ "epoch": 3.473684210526316,
989
+ "grad_norm": 3.1507558822631836,
990
+ "learning_rate": 0.00044619984631966527,
991
+ "loss": 0.5309,
992
+ "step": 66
993
+ },
994
+ {
995
+ "epoch": 3.473684210526316,
996
+ "eval_loss": 0.536996603012085,
997
+ "eval_runtime": 0.8951,
998
+ "eval_samples_per_second": 33.517,
999
+ "eval_steps_per_second": 4.469,
1000
+ "step": 66
1001
+ },
1002
+ {
1003
+ "epoch": 3.526315789473684,
1004
+ "grad_norm": 3.5700478553771973,
1005
+ "learning_rate": 0.0004441110978235418,
1006
+ "loss": 0.7223,
1007
+ "step": 67
1008
+ },
1009
+ {
1010
+ "epoch": 3.526315789473684,
1011
+ "eval_loss": 0.5140640139579773,
1012
+ "eval_runtime": 0.8962,
1013
+ "eval_samples_per_second": 33.474,
1014
+ "eval_steps_per_second": 4.463,
1015
+ "step": 67
1016
+ },
1017
+ {
1018
+ "epoch": 3.5789473684210527,
1019
+ "grad_norm": 1.758971929550171,
1020
+ "learning_rate": 0.0004419876590383554,
1021
+ "loss": 0.6927,
1022
+ "step": 68
1023
+ },
1024
+ {
1025
+ "epoch": 3.5789473684210527,
1026
+ "eval_loss": 0.47072505950927734,
1027
+ "eval_runtime": 0.9127,
1028
+ "eval_samples_per_second": 32.87,
1029
+ "eval_steps_per_second": 4.383,
1030
+ "step": 68
1031
+ },
1032
+ {
1033
+ "epoch": 3.6315789473684212,
1034
+ "grad_norm": 1.5274709463119507,
1035
+ "learning_rate": 0.00043982990945145146,
1036
+ "loss": 0.4762,
1037
+ "step": 69
1038
+ },
1039
+ {
1040
+ "epoch": 3.6315789473684212,
1041
+ "eval_loss": 0.4518219828605652,
1042
+ "eval_runtime": 0.8967,
1043
+ "eval_samples_per_second": 33.456,
1044
+ "eval_steps_per_second": 4.461,
1045
+ "step": 69
1046
+ },
1047
+ {
1048
+ "epoch": 3.6842105263157894,
1049
+ "grad_norm": 1.7685797214508057,
1050
+ "learning_rate": 0.0004376382346819819,
1051
+ "loss": 0.5629,
1052
+ "step": 70
1053
+ },
1054
+ {
1055
+ "epoch": 3.6842105263157894,
1056
+ "eval_loss": 0.40707579255104065,
1057
+ "eval_runtime": 0.8934,
1058
+ "eval_samples_per_second": 33.581,
1059
+ "eval_steps_per_second": 4.478,
1060
+ "step": 70
1061
+ },
1062
+ {
1063
+ "epoch": 3.736842105263158,
1064
+ "grad_norm": 1.6618574857711792,
1065
+ "learning_rate": 0.00043541302641198946,
1066
+ "loss": 0.5877,
1067
+ "step": 71
1068
+ },
1069
+ {
1070
+ "epoch": 3.736842105263158,
1071
+ "eval_loss": 0.3780651390552521,
1072
+ "eval_runtime": 0.9024,
1073
+ "eval_samples_per_second": 33.246,
1074
+ "eval_steps_per_second": 4.433,
1075
+ "step": 71
1076
+ },
1077
+ {
1078
+ "epoch": 3.7894736842105265,
1079
+ "grad_norm": 1.542702317237854,
1080
+ "learning_rate": 0.00043315468231640834,
1081
+ "loss": 0.5222,
1082
+ "step": 72
1083
+ },
1084
+ {
1085
+ "epoch": 3.7894736842105265,
1086
+ "eval_loss": 0.3732970356941223,
1087
+ "eval_runtime": 0.9166,
1088
+ "eval_samples_per_second": 32.73,
1089
+ "eval_steps_per_second": 4.364,
1090
+ "step": 72
1091
+ },
1092
+ {
1093
+ "epoch": 3.8421052631578947,
1094
+ "grad_norm": 1.8039391040802002,
1095
+ "learning_rate": 0.00043086360599199516,
1096
+ "loss": 0.5238,
1097
+ "step": 73
1098
+ },
1099
+ {
1100
+ "epoch": 3.8421052631578947,
1101
+ "eval_loss": 0.3568810820579529,
1102
+ "eval_runtime": 0.9031,
1103
+ "eval_samples_per_second": 33.218,
1104
+ "eval_steps_per_second": 4.429,
1105
+ "step": 73
1106
+ },
1107
+ {
1108
+ "epoch": 3.8947368421052633,
1109
+ "grad_norm": 1.6215863227844238,
1110
+ "learning_rate": 0.0004285402068852002,
1111
+ "loss": 0.6504,
1112
+ "step": 74
1113
+ },
1114
+ {
1115
+ "epoch": 3.8947368421052633,
1116
+ "eval_loss": 0.3885921835899353,
1117
+ "eval_runtime": 0.896,
1118
+ "eval_samples_per_second": 33.483,
1119
+ "eval_steps_per_second": 4.464,
1120
+ "step": 74
1121
+ },
1122
+ {
1123
+ "epoch": 3.9473684210526314,
1124
+ "grad_norm": 1.5152952671051025,
1125
+ "learning_rate": 0.00042618490021899383,
1126
+ "loss": 0.5694,
1127
+ "step": 75
1128
+ },
1129
+ {
1130
+ "epoch": 3.9473684210526314,
1131
+ "eval_loss": 0.38745489716529846,
1132
+ "eval_runtime": 0.8939,
1133
+ "eval_samples_per_second": 33.562,
1134
+ "eval_steps_per_second": 4.475,
1135
+ "step": 75
1136
+ },
1137
+ {
1138
+ "epoch": 4.0,
1139
+ "grad_norm": 2.6989200115203857,
1140
+ "learning_rate": 0.00042379810691866064,
1141
+ "loss": 0.5849,
1142
+ "step": 76
1143
+ },
1144
+ {
1145
+ "epoch": 4.0,
1146
+ "eval_loss": 0.42535698413848877,
1147
+ "eval_runtime": 0.9073,
1148
+ "eval_samples_per_second": 33.066,
1149
+ "eval_steps_per_second": 4.409,
1150
+ "step": 76
1151
+ },
1152
+ {
1153
+ "epoch": 4.052631578947368,
1154
+ "grad_norm": 1.7381691932678223,
1155
+ "learning_rate": 0.00042138025353657407,
1156
+ "loss": 0.3779,
1157
+ "step": 77
1158
+ },
1159
+ {
1160
+ "epoch": 4.052631578947368,
1161
+ "eval_loss": 0.37115439772605896,
1162
+ "eval_runtime": 0.9112,
1163
+ "eval_samples_per_second": 32.922,
1164
+ "eval_steps_per_second": 4.39,
1165
+ "step": 77
1166
+ },
1167
+ {
1168
+ "epoch": 4.105263157894737,
1169
+ "grad_norm": 2.188385248184204,
1170
+ "learning_rate": 0.00041893177217596633,
1171
+ "loss": 0.44,
1172
+ "step": 78
1173
+ },
1174
+ {
1175
+ "epoch": 4.105263157894737,
1176
+ "eval_loss": 0.2926563322544098,
1177
+ "eval_runtime": 0.8982,
1178
+ "eval_samples_per_second": 33.401,
1179
+ "eval_steps_per_second": 4.453,
1180
+ "step": 78
1181
+ },
1182
+ {
1183
+ "epoch": 4.157894736842105,
1184
+ "grad_norm": 2.3652961254119873,
1185
+ "learning_rate": 0.0004164531004137049,
1186
+ "loss": 0.3639,
1187
+ "step": 79
1188
+ },
1189
+ {
1190
+ "epoch": 4.157894736842105,
1191
+ "eval_loss": 0.2751067876815796,
1192
+ "eval_runtime": 0.9146,
1193
+ "eval_samples_per_second": 32.8,
1194
+ "eval_steps_per_second": 4.373,
1195
+ "step": 79
1196
+ },
1197
+ {
1198
+ "epoch": 4.2105263157894735,
1199
+ "grad_norm": 2.165874719619751,
1200
+ "learning_rate": 0.0004139446812220924,
1201
+ "loss": 0.2683,
1202
+ "step": 80
1203
+ },
1204
+ {
1205
+ "epoch": 4.2105263157894735,
1206
+ "eval_loss": 0.2685202360153198,
1207
+ "eval_runtime": 0.9124,
1208
+ "eval_samples_per_second": 32.881,
1209
+ "eval_steps_per_second": 4.384,
1210
+ "step": 80
1211
+ },
1212
+ {
1213
+ "epoch": 4.2631578947368425,
1214
+ "grad_norm": 1.7391912937164307,
1215
+ "learning_rate": 0.0004114069628897006,
1216
+ "loss": 0.2993,
1217
+ "step": 81
1218
+ },
1219
+ {
1220
+ "epoch": 4.2631578947368425,
1221
+ "eval_loss": 0.33646491169929504,
1222
+ "eval_runtime": 0.8952,
1223
+ "eval_samples_per_second": 33.51,
1224
+ "eval_steps_per_second": 4.468,
1225
+ "step": 81
1226
+ },
1227
+ {
1228
+ "epoch": 4.315789473684211,
1229
+ "grad_norm": 3.65714693069458,
1230
+ "learning_rate": 0.0004088403989412559,
1231
+ "loss": 0.4252,
1232
+ "step": 82
1233
+ },
1234
+ {
1235
+ "epoch": 4.315789473684211,
1236
+ "eval_loss": 0.2839888632297516,
1237
+ "eval_runtime": 0.9057,
1238
+ "eval_samples_per_second": 33.123,
1239
+ "eval_steps_per_second": 4.416,
1240
+ "step": 82
1241
+ },
1242
+ {
1243
+ "epoch": 4.368421052631579,
1244
+ "grad_norm": 2.1762771606445312,
1245
+ "learning_rate": 0.00040624544805658794,
1246
+ "loss": 0.3304,
1247
+ "step": 83
1248
+ },
1249
+ {
1250
+ "epoch": 4.368421052631579,
1251
+ "eval_loss": 0.27002134919166565,
1252
+ "eval_runtime": 0.8939,
1253
+ "eval_samples_per_second": 33.562,
1254
+ "eval_steps_per_second": 4.475,
1255
+ "step": 83
1256
+ },
1257
+ {
1258
+ "epoch": 4.421052631578947,
1259
+ "grad_norm": 2.1018354892730713,
1260
+ "learning_rate": 0.00040362257398865713,
1261
+ "loss": 0.4506,
1262
+ "step": 84
1263
+ },
1264
+ {
1265
+ "epoch": 4.421052631578947,
1266
+ "eval_loss": 0.2557659149169922,
1267
+ "eval_runtime": 0.8969,
1268
+ "eval_samples_per_second": 33.45,
1269
+ "eval_steps_per_second": 4.46,
1270
+ "step": 84
1271
+ },
1272
+ {
1273
+ "epoch": 4.473684210526316,
1274
+ "grad_norm": 1.7509180307388306,
1275
+ "learning_rate": 0.00040097224548067613,
1276
+ "loss": 0.3731,
1277
+ "step": 85
1278
+ },
1279
+ {
1280
+ "epoch": 4.473684210526316,
1281
+ "eval_loss": 0.26859304308891296,
1282
+ "eval_runtime": 0.9009,
1283
+ "eval_samples_per_second": 33.299,
1284
+ "eval_steps_per_second": 4.44,
1285
+ "step": 85
1286
+ },
1287
+ {
1288
+ "epoch": 4.526315789473684,
1289
+ "grad_norm": 1.971816897392273,
1290
+ "learning_rate": 0.0003982949361823388,
1291
+ "loss": 0.38,
1292
+ "step": 86
1293
+ },
1294
+ {
1295
+ "epoch": 4.526315789473684,
1296
+ "eval_loss": 0.2624681293964386,
1297
+ "eval_runtime": 0.8949,
1298
+ "eval_samples_per_second": 33.524,
1299
+ "eval_steps_per_second": 4.47,
1300
+ "step": 86
1301
+ },
1302
+ {
1303
+ "epoch": 4.578947368421053,
1304
+ "grad_norm": 1.4714068174362183,
1305
+ "learning_rate": 0.0003955911245651726,
1306
+ "loss": 0.3944,
1307
+ "step": 87
1308
+ },
1309
+ {
1310
+ "epoch": 4.578947368421053,
1311
+ "eval_loss": 0.23652420938014984,
1312
+ "eval_runtime": 0.8952,
1313
+ "eval_samples_per_second": 33.511,
1314
+ "eval_steps_per_second": 4.468,
1315
+ "step": 87
1316
+ },
1317
+ {
1318
+ "epoch": 4.631578947368421,
1319
+ "grad_norm": 2.6970834732055664,
1320
+ "learning_rate": 0.0003928612938370292,
1321
+ "loss": 0.3374,
1322
+ "step": 88
1323
+ },
1324
+ {
1325
+ "epoch": 4.631578947368421,
1326
+ "eval_loss": 0.2716277241706848,
1327
+ "eval_runtime": 0.8932,
1328
+ "eval_samples_per_second": 33.588,
1329
+ "eval_steps_per_second": 4.478,
1330
+ "step": 88
1331
+ },
1332
+ {
1333
+ "epoch": 4.684210526315789,
1334
+ "grad_norm": 1.9066615104675293,
1335
+ "learning_rate": 0.00039010593185572867,
1336
+ "loss": 0.2442,
1337
+ "step": 89
1338
+ },
1339
+ {
1340
+ "epoch": 4.684210526315789,
1341
+ "eval_loss": 0.2999991476535797,
1342
+ "eval_runtime": 0.8939,
1343
+ "eval_samples_per_second": 33.559,
1344
+ "eval_steps_per_second": 4.475,
1345
+ "step": 89
1346
+ },
1347
+ {
1348
+ "epoch": 4.7368421052631575,
1349
+ "grad_norm": 2.6232354640960693,
1350
+ "learning_rate": 0.00038732553104187296,
1351
+ "loss": 0.2857,
1352
+ "step": 90
1353
+ },
1354
+ {
1355
+ "epoch": 4.7368421052631575,
1356
+ "eval_loss": 0.2302989959716797,
1357
+ "eval_runtime": 0.8938,
1358
+ "eval_samples_per_second": 33.564,
1359
+ "eval_steps_per_second": 4.475,
1360
+ "step": 90
1361
+ },
1362
+ {
1363
+ "epoch": 4.7894736842105265,
1364
+ "grad_norm": 2.0710129737854004,
1365
+ "learning_rate": 0.0003845205882908432,
1366
+ "loss": 0.4195,
1367
+ "step": 91
1368
+ },
1369
+ {
1370
+ "epoch": 4.7894736842105265,
1371
+ "eval_loss": 0.21816590428352356,
1372
+ "eval_runtime": 0.9251,
1373
+ "eval_samples_per_second": 32.429,
1374
+ "eval_steps_per_second": 4.324,
1375
+ "step": 91
1376
+ },
1377
+ {
1378
+ "epoch": 4.842105263157895,
1379
+ "grad_norm": 1.8006062507629395,
1380
+ "learning_rate": 0.0003816916048839979,
1381
+ "loss": 0.2859,
1382
+ "step": 92
1383
+ },
1384
+ {
1385
+ "epoch": 4.842105263157895,
1386
+ "eval_loss": 0.21071405708789825,
1387
+ "eval_runtime": 0.8965,
1388
+ "eval_samples_per_second": 33.462,
1389
+ "eval_steps_per_second": 4.462,
1390
+ "step": 92
1391
+ },
1392
+ {
1393
+ "epoch": 4.894736842105263,
1394
+ "grad_norm": 1.6352888345718384,
1395
+ "learning_rate": 0.0003788390863990875,
1396
+ "loss": 0.4275,
1397
+ "step": 93
1398
+ },
1399
+ {
1400
+ "epoch": 4.894736842105263,
1401
+ "eval_loss": 0.20206846296787262,
1402
+ "eval_runtime": 0.9052,
1403
+ "eval_samples_per_second": 33.144,
1404
+ "eval_steps_per_second": 4.419,
1405
+ "step": 93
1406
+ },
1407
+ {
1408
+ "epoch": 4.947368421052632,
1409
+ "grad_norm": 1.6399378776550293,
1410
+ "learning_rate": 0.00037596354261990007,
1411
+ "loss": 0.389,
1412
+ "step": 94
1413
+ },
1414
+ {
1415
+ "epoch": 4.947368421052632,
1416
+ "eval_loss": 0.19467315077781677,
1417
+ "eval_runtime": 0.8973,
1418
+ "eval_samples_per_second": 33.435,
1419
+ "eval_steps_per_second": 4.458,
1420
+ "step": 94
1421
+ },
1422
+ {
1423
+ "epoch": 5.0,
1424
+ "grad_norm": 1.5680173635482788,
1425
+ "learning_rate": 0.0003730654874451569,
1426
+ "loss": 0.395,
1427
+ "step": 95
1428
+ },
1429
+ {
1430
+ "epoch": 5.0,
1431
+ "eval_loss": 0.19546455144882202,
1432
+ "eval_runtime": 0.91,
1433
+ "eval_samples_per_second": 32.968,
1434
+ "eval_steps_per_second": 4.396,
1435
+ "step": 95
1436
+ },
1437
+ {
1438
+ "epoch": 5.052631578947368,
1439
+ "grad_norm": 1.0308386087417603,
1440
+ "learning_rate": 0.00037014543879667093,
1441
+ "loss": 0.1384,
1442
+ "step": 96
1443
+ },
1444
+ {
1445
+ "epoch": 5.052631578947368,
1446
+ "eval_loss": 0.18969732522964478,
1447
+ "eval_runtime": 0.9021,
1448
+ "eval_samples_per_second": 33.258,
1449
+ "eval_steps_per_second": 4.434,
1450
+ "step": 96
1451
+ },
1452
+ {
1453
+ "epoch": 5.105263157894737,
1454
+ "grad_norm": 1.4042502641677856,
1455
+ "learning_rate": 0.0003672039185267878,
1456
+ "loss": 0.2291,
1457
+ "step": 97
1458
+ },
1459
+ {
1460
+ "epoch": 5.105263157894737,
1461
+ "eval_loss": 0.16800740361213684,
1462
+ "eval_runtime": 0.8938,
1463
+ "eval_samples_per_second": 33.563,
1464
+ "eval_steps_per_second": 4.475,
1465
+ "step": 97
1466
+ },
1467
+ {
1468
+ "epoch": 5.157894736842105,
1469
+ "grad_norm": 1.6313552856445312,
1470
+ "learning_rate": 0.00036424145232512333,
1471
+ "loss": 0.1736,
1472
+ "step": 98
1473
+ },
1474
+ {
1475
+ "epoch": 5.157894736842105,
1476
+ "eval_loss": 0.16714099049568176,
1477
+ "eval_runtime": 0.9009,
1478
+ "eval_samples_per_second": 33.301,
1479
+ "eval_steps_per_second": 4.44,
1480
+ "step": 98
1481
+ },
1482
+ {
1483
+ "epoch": 5.2105263157894735,
1484
+ "grad_norm": 1.8922698497772217,
1485
+ "learning_rate": 0.0003612585696246158,
1486
+ "loss": 0.1677,
1487
+ "step": 99
1488
+ },
1489
+ {
1490
+ "epoch": 5.2105263157894735,
1491
+ "eval_loss": 0.179762065410614,
1492
+ "eval_runtime": 0.9039,
1493
+ "eval_samples_per_second": 33.188,
1494
+ "eval_steps_per_second": 4.425,
1495
+ "step": 99
1496
+ },
1497
+ {
1498
+ "epoch": 5.2631578947368425,
1499
+ "grad_norm": 2.409526824951172,
1500
+ "learning_rate": 0.0003582558035069091,
1501
+ "loss": 0.2379,
1502
+ "step": 100
1503
+ },
1504
+ {
1505
+ "epoch": 5.2631578947368425,
1506
+ "eval_loss": 0.1902371197938919,
1507
+ "eval_runtime": 0.9097,
1508
+ "eval_samples_per_second": 32.98,
1509
+ "eval_steps_per_second": 4.397,
1510
+ "step": 100
1511
+ },
1512
+ {
1513
+ "epoch": 5.315789473684211,
1514
+ "grad_norm": 2.084869146347046,
1515
+ "learning_rate": 0.0003552336906070838,
1516
+ "loss": 0.2165,
1517
+ "step": 101
1518
+ },
1519
+ {
1520
+ "epoch": 5.315789473684211,
1521
+ "eval_loss": 0.17252177000045776,
1522
+ "eval_runtime": 0.8948,
1523
+ "eval_samples_per_second": 33.528,
1524
+ "eval_steps_per_second": 4.47,
1525
+ "step": 101
1526
+ },
1527
+ {
1528
+ "epoch": 5.368421052631579,
1529
+ "grad_norm": 1.655718207359314,
1530
+ "learning_rate": 0.000352192771017753,
1531
+ "loss": 0.223,
1532
+ "step": 102
1533
+ },
1534
+ {
1535
+ "epoch": 5.368421052631579,
1536
+ "eval_loss": 0.18867380917072296,
1537
+ "eval_runtime": 0.8956,
1538
+ "eval_samples_per_second": 33.495,
1539
+ "eval_steps_per_second": 4.466,
1540
+ "step": 102
1541
+ },
1542
+ {
1543
+ "epoch": 5.421052631578947,
1544
+ "grad_norm": 2.672633409500122,
1545
+ "learning_rate": 0.0003491335881925407,
1546
+ "loss": 0.161,
1547
+ "step": 103
1548
+ },
1549
+ {
1550
+ "epoch": 5.421052631578947,
1551
+ "eval_loss": 0.1944020837545395,
1552
+ "eval_runtime": 0.8924,
1553
+ "eval_samples_per_second": 33.616,
1554
+ "eval_steps_per_second": 4.482,
1555
+ "step": 103
1556
+ },
1557
+ {
1558
+ "epoch": 5.473684210526316,
1559
+ "grad_norm": 1.9712008237838745,
1560
+ "learning_rate": 0.0003460566888489593,
1561
+ "loss": 0.2525,
1562
+ "step": 104
1563
+ },
1564
+ {
1565
+ "epoch": 5.473684210526316,
1566
+ "eval_loss": 0.17671068012714386,
1567
+ "eval_runtime": 0.897,
1568
+ "eval_samples_per_second": 33.446,
1569
+ "eval_steps_per_second": 4.459,
1570
+ "step": 104
1571
+ },
1572
+ {
1573
+ "epoch": 5.526315789473684,
1574
+ "grad_norm": 2.2153072357177734,
1575
+ "learning_rate": 0.00034296262287070335,
1576
+ "loss": 0.2105,
1577
+ "step": 105
1578
+ },
1579
+ {
1580
+ "epoch": 5.526315789473684,
1581
+ "eval_loss": 0.1715732216835022,
1582
+ "eval_runtime": 0.8951,
1583
+ "eval_samples_per_second": 33.514,
1584
+ "eval_steps_per_second": 4.469,
1585
+ "step": 105
1586
+ },
1587
+ {
1588
+ "epoch": 5.578947368421053,
1589
+ "grad_norm": 1.8106168508529663,
1590
+ "learning_rate": 0.0003398519432093782,
1591
+ "loss": 0.259,
1592
+ "step": 106
1593
+ },
1594
+ {
1595
+ "epoch": 5.578947368421053,
1596
+ "eval_loss": 0.1465868353843689,
1597
+ "eval_runtime": 0.9077,
1598
+ "eval_samples_per_second": 33.051,
1599
+ "eval_steps_per_second": 4.407,
1600
+ "step": 106
1601
+ },
1602
+ {
1603
+ "epoch": 5.631578947368421,
1604
+ "grad_norm": 2.1159439086914062,
1605
+ "learning_rate": 0.0003367252057856802,
1606
+ "loss": 0.2065,
1607
+ "step": 107
1608
+ },
1609
+ {
1610
+ "epoch": 5.631578947368421,
1611
+ "eval_loss": 0.14219093322753906,
1612
+ "eval_runtime": 0.9049,
1613
+ "eval_samples_per_second": 33.154,
1614
+ "eval_steps_per_second": 4.42,
1615
+ "step": 107
1616
+ },
1617
+ {
1618
+ "epoch": 5.684210526315789,
1619
+ "grad_norm": 1.4467761516571045,
1620
+ "learning_rate": 0.00033358296939004547,
1621
+ "loss": 0.2083,
1622
+ "step": 108
1623
+ },
1624
+ {
1625
+ "epoch": 5.684210526315789,
1626
+ "eval_loss": 0.1406753957271576,
1627
+ "eval_runtime": 0.8954,
1628
+ "eval_samples_per_second": 33.505,
1629
+ "eval_steps_per_second": 4.467,
1630
+ "step": 108
1631
+ },
1632
+ {
1633
+ "epoch": 5.7368421052631575,
1634
+ "grad_norm": 1.3671239614486694,
1635
+ "learning_rate": 0.00033042579558278717,
1636
+ "loss": 0.1825,
1637
+ "step": 109
1638
+ },
1639
+ {
1640
+ "epoch": 5.7368421052631575,
1641
+ "eval_loss": 0.13007155060768127,
1642
+ "eval_runtime": 0.8998,
1643
+ "eval_samples_per_second": 33.342,
1644
+ "eval_steps_per_second": 4.446,
1645
+ "step": 109
1646
+ },
1647
+ {
1648
+ "epoch": 5.7894736842105265,
1649
+ "grad_norm": 1.479944109916687,
1650
+ "learning_rate": 0.00032725424859373687,
1651
+ "loss": 0.2244,
1652
+ "step": 110
1653
+ },
1654
+ {
1655
+ "epoch": 5.7894736842105265,
1656
+ "eval_loss": 0.12692232429981232,
1657
+ "eval_runtime": 0.901,
1658
+ "eval_samples_per_second": 33.298,
1659
+ "eval_steps_per_second": 4.44,
1660
+ "step": 110
1661
+ },
1662
+ {
1663
+ "epoch": 5.842105263157895,
1664
+ "grad_norm": 1.5173969268798828,
1665
+ "learning_rate": 0.0003240688952214085,
1666
+ "loss": 0.2273,
1667
+ "step": 111
1668
+ },
1669
+ {
1670
+ "epoch": 5.842105263157895,
1671
+ "eval_loss": 0.12454597651958466,
1672
+ "eval_runtime": 0.8987,
1673
+ "eval_samples_per_second": 33.382,
1674
+ "eval_steps_per_second": 4.451,
1675
+ "step": 111
1676
+ },
1677
+ {
1678
+ "epoch": 5.894736842105263,
1679
+ "grad_norm": 2.7870988845825195,
1680
+ "learning_rate": 0.00032087030473170445,
1681
+ "loss": 0.2101,
1682
+ "step": 112
1683
+ },
1684
+ {
1685
+ "epoch": 5.894736842105263,
1686
+ "eval_loss": 0.12002909928560257,
1687
+ "eval_runtime": 0.893,
1688
+ "eval_samples_per_second": 33.593,
1689
+ "eval_steps_per_second": 4.479,
1690
+ "step": 112
1691
+ },
1692
+ {
1693
+ "epoch": 5.947368421052632,
1694
+ "grad_norm": 1.3659342527389526,
1695
+ "learning_rate": 0.00031765904875617973,
1696
+ "loss": 0.1882,
1697
+ "step": 113
1698
+ },
1699
+ {
1700
+ "epoch": 5.947368421052632,
1701
+ "eval_loss": 0.10573837906122208,
1702
+ "eval_runtime": 0.8956,
1703
+ "eval_samples_per_second": 33.496,
1704
+ "eval_steps_per_second": 4.466,
1705
+ "step": 113
1706
+ },
1707
+ {
1708
+ "epoch": 6.0,
1709
+ "grad_norm": 1.8464044332504272,
1710
+ "learning_rate": 0.00031443570118988356,
1711
+ "loss": 0.2285,
1712
+ "step": 114
1713
+ },
1714
+ {
1715
+ "epoch": 6.0,
1716
+ "eval_loss": 0.10221625119447708,
1717
+ "eval_runtime": 0.8955,
1718
+ "eval_samples_per_second": 33.501,
1719
+ "eval_steps_per_second": 4.467,
1720
+ "step": 114
1721
+ },
1722
+ {
1723
+ "epoch": 6.052631578947368,
1724
+ "grad_norm": 1.3894392251968384,
1725
+ "learning_rate": 0.00031120083808879663,
1726
+ "loss": 0.1115,
1727
+ "step": 115
1728
+ },
1729
+ {
1730
+ "epoch": 6.052631578947368,
1731
+ "eval_loss": 0.09458151459693909,
1732
+ "eval_runtime": 0.8981,
1733
+ "eval_samples_per_second": 33.405,
1734
+ "eval_steps_per_second": 4.454,
1735
+ "step": 115
1736
  }
1737
  ],
1738
  "logging_steps": 1,
 
1752
  "attributes": {}
1753
  }
1754
  },
1755
+ "total_flos": 4901149662148608.0,
1756
  "train_batch_size": 1,
1757
  "trial_name": null,
1758
  "trial_params": null