aleegis commited on
Commit
3389001
·
verified ·
1 Parent(s): c4a433e

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba5ad2fb2cb4b2a5bd7986862a8166911fc3384611e2f392b642528de144efef
3
  size 101752088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfe33153bfb4efaa9f968d0812bfb37b362b38c9c025435d7a7bfe3297a0d0cc
3
  size 101752088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2807bfac567254a682fcee8ad37ca39304c0da0fefc87ffe5dcb5421a22106b
3
  size 203719079
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5be5b53988a320341390e32e0c3213d49731dee11836050bbd36accdfa86a9f2
3
  size 203719079
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:864067d528288ac71cd598d44e76d3e8aa962e0c46a88d68dee7762bfa3899db
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e02d071ee6920b7ece49ae76afe405eaacca52e7327794299c581d3989e02b4
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb69807b0cc213740e86e6add3784f51b695f07900aa8a435d08a7fff4f32bd7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:095865ee0dba2422fa75ed17304220ae17502f490054466dcc6d644f9f447b2a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4276888959290353,
6
  "eval_steps": 500,
7
- "global_step": 2700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1898,6 +1898,216 @@
1898
  "learning_rate": 3.0352986867686007e-06,
1899
  "loss": 1.2348,
1900
  "step": 2700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1901
  }
1902
  ],
1903
  "logging_steps": 10,
@@ -1912,12 +2122,12 @@
1912
  "should_evaluate": false,
1913
  "should_log": false,
1914
  "should_save": true,
1915
- "should_training_stop": false
1916
  },
1917
  "attributes": {}
1918
  }
1919
  },
1920
- "total_flos": 8.89026842198016e+17,
1921
  "train_batch_size": 16,
1922
  "trial_name": null,
1923
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4752098843655948,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1898
  "learning_rate": 3.0352986867686007e-06,
1899
  "loss": 1.2348,
1900
  "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.42927292887692065,
1904
+ "grad_norm": 0.7749059796333313,
1905
+ "learning_rate": 2.8388671026199522e-06,
1906
+ "loss": 1.254,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.43085696182480593,
1911
+ "grad_norm": 0.7517712712287903,
1912
+ "learning_rate": 2.6488203809326207e-06,
1913
+ "loss": 1.3453,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.43244099477269127,
1918
+ "grad_norm": 0.7981254458427429,
1919
+ "learning_rate": 2.4651842509905487e-06,
1920
+ "loss": 1.3381,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.4340250277205766,
1925
+ "grad_norm": 0.6932268738746643,
1926
+ "learning_rate": 2.2879835741861586e-06,
1927
+ "loss": 1.2535,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.4356090606684619,
1932
+ "grad_norm": 0.8161441683769226,
1933
+ "learning_rate": 2.1172423406545516e-06,
1934
+ "loss": 1.2834,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.43719309361634723,
1939
+ "grad_norm": 0.848377525806427,
1940
+ "learning_rate": 1.9529836660256096e-06,
1941
+ "loss": 1.3685,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.4387771265642325,
1946
+ "grad_norm": 0.7809950113296509,
1947
+ "learning_rate": 1.7952297882945003e-06,
1948
+ "loss": 1.3941,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.44036115951211785,
1953
+ "grad_norm": 0.7642554044723511,
1954
+ "learning_rate": 1.6440020648110067e-06,
1955
+ "loss": 1.3021,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.4419451924600032,
1960
+ "grad_norm": 0.6984448432922363,
1961
+ "learning_rate": 1.4993209693881183e-06,
1962
+ "loss": 1.339,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.44352922540788847,
1967
+ "grad_norm": 0.9434962272644043,
1968
+ "learning_rate": 1.3612060895301759e-06,
1969
+ "loss": 1.4138,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.4451132583557738,
1974
+ "grad_norm": 0.9491485953330994,
1975
+ "learning_rate": 1.2296761237810207e-06,
1976
+ "loss": 1.3785,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.4466972913036591,
1981
+ "grad_norm": 0.8270556330680847,
1982
+ "learning_rate": 1.104748879192552e-06,
1983
+ "loss": 1.2682,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.44828132425154443,
1988
+ "grad_norm": 0.7466399669647217,
1989
+ "learning_rate": 9.864412689139123e-07,
1990
+ "loss": 1.2793,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.44986535719942977,
1995
+ "grad_norm": 0.796255350112915,
1996
+ "learning_rate": 8.747693099017129e-07,
1997
+ "loss": 1.2731,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.45144939014731505,
2002
+ "grad_norm": 0.6511625647544861,
2003
+ "learning_rate": 7.697481207516289e-07,
2004
+ "loss": 1.3496,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.4530334230952004,
2009
+ "grad_norm": 0.8428515195846558,
2010
+ "learning_rate": 6.713919196515317e-07,
2011
+ "loss": 1.3259,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.4546174560430857,
2016
+ "grad_norm": 0.7594891786575317,
2017
+ "learning_rate": 5.797140224566122e-07,
2018
+ "loss": 1.3121,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.456201488990971,
2023
+ "grad_norm": 0.8366693258285522,
2024
+ "learning_rate": 4.947268408866113e-07,
2025
+ "loss": 1.4236,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.45778552193885635,
2030
+ "grad_norm": 0.7590285539627075,
2031
+ "learning_rate": 4.1644188084548063e-07,
2032
+ "loss": 1.3011,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.45936955488674164,
2037
+ "grad_norm": 0.8987306952476501,
2038
+ "learning_rate": 3.4486974086366253e-07,
2039
+ "loss": 1.2998,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.460953587834627,
2044
+ "grad_norm": 0.7959816455841064,
2045
+ "learning_rate": 2.800201106632205e-07,
2046
+ "loss": 1.3055,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.46253762078251226,
2051
+ "grad_norm": 0.9299723505973816,
2052
+ "learning_rate": 2.219017698460002e-07,
2053
+ "loss": 1.3327,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.4641216537303976,
2058
+ "grad_norm": 0.9437219500541687,
2059
+ "learning_rate": 1.7052258670501308e-07,
2060
+ "loss": 1.3535,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.4657056866782829,
2065
+ "grad_norm": 0.858355700969696,
2066
+ "learning_rate": 1.2588951715921116e-07,
2067
+ "loss": 1.456,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.4672897196261682,
2072
+ "grad_norm": 0.8487655520439148,
2073
+ "learning_rate": 8.800860381173448e-08,
2074
+ "loss": 1.1843,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.46887375257405356,
2079
+ "grad_norm": 0.7240117788314819,
2080
+ "learning_rate": 5.688497513188229e-08,
2081
+ "loss": 1.352,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.47045778552193884,
2086
+ "grad_norm": 0.7505178451538086,
2087
+ "learning_rate": 3.2522844760762836e-08,
2088
+ "loss": 1.3472,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.4720418184698242,
2093
+ "grad_norm": 0.8503928184509277,
2094
+ "learning_rate": 1.4925510940844156e-08,
2095
+ "loss": 1.3577,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.47362585141770946,
2100
+ "grad_norm": 0.8528128862380981,
2101
+ "learning_rate": 4.095356069439005e-09,
2102
+ "loss": 1.4115,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.4752098843655948,
2107
+ "grad_norm": 0.9052889943122864,
2108
+ "learning_rate": 3.384637615733155e-11,
2109
+ "loss": 1.3026,
2110
+ "step": 3000
2111
  }
2112
  ],
2113
  "logging_steps": 10,
 
2122
  "should_evaluate": false,
2123
  "should_log": false,
2124
  "should_save": true,
2125
+ "should_training_stop": true
2126
  },
2127
  "attributes": {}
2128
  }
2129
  },
2130
+ "total_flos": 9.8780760244224e+17,
2131
  "train_batch_size": 16,
2132
  "trial_name": null,
2133
  "trial_params": null