msb-roshan's picture
Duplicate from MolGen/llama_ZINC_1B-raw_tokenizer_bpe_None_SAFE_500_0_0.1_f025f239
605e343 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
2
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
3
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": {
"type": "BPEDecoder",
"suffix": "</w>"
},
"model": {
"type": "BPE",
"dropout": 0.1,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"<pad>": 1,
"<bos>": 2,
"<eos>": 3,
"#": 4,
"%": 5,
"(": 6,
")": 7,
"+": 8,
"-": 9,
".": 10,
"0": 11,
"1": 12,
"2": 13,
"3": 14,
"4": 15,
"5": 16,
"6": 17,
"7": 18,
"8": 19,
"9": 20,
"=": 21,
"B": 22,
"C": 23,
"F": 24,
"H": 25,
"I": 26,
"N": 27,
"O": 28,
"P": 29,
"S": 30,
"[": 31,
"]": 32,
"c": 33,
"e": 34,
"i": 35,
"l": 36,
"n": 37,
"o": 38,
"r": 39,
"s": 40,
"CC": 41,
".C": 42,
"=O": 43,
"cc": 44,
".N": 45,
"c1": 46,
")C": 47,
"(C": 48,
"1.C": 49,
"=O.C": 50,
"%1": 51,
"CCC": 52,
"(=O": 53,
".c1": 54,
"%10": 55,
"CC1": 56,
".N3": 57,
"(C)": 58,
"C1": 59,
".N4": 60,
"(C)C": 61,
"(=O)C": 62,
".C1": 63,
"(F": 64,
".C4": 65,
".O": 66,
"ccc": 67,
"(F)": 68,
".CC": 69,
"N1": 70,
"(=O)": 71,
".C5": 72,
"6C": 73,
".C3": 74,
"cn": 75,
"CC2": 76,
"1.C1": 77,
"5C": 78,
"7C": 79,
".N1": 80,
".N5": 81,
"n1": 82,
".N35": 83,
".N46": 84,
"cccc": 85,
"CCN": 86,
".N2": 87,
"(O": 88,
"1.C4": 89,
"8C": 90,
"%11": 91,
"7=O": 92,
"[n": 93,
"H]": 94,
"[nH]": 95,
"CCCC": 96,
"=O.C5": 97,
".C6": 98,
"=O.C4": 99,
".c17": 100,
"nc": 101,
"=O.C1": 102,
"4C": 103,
".c18": 104,
"8=O": 105,
"=C": 106,
"cc1": 107,
"CCC1": 108,
"1.C5": 109,
"9C": 110,
".c16": 111,
"(F)F": 112,
"6=O": 113,
"CCO": 114,
"(Cl": 115,
"c2": 116,
"1.C3": 117,
"(Cl)": 118,
".N36": 119,
"N14": 120,
".N45": 121,
"c17": 122,
"(=O)CC": 123,
"CCC2": 124,
"(O)": 125,
"(C)(C)C": 126,
"nn": 127,
"3=O": 128,
"Cc1": 129,
"=O.CO": 130,
"c18": 131,
".C2": 132,
"(O)C": 133,
".C7": 134,
".c19": 135,
"c1C": 136,
".N47": 137,
"N13": 138,
"=O.C6": 139,
".O=C": 140,
".N57": 141,
".C17": 142,
"C2": 143,
"c16": 144,
"N)": 145,
".N6": 146,
"8CC": 147,
"6CC": 148,
".n1": 149,
".N24": 150,
".N56": 151,
"9=O": 152,
".N14": 153,
".CCC": 154,
"(F)C": 155,
"ccc2": 156,
"Br": 157,
"(C)CC": 158,
"5CC": 159,
".N13": 160,
"4=O": 161,
"CN": 162,
".C=": 163,
".C4(=O)C": 164,
"ccccc1": 165,
".N34": 166,
"7CC": 167,
"ncc": 168,
"56": 169,
"c(": 170,
"c19": 171,
".S": 172,
"%12": 173,
".C18": 174,
".C3(=O)C": 175,
".N37": 176,
"1.C4(=O)C": 177,
"N4": 178,
"9CC": 179,
".C16": 180,
"(F)(F)F": 181,
"nc1": 182,
"%10C": 183,
"=O.C3": 184,
".O=C3": 185,
"c(C)": 186,
".O56": 187,
".C6C": 188,
"cs": 189,
".C47": 190,
".c15": 191,
".C58": 192,
".N35.N46": 193,
".C8": 194,
".CC1": 195,
"(N)": 196,
".N25": 197,
"5=O": 198,
"(CC": 199,
"3=O.C4": 200,
".C5C": 201,
"Br)": 202,
".CO": 203,
".C47=O": 204,
".Cc1": 205,
"ccn": 206,
"CC12": 207,
"67": 208,
".c1%10": 209,
"3=O.C1": 210,
"CCN4": 211,
"CCCN": 212,
"cc2": 213,
"(C1": 214,
"1.C17": 215,
"c15": 216,
"C4": 217,
")C1": 218,
".C48": 219,
".O4": 220,
"1.C6": 221,
"N15": 222,
"C3": 223,
".N48": 224,
"=CC": 225,
"4=O.C5": 226,
"C5": 227,
".C69": 228,
"1.C18": 229,
".C37": 230,
".C19": 231,
"=N": 232,
"N3": 233,
".C7C": 234,
"1.C3(=O)C": 235,
".CC5": 236,
"1.C5(=O)C": 237,
"C17": 238,
"cnc": 239,
".N58": 240,
"cnn": 241,
"%10CC": 242,
"(Br)": 243,
"5C2": 244,
".N24.N35": 245,
"[N": 246,
"(=O)C1": 247,
"1.Cc1": 248,
"nc2": 249,
")CC": 250,
")C2": 251,
"=O.CO5": 252,
".C59": 253,
".O45": 254,
".C57": 255,
"C18": 256,
"3=O.C": 257,
".O3": 258,
"6=O.N35": 259,
"C16": 260,
"(=O)N": 261,
"CCCO": 262,
"cc8": 263,
".c17ccc": 264,
".C36": 265,
"cc(C)": 266,
"C12": 267,
"c1Cl": 268,
"cc1C": 269,
"1.C16": 270,
"=7": 271,
"+]": 272,
"(C2": 273,
"-]": 274,
"cc(Cl)": 275,
"4C2": 276,
"=O.C7": 277,
"5CC1": 278,
".C6%10": 279,
"nn1": 280,
".N15": 281,
"c(F)": 282,
".O67": 283,
"cc(F)": 284,
".O5C": 285,
".C9": 286,
".C37=O": 287,
".c18ccc": 288,
"c[nH]": 289,
".N26": 290,
"[N+]": 291,
".CC6": 292,
"c1%10": 293,
"(=O)CCC": 294,
"cn1": 295,
"=8": 296,
"cc7": 297,
"8=O.N46": 298,
"(C#": 299,
"1.C19": 300,
"no": 301,
".N67": 302,
"=O.CO6": 303,
".C68": 304,
"CCOC": 305,
"c(Cl)": 306,
"OC": 307,
"4=O.C1": 308,
".N14CC": 309,
"5C1": 310,
".N13CC": 311,
".C46": 312,
"(F)(F)C": 313,
"n[nH]": 314,
"2CC": 315,
".C8C": 316,
"CC17": 317,
"1.C2": 318,
".C5(=O)C": 319,
"(C)C1": 320,
"cc9": 321,
".c16ccc": 322,
".CC8": 323,
"ccc1": 324,
".CC4": 325,
".C38": 326,
"CC1C": 327,
"CC2(": 328,
"S(=O)": 329,
"=O.C58": 330,
"[O": 331,
"[O-]": 332,
"6C2": 333,
"%10=O": 334,
"78": 335,
".C48=O": 336,
"cccc1": 337,
"C=": 338,
"s1": 339,
"1.C=": 340,
".CC9": 341,
".C15": 342,
"(=O)=O": 343,
"co": 344,
"(C)O": 345,
"ccccc2": 346,
".CC7": 347,
".C36=O": 348,
".C2(=O)C": 349,
"CC16": 350,
"C19": 351,
"#CC": 352,
"(=O)[O-]": 353,
"[N+](=O)[O-]": 354,
".C7%10": 355,
"CC18": 356,
".C37=O.C4": 357,
"=O.C47=O": 358,
"6C1": 359,
"c8": 360,
"ccc(F)": 361,
"1.C6C": 362,
"2C": 363,
"7C1": 364,
".C1%10": 365,
"7=O.N46": 366,
"(C#N)": 367,
".N35C": 368,
".O4C": 369,
"1.C48": 370,
"c7": 371,
".N36.N47": 372,
".N14CCC": 373,
"(O)CC": 374,
"1.C47": 375,
".N68": 376,
"5C6": 377,
"n1C": 378,
".C49": 379,
"ncn": 380,
".CCCC": 381,
".O6C": 382,
"C6": 383,
"C15": 384,
"[nH]1": 385,
"CC1CN": 386,
"cccc2": 387,
"4=O.C": 388,
"nc(C)": 389,
"CCOCC": 390,
"ccc8": 391,
"N=": 392,
".n18": 393,
".C17CC": 394,
"CCN4C": 395,
"7(C)C": 396,
".n17": 397,
"CCC2(": 398,
".C79": 399,
".N46C": 400,
"N14CC": 401,
"ccs": 402,
"N2": 403,
"ccc(Cl)": 404,
"=O.C8": 405,
"N4C": 406,
"1.C7C": 407,
"=O.CO4": 408,
"8C1": 409,
"CC2C": 410,
"(C)CCC": 411,
"N13CC": 412,
".N38": 413,
"3=O.C4(=O)C": 414,
"ccc7": 415,
"%11C": 416,
".N13CCC": 417,
"(N)=O": 418,
"%11CC": 419,
".c14": 420,
"cc1F": 421,
"c1.C1": 422,
"1.C1%10": 423,
".C48=O.C5": 424,
".N16": 425,
"8CC1": 426,
"c14": 427,
"c2ccccc2": 428,
"1.C5C": 429,
"=O.C69": 430,
"9CC1": 431,
"CCN3": 432,
"O1": 433,
"7C2": 434,
".C46=O": 435,
".C9C": 436,
"N5C": 437,
".c17cn": 438,
".C38=O": 439,
"CC2CC": 440,
"(C1)": 441,
"cc1Cl": 442,
"1.CO": 443,
"c9": 444,
"NC": 445,
"#N": 446,
".O=C4": 447,
"C3(=O)C": 448,
"(C)c1": 449,
"CCN5C": 450,
"1.C8C": 451,
"N4C1": 452,
".CO5": 453,
".c18cn": 454,
"7CC1": 455,
".C18CC": 456,
".C16CC": 457,
"o1": 458,
"[N+](=O)[O-])": 459,
"cc(Br)": 460,
".CC(C)C": 461,
".c1%11": 462,
"n2": 463,
"ccnc": 464,
".N46.N57": 465,
".C%10": 466,
"1.C7": 467,
"=6": 468,
"S(C)": 469,
"cc6": 470,
"1.C4(=O)CC": 471,
"c1F": 472,
"C#": 473,
")C(C)C": 474,
".C4(=O)CC": 475,
"ccc9": 476,
"CC19": 477,
"=O.C47": 478,
"1.C48=O.C5": 479,
"=O.CS": 480,
"N5": 481,
"N13CCC": 482,
"=O.C6C": 483,
"5=O.C6": 484,
"C17C": 485,
"c2c1": 486,
".C4C": 487,
"=O.C9": 488,
"(O)C1": 489,
"c(O)": 490,
"CC1(C)C": 491,
".N59": 492,
".N36.N45": 493,
".C3(=O)CC": 494,
"CC2)C": 495,
".N15CC": 496,
"1.C37": 497,
".N23": 498,
".CC%10": 499
},
"merges": [
"C C",
". C",
"= O",
"c c",
". N",
"c 1",
") C",
"( C",
"1 .C",
"=O .C",
"% 1",
"CC C",
"( =O",
". c1",
"%1 0",
"CC 1",
".N 3",
"(C )",
"C 1",
".N 4",
"(C )C",
"(=O )C",
".C 1",
"( F",
".C 4",
". O",
"cc c",
"(F )",
". CC",
"N 1",
"(=O )",
".C 5",
"6 C",
".C 3",
"c n",
"CC 2",
"1.C 1",
"5 C",
"7 C",
".N 1",
".N 5",
"n 1",
".N3 5",
".N4 6",
"cc cc",
"CC N",
".N 2",
"( O",
"1.C 4",
"8 C",
"%1 1",
"7 =O",
"[ n",
"H ]",
"[n H]",
"CC CC",
"=O.C 5",
".C 6",
"=O.C 4",
".c1 7",
"n c",
"=O.C 1",
"4 C",
".c1 8",
"8 =O",
"= C",
"cc 1",
"CCC 1",
"1.C 5",
"9 C",
".c1 6",
"(F) F",
"6 =O",
"CC O",
"(C l",
"c 2",
"1.C 3",
"(Cl )",
".N3 6",
"N1 4",
".N4 5",
"c1 7",
"(=O) CC",
"CCC 2",
"(O )",
"(C) (C)C",
"n n",
"3 =O",
"C c1",
"=O.C O",
"c1 8",
".C 2",
"(O )C",
".C 7",
".c1 9",
"c1 C",
".N4 7",
"N1 3",
"=O.C 6",
".O =C",
".N5 7",
".C1 7",
"C 2",
"c1 6",
"N )",
".N 6",
"8 CC",
"6 CC",
". n1",
".N2 4",
".N5 6",
"9 =O",
".N1 4",
". CCC",
"(F )C",
"ccc 2",
"B r",
"(C) CC",
"5 CC",
".N1 3",
"4 =O",
"C N",
".C =",
".C4 (=O)C",
"cccc c1",
".N3 4",
"7 CC",
"n cc",
"5 6",
"c (",
"c1 9",
". S",
"%1 2",
".C1 8",
".C3 (=O)C",
".N3 7",
"1.C4 (=O)C",
"N 4",
"9 CC",
".C1 6",
"(F) (F)F",
"n c1",
"%10 C",
"=O.C 3",
".O=C 3",
"c (C)",
".O 56",
".C 6C",
"c s",
".C4 7",
".c1 5",
".C5 8",
".N35 .N46",
".C 8",
". CC1",
"( N)",
".N2 5",
"5 =O",
"( CC",
"3 =O.C4",
".C5 C",
"Br )",
".C O",
".C4 7=O",
".C c1",
"cc n",
"CC1 2",
"6 7",
".c1 %10",
"3 =O.C1",
"CCN 4",
"CCC N",
"cc 2",
"(C 1",
"1.C1 7",
"c1 5",
"C 4",
")C 1",
".C4 8",
".O 4",
"1.C 6",
"N1 5",
"C 3",
".N4 8",
"= CC",
"4 =O.C5",
"C 5",
".C6 9",
"1.C1 8",
".C3 7",
".C1 9",
"= N",
"N 3",
".C 7C",
"1.C3 (=O)C",
".CC 5",
"1.C5 (=O)C",
"C1 7",
"cn c",
".N5 8",
"cn n",
"%10 CC",
"( Br)",
"5C 2",
".N24 .N35",
"[ N",
"(=O)C 1",
"1.C c1",
"nc 2",
") CC",
")C 2",
"=O.CO 5",
".C5 9",
".O4 5",
".C5 7",
"C1 8",
"3 =O.C",
".O 3",
"6=O .N35",
"C1 6",
"(=O) N",
"CCC O",
"cc 8",
".c17 ccc",
".C3 6",
"cc (C)",
"C1 2",
"c1C l",
"cc1 C",
"1.C1 6",
"= 7",
"+ ]",
"(C 2",
"- ]",
"cc (Cl)",
"4C 2",
"=O.C 7",
"5 CC1",
".C6 %10",
"n n1",
".N1 5",
"c (F)",
".O 67",
"cc (F)",
".O 5C",
".C 9",
".C3 7=O",
".c18 ccc",
"c [nH]",
".N2 6",
"[N +]",
".CC 6",
"c1 %10",
"(=O) CCC",
"cn 1",
"= 8",
"cc 7",
"8=O .N46",
"(C #",
"1.C1 9",
"n o",
".N6 7",
"=O.CO 6",
".C6 8",
"CCO C",
"c (Cl)",
"O C",
"4 =O.C1",
".N14 CC",
"5 C1",
".N13 CC",
".C4 6",
"(F) (F)C",
"n [nH]",
"2 CC",
".C 8C",
"CC1 7",
"1.C 2",
".C5 (=O)C",
"(C)C 1",
"cc 9",
".c16 ccc",
".CC 8",
"cc c1",
".CC 4",
".C3 8",
"CC1 C",
"CC2 (",
"S (=O)",
"=O.C5 8",
"[ O",
"[O -]",
"6C 2",
"%10 =O",
"7 8",
".C4 8=O",
"cccc 1",
"C =",
"s 1",
"1.C =",
".CC 9",
".C1 5",
"(=O) =O",
"c o",
"(C) O",
"cc ccc2",
".CC 7",
".C3 6=O",
".C2 (=O)C",
"CC1 6",
"C1 9",
"# CC",
"(=O) [O-]",
"[N+] (=O)[O-]",
".C7 %10",
"CC1 8",
".C37 =O.C4",
"=O.C4 7=O",
"6 C1",
"c 8",
"ccc (F)",
"1.C 6C",
"2 C",
"7 C1",
".C1 %10",
"7=O .N46",
"(C# N)",
".N3 5C",
".O 4C",
"1.C4 8",
"c 7",
".N36 .N47",
".N14 CCC",
"(O) CC",
"1.C4 7",
".N6 8",
"5C 6",
"n1 C",
".C4 9",
"n cn",
".CC CC",
".O 6C",
"C 6",
"C1 5",
"[nH] 1",
"CC1 CN",
"cccc 2",
"4 =O.C",
"nc (C)",
"CCO CC",
"ccc 8",
"N =",
".n1 8",
".C17 CC",
"CCN 4C",
"7 (C)C",
".n1 7",
"CCC2 (",
".C7 9",
".N4 6C",
"N14 CC",
"cc s",
"N 2",
"ccc (Cl)",
"=O.C 8",
"N 4C",
"1.C 7C",
"=O.CO 4",
"8 C1",
"CC2 C",
"(C) CCC",
"N13 CC",
".N3 8",
"3=O.C4 (=O)C",
"ccc 7",
"%11 C",
".N13 CCC",
"(N) =O",
"%11 CC",
".c1 4",
"cc1 F",
"c1 .C1",
"1.C1 %10",
".C48 =O.C5",
".N1 6",
"8 CC1",
"c1 4",
"c2 ccccc2",
"1.C 5C",
"=O.C6 9",
"9 CC1",
"CCN 3",
"O 1",
"7C 2",
".C4 6=O",
".C 9C",
"N 5C",
".c17 cn",
".C3 8=O",
"CC2 CC",
"(C1 )",
"cc1C l",
"1.C O",
"c 9",
"N C",
"# N",
".O=C 4",
"C3 (=O)C",
"(C) c1",
"CCN 5C",
"1.C 8C",
"N4 C1",
".CO 5",
".c18 cn",
"7 CC1",
".C1 8CC",
".C1 6CC",
"o 1",
"[N+](=O)[O-] )",
"cc (Br)",
".CC (C)C",
".c1 %11",
"n 2",
"cc nc",
".N46 .N57",
".C %10",
"1.C 7",
"= 6",
"S (C)",
"cc 6",
"1.C4 (=O)CC",
"c1 F",
"C #",
")C (C)C",
".C4 (=O)CC",
"ccc 9",
"CC1 9",
"=O.C4 7",
"1.C48 =O.C5",
"=O.C S",
"N 5",
"N13 CCC",
"=O.C 6C",
"5 =O.C6",
"C1 7C",
"c2 c1",
".C4 C",
"=O.C 9",
"(O)C 1",
"c (O)",
"CC1 (C)C",
".N5 9",
".N36 .N45",
".C3 (=O)CC",
"CC2 )C",
".N1 5CC",
"1.C3 7",
".N2 3",
".CC %10"
]
}
}