| use_wandb = 1 | |
| seed = 3 | |
| style = "dit" | |
| d_adapter = 768 | |
| normalize_embeddings = 1 | |
| depth = 3 | |
| embs = [ "gte", "gtr", "stella", "sentence-t5", "e5", "sbert", "clip", "stella-big", "snowflake", "ember",] | |
| n_embs_per_batch = 2 | |
| max_seq_length = 512 | |
| depth_transform = 6 | |
| lr = 5e-5 | |
| bs = 64 | |
| save_every = 400 | |
| epochs = 5.0 | |
| dataset = "nomic_unsupervised" | |
| max_grad_norm = 10.0 | |
| gradient_accumulation_steps = 1 | |
| loss_coefficient_vsp = 0 | |
| loss_coefficient_contrastive = 1 | |
| loss_coefficient_trans = 1 | |
| loss_coefficient_cc = 0 | |
| eval_steps = 99999999999999999 | |
| cluster_size = 1024 | |
| cluster_strategy = "cluster_within_domain" | |
| warmup_steps = 100 | |
| wandb_project = "edx-2" | |
| wandb_name = "dit-pretrain-md-3" | |
| save_dir = "checkpoints/{}/" | |
| state_dict_dir = "checkpoints/dit-pretrain-md-2/model.pt" | |
| num_params = 1140467800 | |