apapagi commited on
Commit
4f7dce5
·
verified ·
1 Parent(s): 64f9a31

Upload 2 files

Browse files
Files changed (2) hide show
  1. inference.py +21 -0
  2. train.py +85 -0
inference.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
2
+
3
+ # Path of fine-tuned model
4
+ model_path = "./fine_tuned_model"
5
+
6
+ # Load tokenizer and model
7
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
8
+ model = AutoModelForCausalLM.from_pretrained(model_path)
9
+
10
+ # Create chatbot pipeline
11
+ chatbot = pipeline(
12
+ "text-generation",
13
+ model=model,
14
+ tokenizer=tokenizer,
15
+ device=0 if torch.cuda.is_available() else -1 # Use GPU if available
16
+ )
17
+
18
+ # Example usage
19
+ prompt = "Hello, can you tell me some fun facts about european legislation?"
20
+ response = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
21
+ print(response[0]['generated_text'])
train.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import DataCollatorForLanguageModeling
3
+ from transformers import Trainer, TrainingArguments
4
+ import os
5
+ import torch
6
+
7
+
8
+
9
+ def main():
10
+
11
+ local_rank = int(os.environ['LOCAL_RANK'])
12
+ rank = int(os.environ['RANK'])
13
+ world_size = int(os.environ['WORLD_SIZE'])
14
+
15
+ torch.distributed.init_process_group("nccl")
16
+ print(f"Local Rank = {local_rank}/{world_size}")
17
+
18
+
19
+
20
+ # Load your JSONL file
21
+ dataset = load_dataset('json', data_files='../../data/m500_clean.jsonl', split='train')
22
+
23
+ # Load a Model
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM
25
+
26
+ model_name = "FacebookAI/roberta-base"
27
+
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForCausalLM.from_pretrained(model_name)
30
+
31
+ # Set pad token if not set
32
+ if tokenizer.pad_token is None:
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+
35
+ # Tokenize the dataset
36
+ def tokenize_function(examples):
37
+ return tokenizer(examples["text"], truncation=True, max_length=512)
38
+
39
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
40
+
41
+ # Split the dataset into training and validation sets
42
+ split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
43
+
44
+ # Data collator, pad the inputs to the maximum length in the batch
45
+ data_collator = DataCollatorForLanguageModeling(
46
+ tokenizer=tokenizer, mlm=False # mlm=False: causal language modeling
47
+ )
48
+
49
+ # Training
50
+ training_args = TrainingArguments(
51
+ output_dir="./results",
52
+ overwrite_output_dir=True,
53
+ num_train_epochs=3,
54
+ per_device_train_batch_size=4,
55
+ per_device_eval_batch_size=4,
56
+ dataloader_num_workers=8,
57
+ eval_steps=500,
58
+ save_steps=1000,
59
+ warmup_steps=500,
60
+ prediction_loss_only=True,
61
+ logging_dir="./logs",
62
+ logging_steps=100,
63
+ learning_rate=5e-5,
64
+ fp16=True, # true for GPU
65
+ )
66
+
67
+ trainer = Trainer(
68
+ model=model,
69
+ args=training_args,
70
+ train_dataset=split_dataset["train"],
71
+ eval_dataset=split_dataset["test"],
72
+ data_collator=data_collator,
73
+ )
74
+
75
+ # Start training
76
+ trainer.train()
77
+
78
+ torch.distributed.destroy_process_group()
79
+
80
+ # Save the model and tokenizer
81
+ model.save_pretrained("./fine_tuned_model")
82
+ tokenizer.save_pretrained("./fine_tuned_model")
83
+
84
+ if __name__ == "__main__":
85
+ main()