veda-programming / train.py
vedaco's picture
Update train.py
47c9da0 verified
import tensorflow as tf
from model import VedaProgrammingLLM
from tokenizer import VedaTokenizer
from config import MODEL_DIR, VOCAB_SIZE, MAX_LENGTH
import os
import json
TRAINING_DATA = """
<USER> Hi
<ASSISTANT> Hello! How can I help you today?
<USER> Who are you?
<ASSISTANT> I am Veda, an AI programming assistant.
<USER> What is Python?
<ASSISTANT> Python is a high-level programming language known for its simplicity and readability.
<USER> Write hello world in python
<ASSISTANT> <CODE>print("Hello, World!")<ENDCODE>
<USER> Write a function to add numbers
<ASSISTANT> <CODE>
def add(a, b):
return a + b
</CODE>
<USER> Explain recursion
<ASSISTANT> Recursion is when a function calls itself to solve a problem. It needs a base case to stop.
"""
class VedaTrainer:
def __init__(self):
self.tokenizer = VedaTokenizer(VOCAB_SIZE)
self.model = None
def train(self, epochs=10, extra_data=""):
data = TRAINING_DATA + "\n" + extra_data
self.tokenizer.fit([data])
tokens = self.tokenizer.encode(data)
# Create dataset
seqs = []
for i in range(0, len(tokens)-MAX_LENGTH, 50):
seqs.append(tokens[i:i+MAX_LENGTH+1])
import numpy as np
if not seqs: seqs = [tokens[:MAX_LENGTH+1]]
arr = np.array(seqs)
ds = tf.data.Dataset.from_tensor_slices((arr[:, :-1], arr[:, 1:])).batch(4)
self.model = VedaProgrammingLLM(self.tokenizer.vocabulary_size)
self.model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
# Build model
self.model(tf.zeros((1, MAX_LENGTH)))
self.model.fit(ds, epochs=epochs)
# Save
self.model.save_weights(os.path.join(MODEL_DIR, "weights.h5"))
self.tokenizer.save(os.path.join(MODEL_DIR, "tokenizer.json"))
with open(os.path.join(MODEL_DIR, "config.json"), 'w') as f:
json.dump(self.model.get_config(), f)
if __name__ == "__main__":
VedaTrainer().train(epochs=20)