{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "UiO1w9dn_ZHb" }, "source": [ "# 라이브러리 설치" ] }, { "cell_type": "markdown", "metadata": { "id": "2sepsyCU_b--" }, "source": [ "- 양자화, 실행 최적화, 효율적 파인튜닝, 데이터셋 처리, 시각화 라이브러리 불러오기" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DyzIFdONrhtC", "outputId": "1ef1c80a-92e1-45c8-b89b-5a7c8c1f3d11" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.0/76.0 MB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m411.0/411.0 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m491.2/491.2 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m105.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m183.9/183.9 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m80.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m79.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m35.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m92.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "tensorflow 2.18.0 requires tensorboard<2.19,>=2.18, but you have tensorboard 2.19.0 which is incompatible.\n", "gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.12.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "!pip install -q -U bitsandbytes accelerate peft datasets tensorboard" ] }, { "cell_type": "markdown", "metadata": { "id": "I15Xh4DNr1nw" }, "source": [ "# HuggingFace 로그인 및 데이터 업로드" ] }, { "cell_type": "markdown", "metadata": { "id": "MgVSOYxQ-Oi4" }, "source": [ "- 구글 드라이브를 마운트해서 데이터를 불러오는 방식이 아닌 HuggingFace에 업로드해서 다시 불러오는 방식으로 데이터 사용" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZLbzElKyrn-r" }, "outputs": [], "source": [ "# HuggingFace hub에 업로드하기 위해 인증 토큰으로 로그인\n", "from huggingface_hub import login\n", "\n", "login(token=\"token\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Wu8rsKbIr9-N" }, "outputs": [], "source": [ "import os\n", "import shutil\n", "\n", "# 저장할 디렉토리 이름\n", "dataset_name = \"empathy_chat_couple_data\"\n", "\n", "# 디렉토리 생성\n", "if os.path.exists(dataset_name):\n", " shutil.rmtree(dataset_name)\n", "os.makedirs(dataset_name)\n", "\n", "# jsonl 파일 복사\n", "# 전처리한 파일 HuggingFace hub에 업로드\n", "shutil.copy(\"/content/converted_format.jsonl\", f\"{dataset_name}/data.jsonl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yE1BpErpr_Go" }, "outputs": [], "source": [ "from huggingface_hub import create_repo, upload_folder\n", "\n", "# Hugging Face에 데이터셋용 저장소 만들기\n", "create_repo(repo_id=f\"shjun/{dataset_name}\", repo_type=\"dataset\", exist_ok=True)\n", "\n", "# 디렉토리 통째로 업로드\n", "upload_folder(\n", " repo_id=f\"shjun/{dataset_name}\",\n", " folder_path=dataset_name,\n", " repo_type=\"dataset\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "d-YCX6nN9_dq" }, "source": [ "![image.png]()" ] }, { "cell_type": "markdown", "metadata": { "id": "5mxWSYaAsLfg" }, "source": [ "# HuggingFace에서 데이터셋 가져오기" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 190, "referenced_widgets": [ "f3f714a45faa410aa8fe97afbfad4bc9", "81452c23ddc74b73b00683e6e0ffd62c", "5a457b732e6f41d6bdd60d3e319e16b3", "3ec60119a667432591ff84d689498d64", "ef3bfefab1894c639c1efa361e02ad1c", "93bc3ada38bb4544a20337de5b480c80", "3997e390204e4f93ad584bc43a763092", "e3d10614ae6840b8978dfe947d52794e", "25859ea43ecd4c36ababe00bd9d9a9ee", "ea835a895f704f68a5f2632a0ab69a5e", "fb9b3e0f77724dd49226198162394de9", "c3a5b2d6e7014e6485397b1cd800b858", "ea1fc5e0e5b445cfbe8f3546014975de", "3efe0e4cfa204ff5bef1bb1460149e2e", "322b8fffe4e5498188bc56ba62ecccf4", "cfd3b1e036dc40c4ad31e15e523236e7", "98ad3eaf942348e59aea1964abd2202c", "5d4799296ede4114a5bad2648def9ca1", "cbbb14ab13854610858fadba2e4ca5c2", "afc57fb7a5264dea966c34249a91bacf", "ef23cddf9b8748c0a7722500c2af0410", "720b980a7f644059ae593129c7e8e0ed" ] }, "id": "8h-5xlBJsAwb", "outputId": "3598fb76-3221-4ed2-9323-6945dcb5225d" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f3f714a45faa410aa8fe97afbfad4bc9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "data.jsonl: 0%| | 0.00/12.0M [00:00 EEVE 계열 모델 구조에 특화\n", "- lora_dropout=0 // 드롭아웃 비활성화 —> 안정적 학습\n", "- bias=\"none\" // LoRA에서 bias 학습 안 함\n", "- use_rslora=False //Rank Stabilized LoRA 사용 안 함 (보통 False가 안정적)\n", "\n", "### 토큰 설정\n", "- 생성 결과 에서 발화자 구분이 필요할 때, 생성 멈춤 조건을 걸때, 생성 후 후처리 파싱할때 활용하기 위해 미리 확인차 설정\n", "- 각 항목에 대한 숫자는 해당 모델에 학습하며 이미 설정되어 있음\n", "\n", "### 학습 파라미터 결과\n", "- all params : 해당 모델의 전체 파라미터 수\n", "- trainable params : 이 중에서 LoRA로 학습 가능한 파라미터 수\n", "- trainable% : 전체 파라미터 중에서서 학습되는 비율" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EXfC3BpLtN2f", "outputId": "f4267ffc-bc6d-4465-cbd8-8f0bf80c51a8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 62914560 || all params: 5633347584 || trainable%: 1.1168236836422412\n" ] } ], "source": [ "# Gradient checkpointing 활성화 -> 메모리 최적화 기법\n", "model.gradient_checkpointing_enable()\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# 전체 파라미터 대비 LoRA로 학습되는 파라미터 비율 확인\n", "# 보통 수백억 파라미터 중 수십만 개만 학습(약 0.1%~0.3%)\n", "def print_trainable_parameters(model):\n", " \"\"\"\n", " Prints the number of trainable parameters in the model.\n", " \"\"\"\n", " trainable_params = 0\n", " all_param = 0\n", " for _, param in model.named_parameters():\n", " all_param += param.numel()\n", " if param.requires_grad:\n", " trainable_params += param.numel()\n", " print(\n", " f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n", " )\n", "\n", "model.gradient_checkpointing_enable()\n", "# LoRA 설정\n", "config = LoraConfig(\n", " r=16,\n", " lora_alpha=32,\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_dropout=0,\n", " bias=\"none\",\n", " use_rslora = False,\n", ")\n", "\n", "# 원본 모델에 LoRA 어댑터 붙이기(trainable 파라미터만 추가)\n", "model = get_peft_model(model, config)\n", "print_trainable_parameters(model)\n", "\n", "#역할 토큰 넘버 저장\n", "tokenNum_ai = 33626 # \"남자\"\n", "tokenNum_human = 33269 # \"여자\"\n", "tokenNum_com = 714 # \":\"" ] }, { "cell_type": "markdown", "metadata": { "id": "1If0pHV6OPFO" }, "source": [ "##" ] }, { "cell_type": "markdown", "metadata": { "id": "1bp452wGYAhZ" }, "source": [ "## 학습 손실 계산을 재정의\n", "\n", "- '남자' 뒤에 나오는 부분만 학습하도록 손실을 마스킹하는 것\n", "- 즉, 모델은 '여자' 뒤의 내용은 예측하지 않도록 -100 마스크를 씌움" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RY0ITNMitO_I" }, "outputs": [], "source": [ "class maskTrainer(Trainer):\n", " def __init__(self, *args, **kwargs):\n", " super().__init__(*args, **kwargs)\n", "\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " for x in range(len(inputs['labels'])):\n", " # print(tokenizer.decode(inputs['labels'][x]))\n", "\n", " maskindex1 = (inputs['labels'][x]==tokenNum_human).nonzero()[:, 0].cpu()\n", " temp = 0\n", " for i, index in enumerate(maskindex1):\n", " if (inputs['labels'][x][index+1] != tokenNum_com):\n", " maskindex1 = np.delete(maskindex1, i-temp)\n", " temp += 1\n", "\n", " maskindex2 = (inputs['labels'][x]==tokenNum_ai).nonzero()[:, 0].cpu()\n", " temp = 0\n", " for i, index in enumerate(maskindex2):\n", " if (inputs['labels'][x][index+1] != tokenNum_com):\n", " maskindex2 = np.delete(maskindex2, i-temp)\n", " temp += 1\n", "\n", " for i in range(len(maskindex1)):\n", " ai_index = -1\n", " for num in maskindex2:\n", " if (maskindex1[i] < num):\n", " ai_index = num\n", " break\n", " if (ai_index == -1):\n", " inputs['labels'][x][maskindex1[i]+2:] = -100\n", " else:\n", " inputs['labels'][x][maskindex1[i]+2:ai_index+2] = -100\n", " # print(inputs['labels'][x])\n", "\n", " outputs = model(**inputs)\n", " loss = outputs['loss']\n", " return (loss,outputs) if return_outputs else loss\n", "\n", "# tokenizer.pad_token = tokenizer.eos_token" ] }, { "cell_type": "markdown", "metadata": { "id": "svg6ImYiZE0V" }, "source": [ "# 모델 학습 및 파라미터 설정" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 553 }, "id": "iBWy6eGztQL-", "outputId": "d12504b6-3cf2-4dda-ba52-06c08cfaac09" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [7292/7292 1:49:40, Epoch 2/2]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
5002.231800
10002.214900
15002.218900
20002.181500
25002.195300
30002.144400
35002.145200
40001.834900
45001.688400
50001.673200
55001.657700
60001.630900
65001.614700
70001.612100

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "NUM_EPOCHS = 2\n", "\n", "trainer = Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", " args=TrainingArguments(\n", " per_device_train_batch_size=1, # GPU 1개당 batch 1개\n", " gradient_accumulation_steps=1, # 1 step 마다 weight 업데이트\n", " fp16=True, # float16 혼합정밀도 사용 -> 메모리 절약\n", " output_dir=\"outputs\", # 체크포인트 저장 위치\n", " save_total_limit=2, # 최대 저장 모델 수 2개로 제한\n", " logging_steps=500, # 500 step마다 로그 출력\n", " report_to=[\"tensorboard\"], # 로그를 텐서보드로 기록\n", " num_train_epochs = NUM_EPOCHS, # 2회 학습\n", " learning_rate=2e-4, # 비교적 높은 학습률(LoRA에 적합한 수치)\n", " lr_scheduler_type= \"cosine\", # 학습률 점차 감소\n", " ),\n", " data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "\n", "model.config.use_cache = False\n", "trainer.train()\n", "\n", "model.save_pretrained(f\"./saved/EVEE/10.8B/{NUM_EPOCHS}epoch\") # 학습이 완료된 모델을 로컬 디렉토리에 저장" ] }, { "cell_type": "markdown", "metadata": { "id": "eRgmQi4ev5f8" }, "source": [ "# HuggingFace에 push하기" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2m0NF7-_wF8y", "outputId": "06ca6b8a-e7e1-4d2e-8d72-76b66599de5b" }, "outputs": [ { "data": { "text/plain": [ "('./saved/EVEE/10.8B/2epoch/tokenizer_config.json',\n", " './saved/EVEE/10.8B/2epoch/special_tokens_map.json',\n", " './saved/EVEE/10.8B/2epoch/tokenizer.json')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_path = f\"./saved/EVEE/10.8B/{NUM_EPOCHS}epoch\"\n", "\n", "model.save_pretrained(output_path)\n", "tokenizer.save_pretrained(output_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 103, "referenced_widgets": [ "88ddd9672305452fa558d592aee0f30c", "d3ab5d075be349cc8af47a3f6b422b11", "5d0655e6412c4c7599eedc23af8c2df5", "d338b7988f6e4b979610c1be46101b50", "98f540f7aa4f49c4859612a0db8c7bc6", "783c546fa7584065987402f4197c758e", "8fc2951ab45a46a294d10ad459f93ddd", "bfd06e21c7334d2bafa0261e1ce918dc", "3c455257a8e0404b9cc758baaae76d86", "e17b538116f74b0ab5b1ce65ebf85312", "122cfefc839b43c29da85d76407ccf79" ] }, "id": "sF5h09M3tRS9", "outputId": "02e5d435-0861-47a4-92ff-f9cd6d13e6bd" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "88ddd9672305452fa558d592aee0f30c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "adapter_model.safetensors: 0%| | 0.00/252M [00:00