{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pandas\n", " Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)\n", "Collecting numpy>=1.23.2 (from pandas)\n", " Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl.metadata (60 kB)\n", " ---------------------------------------- 0.0/60.8 kB ? eta -:--:--\n", " -------------------------- ------------- 41.0/60.8 kB ? eta -:--:--\n", " ---------------------------------------- 60.8/60.8 kB 1.1 MB/s eta 0:00:00\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\user\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\\localcache\\local-packages\\python311\\site-packages (from pandas) (2.9.0.post0)\n", "Collecting pytz>=2020.1 (from pandas)\n", " Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", "Collecting tzdata>=2022.7 (from pandas)\n", " Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\user\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\\localcache\\local-packages\\python311\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", "Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)\n", " ---------------------------------------- 0.0/11.6 MB ? eta -:--:--\n", " - -------------------------------------- 0.4/11.6 MB 24.3 MB/s eta 0:00:01\n", " --- ------------------------------------ 1.0/11.6 MB 20.5 MB/s eta 0:00:01\n", " --------- ------------------------------ 2.8/11.6 MB 22.0 MB/s eta 0:00:01\n", " ----------- ---------------------------- 3.3/11.6 MB 20.9 MB/s eta 0:00:01\n", " ------------------ --------------------- 5.3/11.6 MB 28.1 MB/s eta 0:00:01\n", " ----------------------- ---------------- 6.8/11.6 MB 27.1 MB/s eta 0:00:01\n", " ------------------------ --------------- 7.2/11.6 MB 22.9 MB/s eta 0:00:01\n", " ----------------------------- ---------- 8.6/11.6 MB 26.0 MB/s eta 0:00:01\n", " -------------------------------------- - 11.3/11.6 MB 31.1 MB/s eta 0:00:01\n", " ---------------------------------------- 11.6/11.6 MB 28.5 MB/s eta 0:00:00\n", "Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl (12.9 MB)\n", " ---------------------------------------- 0.0/12.9 MB ? eta -:--:--\n", " ----------- ---------------------------- 3.7/12.9 MB 120.6 MB/s eta 0:00:01\n", " ------------------- -------------------- 6.2/12.9 MB 79.0 MB/s eta 0:00:01\n", " ------------------------ --------------- 7.9/12.9 MB 63.4 MB/s eta 0:00:01\n", " ------------------------ --------------- 8.0/12.9 MB 56.5 MB/s eta 0:00:01\n", " ------------------------- -------------- 8.3/12.9 MB 40.8 MB/s eta 0:00:01\n", " ---------------------------- ----------- 9.2/12.9 MB 34.5 MB/s eta 0:00:01\n", " ------------------------------------ --- 11.7/12.9 MB 34.4 MB/s eta 0:00:01\n", " ---------------------------------------- 12.9/12.9 MB 31.2 MB/s eta 0:00:00\n", "Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", " ---------------------------------------- 0.0/509.2 kB ? eta -:--:--\n", " --------------------------------------- 509.2/509.2 kB 31.2 MB/s eta 0:00:00\n", "Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)\n", " ---------------------------------------- 0.0/347.8 kB ? eta -:--:--\n", " --------------------------------------- 347.8/347.8 kB 21.1 MB/s eta 0:00:00\n", "Installing collected packages: pytz, tzdata, numpy, pandas\n", "Successfully installed numpy-2.2.4 pandas-2.2.3 pytz-2025.2 tzdata-2025.2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "[notice] A new release of pip is available: 24.0 -> 25.0.1\n", "[notice] To update, run: C:\\Users\\USER\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\python.exe -m pip install --upgrade pip\n" ] } ], "source": [ "!pip install pandas" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ 변환 완료: 376개의 대화가 sad_couple.jsonl에 저장됨.\n" ] } ], "source": [ "import pandas as pd\n", "import json\n", "import glob\n", "import os\n", "\n", "# TSV 파일들이 있는 폴더 경로\n", "folder_path = r\"C:\\Users\\USER\\Desktop\\empathy_data\\data\\046.공감형 대화 (연인만)\\046.공감형 대화 (연인만)\\01-1.정식개방데이터\\Training\\01.원천데이터\\TS_슬픔_연인\"\n", "\n", "# 폴더 내 모든 .tsv 파일 찾기\n", "tsv_files = glob.glob(os.path.join(folder_path, \"*.tsv\"))\n", "\n", "# 결과 저장할 딕셔너리\n", "conversations = {}\n", "\n", "# 모든 TSV 파일 처리\n", "for file in tsv_files:\n", " df = pd.read_csv(file, sep=\"\\t\") # TSV 파일 읽기\n", " \n", " for _, row in df.iterrows():\n", " conv_id = row[\"id\"] # 대화 ID\n", " utterance_type = \"user\" if row[\"utterance_type\"] == 0 else \"assistant\"\n", " \n", " # 새로운 대화 ID가 나오면 리스트 초기화\n", " if conv_id not in conversations:\n", " conversations[conv_id] = []\n", " \n", " # 발화 추가\n", " conversations[conv_id].append({\n", " \"role\": utterance_type,\n", " \"content\": row[\"utterance_text\"]\n", " })\n", "\n", "# JSONL 파일로 저장\n", "output_file = \"sad_couple.jsonl\"\n", "with open(output_file, \"w\", encoding=\"utf-8\") as f:\n", " for conv_id, convo in conversations.items():\n", " json.dump({\"conversation\": convo}, f, ensure_ascii=False)\n", " f.write(\"\\n\")\n", "\n", "print(f\"✅ 변환 완료: {len(conversations)}개의 대화가 {output_file}에 저장됨.\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## combine" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ 변환 완료: 6개의 JSONL 파일이 merged_train.jsonl에 합쳐졌습니다.\n" ] } ], "source": [ "import json\n", "import glob\n", "import os\n", "\n", "# JSONL 파일들이 있는 폴더 경로\n", "folder_path = r\"C:\\Users\\USER\\Desktop\\empathy_data\\data\"\n", "\n", "# 폴더 내 모든 .jsonl 파일 찾기\n", "jsonl_files = glob.glob(os.path.join(folder_path, \"*.jsonl\"))\n", "\n", "# 결과를 저장할 최종 JSONL 파일 경로\n", "output_file = \"merged_train.jsonl\"\n", "\n", "# 모든 JSONL 파일을 하나로 합침\n", "with open(output_file, \"w\", encoding=\"utf-8\") as outfile:\n", " for file in jsonl_files:\n", " with open(file, \"r\", encoding=\"utf-8\") as infile:\n", " # 파일 내용 읽고 그대로 출력 파일에 작성\n", " for line in infile:\n", " outfile.write(line)\n", "\n", "print(f\"✅ 변환 완료: {len(jsonl_files)}개의 JSONL 파일이 {output_file}에 합쳐졌습니다.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }