qingy2024 commited on
Commit
c64a1ae
·
verified ·
1 Parent(s): 779cffa

Training in progress, step 100

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/upload-checkpoint.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import numpy as np
4
+ import time
5
+ import shutil
6
+ import json
7
+ import matplotlib.pyplot as plt
8
+ from huggingface_hub import login, create_repo, upload_folder, HfFolder
9
+ from pathlib import Path # Using pathlib for easier path manipulation
10
+
11
+ # --- Configuration Constants ---
12
+ # Model and Repo Details
13
+ BASE_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
14
+ TARGET_REPO_NAME = "qingy2024/Gradience-T1-7B-Checkpoint" # Specify your target repo
15
+
16
+ # Training Parameters (Update if necessary)
17
+ TOTAL_STEPS = 4918 # Total expected steps for progress calculation
18
+
19
+ # File Names
20
+ README_FILENAME = "README.md"
21
+ ADAPTER_CONFIG_FILENAME = "adapter_config.json"
22
+ TRAINER_STATE_FILENAME = "trainer_state.json"
23
+ LOSS_PLOT_FILENAME = "loss.png"
24
+
25
+ # Plotting Configuration
26
+ LOSS_SMOOTHING_WINDOW = 10
27
+
28
+ # Monitoring Configuration
29
+ CHECKPOINT_DIR_PATTERN = re.compile(r"^checkpoint-(\d+)$")
30
+ POLL_INTERVAL_SECONDS = 30
31
+ PRE_UPLOAD_DELAY_SECONDS = 10 # Delay after finding checkpoint before processing
32
+
33
+ # --- Global State ---
34
+ # Set to track uploaded checkpoints (using Path objects for consistency)
35
+ uploaded_checkpoints = set()
36
+
37
+ # --- Helper Functions ---
38
+
39
+ def get_huggingface_token():
40
+ """Retrieves the Hugging Face token from environment variable or login cache."""
41
+ token = os.getenv('HUGGINGFACE_TOKEN')
42
+ if token:
43
+ print("Using Hugging Face token from HUGGINGFACE_TOKEN environment variable.")
44
+ return token
45
+ token = HfFolder.get_token()
46
+ if token:
47
+ print("Using Hugging Face token from saved credentials.")
48
+ return token
49
+ raise ValueError("Hugging Face token not found. Set HUGGINGFACE_TOKEN environment variable or login using `huggingface-cli login`.")
50
+
51
+ def update_adapter_config(config_path: Path, base_model_name: str):
52
+ """
53
+ Reads adapter_config.json, updates the base_model_name_or_path field,
54
+ and saves it back.
55
+
56
+ Args:
57
+ config_path (Path): Path to the adapter_config.json file.
58
+ base_model_name (str): The base model name to set.
59
+ """
60
+ try:
61
+ with open(config_path, 'r') as file:
62
+ config = json.load(file)
63
+
64
+ config['base_model_name_or_path'] = base_model_name
65
+
66
+ with open(config_path, 'w') as file:
67
+ json.dump(config, file, indent=2)
68
+ print(f"Updated 'base_model_name_or_path' in {config_path}")
69
+
70
+ except FileNotFoundError:
71
+ print(f"Error: Adapter config file not found at {config_path}")
72
+ except json.JSONDecodeError:
73
+ print(f"Error: Could not decode JSON from {config_path}. Is it valid?")
74
+ except KeyError:
75
+ print(f"Error: 'base_model_name_or_path' key not found in {config_path}")
76
+ except Exception as e:
77
+ print(f"An unexpected error occurred while updating {config_path}: {e}")
78
+
79
+ def generate_readme_content(checkpoint_number: int, total_steps: int, base_model: str, loss_plot_filename: str) -> str:
80
+ """Generates the README content with updated progress."""
81
+ if total_steps <= 0:
82
+ progress_percentage = 0.0
83
+ else:
84
+ progress_percentage = min(100.0, (checkpoint_number / total_steps) * 100) # Ensure percentage doesn't exceed 100
85
+
86
+ progress_width = f"{progress_percentage:.2f}%"
87
+ progress_text = f"Progress: {checkpoint_number} out of {total_steps} steps"
88
+
89
+ # Using an f-string for the template makes insertions cleaner
90
+ readme_template = f"""
91
+ ---
92
+ base_model: {base_model}
93
+ library_name: peft
94
+ ---
95
+ # Gradience T1 7B (Step {checkpoint_number} Checkpoint)
96
+
97
+ > [!NOTE]
98
+ > Training in progress...
99
+
100
+ <!DOCTYPE html>
101
+ <html lang="en">
102
+ <head>
103
+ <meta charset="UTF-8">
104
+ <title>Progress Bar Example</title>
105
+ <style>
106
+ .progress-container {{
107
+ width: 100%;
108
+ background-color: #e0e0e0;
109
+ border-radius: 25px;
110
+ overflow: hidden;
111
+ margin: 20px 0;
112
+ }}
113
+ .progress-bar {{
114
+ height: 30px;
115
+ width: 0;
116
+ background-color: #44965a;
117
+ text-align: center;
118
+ line-height: 30px;
119
+ color: white;
120
+ border-radius: 25px 0 0 25px;
121
+ }}
122
+ .progress-text {{
123
+ margin-top: 10px;
124
+ font-size: 16px;
125
+ font-family: Arial, sans-serif;
126
+ }}
127
+ </style>
128
+ </head>
129
+ <body>
130
+ <div style="width: 100%; background-color: #e0e0e0; border-radius: 25px; overflow: hidden; margin: 20px 0;">
131
+ <div style="height: 30px; width: {progress_width}; background-color: #76c7c0; text-align: center; line-height: 30px; color: white; border-radius: 25px 0 0 25px;">
132
+ <!-- {progress_percentage:.2f}% -->
133
+ </div>
134
+ </div>
135
+ <p style="font-family: Arial, sans-serif; font-size: 16px;">{progress_text}</p>
136
+ </body>
137
+ </html>
138
+
139
+ ## Training Loss
140
+ ![Training Loss Chart]({loss_plot_filename})
141
+ """.strip()
142
+ return readme_template
143
+
144
+ def plot_loss_from_json(
145
+ json_file_path: Path,
146
+ output_image_path: Path,
147
+ smooth_steps: int = LOSS_SMOOTHING_WINDOW
148
+ ):
149
+ """
150
+ Reads training log data from a JSON file (trainer_state.json),
151
+ extracts loss and step values, plots the original loss and a smoothed
152
+ version (running average), and saves the plot to a PNG file.
153
+
154
+ Args:
155
+ json_file_path (Path): Path to the input trainer_state.json file.
156
+ output_image_path (Path): Path where the output PNG plot will be saved.
157
+ smooth_steps (int): Window size for running average smoothing.
158
+ If <= 0, no smoothing is applied.
159
+ """
160
+ print(f"Reading training log data from: {json_file_path}")
161
+ print(f"Smoothing window: {smooth_steps if smooth_steps > 0 else 'Disabled'}")
162
+
163
+ try:
164
+ with open(json_file_path, 'r') as f:
165
+ data = json.load(f)
166
+ except FileNotFoundError:
167
+ print(f"Error: JSON file not found at {json_file_path}")
168
+ return
169
+ except json.JSONDecodeError:
170
+ print(f"Error: Could not decode JSON from {json_file_path}. Is it valid?")
171
+ return
172
+ except Exception as e:
173
+ print(f"An unexpected error occurred while reading {json_file_path}: {e}")
174
+ return
175
+
176
+ log_history = data.get("log_history") # Use .get for safer access
177
+ if not isinstance(log_history, list):
178
+ print(f"Error: 'log_history' key not found or not a list in {json_file_path}")
179
+ return
180
+
181
+ steps, losses = [], []
182
+ for entry in log_history:
183
+ if isinstance(entry, dict) and "step" in entry and "loss" in entry and entry["loss"] is not None:
184
+ try:
185
+ steps.append(int(entry["step"]))
186
+ losses.append(float(entry["loss"]))
187
+ except (ValueError, TypeError):
188
+ print(f"Warning: Skipping entry with non-numeric step/loss: {entry}")
189
+ # else: # Optionally log skipped entries
190
+ # print(f"Info: Skipping log entry missing 'step'/'loss' or loss is null: {entry}")
191
+
192
+ if not steps:
193
+ print("No valid step/loss data found in the log history to plot.")
194
+ return
195
+
196
+ # Convert to numpy arrays and sort by step (good practice)
197
+ steps = np.array(steps)
198
+ losses = np.array(losses)
199
+ sorted_indices = np.argsort(steps)
200
+ steps = steps[sorted_indices]
201
+ losses = losses[sorted_indices]
202
+
203
+ print(f"Found {len(steps)} valid data points to plot.")
204
+
205
+ # Calculate Running Average
206
+ smoothed_losses = None
207
+ smoothed_steps = None
208
+ apply_smoothing = smooth_steps > 0 and len(losses) >= smooth_steps
209
+
210
+ if apply_smoothing:
211
+ try:
212
+ weights = np.ones(smooth_steps) / smooth_steps
213
+ smoothed_losses = np.convolve(losses, weights, mode='valid')
214
+ smoothed_steps = steps[smooth_steps - 1:] # Steps corresponding to the smoothed values
215
+ print(f"Calculated smoothed loss over {len(smoothed_steps)} points.")
216
+ except Exception as e:
217
+ print(f"Warning: Could not calculate smoothed loss. Error: {e}")
218
+ apply_smoothing = False # Disable if calculation fails
219
+ elif smooth_steps > 0:
220
+ print(f"Warning: Not enough data points ({len(losses)}) for smoothing window ({smooth_steps}). Skipping smoothing.")
221
+
222
+ # Plotting
223
+ plt.style.use('seaborn-v0_8-darkgrid') # Use a nice style
224
+ plt.figure(figsize=(10, 6)) # Standard figure size
225
+
226
+ plt.plot(steps, losses, linestyle='-', color='skyblue', alpha=0.5, label='Original Loss')
227
+
228
+ if apply_smoothing and smoothed_losses is not None and smoothed_steps is not None:
229
+ plt.plot(smoothed_steps, smoothed_losses, linestyle='-', color='dodgerblue', alpha=1.0, linewidth=1.5,
230
+ label=f'Smoothed Loss ({smooth_steps}-step avg)')
231
+
232
+ plt.xlabel("Step")
233
+ plt.ylabel("Loss")
234
+ plt.title("Training Loss Progression")
235
+ plt.legend()
236
+ plt.tight_layout() # Adjust layout
237
+
238
+ # Saving
239
+ try:
240
+ plt.savefig(output_image_path, format='png', dpi=150)
241
+ print(f"Plot successfully saved to: {output_image_path}")
242
+ except Exception as e:
243
+ print(f"Error saving plot to {output_image_path}: {e}")
244
+ finally:
245
+ plt.close() # Ensure figure is closed to free memory
246
+
247
+ def prepare_checkpoint_folder(checkpoint_path: Path, checkpoint_number: int):
248
+ """
249
+ Updates README.md, adapter_config.json, and generates the loss plot
250
+ within the specified checkpoint folder.
251
+ """
252
+ print(f"Preparing checkpoint folder: {checkpoint_path}")
253
+
254
+ # 1. Update adapter config
255
+ adapter_config_path = checkpoint_path / ADAPTER_CONFIG_FILENAME
256
+ update_adapter_config(adapter_config_path, BASE_MODEL_NAME)
257
+
258
+ # 2. Generate loss plot
259
+ trainer_state_path = checkpoint_path / TRAINER_STATE_FILENAME
260
+ loss_plot_path = checkpoint_path / LOSS_PLOT_FILENAME
261
+ plot_loss_from_json(trainer_state_path, loss_plot_path, smooth_steps=LOSS_SMOOTHING_WINDOW)
262
+
263
+ # 3. Generate and write README
264
+ readme_path = checkpoint_path / README_FILENAME
265
+ readme_content = generate_readme_content(checkpoint_number, TOTAL_STEPS, BASE_MODEL_NAME, LOSS_PLOT_FILENAME)
266
+ try:
267
+ with open(readme_path, 'w', encoding='utf-8') as file:
268
+ file.write(readme_content)
269
+ print(f"Generated and saved {README_FILENAME} in {checkpoint_path}")
270
+ except Exception as e:
271
+ print(f"Error writing README file to {readme_path}: {e}")
272
+
273
+ # --- Core Logic ---
274
+
275
+ def find_new_checkpoint(current_dir: Path = Path('.')) -> tuple[int, Path] | None:
276
+ """
277
+ Finds the checkpoint folder in the specified directory with the highest
278
+ step number that has not been previously uploaded.
279
+
280
+ Args:
281
+ current_dir (Path): The directory to scan for checkpoints.
282
+
283
+ Returns:
284
+ tuple[int, Path] | None: A tuple containing the (checkpoint_number, folder_path)
285
+ or None if no new checkpoint is found.
286
+ """
287
+ new_checkpoints = []
288
+ try:
289
+ for item in current_dir.iterdir():
290
+ if item.is_dir():
291
+ match = CHECKPOINT_DIR_PATTERN.match(item.name)
292
+ # Check if it matches the pattern AND has not been uploaded
293
+ if match and item not in uploaded_checkpoints:
294
+ checkpoint_number = int(match.group(1))
295
+ new_checkpoints.append((checkpoint_number, item))
296
+ except FileNotFoundError:
297
+ print(f"Error: Directory not found: {current_dir}")
298
+ return None
299
+ except Exception as e:
300
+ print(f"Error scanning directory {current_dir}: {e}")
301
+ return None
302
+
303
+ if new_checkpoints:
304
+ new_checkpoints.sort(key=lambda x: x[0], reverse=True) # Sort by step number, highest first
305
+ return new_checkpoints[0] # Return the one with the highest step number
306
+ return None
307
+
308
+ def upload_checkpoint_to_hf(folder_path: Path, checkpoint_number: int, repo_id: str):
309
+ """
310
+ Uploads the prepared checkpoint folder to Hugging Face Hub and deletes
311
+ the folder locally upon successful upload.
312
+
313
+ Args:
314
+ folder_path (Path): Path to the local checkpoint folder.
315
+ checkpoint_number (int): The checkpoint step number.
316
+ repo_id (str): The Hugging Face repository ID (e.g., "username/repo-name").
317
+ """
318
+ print(f"\nAttempting to upload {folder_path.name} to Hugging Face repository: {repo_id}...")
319
+
320
+ try:
321
+ # Ensure repository exists
322
+ create_repo(repo_id, repo_type="model", exist_ok=True)
323
+ print(f"Repository {repo_id} exists or was created.")
324
+
325
+ # Upload the folder contents
326
+ upload_folder(
327
+ folder_path=str(folder_path), # upload_folder expects string path
328
+ repo_id=repo_id,
329
+ commit_message=f"Upload checkpoint {checkpoint_number}",
330
+ repo_type="model" # Explicitly set repo type
331
+ )
332
+ print(f"Successfully uploaded contents of {folder_path.name} to {repo_id}.")
333
+
334
+ # Delete the local folder ONLY after successful upload
335
+ try:
336
+ shutil.rmtree(folder_path)
337
+ print(f"Successfully deleted local folder: {folder_path}")
338
+ return True # Indicate success
339
+ except OSError as e:
340
+ print(f"Error deleting local folder {folder_path}: {e}. Please delete manually.")
341
+ return True # Upload succeeded, but deletion failed
342
+
343
+ except Exception as e:
344
+ print(f"ERROR during Hugging Face upload for {folder_path.name}: {e}")
345
+ print("Upload failed. Local folder will not be deleted.")
346
+ return False # Indicate failure
347
+
348
+ # --- Main Execution ---
349
+
350
+ def main():
351
+ """
352
+ Main loop to monitor for new checkpoints, prepare them, upload them to
353
+ Hugging Face Hub, and clean up locally.
354
+ """
355
+ try:
356
+ hf_token = get_huggingface_token()
357
+ login(hf_token)
358
+ print("\nSuccessfully logged into Hugging Face Hub.")
359
+ except ValueError as e:
360
+ print(f"Error: {e}")
361
+ return # Exit if login fails
362
+ except Exception as e:
363
+ print(f"An unexpected error occurred during Hugging Face login: {e}")
364
+ return
365
+
366
+ print("\nStarting checkpoint monitor...")
367
+ print(f"Will check for new checkpoints matching '{CHECKPOINT_DIR_PATTERN.pattern}' every {POLL_INTERVAL_SECONDS} seconds.")
368
+ print(f"Target repository: {TARGET_REPO_NAME}")
369
+ print(f"Found checkpoints will be tracked (not re-uploaded): {uploaded_checkpoints or 'None yet'}")
370
+ print("-" * 30)
371
+
372
+ while True:
373
+ new_checkpoint_info = find_new_checkpoint()
374
+
375
+ if new_checkpoint_info:
376
+ checkpoint_number, folder_path = new_checkpoint_info
377
+ print(f"\nFound new checkpoint: {folder_path.name} (Step {checkpoint_number})")
378
+
379
+ # Optional delay: wait a bit in case files are still being written
380
+ print(f"Waiting {PRE_UPLOAD_DELAY_SECONDS} seconds before processing...")
381
+ time.sleep(PRE_UPLOAD_DELAY_SECONDS)
382
+
383
+ # Prepare the folder (update README, config, generate plot)
384
+ prepare_checkpoint_folder(folder_path, checkpoint_number)
385
+
386
+ # Attempt upload and deletion
387
+ upload_successful = upload_checkpoint_to_hf(
388
+ folder_path=folder_path,
389
+ checkpoint_number=checkpoint_number,
390
+ repo_id=TARGET_REPO_NAME
391
+ )
392
+
393
+ if upload_successful:
394
+ # Add to uploaded set ONLY if upload (and optionally deletion) was processed
395
+ uploaded_checkpoints.add(folder_path)
396
+ print(f"Added {folder_path.name} to the set of processed checkpoints.")
397
+
398
+ print("-" * 30) # Separator after processing a checkpoint
399
+
400
+ else:
401
+ # Use \r for inline update when no checkpoint found
402
+ print(f"\rNo new checkpoints found. Checking again in {POLL_INTERVAL_SECONDS} seconds... ", end="")
403
+
404
+ # Wait before the next check
405
+ time.sleep(POLL_INTERVAL_SECONDS)
406
+
407
+ if __name__ == "__main__":
408
+ try:
409
+ main()
410
+ except KeyboardInterrupt:
411
+ print("\nMonitoring stopped by user.")
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "./Qwen-2.5-7B-Instruct",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q_proj",
27
+ "o_proj",
28
+ "gate_proj",
29
+ "up_proj",
30
+ "down_proj",
31
+ "k_proj",
32
+ "v_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3353418dbd5964c39e5c93c99e56d574d216efe02d8f38dcee8e4c0ea3124a
3
+ size 161533192
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 128000,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "right",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037e3609d0a8e029fa9ca5c2f0ef2b54b5b2444009261370eb659c500ce44612
3
+ size 5688
upload.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import numpy as np
4
+ import time
5
+ import shutil
6
+ import json
7
+ import matplotlib.pyplot as plt
8
+ from huggingface_hub import login, create_repo, upload_folder, HfFolder
9
+ from pathlib import Path # Using pathlib for easier path manipulation
10
+
11
+ # --- Configuration Constants ---
12
+ # Model and Repo Details
13
+ BASE_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
14
+ TARGET_REPO_NAME = "qingy2024/Gradience-T1-7B-Checkpoint" # Specify your target repo
15
+
16
+ # Training Parameters (Update if necessary)
17
+ TOTAL_STEPS = 4918 # Total expected steps for progress calculation
18
+
19
+ # File Names
20
+ README_FILENAME = "README.md"
21
+ ADAPTER_CONFIG_FILENAME = "adapter_config.json"
22
+ TRAINER_STATE_FILENAME = "trainer_state.json"
23
+ LOSS_PLOT_FILENAME = "loss.png"
24
+
25
+ # Plotting Configuration
26
+ LOSS_SMOOTHING_WINDOW = 10
27
+
28
+ # Monitoring Configuration
29
+ CHECKPOINT_DIR_PATTERN = re.compile(r"^checkpoint-(\d+)$")
30
+ POLL_INTERVAL_SECONDS = 30
31
+ PRE_UPLOAD_DELAY_SECONDS = 10 # Delay after finding checkpoint before processing
32
+
33
+ # --- Global State ---
34
+ # Set to track uploaded checkpoints (using Path objects for consistency)
35
+ uploaded_checkpoints = set()
36
+
37
+ # --- Helper Functions ---
38
+
39
+ def get_huggingface_token():
40
+ """Retrieves the Hugging Face token from environment variable or login cache."""
41
+ token = os.getenv('HUGGINGFACE_TOKEN')
42
+ if token:
43
+ print("Using Hugging Face token from HUGGINGFACE_TOKEN environment variable.")
44
+ return token
45
+ token = HfFolder.get_token()
46
+ if token:
47
+ print("Using Hugging Face token from saved credentials.")
48
+ return token
49
+ raise ValueError("Hugging Face token not found. Set HUGGINGFACE_TOKEN environment variable or login using `huggingface-cli login`.")
50
+
51
+ def update_adapter_config(config_path: Path, base_model_name: str):
52
+ """
53
+ Reads adapter_config.json, updates the base_model_name_or_path field,
54
+ and saves it back.
55
+
56
+ Args:
57
+ config_path (Path): Path to the adapter_config.json file.
58
+ base_model_name (str): The base model name to set.
59
+ """
60
+ try:
61
+ with open(config_path, 'r') as file:
62
+ config = json.load(file)
63
+
64
+ config['base_model_name_or_path'] = base_model_name
65
+
66
+ with open(config_path, 'w') as file:
67
+ json.dump(config, file, indent=2)
68
+ print(f"Updated 'base_model_name_or_path' in {config_path}")
69
+
70
+ except FileNotFoundError:
71
+ print(f"Error: Adapter config file not found at {config_path}")
72
+ except json.JSONDecodeError:
73
+ print(f"Error: Could not decode JSON from {config_path}. Is it valid?")
74
+ except KeyError:
75
+ print(f"Error: 'base_model_name_or_path' key not found in {config_path}")
76
+ except Exception as e:
77
+ print(f"An unexpected error occurred while updating {config_path}: {e}")
78
+
79
+ def generate_readme_content(checkpoint_number: int, total_steps: int, base_model: str, loss_plot_filename: str) -> str:
80
+ """Generates the README content with updated progress."""
81
+ if total_steps <= 0:
82
+ progress_percentage = 0.0
83
+ else:
84
+ progress_percentage = min(100.0, (checkpoint_number / total_steps) * 100) # Ensure percentage doesn't exceed 100
85
+
86
+ progress_width = f"{progress_percentage:.2f}%"
87
+ progress_text = f"Progress: {checkpoint_number} out of {total_steps} steps"
88
+
89
+ # Using an f-string for the template makes insertions cleaner
90
+ readme_template = f"""
91
+ ---
92
+ base_model: {base_model}
93
+ library_name: peft
94
+ ---
95
+ # Gradience T1 7B (Step {checkpoint_number} Checkpoint)
96
+
97
+ > [!NOTE]
98
+ > Training in progress...
99
+
100
+ <!DOCTYPE html>
101
+ <html lang="en">
102
+ <head>
103
+ <meta charset="UTF-8">
104
+ <title>Progress Bar Example</title>
105
+ <style>
106
+ .progress-container {{
107
+ width: 100%;
108
+ background-color: #e0e0e0;
109
+ border-radius: 25px;
110
+ overflow: hidden;
111
+ margin: 20px 0;
112
+ }}
113
+ .progress-bar {{
114
+ height: 30px;
115
+ width: 0;
116
+ background-color: #44965a;
117
+ text-align: center;
118
+ line-height: 30px;
119
+ color: white;
120
+ border-radius: 25px 0 0 25px;
121
+ }}
122
+ .progress-text {{
123
+ margin-top: 10px;
124
+ font-size: 16px;
125
+ font-family: Arial, sans-serif;
126
+ }}
127
+ </style>
128
+ </head>
129
+ <body>
130
+ <div style="width: 100%; background-color: #e0e0e0; border-radius: 25px; overflow: hidden; margin: 20px 0;">
131
+ <div style="height: 30px; width: {progress_width}; background-color: #76c7c0; text-align: center; line-height: 30px; color: white; border-radius: 25px 0 0 25px;">
132
+ <!-- {progress_percentage:.2f}% -->
133
+ </div>
134
+ </div>
135
+ <p style="font-family: Arial, sans-serif; font-size: 16px;">{progress_text}</p>
136
+ </body>
137
+ </html>
138
+
139
+ ## Training Loss
140
+ ![Training Loss Chart]({loss_plot_filename})
141
+ """.strip()
142
+ return readme_template
143
+
144
+ def plot_loss_from_json(
145
+ json_file_path: Path,
146
+ output_image_path: Path,
147
+ smooth_steps: int = LOSS_SMOOTHING_WINDOW
148
+ ):
149
+ """
150
+ Reads training log data from a JSON file (trainer_state.json),
151
+ extracts loss and step values, plots the original loss and a smoothed
152
+ version (running average), and saves the plot to a PNG file.
153
+
154
+ Args:
155
+ json_file_path (Path): Path to the input trainer_state.json file.
156
+ output_image_path (Path): Path where the output PNG plot will be saved.
157
+ smooth_steps (int): Window size for running average smoothing.
158
+ If <= 0, no smoothing is applied.
159
+ """
160
+ print(f"Reading training log data from: {json_file_path}")
161
+ print(f"Smoothing window: {smooth_steps if smooth_steps > 0 else 'Disabled'}")
162
+
163
+ try:
164
+ with open(json_file_path, 'r') as f:
165
+ data = json.load(f)
166
+ except FileNotFoundError:
167
+ print(f"Error: JSON file not found at {json_file_path}")
168
+ return
169
+ except json.JSONDecodeError:
170
+ print(f"Error: Could not decode JSON from {json_file_path}. Is it valid?")
171
+ return
172
+ except Exception as e:
173
+ print(f"An unexpected error occurred while reading {json_file_path}: {e}")
174
+ return
175
+
176
+ log_history = data.get("log_history") # Use .get for safer access
177
+ if not isinstance(log_history, list):
178
+ print(f"Error: 'log_history' key not found or not a list in {json_file_path}")
179
+ return
180
+
181
+ steps, losses = [], []
182
+ for entry in log_history:
183
+ if isinstance(entry, dict) and "step" in entry and "loss" in entry and entry["loss"] is not None:
184
+ try:
185
+ steps.append(int(entry["step"]))
186
+ losses.append(float(entry["loss"]))
187
+ except (ValueError, TypeError):
188
+ print(f"Warning: Skipping entry with non-numeric step/loss: {entry}")
189
+ # else: # Optionally log skipped entries
190
+ # print(f"Info: Skipping log entry missing 'step'/'loss' or loss is null: {entry}")
191
+
192
+ if not steps:
193
+ print("No valid step/loss data found in the log history to plot.")
194
+ return
195
+
196
+ # Convert to numpy arrays and sort by step (good practice)
197
+ steps = np.array(steps)
198
+ losses = np.array(losses)
199
+ sorted_indices = np.argsort(steps)
200
+ steps = steps[sorted_indices]
201
+ losses = losses[sorted_indices]
202
+
203
+ print(f"Found {len(steps)} valid data points to plot.")
204
+
205
+ # Calculate Running Average
206
+ smoothed_losses = None
207
+ smoothed_steps = None
208
+ apply_smoothing = smooth_steps > 0 and len(losses) >= smooth_steps
209
+
210
+ if apply_smoothing:
211
+ try:
212
+ weights = np.ones(smooth_steps) / smooth_steps
213
+ smoothed_losses = np.convolve(losses, weights, mode='valid')
214
+ smoothed_steps = steps[smooth_steps - 1:] # Steps corresponding to the smoothed values
215
+ print(f"Calculated smoothed loss over {len(smoothed_steps)} points.")
216
+ except Exception as e:
217
+ print(f"Warning: Could not calculate smoothed loss. Error: {e}")
218
+ apply_smoothing = False # Disable if calculation fails
219
+ elif smooth_steps > 0:
220
+ print(f"Warning: Not enough data points ({len(losses)}) for smoothing window ({smooth_steps}). Skipping smoothing.")
221
+
222
+ # Plotting
223
+ plt.style.use('seaborn-v0_8-darkgrid') # Use a nice style
224
+ plt.figure(figsize=(10, 6)) # Standard figure size
225
+
226
+ plt.plot(steps, losses, linestyle='-', color='skyblue', alpha=0.5, label='Original Loss')
227
+
228
+ if apply_smoothing and smoothed_losses is not None and smoothed_steps is not None:
229
+ plt.plot(smoothed_steps, smoothed_losses, linestyle='-', color='dodgerblue', alpha=1.0, linewidth=1.5,
230
+ label=f'Smoothed Loss ({smooth_steps}-step avg)')
231
+
232
+ plt.xlabel("Step")
233
+ plt.ylabel("Loss")
234
+ plt.title("Training Loss Progression")
235
+ plt.legend()
236
+ plt.tight_layout() # Adjust layout
237
+
238
+ # Saving
239
+ try:
240
+ plt.savefig(output_image_path, format='png', dpi=150)
241
+ print(f"Plot successfully saved to: {output_image_path}")
242
+ except Exception as e:
243
+ print(f"Error saving plot to {output_image_path}: {e}")
244
+ finally:
245
+ plt.close() # Ensure figure is closed to free memory
246
+
247
+ def prepare_checkpoint_folder(checkpoint_path: Path, checkpoint_number: int):
248
+ """
249
+ Updates README.md, adapter_config.json, and generates the loss plot
250
+ within the specified checkpoint folder.
251
+ """
252
+ print(f"Preparing checkpoint folder: {checkpoint_path}")
253
+
254
+ # 1. Update adapter config
255
+ adapter_config_path = checkpoint_path / ADAPTER_CONFIG_FILENAME
256
+ update_adapter_config(adapter_config_path, BASE_MODEL_NAME)
257
+
258
+ # 2. Generate loss plot
259
+ trainer_state_path = checkpoint_path / TRAINER_STATE_FILENAME
260
+ loss_plot_path = checkpoint_path / LOSS_PLOT_FILENAME
261
+ plot_loss_from_json(trainer_state_path, loss_plot_path, smooth_steps=LOSS_SMOOTHING_WINDOW)
262
+
263
+ # 3. Generate and write README
264
+ readme_path = checkpoint_path / README_FILENAME
265
+ readme_content = generate_readme_content(checkpoint_number, TOTAL_STEPS, BASE_MODEL_NAME, LOSS_PLOT_FILENAME)
266
+ try:
267
+ with open(readme_path, 'w', encoding='utf-8') as file:
268
+ file.write(readme_content)
269
+ print(f"Generated and saved {README_FILENAME} in {checkpoint_path}")
270
+ except Exception as e:
271
+ print(f"Error writing README file to {readme_path}: {e}")
272
+
273
+ # --- Core Logic ---
274
+
275
+ def find_new_checkpoint(current_dir: Path = Path('.')) -> tuple[int, Path] | None:
276
+ """
277
+ Finds the checkpoint folder in the specified directory with the highest
278
+ step number that has not been previously uploaded.
279
+
280
+ Args:
281
+ current_dir (Path): The directory to scan for checkpoints.
282
+
283
+ Returns:
284
+ tuple[int, Path] | None: A tuple containing the (checkpoint_number, folder_path)
285
+ or None if no new checkpoint is found.
286
+ """
287
+ new_checkpoints = []
288
+ try:
289
+ for item in current_dir.iterdir():
290
+ if item.is_dir():
291
+ match = CHECKPOINT_DIR_PATTERN.match(item.name)
292
+ # Check if it matches the pattern AND has not been uploaded
293
+ if match and item not in uploaded_checkpoints:
294
+ checkpoint_number = int(match.group(1))
295
+ new_checkpoints.append((checkpoint_number, item))
296
+ except FileNotFoundError:
297
+ print(f"Error: Directory not found: {current_dir}")
298
+ return None
299
+ except Exception as e:
300
+ print(f"Error scanning directory {current_dir}: {e}")
301
+ return None
302
+
303
+ if new_checkpoints:
304
+ new_checkpoints.sort(key=lambda x: x[0], reverse=True) # Sort by step number, highest first
305
+ return new_checkpoints[0] # Return the one with the highest step number
306
+ return None
307
+
308
+ def upload_checkpoint_to_hf(folder_path: Path, checkpoint_number: int, repo_id: str):
309
+ """
310
+ Uploads the prepared checkpoint folder to Hugging Face Hub and deletes
311
+ the folder locally upon successful upload.
312
+
313
+ Args:
314
+ folder_path (Path): Path to the local checkpoint folder.
315
+ checkpoint_number (int): The checkpoint step number.
316
+ repo_id (str): The Hugging Face repository ID (e.g., "username/repo-name").
317
+ """
318
+ print(f"\nAttempting to upload {folder_path.name} to Hugging Face repository: {repo_id}...")
319
+
320
+ try:
321
+ # Ensure repository exists
322
+ create_repo(repo_id, repo_type="model", exist_ok=True)
323
+ print(f"Repository {repo_id} exists or was created.")
324
+
325
+ # Upload the folder contents
326
+ upload_folder(
327
+ folder_path=str(folder_path), # upload_folder expects string path
328
+ repo_id=repo_id,
329
+ commit_message=f"Upload checkpoint {checkpoint_number}",
330
+ repo_type="model" # Explicitly set repo type
331
+ )
332
+ print(f"Successfully uploaded contents of {folder_path.name} to {repo_id}.")
333
+
334
+ # Delete the local folder ONLY after successful upload
335
+ try:
336
+ shutil.rmtree(folder_path)
337
+ print(f"Successfully deleted local folder: {folder_path}")
338
+ return True # Indicate success
339
+ except OSError as e:
340
+ print(f"Error deleting local folder {folder_path}: {e}. Please delete manually.")
341
+ return True # Upload succeeded, but deletion failed
342
+
343
+ except Exception as e:
344
+ print(f"ERROR during Hugging Face upload for {folder_path.name}: {e}")
345
+ print("Upload failed. Local folder will not be deleted.")
346
+ return False # Indicate failure
347
+
348
+ # --- Main Execution ---
349
+
350
+ def main():
351
+ """
352
+ Main loop to monitor for new checkpoints, prepare them, upload them to
353
+ Hugging Face Hub, and clean up locally.
354
+ """
355
+ try:
356
+ hf_token = get_huggingface_token()
357
+ login(hf_token)
358
+ print("\nSuccessfully logged into Hugging Face Hub.")
359
+ except ValueError as e:
360
+ print(f"Error: {e}")
361
+ return # Exit if login fails
362
+ except Exception as e:
363
+ print(f"An unexpected error occurred during Hugging Face login: {e}")
364
+ return
365
+
366
+ print("\nStarting checkpoint monitor...")
367
+ print(f"Will check for new checkpoints matching '{CHECKPOINT_DIR_PATTERN.pattern}' every {POLL_INTERVAL_SECONDS} seconds.")
368
+ print(f"Target repository: {TARGET_REPO_NAME}")
369
+ print(f"Found checkpoints will be tracked (not re-uploaded): {uploaded_checkpoints or 'None yet'}")
370
+ print("-" * 30)
371
+
372
+ while True:
373
+ new_checkpoint_info = find_new_checkpoint()
374
+
375
+ if new_checkpoint_info:
376
+ checkpoint_number, folder_path = new_checkpoint_info
377
+ print(f"\nFound new checkpoint: {folder_path.name} (Step {checkpoint_number})")
378
+
379
+ # Optional delay: wait a bit in case files are still being written
380
+ print(f"Waiting {PRE_UPLOAD_DELAY_SECONDS} seconds before processing...")
381
+ time.sleep(PRE_UPLOAD_DELAY_SECONDS)
382
+
383
+ # Prepare the folder (update README, config, generate plot)
384
+ prepare_checkpoint_folder(folder_path, checkpoint_number)
385
+
386
+ # Attempt upload and deletion
387
+ upload_successful = upload_checkpoint_to_hf(
388
+ folder_path=folder_path,
389
+ checkpoint_number=checkpoint_number,
390
+ repo_id=TARGET_REPO_NAME
391
+ )
392
+
393
+ if upload_successful:
394
+ # Add to uploaded set ONLY if upload (and optionally deletion) was processed
395
+ uploaded_checkpoints.add(folder_path)
396
+ print(f"Added {folder_path.name} to the set of processed checkpoints.")
397
+
398
+ print("-" * 30) # Separator after processing a checkpoint
399
+
400
+ else:
401
+ # Use \r for inline update when no checkpoint found
402
+ print(f"\rNo new checkpoints found. Checking again in {POLL_INTERVAL_SECONDS} seconds... ", end="")
403
+
404
+ # Wait before the next check
405
+ time.sleep(POLL_INTERVAL_SECONDS)
406
+
407
+ if __name__ == "__main__":
408
+ try:
409
+ main()
410
+ except KeyboardInterrupt:
411
+ print("\nMonitoring stopped by user.")
vocab.json ADDED
The diff for this file is too large to render. See raw diff