""" Test a fine-tuned Unsloth/LLM review-label model on 1 000 random samples taken *after* the first 20 000 rows of the original CSV. Assumes your fine-tuned model + tokenizer are already on disk (e.g. in ./ft_model) and CUDA is available. pip install datasets pandas numpy torch transformers unsloth """ import re import random import torch import pandas as pd import numpy as np from datasets import Dataset from unsloth.chat_templates import get_chat_template from unsloth import FastLanguageModel # noqa: Unsloth ≥0.6 from transformers import AutoTokenizer, AutoModelForCausalLM # --------------------------------------------------------------------- # 1. Load rows AFTER the first 20 000 and keep 1 000 random samples # --------------------------------------------------------------------- CSV_PATH = "fake reviews dataset.csv" START_ROW = 20_000 # zero-based slice point N_SAMPLES = 1_000 RNG_SEED = 42 df = pd.read_csv(CSV_PATH, skiprows=range(1, START_ROW + 1)) # skip header + 20 000 rows df = df[["text_", "label"]].dropna() df = df[df["label"].isin(["CG", "OR"])] # Map CG→1, OR→0 (ground-truth) df["true_label"] = (df["label"] == "CG").astype(int) # If fewer than N_SAMPLES rows, use all of them sample_df = df.sample( n=min(N_SAMPLES, len(df)), random_state=RNG_SEED ).reset_index(drop=True) # --------------------------------------------------------------------- # 2. Load tokenizer / model and prepare the chat template # --------------------------------------------------------------------- MODEL_DIR = "./lora_model" # path to your fine-tuned model folder tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_DIR, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) # tell Unsloth to use the Qwen-2.5 chat schema (same one used in fine-tuning) tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5") FastLanguageModel.for_inference(model) SYSTEM_PROMPT = ( "You are ReviewLabeler, an expert at distinguishing computer-generated " "reviews (label 1) from authentic human reviews (label 0)." ) # --------------------------------------------------------------------- # 3. Helper: run one review through the model and extract predicted label # --------------------------------------------------------------------- LABEL_RE = re.compile(r"label\s*:\s*([01])", re.I) @torch.inference_mode() def predict_label(review_text: str) -> int: """Return 0 or 1 extracted from model's answer (defaults to −1 on failure).""" messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"The review you need to label carefully: {review_text}"}, ] input_ids = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", ).to(model.device) gen_ids = model.generate( input_ids=input_ids, max_new_tokens=128, use_cache=True, temperature=0.8, top_p=0.9, ) completion = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0] match = LABEL_RE.search(completion) if match: return int(match.group(1)) # Fallback: if “computer generated” / “human written” appear alone if "computer generated" in completion.lower(): return 1 if "human written" in completion.lower(): return 0 return -1 # could not parse # --------------------------------------------------------------------- # 4. Run evaluation — with live progress # --------------------------------------------------------------------- from tqdm.auto import tqdm # progress bar y_true, y_pred, failures = [], [], 0 progress_bar = tqdm(sample_df.itertuples(index=False), total=len(sample_df), bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") for row in progress_bar: true_lbl = row.true_label pred_lbl = predict_label(row.text_) if pred_lbl == -1: # could not parse model output failures += 1 progress_bar.set_postfix_str("⚠ parse-fail") continue y_true.append(true_lbl) y_pred.append(pred_lbl) # live accuracy so far acc_so_far = (np.array(y_true) == np.array(y_pred)).mean() progress_bar.set_postfix(acc=acc_so_far, fails=failures) # ----------------- Final metrics ----------------- y_true = np.array(y_true) y_pred = np.array(y_pred) accuracy = (y_true == y_pred).mean() precision = (y_pred[y_true == 1] == 1).mean() if (y_pred == 1).any() else 0.0 recall = (y_pred[y_true == 1] == 1).sum() / max((y_true == 1).sum(), 1) f1 = (2 * precision * recall) / max((precision + recall), 1e-8) print("\n=== FINAL RESULTS ===") print(f"Samples evaluated : {len(y_true)} / {len(sample_df)}") print(f"Parse failures : {failures}") print(f"Accuracy : {accuracy:6.3%}") print(f"Precision (label 1): {precision:6.3%}") print(f"Recall (label 1): {recall:6.3%}") print(f"F1 (label 1): {f1:6.3%}")