virginia-legal-rag-lexva / generate_test_cases.py

Dalila Cuevas Rodriguez

Initial commit

dabc803 about 1 month ago

4.57 kB

	import sys
	import csv
	import requests
	from typing import List
	import json
	import os


	from tqdm import tqdm
	import pandas as pd

	from deepeval.metrics import GEval, FaithfulnessMetric
	from deepeval.test_case import LLMTestCase, LLMTestCaseParams
	from deepeval.models.base_model import DeepEvalBaseLLM

	from datasets import load_dataset

	from chat_engine import LLMChatEngine

	model="qwen3-14b"
	sqlite_path = "/content/drive/MyDrive/LegalRag/va_code.db"
	model_chat = LLMChatEngine(sqlite_path, "Qwen/Qwen3-14B", enable_thinking=True)

	retriever = model_chat.retriever

	ds = load_dataset("dcrodriguez/Virginia-Statute-QA")
	train_ds = ds['train']

	from pathlib import Path
	# -------- CONFIG --------
	N = 300 # how many items from the dataset to use
	OUTPUT_PATH = Path("/content/drive/MyDrive/LegalRag/llm_test_cases_qwen3-14b-thinking.jsonl")

	def build_context_from_section_ids(section_ids: List[int]) -> List[str]:
	"""
	Given a list of section_ids (ints), use your retriever to fetch the
	corresponding statute text and return a list of context strings.
	"""
	# retriever._fetch_sections returns a dict keyed by doc_id
	sections: Dict[int, Dict[str, Any]] = retriever._fetch_sections(section_ids)

	contexts: List[str] = []
	for sec in sections.values():
	# You can enrich the context with title/chapter/section names if you like
	ctx = (
	f"Virginia Code § {sec['doc_id'].replace('VA:', '')} \n"
	f"Title: {sec['title']} \n"
	f"Chapter: {sec['chapter']} \n"
	f"Section: {sec['section']} \n\n"
	f"Text:\n{sec['text']}"
	"\n\n\n----------------------\n\n\n"
	)
	contexts.append(ctx)

	return contexts


	def build_test_cases_for_first_n(n: int = 10, start = 0):
	test_cases_rag = []
	test_cases_no_rag = []

	for i in tqdm(range(25, n + start), desc="Building test cases"):
	row = train_ds[i]
	section_ids = row["section_ids"] # e.g. [12345]
	question = row["question"]
	gold_answer = row["answer"]

	# Context for the judge (gold statute text from section_ids)
	retrieval_context = build_context_from_section_ids(section_ids)

	# Generate answers with and without RAG
	answer_rag = model_chat.chat(question, rag=True)
	model_chat.clear_history()
	answer_no_rag = model_chat.chat(question, rag=False)
	model_chat.clear_history()

	# RAG test case
	tc_rag = LLMTestCase(
	input=question,
	actual_output=answer_rag,
	# What the judge reads as the "source of truth"
	retrieval_context=retrieval_context,
	# Optional: include gold answer if you want
	expected_output=gold_answer,
	# You can also store metadata if helpful
	metadata={"section_ids": section_ids, "mode": "rag"},
	)
	test_cases_rag.append(tc_rag)

	# No-RAG test case
	tc_no_rag = LLMTestCase(
	input=question,
	actual_output=answer_no_rag,
	retrieval_context=retrieval_context,
	expected_output=gold_answer,
	metadata={"section_ids": section_ids, "mode": "no_rag"},
	)
	test_cases_no_rag.append(tc_no_rag)

	return test_cases_rag, test_cases_no_rag

	# Ensure output directory exists
	OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

	with OUTPUT_PATH.open("w", encoding="utf-8") as f_out:
	for i in tqdm(range(min(N, len(train_ds))), desc="Generating test cases"):
	row = train_ds[i]
	section_ids = row["section_ids"] # e.g. [12345]
	question = row["question"]
	gold_answer = row["answer"]

	retrieval_context = build_context_from_section_ids(section_ids)

	# Run model with and without RAG
	answer_rag = model_chat.chat(question, rag=True)
	model_chat.clear_history()
	answer_no_rag = model_chat.chat(question, rag=False)
	model_chat.clear_history()

	record = {
	"idx": i, # index in the dataset
	"section_ids": section_ids, # list[int]
	"question": question,
	"gold_answer": gold_answer,
	"retrieval_context": retrieval_context, # list[str]
	"answer_rag": answer_rag,
	"answer_no_rag": answer_no_rag,
	}

	json.dump(record, f_out, ensure_ascii=False)
	f_out.write("\n")
	f_out.flush()
	os.fsync(f_out.fileno())

	print(f"Saved test cases to: {OUTPUT_PATH.resolve()}")