English
rag
virginia-legal-rag-lexva / generate_test_cases.py
Dalila Cuevas Rodriguez
Initial commit
dabc803
import sys
import csv
import requests
from typing import List
import json
import os
from tqdm import tqdm
import pandas as pd
from deepeval.metrics import GEval, FaithfulnessMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.models.base_model import DeepEvalBaseLLM
from datasets import load_dataset
from chat_engine import LLMChatEngine
model="qwen3-14b"
sqlite_path = "/content/drive/MyDrive/LegalRag/va_code.db"
model_chat = LLMChatEngine(sqlite_path, "Qwen/Qwen3-14B", enable_thinking=True)
retriever = model_chat.retriever
ds = load_dataset("dcrodriguez/Virginia-Statute-QA")
train_ds = ds['train']
from pathlib import Path
# -------- CONFIG --------
N = 300 # how many items from the dataset to use
OUTPUT_PATH = Path("/content/drive/MyDrive/LegalRag/llm_test_cases_qwen3-14b-thinking.jsonl")
def build_context_from_section_ids(section_ids: List[int]) -> List[str]:
"""
Given a list of section_ids (ints), use your retriever to fetch the
corresponding statute text and return a list of context strings.
"""
# retriever._fetch_sections returns a dict keyed by doc_id
sections: Dict[int, Dict[str, Any]] = retriever._fetch_sections(section_ids)
contexts: List[str] = []
for sec in sections.values():
# You can enrich the context with title/chapter/section names if you like
ctx = (
f"Virginia Code § {sec['doc_id'].replace('VA:', '')} \n"
f"Title: {sec['title']} \n"
f"Chapter: {sec['chapter']} \n"
f"Section: {sec['section']} \n\n"
f"Text:\n{sec['text']}"
"\n\n\n----------------------\n\n\n"
)
contexts.append(ctx)
return contexts
def build_test_cases_for_first_n(n: int = 10, start = 0):
test_cases_rag = []
test_cases_no_rag = []
for i in tqdm(range(25, n + start), desc="Building test cases"):
row = train_ds[i]
section_ids = row["section_ids"] # e.g. [12345]
question = row["question"]
gold_answer = row["answer"]
# Context for the judge (gold statute text from section_ids)
retrieval_context = build_context_from_section_ids(section_ids)
# Generate answers with and without RAG
answer_rag = model_chat.chat(question, rag=True)
model_chat.clear_history()
answer_no_rag = model_chat.chat(question, rag=False)
model_chat.clear_history()
# RAG test case
tc_rag = LLMTestCase(
input=question,
actual_output=answer_rag,
# What the judge reads as the "source of truth"
retrieval_context=retrieval_context,
# Optional: include gold answer if you want
expected_output=gold_answer,
# You can also store metadata if helpful
metadata={"section_ids": section_ids, "mode": "rag"},
)
test_cases_rag.append(tc_rag)
# No-RAG test case
tc_no_rag = LLMTestCase(
input=question,
actual_output=answer_no_rag,
retrieval_context=retrieval_context,
expected_output=gold_answer,
metadata={"section_ids": section_ids, "mode": "no_rag"},
)
test_cases_no_rag.append(tc_no_rag)
return test_cases_rag, test_cases_no_rag
# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_PATH.open("w", encoding="utf-8") as f_out:
for i in tqdm(range(min(N, len(train_ds))), desc="Generating test cases"):
row = train_ds[i]
section_ids = row["section_ids"] # e.g. [12345]
question = row["question"]
gold_answer = row["answer"]
retrieval_context = build_context_from_section_ids(section_ids)
# Run model with and without RAG
answer_rag = model_chat.chat(question, rag=True)
model_chat.clear_history()
answer_no_rag = model_chat.chat(question, rag=False)
model_chat.clear_history()
record = {
"idx": i, # index in the dataset
"section_ids": section_ids, # list[int]
"question": question,
"gold_answer": gold_answer,
"retrieval_context": retrieval_context, # list[str]
"answer_rag": answer_rag,
"answer_no_rag": answer_no_rag,
}
json.dump(record, f_out, ensure_ascii=False)
f_out.write("\n")
f_out.flush()
os.fsync(f_out.fileno())
print(f"Saved test cases to: {OUTPUT_PATH.resolve()}")