|
|
import sys |
|
|
import csv |
|
|
import requests |
|
|
from typing import List |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
from tqdm import tqdm |
|
|
import pandas as pd |
|
|
|
|
|
from deepeval.metrics import GEval, FaithfulnessMetric |
|
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams |
|
|
from deepeval.models.base_model import DeepEvalBaseLLM |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
|
|
from chat_engine import LLMChatEngine |
|
|
|
|
|
model="qwen3-14b" |
|
|
sqlite_path = "/content/drive/MyDrive/LegalRag/va_code.db" |
|
|
model_chat = LLMChatEngine(sqlite_path, "Qwen/Qwen3-14B", enable_thinking=True) |
|
|
|
|
|
retriever = model_chat.retriever |
|
|
|
|
|
ds = load_dataset("dcrodriguez/Virginia-Statute-QA") |
|
|
train_ds = ds['train'] |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
N = 300 |
|
|
OUTPUT_PATH = Path("/content/drive/MyDrive/LegalRag/llm_test_cases_qwen3-14b-thinking.jsonl") |
|
|
|
|
|
def build_context_from_section_ids(section_ids: List[int]) -> List[str]: |
|
|
""" |
|
|
Given a list of section_ids (ints), use your retriever to fetch the |
|
|
corresponding statute text and return a list of context strings. |
|
|
""" |
|
|
|
|
|
sections: Dict[int, Dict[str, Any]] = retriever._fetch_sections(section_ids) |
|
|
|
|
|
contexts: List[str] = [] |
|
|
for sec in sections.values(): |
|
|
|
|
|
ctx = ( |
|
|
f"Virginia Code § {sec['doc_id'].replace('VA:', '')} \n" |
|
|
f"Title: {sec['title']} \n" |
|
|
f"Chapter: {sec['chapter']} \n" |
|
|
f"Section: {sec['section']} \n\n" |
|
|
f"Text:\n{sec['text']}" |
|
|
"\n\n\n----------------------\n\n\n" |
|
|
) |
|
|
contexts.append(ctx) |
|
|
|
|
|
return contexts |
|
|
|
|
|
|
|
|
def build_test_cases_for_first_n(n: int = 10, start = 0): |
|
|
test_cases_rag = [] |
|
|
test_cases_no_rag = [] |
|
|
|
|
|
for i in tqdm(range(25, n + start), desc="Building test cases"): |
|
|
row = train_ds[i] |
|
|
section_ids = row["section_ids"] |
|
|
question = row["question"] |
|
|
gold_answer = row["answer"] |
|
|
|
|
|
|
|
|
retrieval_context = build_context_from_section_ids(section_ids) |
|
|
|
|
|
|
|
|
answer_rag = model_chat.chat(question, rag=True) |
|
|
model_chat.clear_history() |
|
|
answer_no_rag = model_chat.chat(question, rag=False) |
|
|
model_chat.clear_history() |
|
|
|
|
|
|
|
|
tc_rag = LLMTestCase( |
|
|
input=question, |
|
|
actual_output=answer_rag, |
|
|
|
|
|
retrieval_context=retrieval_context, |
|
|
|
|
|
expected_output=gold_answer, |
|
|
|
|
|
metadata={"section_ids": section_ids, "mode": "rag"}, |
|
|
) |
|
|
test_cases_rag.append(tc_rag) |
|
|
|
|
|
|
|
|
tc_no_rag = LLMTestCase( |
|
|
input=question, |
|
|
actual_output=answer_no_rag, |
|
|
retrieval_context=retrieval_context, |
|
|
expected_output=gold_answer, |
|
|
metadata={"section_ids": section_ids, "mode": "no_rag"}, |
|
|
) |
|
|
test_cases_no_rag.append(tc_no_rag) |
|
|
|
|
|
return test_cases_rag, test_cases_no_rag |
|
|
|
|
|
|
|
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with OUTPUT_PATH.open("w", encoding="utf-8") as f_out: |
|
|
for i in tqdm(range(min(N, len(train_ds))), desc="Generating test cases"): |
|
|
row = train_ds[i] |
|
|
section_ids = row["section_ids"] |
|
|
question = row["question"] |
|
|
gold_answer = row["answer"] |
|
|
|
|
|
retrieval_context = build_context_from_section_ids(section_ids) |
|
|
|
|
|
|
|
|
answer_rag = model_chat.chat(question, rag=True) |
|
|
model_chat.clear_history() |
|
|
answer_no_rag = model_chat.chat(question, rag=False) |
|
|
model_chat.clear_history() |
|
|
|
|
|
record = { |
|
|
"idx": i, |
|
|
"section_ids": section_ids, |
|
|
"question": question, |
|
|
"gold_answer": gold_answer, |
|
|
"retrieval_context": retrieval_context, |
|
|
"answer_rag": answer_rag, |
|
|
"answer_no_rag": answer_no_rag, |
|
|
} |
|
|
|
|
|
json.dump(record, f_out, ensure_ascii=False) |
|
|
f_out.write("\n") |
|
|
f_out.flush() |
|
|
os.fsync(f_out.fileno()) |
|
|
|
|
|
print(f"Saved test cases to: {OUTPUT_PATH.resolve()}") |
|
|
|