import fitz
import gradio as gr
import torch
import json
import re
import nltk
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Download all required NLTK resources
nltk.download(['punkt', 'punkt_tab'], quiet=True)

# Initialize model and tokenizer once
model_path = "model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def extract_resume_text(pdf_path):
    """Extract text from PDF with proper block handling"""
    full_text = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            # Get text blocks as list of strings
            blocks = page.get_text("blocks")
            # Each block is a tuple, we need the 4th element which is the text
            text_blocks = [block[4] for block in blocks if block[4].strip()]
            full_text.extend(text_blocks)
    return "\n".join(full_text)

def clean_resume_content(text):
    """Advanced cleaning using regex and NLTK"""
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode()
    
    # Clean section headers
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(EDUCATION|EXPERIENCE|PROJECTS):', r'\nSECTION: \1\n', text, flags=re.IGNORECASE)
    
    return text[:3000]  # Limit to first 3000 characters

def generate_structured_output(model_input):
    """Generate JSON with constrained decoding"""
    input_ids = tokenizer.encode(
        model_input,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    outputs = model.generate(
        input_ids,
        max_length=512,
        num_beams=4,
        early_stopping=True,
        temperature=0.7,
        no_repeat_ngram_size=2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def repair_json(json_str):
    """Advanced JSON repair with multiple strategies"""
    # First attempt: Find JSON substring
    json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
    
    # Fix common formatting issues
    replacements = [
        (r'(\w+)\s*:', r'"\1":'),  # Add quotes around keys
        (r':\s*([^"\s{]+)(\s*[,}])', r': "\1"\2'),  # Add quotes around unquoted values
        (r"'", '"'),  # Replace single quotes
        (r'True', 'true'), (r'False', 'false')  # Fix boolean values
    ]
    
    for pattern, replacement in replacements:
        json_str = re.sub(pattern, replacement, json_str)
    
    # Ensure proper array formatting
    json_str = re.sub(r'(\[|,)\s*([^"\]]+?)\s*(?=,|\])', r'\1"\2"', json_str)
    
    return json_str


def parse_resume(pdf_file):
    # Initialize all variables at the top
    raw_text = clean_text = raw_output = ""
    
    try:
        # 1. Extract and clean text
        raw_text = extract_resume_text(pdf_file.name)
        clean_text = clean_resume_content(raw_text)
        
        # 2. Create structured prompt
        prompt = (
            "Generate valid JSON resume with these EXACT fields:\n"
            "Name, Contact{Phone, Email, GitHub, LinkedIn}, "
            "Education[Institution, Degree, Duration, Grade], "
            f"Resume Content:\n{clean_text}"
        )
        
        # 3. Generate model output
        raw_output = generate_structured_output(prompt)
        
        # 4. Repair and parse JSON
        repaired_json = repair_json(raw_output)
        parsed_data = json.loads(repaired_json)
        
        return json.dumps(parsed_data, indent=2)

    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        if raw_text:  # Show extraction status
            error_msg += f"\n\nPDF Extraction: Success ({len(raw_text)} chars)"
        else:
            error_msg += "\n\nPDF Extraction: Failed"
            
        if raw_output:  # Show model output if available
            error_msg += f"\n\nModel Output:\n{raw_output[:500]}..."
            
        return error_msg

# Configure Gradio interface
iface = gr.Interface(
    fn=parse_resume,
    inputs=gr.File(label="Upload Resume PDF", type="filepath"),
    outputs=gr.Textbox(label="Structured Output"),
    title="Professional Resume Parser",
    description="Converts PDF resumes to structured JSON format with AI",
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch(server_port=7860, show_error=True)