import os
import base64
import requests
from io import BytesIO
from typing import List, Union

from PIL import Image
import pypdfium2 as pdfium
import gradio as gr

# 从环境变量中读取你的 OCR token（变量名：ocr_model）
HF_API_TOKEN = os.environ.get("ocr_model")
if HF_API_TOKEN is None:
    raise RuntimeError(
        "环境变量 ocr_model 未设置，请在 Space 的 Settings -> Variables 中添加一个名为 ocr_model 的 Secret。"
    )

# 使用 OCR 模型（保持你要的模型名不变）
MODEL_ID = "tencent/HunyuanOCR"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}


def image_to_base64(image: Image.Image) -> str:
    """把 PIL Image 转成 base64 字符串"""
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
    return img_b64


def call_ocr_model(image: Image.Image) -> str:
    """对单张图片调用 HunyuanOCR"""
    img_b64 = image_to_base64(image)

    # 绝大多数 image-text-to-text 模型都接受这种 payload 结构
    payload = {
        "inputs": {
            "image": img_b64
        }
    }

    try:
        response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
        response.raise_for_status()
    except Exception as e:
        return f"[调用模型出错] {type(e).__name__}: {e}"

    try:
        data = response.json()
    except Exception as e:
        return f"[解析返回结果出错] {type(e).__name__}: {e}\n原始返回：{response.text[:1000]}"

    # 尝试多种常见返回结构
    if isinstance(data, list) and len(data) > 0:
        first = data[0]
        if isinstance(first, dict):
            for key in ["generated_text", "text", "output", "label"]:
                if key in first and isinstance(first[key], str):
                    return first[key].strip()
        return str(first)

    if isinstance(data, dict):
        for key in ["generated_text", "text", "output", "label"]:
            if key in data and isinstance(data[key], str):
                return data[key].strip()
        return str(data)

    return str(data)


def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> List[Image.Image]:
    """把 PDF 的每一页渲染成 PIL Image 列表"""
    pdf = pdfium.PdfDocument(pdf_bytes)
    n_pages = len(pdf)
    images: List[Image.Image] = []

    for i in range(n_pages):
        page = pdf[i]
        # 72 dpi 是 PDF 默认分辨率，这里按比例放大到指定 dpi
        pil_image = page.render(scale=dpi / 72).to_pil()
        images.append(pil_image)

    return images


def run_ocr(file: Union[bytes, None], image: Union[Image.Image, None]) -> str:
    """
    总入口：可以上传 PDF 或 图片。
    - 如果上传了 PDF（file），对 PDF 每一页做 OCR
    - 如果只上传图片，对图片做 OCR
    - 如果两个都没传，提示用户
    """
    if file is None and image is None:
        return "请上传 PDF 文件或图片。"

    results = []

    # 1. 如果上传了 PDF
    if file is not None:
        try:
            pdf_bytes = file
            pages = pdf_to_images(pdf_bytes)
        except Exception as e:
            return f"[解析 PDF 出错] {type(e).__name__}: {e}"

        if not pages:
            return "PDF 中未检测到页面。"

        for idx, page_img in enumerate(pages, start=1):
            text = call_ocr_model(page_img)
            results.append(f"===== 第 {idx} 页 =====\n{text}\n")

    # 2. 如果上传了图片
    if image is not None:
        text = call_ocr_model(image)
        if results:
            results.append("===== 图片识别结果 =====\n" + text)
        else:
            results.append(text)

    return "\n".join(results)


with gr.Blocks() as demo:
    gr.Markdown(
        f"""# 文档 OCR Demo（HunyuanOCR）
使用模型：`{MODEL_ID}`

你可以：
- 上传 **PDF 文件**（多页会逐页识别，并按页分隔）
- 或上传 **单张图片**（截图、拍照等）
"""
    )

    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(
                label="上传 PDF 文件（可选）",
                file_types=[".pdf"],
                type="binary",
            )
            image_input = gr.Image(
                type="pil",
                label="上传图片（可选）",
            )
            run_button = gr.Button("开始识别")
        with gr.Column():
            output_text = gr.Textbox(label="识别结果", lines=25)

    run_button.click(
        fn=run_ocr,
        inputs=[pdf_input, image_input],
        outputs=output_text,
    )

if __name__ == "__main__":
    demo.launch()