lab / app.py
yakki233's picture
Update app.py
f7bd5ba verified
import os
import base64
import requests
from io import BytesIO
from typing import List, Union
from PIL import Image
import pypdfium2 as pdfium
import gradio as gr
# 从环境变量中读取你的 OCR token(变量名:ocr_model)
HF_API_TOKEN = os.environ.get("ocr_model")
if HF_API_TOKEN is None:
raise RuntimeError(
"环境变量 ocr_model 未设置,请在 Space 的 Settings -> Variables 中添加一个名为 ocr_model 的 Secret。"
)
# 使用 OCR 模型(保持你要的模型名不变)
MODEL_ID = "tencent/HunyuanOCR"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
def image_to_base64(image: Image.Image) -> str:
"""把 PIL Image 转成 base64 字符串"""
buffered = BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
return img_b64
def call_ocr_model(image: Image.Image) -> str:
"""对单张图片调用 HunyuanOCR"""
img_b64 = image_to_base64(image)
# 绝大多数 image-text-to-text 模型都接受这种 payload 结构
payload = {
"inputs": {
"image": img_b64
}
}
try:
response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
response.raise_for_status()
except Exception as e:
return f"[调用模型出错] {type(e).__name__}: {e}"
try:
data = response.json()
except Exception as e:
return f"[解析返回结果出错] {type(e).__name__}: {e}\n原始返回:{response.text[:1000]}"
# 尝试多种常见返回结构
if isinstance(data, list) and len(data) > 0:
first = data[0]
if isinstance(first, dict):
for key in ["generated_text", "text", "output", "label"]:
if key in first and isinstance(first[key], str):
return first[key].strip()
return str(first)
if isinstance(data, dict):
for key in ["generated_text", "text", "output", "label"]:
if key in data and isinstance(data[key], str):
return data[key].strip()
return str(data)
return str(data)
def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> List[Image.Image]:
"""把 PDF 的每一页渲染成 PIL Image 列表"""
pdf = pdfium.PdfDocument(pdf_bytes)
n_pages = len(pdf)
images: List[Image.Image] = []
for i in range(n_pages):
page = pdf[i]
# 72 dpi 是 PDF 默认分辨率,这里按比例放大到指定 dpi
pil_image = page.render(scale=dpi / 72).to_pil()
images.append(pil_image)
return images
def run_ocr(file: Union[bytes, None], image: Union[Image.Image, None]) -> str:
"""
总入口:可以上传 PDF 或 图片。
- 如果上传了 PDF(file),对 PDF 每一页做 OCR
- 如果只上传图片,对图片做 OCR
- 如果两个都没传,提示用户
"""
if file is None and image is None:
return "请上传 PDF 文件或图片。"
results = []
# 1. 如果上传了 PDF
if file is not None:
try:
pdf_bytes = file
pages = pdf_to_images(pdf_bytes)
except Exception as e:
return f"[解析 PDF 出错] {type(e).__name__}: {e}"
if not pages:
return "PDF 中未检测到页面。"
for idx, page_img in enumerate(pages, start=1):
text = call_ocr_model(page_img)
results.append(f"===== 第 {idx} 页 =====\n{text}\n")
# 2. 如果上传了图片
if image is not None:
text = call_ocr_model(image)
if results:
results.append("===== 图片识别结果 =====\n" + text)
else:
results.append(text)
return "\n".join(results)
with gr.Blocks() as demo:
gr.Markdown(
f"""# 文档 OCR Demo(HunyuanOCR)
使用模型:`{MODEL_ID}`
你可以:
- 上传 **PDF 文件**(多页会逐页识别,并按页分隔)
- 或上传 **单张图片**(截图、拍照等)
"""
)
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="上传 PDF 文件(可选)",
file_types=[".pdf"],
type="binary",
)
image_input = gr.Image(
type="pil",
label="上传图片(可选)",
)
run_button = gr.Button("开始识别")
with gr.Column():
output_text = gr.Textbox(label="识别结果", lines=25)
run_button.click(
fn=run_ocr,
inputs=[pdf_input, image_input],
outputs=output_text,
)
if __name__ == "__main__":
demo.launch()