|
|
<!doctype html> |
|
|
<html> |
|
|
<head> |
|
|
<meta charset="utf-8" /> |
|
|
<meta name="viewport" content="width=device-width,initial-scale=1" /> |
|
|
<title>WebGPU · Transformers.js · Image Captioning</title> |
|
|
<style> |
|
|
body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; } |
|
|
.card { border:1px solid #4443; border-radius:12px; padding:16px; } |
|
|
.log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; } |
|
|
img { max-width:100%; border-radius:8px; margin-top:10px; } |
|
|
.muted { opacity:.75; font-size:14px; } |
|
|
button,input { font:inherit; } |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
<h2>Image → Text in your browser (Transformers.js + WebGPU)</h2> |
|
|
<p id="env">Probing environment…</p> |
|
|
|
|
|
<div class="card"> |
|
|
<h3>Caption an image (file upload)</h3> |
|
|
<input id="file" type="file" accept="image/*" /> |
|
|
<button id="run" disabled>Caption</button> |
|
|
<div><img id="preview" alt="preview will appear here" /></div> |
|
|
<h4>Output</h4> |
|
|
<div id="log" class="log">Loading model…</div> |
|
|
<p class="muted"> |
|
|
Model: <code>Xenova/vit-gpt2-image-captioning</code><br /> |
|
|
Backend: <span id="backend">…</span> |
|
|
</p> |
|
|
</div> |
|
|
|
|
|
<script type="module"> |
|
|
const envEl = document.getElementById('env'); |
|
|
const fileEl = document.getElementById('file'); |
|
|
const runBtn = document.getElementById('run'); |
|
|
const logEl = document.getElementById('log'); |
|
|
const imgEl = document.getElementById('preview'); |
|
|
const backendEl = document.getElementById('backend'); |
|
|
|
|
|
|
|
|
const hasWebGPU = 'gpu' in navigator; |
|
|
let device = hasWebGPU ? 'webgpu' : 'wasm'; |
|
|
backendEl.textContent = device.toUpperCase(); |
|
|
envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).'; |
|
|
|
|
|
|
|
|
const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); |
|
|
|
|
|
|
|
|
const LOAD_TIMEOUT_MS = 25000; |
|
|
let captioner; |
|
|
|
|
|
async function buildPipeline(targetDevice) { |
|
|
logEl.textContent = `Loading model… device=${targetDevice}`; |
|
|
backendEl.textContent = targetDevice.toUpperCase(); |
|
|
return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice }); |
|
|
} |
|
|
|
|
|
try { |
|
|
if (device === 'webgpu') { |
|
|
const webgpuPromise = buildPipeline('webgpu'); |
|
|
const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS)); |
|
|
captioner = await Promise.race([webgpuPromise, timeout]); |
|
|
} else { |
|
|
captioner = await buildPipeline('wasm'); |
|
|
} |
|
|
} catch (e) { |
|
|
if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) { |
|
|
envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.'; |
|
|
device = 'wasm'; |
|
|
captioner = await buildPipeline('wasm'); |
|
|
} else { |
|
|
logEl.textContent = 'Error loading model: ' + e; |
|
|
throw e; |
|
|
} |
|
|
} |
|
|
|
|
|
logEl.textContent = `Model ready · device=${device}`; |
|
|
runBtn.disabled = false; |
|
|
|
|
|
|
|
|
let imgDataURL = null; |
|
|
|
|
|
fileEl.addEventListener('change', () => { |
|
|
logEl.textContent = 'Image selected. Preparing preview…'; |
|
|
const f = fileEl.files?.[0]; |
|
|
if (!f) { logEl.textContent = 'No file chosen.'; return; } |
|
|
|
|
|
|
|
|
if (!f.type.startsWith('image/')) { |
|
|
logEl.textContent = `Unsupported file type: ${f.type || 'unknown'}. Use JPG/PNG.`; |
|
|
return; |
|
|
} |
|
|
|
|
|
const reader = new FileReader(); |
|
|
reader.onerror = () => { |
|
|
logEl.textContent = 'Failed to read file. Try another image.'; |
|
|
}; |
|
|
reader.onload = async () => { |
|
|
imgDataURL = reader.result; |
|
|
imgEl.src = imgDataURL; |
|
|
try { |
|
|
|
|
|
if (imgEl.decode) await imgEl.decode(); |
|
|
logEl.textContent = 'Preview ready. Click “Caption”.'; |
|
|
} catch { |
|
|
logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.'; |
|
|
} |
|
|
}; |
|
|
reader.readAsDataURL(f); |
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
runBtn.addEventListener('click', async () => { |
|
|
if (!captioner) return; |
|
|
if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; } |
|
|
logEl.textContent = 'Running…'; |
|
|
try { |
|
|
const out = await captioner(imgDataURL, { |
|
|
max_new_tokens: 48, |
|
|
num_beams: 5, |
|
|
do_sample: false, |
|
|
no_repeat_ngram_size: 3 |
|
|
}); |
|
|
logEl.textContent = out[0].generated_text; |
|
|
} catch (e) { |
|
|
logEl.textContent = 'Inference error: ' + e; |
|
|
console.error(e); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
</body> |
|
|
</html> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|