Javedalam's picture
Update index.html
e74a613 verified
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>WebGPU · Transformers.js · Image Captioning</title>
<style>
body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
.card { border:1px solid #4443; border-radius:12px; padding:16px; }
.log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; }
img { max-width:100%; border-radius:8px; margin-top:10px; }
.muted { opacity:.75; font-size:14px; }
button,input { font:inherit; }
</style>
</head>
<body>
<h2>Image → Text in your browser (Transformers.js + WebGPU)</h2>
<p id="env">Probing environment…</p>
<div class="card">
<h3>Caption an image (file upload)</h3>
<input id="file" type="file" accept="image/*" />
<button id="run" disabled>Caption</button>
<div><img id="preview" alt="preview will appear here" /></div>
<h4>Output</h4>
<div id="log" class="log">Loading model…</div>
<p class="muted">
Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
Backend: <span id="backend"></span>
</p>
</div>
<script type="module">
const envEl = document.getElementById('env');
const fileEl = document.getElementById('file');
const runBtn = document.getElementById('run');
const logEl = document.getElementById('log');
const imgEl = document.getElementById('preview');
const backendEl = document.getElementById('backend');
// Prefer WebGPU; fall back to WASM if unavailable/slow
const hasWebGPU = 'gpu' in navigator;
let device = hasWebGPU ? 'webgpu' : 'wasm';
backendEl.textContent = device.toUpperCase();
envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';
// Load Transformers.js v3
const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
// Watchdog: if WebGPU load takes too long, retry on WASM
const LOAD_TIMEOUT_MS = 25000;
let captioner;
async function buildPipeline(targetDevice) {
logEl.textContent = `Loading model… device=${targetDevice}`;
backendEl.textContent = targetDevice.toUpperCase();
return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice });
}
try {
if (device === 'webgpu') {
const webgpuPromise = buildPipeline('webgpu');
const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
captioner = await Promise.race([webgpuPromise, timeout]);
} else {
captioner = await buildPipeline('wasm');
}
} catch (e) {
if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) {
envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
device = 'wasm';
captioner = await buildPipeline('wasm');
} else {
logEl.textContent = 'Error loading model: ' + e;
throw e;
}
}
logEl.textContent = `Model ready · device=${device}`;
runBtn.disabled = false;
// ---------- Robust file load (FileReader → data URL, with checks) ----------
let imgDataURL = null;
fileEl.addEventListener('change', () => {
logEl.textContent = 'Image selected. Preparing preview…';
const f = fileEl.files?.[0];
if (!f) { logEl.textContent = 'No file chosen.'; return; }
// Some Android cameras save HEIC/HEIF which many browsers can’t decode.
if (!f.type.startsWith('image/')) {
logEl.textContent = `Unsupported file type: ${f.type || 'unknown'}. Use JPG/PNG.`;
return;
}
const reader = new FileReader();
reader.onerror = () => {
logEl.textContent = 'Failed to read file. Try another image.';
};
reader.onload = async () => {
imgDataURL = reader.result; // base64 data URL
imgEl.src = imgDataURL;
try {
// ensure it decoded before we allow run
if (imgEl.decode) await imgEl.decode();
logEl.textContent = 'Preview ready. Click “Caption”.';
} catch {
logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.';
}
};
reader.readAsDataURL(f);
});
// --------------------------------------------------------------------------
// Run captioning (beam search for better captions)
runBtn.addEventListener('click', async () => {
if (!captioner) return;
if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; }
logEl.textContent = 'Running…';
try {
const out = await captioner(imgDataURL, {
max_new_tokens: 48,
num_beams: 5,
do_sample: false,
no_repeat_ngram_size: 3
});
logEl.textContent = out[0].generated_text;
} catch (e) {
logEl.textContent = 'Inference error: ' + e;
console.error(e);
}
});
</script>
</body>
</html>