Spaces:

Javedalam
/

transformersjs-webgpu-captioning

Running

App Files Files Community

transformersjs-webgpu-captioning / index.html

Javedalam

Update index.html

e74a613 verified 4 months ago

raw

history blame contribute delete

5.17 kB

	<!doctype html>
	<html>
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width,initial-scale=1" />
	<title>WebGPU · Transformers.js · Image Captioning</title>
	<style>
	body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
	.card { border:1px solid #4443; border-radius:12px; padding:16px; }
	.log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; }
	img { max-width:100%; border-radius:8px; margin-top:10px; }
	.muted { opacity:.75; font-size:14px; }
	button,input { font:inherit; }
	</style>
	</head>
	<body>
	<h2>Image → Text in your browser (Transformers.js + WebGPU)</h2>
	<p id="env">Probing environment…</p>

	<div class="card">
	<h3>Caption an image (file upload)</h3>
	<input id="file" type="file" accept="image/*" />
	<button id="run" disabled>Caption</button>
	<div><img id="preview" alt="preview will appear here" /></div>
	<h4>Output</h4>
	<div id="log" class="log">Loading model…</div>
	<p class="muted">
	Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
	Backend: <span id="backend">…</span>
	</p>
	</div>

	<script type="module">
	const envEl = document.getElementById('env');
	const fileEl = document.getElementById('file');
	const runBtn = document.getElementById('run');
	const logEl = document.getElementById('log');
	const imgEl = document.getElementById('preview');
	const backendEl = document.getElementById('backend');

	// Prefer WebGPU; fall back to WASM if unavailable/slow
	const hasWebGPU = 'gpu' in navigator;
	let device = hasWebGPU ? 'webgpu' : 'wasm';
	backendEl.textContent = device.toUpperCase();
	envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';

	// Load Transformers.js v3
	const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');

	// Watchdog: if WebGPU load takes too long, retry on WASM
	const LOAD_TIMEOUT_MS = 25000;
	let captioner;

	async function buildPipeline(targetDevice) {
	logEl.textContent = `Loading model… device=${targetDevice}`;
	backendEl.textContent = targetDevice.toUpperCase();
	return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice });
	}

	try {
	if (device === 'webgpu') {
	const webgpuPromise = buildPipeline('webgpu');
	const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
	captioner = await Promise.race([webgpuPromise, timeout]);
	} else {
	captioner = await buildPipeline('wasm');
	}
	} catch (e) {
	if (hasWebGPU && (e.message === 'webgpu-timeout' \|\| String(e).toLowerCase().includes('webgpu'))) {
	envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
	device = 'wasm';
	captioner = await buildPipeline('wasm');
	} else {
	logEl.textContent = 'Error loading model: ' + e;
	throw e;
	}
	}

	logEl.textContent = `Model ready · device=${device}`;
	runBtn.disabled = false;

	// ---------- Robust file load (FileReader → data URL, with checks) ----------
	let imgDataURL = null;

	fileEl.addEventListener('change', () => {
	logEl.textContent = 'Image selected. Preparing preview…';
	const f = fileEl.files?.[0];
	if (!f) { logEl.textContent = 'No file chosen.'; return; }

	// Some Android cameras save HEIC/HEIF which many browsers can’t decode.
	if (!f.type.startsWith('image/')) {
	logEl.textContent = `Unsupported file type: ${f.type \|\| 'unknown'}. Use JPG/PNG.`;
	return;
	}

	const reader = new FileReader();
	reader.onerror = () => {
	logEl.textContent = 'Failed to read file. Try another image.';
	};
	reader.onload = async () => {
	imgDataURL = reader.result; // base64 data URL
	imgEl.src = imgDataURL;
	try {
	// ensure it decoded before we allow run
	if (imgEl.decode) await imgEl.decode();
	logEl.textContent = 'Preview ready. Click “Caption”.';
	} catch {
	logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.';
	}
	};
	reader.readAsDataURL(f);
	});
	// --------------------------------------------------------------------------

	// Run captioning (beam search for better captions)
	runBtn.addEventListener('click', async () => {
	if (!captioner) return;
	if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; }
	logEl.textContent = 'Running…';
	try {
	const out = await captioner(imgDataURL, {
	max_new_tokens: 48,
	num_beams: 5,
	do_sample: false,
	no_repeat_ngram_size: 3
	});
	logEl.textContent = out[0].generated_text;
	} catch (e) {
	logEl.textContent = 'Inference error: ' + e;
	console.error(e);
	}
	});
	</script>
	</body>
	</html>