babylm-multimodal-baseline-flamingo-old / processor_flamingo.py

Upload processor_flamingo.py with huggingface_hub

0412630 verified 3 months ago

3.06 kB

	from transformers import ProcessorMixin, AutoProcessor
	from transformers.models.auto.processing_auto import AutoProcessor
	from transformers.processing_utils import ProcessorMixin
	from transformers.tokenization_utils_base import BatchEncoding
	import json
	import os

	class FlamingoProcessor(ProcessorMixin):
	"""
	Custom processor that combines a tokenizer and feature extractor.
	"""
	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = "AutoTokenizer"

	def __init__(self, image_processor, tokenizer):
	super().__init__(image_processor, tokenizer)

	def __call__(self, text=None, images=None, **kwargs):
	"""
	Main processing method that handles both text and images.

	Args:
	text: Text input(s) to tokenize
	images: Image input(s) to process
	**kwargs: Additional arguments passed to tokenizer/image_processor

	Returns:
	Dictionary with processed inputs
	"""
	if text is None and images is None:
	raise ValueError("You need to specify either text or images")

	encoding = {}

	# Process text if provided
	if text is not None:
	if type(text) == str:
	all_text = "<image> " + text
	else:
	if type(text[0]) == str:
	all_text = ["<image> " + _text for _text in text]
	else:
	all_text = ['<image> ' + " ".join(_text) for _text in text]
	text_encoding = self.tokenizer(all_text, **kwargs)

	if 'offset_mapping' in text_encoding:
	offset_mapping = text_encoding['offset_mapping']
	if type(offset_mapping) != list:
	offset_mapping = offset_mapping[0].tolist()
	true_offset = offset_mapping[0][-1]
	new_offsets = []
	for start, end in offset_mapping:
	if start == 0:
	new_offsets.append((0, 0))
	else:
	new_offsets.append((start - true_offset, end - true_offset))
	text_encoding['offset_mapping'] = new_offsets

	encoding.update(text_encoding)

	# Process images if provided
	if images is not None:
	image_encoding = self.image_processor(images, **kwargs)
	# Add prefix to avoid key conflicts
	for key, value in image_encoding.items():
	encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value

	return BatchEncoding(encoding)

	def batch_decode(self, args, *kwargs):
	"""
	Delegate batch decoding to the tokenizer.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	Delegate decoding to the tokenizer.
	"""
	return self.tokenizer.decode(args, *kwargs)