Upload 14 files

Browse files

Files changed (14) hide show

added_tokens.json +209 -0
chat_template.json +3 -0
config.json +78 -0
configuration_maira2.py +32 -0
generation_config.json +7 -0
model.safetensors +3 -0
modeling_maira2.py +359 -0
preprocessor_config.json +31 -0
processing_maira2.py +729 -0
processor_config.json +14 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +1701 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "</box>": 32203,
+  "</obj>": 32001,
+  "<box>": 32202,
+  "<image>": 32204,
+  "<lat_image>": 32206,
+  "<obj>": 32000,
+  "<prev_im>": 32205,
+  "<x0>": 32002,
+  "<x10>": 32012,
+  "<x11>": 32013,
+  "<x12>": 32014,
+  "<x13>": 32015,
+  "<x14>": 32016,
+  "<x15>": 32017,
+  "<x16>": 32018,
+  "<x17>": 32019,
+  "<x18>": 32020,
+  "<x19>": 32021,
+  "<x1>": 32003,
+  "<x20>": 32022,
+  "<x21>": 32023,
+  "<x22>": 32024,
+  "<x23>": 32025,
+  "<x24>": 32026,
+  "<x25>": 32027,
+  "<x26>": 32028,
+  "<x27>": 32029,
+  "<x28>": 32030,
+  "<x29>": 32031,
+  "<x2>": 32004,
+  "<x30>": 32032,
+  "<x31>": 32033,
+  "<x32>": 32034,
+  "<x33>": 32035,
+  "<x34>": 32036,
+  "<x35>": 32037,
+  "<x36>": 32038,
+  "<x37>": 32039,
+  "<x38>": 32040,
+  "<x39>": 32041,
+  "<x3>": 32005,
+  "<x40>": 32042,
+  "<x41>": 32043,
+  "<x42>": 32044,
+  "<x43>": 32045,
+  "<x44>": 32046,
+  "<x45>": 32047,
+  "<x46>": 32048,
+  "<x47>": 32049,
+  "<x48>": 32050,
+  "<x49>": 32051,
+  "<x4>": 32006,
+  "<x50>": 32052,
+  "<x51>": 32053,
+  "<x52>": 32054,
+  "<x53>": 32055,
+  "<x54>": 32056,
+  "<x55>": 32057,
+  "<x56>": 32058,
+  "<x57>": 32059,
+  "<x58>": 32060,
+  "<x59>": 32061,
+  "<x5>": 32007,
+  "<x60>": 32062,
+  "<x61>": 32063,
+  "<x62>": 32064,
+  "<x63>": 32065,
+  "<x64>": 32066,
+  "<x65>": 32067,
+  "<x66>": 32068,
+  "<x67>": 32069,
+  "<x68>": 32070,
+  "<x69>": 32071,
+  "<x6>": 32008,
+  "<x70>": 32072,
+  "<x71>": 32073,
+  "<x72>": 32074,
+  "<x73>": 32075,
+  "<x74>": 32076,
+  "<x75>": 32077,
+  "<x76>": 32078,
+  "<x77>": 32079,
+  "<x78>": 32080,
+  "<x79>": 32081,
+  "<x7>": 32009,
+  "<x80>": 32082,
+  "<x81>": 32083,
+  "<x82>": 32084,
+  "<x83>": 32085,
+  "<x84>": 32086,
+  "<x85>": 32087,
+  "<x86>": 32088,
+  "<x87>": 32089,
+  "<x88>": 32090,
+  "<x89>": 32091,
+  "<x8>": 32010,
+  "<x90>": 32092,
+  "<x91>": 32093,
+  "<x92>": 32094,
+  "<x93>": 32095,
+  "<x94>": 32096,
+  "<x95>": 32097,
+  "<x96>": 32098,
+  "<x97>": 32099,
+  "<x98>": 32100,
+  "<x99>": 32101,
+  "<x9>": 32011,
+  "<y0>": 32102,
+  "<y10>": 32112,
+  "<y11>": 32113,
+  "<y12>": 32114,
+  "<y13>": 32115,
+  "<y14>": 32116,
+  "<y15>": 32117,
+  "<y16>": 32118,
+  "<y17>": 32119,
+  "<y18>": 32120,
+  "<y19>": 32121,
+  "<y1>": 32103,
+  "<y20>": 32122,
+  "<y21>": 32123,
+  "<y22>": 32124,
+  "<y23>": 32125,
+  "<y24>": 32126,
+  "<y25>": 32127,
+  "<y26>": 32128,
+  "<y27>": 32129,
+  "<y28>": 32130,
+  "<y29>": 32131,
+  "<y2>": 32104,
+  "<y30>": 32132,
+  "<y31>": 32133,
+  "<y32>": 32134,
+  "<y33>": 32135,
+  "<y34>": 32136,
+  "<y35>": 32137,
+  "<y36>": 32138,
+  "<y37>": 32139,
+  "<y38>": 32140,
+  "<y39>": 32141,
+  "<y3>": 32105,
+  "<y40>": 32142,
+  "<y41>": 32143,
+  "<y42>": 32144,
+  "<y43>": 32145,
+  "<y44>": 32146,
+  "<y45>": 32147,
+  "<y46>": 32148,
+  "<y47>": 32149,
+  "<y48>": 32150,
+  "<y49>": 32151,
+  "<y4>": 32106,
+  "<y50>": 32152,
+  "<y51>": 32153,
+  "<y52>": 32154,
+  "<y53>": 32155,
+  "<y54>": 32156,
+  "<y55>": 32157,
+  "<y56>": 32158,
+  "<y57>": 32159,
+  "<y58>": 32160,
+  "<y59>": 32161,
+  "<y5>": 32107,
+  "<y60>": 32162,
+  "<y61>": 32163,
+  "<y62>": 32164,
+  "<y63>": 32165,
+  "<y64>": 32166,
+  "<y65>": 32167,
+  "<y66>": 32168,
+  "<y67>": 32169,
+  "<y68>": 32170,
+  "<y69>": 32171,
+  "<y6>": 32108,
+  "<y70>": 32172,
+  "<y71>": 32173,
+  "<y72>": 32174,
+  "<y73>": 32175,
+  "<y74>": 32176,
+  "<y75>": 32177,
+  "<y76>": 32178,
+  "<y77>": 32179,
+  "<y78>": 32180,
+  "<y79>": 32181,
+  "<y7>": 32109,
+  "<y80>": 32182,
+  "<y81>": 32183,
+  "<y82>": 32184,
+  "<y83>": 32185,
+  "<y84>": 32186,
+  "<y85>": 32187,
+  "<y86>": 32188,
+  "<y87>": 32189,
+  "<y88>": 32190,
+  "<y89>": 32191,
+  "<y8>": 32110,
+  "<y90>": 32192,
+  "<y91>": 32193,
+  "<y92>": 32194,
+  "<y93>": 32195,
+  "<y94>": 32196,
+  "<y95>": 32197,
+  "<y96>": 32198,
+  "<y97>": 32199,
+  "<y98>": 32200,
+  "<y99>": 32201,
+  "<y9>": 32111
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}You are an expert radiology assistant tasked with interpreting a chest X-ray study.  {% for message in messages %}{% if message[\"role\"] == \"user\" %}USER:  {% else %}ASSISTANT: {% endif %}{% for item in message[\"content\"] %}{% if item[\"type\"] == \"text\" %}{{ item[\"text\"] }}{% elif item[\"type\"] == \"image\" %}<image>{% endif %}{% endfor %}{% if message[\"role\"] == \"user\" %}  {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "_name_or_path": "/home/ea/work/my_optimum_intel/optimum-intel/maira2",
+  "architectures": [
+    "Maira2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_maira2.Maira2Config",
+    "AutoModelForCausalLM": "modeling_maira2.Maira2ForConditionalGeneration",
+    "AutoModelForVision2Seq": "modeling_maira2.Maira2ForConditionalGeneration"
+  },
+  "hidden_size": 16,
+  "ignore_index": -100,
+  "image_seq_length": 4,
+  "image_token_index": 32204,
+  "model_type": "maira2",
+  "multimodal_projector_bias": true,
+  "pad_token_id": 0,
+  "projector_hidden_act": "gelu",
+  "projector_n_layers": 4,
+  "text_config": {
+    "_name_or_path": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
+    "architectures": [
+      "LlamaForCausalLM"
+    ],
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "head_dim": 4,
+    "hidden_size": 16,
+    "intermediate_size": 64,
+    "model_type": "llama",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "num_key_value_heads": 4,
+    "pad_token_id": 2,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 32207
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vision_config": {
+    "apply_layernorm": true,
+    "architectures": [
+      "Dinov2Model"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "drop_path_rate": 0.0,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 16,
+    "image_size": 30,
+    "layer_norm_eps": 1e-06,
+    "layerscale_value": 1.0,
+    "mlp_ratio": 4,
+    "model_type": "dinov2",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 4,
+    "out_features": [
+      "stage4"
+    ],
+    "out_indices": [
+      4
+    ],
+    "patch_size": 2,
+    "qkv_bias": true,
+    "reshape_hidden_states": false,
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "torch_dtype": "float32",
+    "use_swiglu_ffn": false
+  },
+  "vision_feature_layer": -1,
+  "vision_feature_select_strategy": "default"
+}

configuration_maira2.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#  Copyright 2024 Microsoft. All rights reserved.
+#  Licensed under the MSRLA License. See LICENSE in the repo root for license information.
+from typing import Any
+from transformers import LlavaConfig
+class Maira2Config(LlavaConfig):
+    """
+    This is the configuration class to store the configuration of a `Maira2ForConditionalGeneration` model. It is
+    used to instantiate a MAIRA-2 model according to the specified arguments, defining the model architecture.
+    It inherits from `LlavaConfig`. In addition to the inherited attributes, it adds the
+    ability to customize the multimodal projector through following attributes:
+    Args:
+        projector_n_layers (`int`, *optional*, defaults to 4):
+            Number of layers in the multimodal projector.
+    """
+    model_type = "maira2"
+    def __init__(
+        self,
+        projector_n_layers: int = 4,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.hidden_size = self.text_config.hidden_size
+        self.projector_n_layers = projector_n_layers

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.48.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3bfb1d6f0ec0f0949cd84df187a2bfb571242c4ca9bdd519c4af512716ae23a
+size 4240896

modeling_maira2.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#  Copyright 2024 Microsoft. All rights reserved.
+#  Licensed under the MSRLA License. See LICENSE in the repo root for license information.
+from typing import Optional, List, Tuple, Union
+import torch
+from torch.nn import Linear, Module, Sequential
+from transformers import AutoBackbone, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaPreTrainedModel
+from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
+from transformers.activations import ACT2FN
+from transformers.utils import check_min_version
+from .configuration_maira2 import Maira2Config
+class Maira2MultiModalProjector(Module):
+    """
+    This class implements the multimodal projector for MAIRA-2 model. It projects the image features to the text
+    hidden size via a series of linear layers (4 layers in MAIRA-2).
+    """
+    def __init__(self, config: Maira2Config):
+        super().__init__()
+        n_layers = config.projector_n_layers
+        if n_layers < 1:
+            raise ValueError(f"Number of layers should be at least 1, got {n_layers=}")
+        text_hidden_size = config.text_config.hidden_size
+        vision_hidden_size = config.vision_config.hidden_size
+        _layers = [Linear(vision_hidden_size, text_hidden_size, bias=True)]
+        for _ in range(n_layers - 1):
+            _layers.append(ACT2FN[config.projector_hidden_act])
+            _layers.append(Linear(text_hidden_size, text_hidden_size, bias=True))
+        self.layers = Sequential(*_layers)
+    def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
+        hidden_states = self.layers(image_features)
+        return hidden_states  # type: ignore[no-any-return]
+class Maira2ForConditionalGeneration(LlavaForConditionalGeneration):
+    """
+    This model implements the multimodal model MAIRA-2. It consists of a vision backbone, a multimodal projector, and a
+    language model. The model can be used for grounded and ungrounded report generation tasks as well as phrase grounding.
+    This class inherits from `LlavaForConditionalGeneration`, defining a custom multimodal projector and changing image
+    feature selection.
+    """
+    config_class = Maira2Config
+    def __init__(self, config: Maira2Config) -> None:
+        # Check transformers version is at least 4.46.0.dev0  otherwise the model fails
+        # silently since get_image_features is not called in the forward pass
+        check_min_version("4.46.0.dev0")
+        super(LlavaPreTrainedModel, self).__init__(config)
+        self.vision_tower = AutoBackbone.from_config(config.vision_config)
+        self.multi_modal_projector = Maira2MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config,
+            attn_implementation=config._attn_implementation,
+        )
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+    def get_image_features(
+        self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str
+    ) -> torch.Tensor:
+        """
+        This method extracts the image features from the vision backbone using the specified feature layer and
+        selection strategy. This is custom to MAIRA-2 model since we want to use the `feature_maps` from the Dinov2Backbone
+        class instead of the `hidden_states` which are used in the default implementation of `get_image_features` in LlavaForConditionalGeneration.
+        The feature_maps returned by Dinov2Backbone are the hideen_states with a layernorm applied to them.
+        """
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.feature_maps[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features  # type: ignore[no-any-return]
+    # modification from original, added forward from transformers 4.46 to prevent new preprocessing
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+        legacy_processing = False
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            print(image_features.shape)
+            if legacy_processing:
+                # prefill stage vs decoding stage (legacy behavior copied)
+                if input_ids.shape[1] != 1:
+                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
+                        -target_length:
+                    ]
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+        image_to_overwrite = torch.full(
+            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+        )
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
+        indices_to_mask = new_token_positions[batch_indices, pad_indices]
+        final_embedding[batch_indices, indices_to_mask] = 0
+        if labels is None:
+            final_labels = None
+        return final_embedding, final_attention_mask, final_labels, position_ids

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_maira2.Maira2Processor"
+  },
+  "crop_size": {
+    "height": 30,
+    "width": 30
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5307,
+    0.5307,
+    0.5307
+  ],
+  "image_processor_type": "BitImageProcessor",
+  "image_std": [
+    0.2583,
+    0.2583,
+    0.2583
+  ],
+  "processor_class": "Maira2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 30
+  }
+}

processing_maira2.py ADDED Viewed

	@@ -0,0 +1,729 @@

+#  Copyright 2024 Microsoft. All rights reserved.
+#  Licensed under the MSRLA License. See LICENSE in the repo root for license information.
+import re
+from typing import Any, TypeAlias, Union, List
+import numpy as np
+from PIL import Image
+from transformers import BaseImageProcessor, LlavaProcessor, PreTrainedTokenizer
+from transformers.models.llava.processing_llava import LlavaProcessorKwargs
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+SingleChatMessageType: TypeAlias = dict[str, str | int | None]
+ChatMessageListType: TypeAlias = list[dict[str, str | list[SingleChatMessageType]]]
+BoxType: TypeAlias = tuple[float, float, float, float]
+class Maira2Processor(LlavaProcessor):
+    """
+    Constructs a Maira2 processor similar to LlavaProcessor but with additional arguments and functions to support
+    multi-image grounded and non-grounded radiology report generation.
+    In addition to the arguments of LlavaProcessor, Maira2Processor has the following extra arguments:
+    Args:
+        phrase_start_token (`str`, *optional*, defaults to `"<obj>"`):
+            Special token used to denote the start of a grounded phrase (with or without box).
+        phrase_end_token (`str`, *optional*, defaults to `"</obj>"`):
+            Special token used to denote the end of a grounded phrase.
+        box_start_token (`str`, *optional*, defaults to `"<box>"`):
+            Special token used to denote the start of a bounding box.
+        box_end_token (`str`, *optional*, defaults to `"</box>"`):
+            Special token used to denote the end of a bounding box.
+        num_box_coord_bins (`int`, *optional*, defaults to `100`):
+            Number of bins used to represent the bounding box coordinates.
+    """
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "phrase_start_token",
+        "phrase_end_token",
+        "box_start_token",
+        "box_end_token",
+        "num_box_coord_bins",
+    ]
+    def __init__(
+        self,
+        image_processor: BaseImageProcessor = None,
+        tokenizer: PreTrainedTokenizer = None,
+        patch_size: int | None = None,
+        vision_feature_select_strategy: str | None = None,
+        chat_template: str | None = None,
+        image_token: str = "<image>",
+        phrase_start_token: str = "<obj>",
+        phrase_end_token: str = "</obj>",
+        box_start_token: str = "<box>",
+        box_end_token: str = "</box>",
+        num_box_coord_bins: int = 100,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            patch_size=patch_size,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            chat_template=chat_template,
+            image_token=image_token,
+            **kwargs,
+        )
+        self.phrase_start_token = phrase_start_token
+        self.phrase_end_token = phrase_end_token
+        self.box_start_token = box_start_token
+        self.box_end_token = box_end_token
+        self.num_box_coord_bins = num_box_coord_bins
+    @staticmethod
+    def _normalize_image(image: Image.Image) -> Image.Image:
+        """
+        This function normalizes the input image to have pixel values in the range [0, 255].
+        Args:
+            image (Image.Image | np.ndarray):
+                The input image to be normalized.
+        Returns:
+            Image.Image: The normalized image in grayscale.
+        """
+        image_np = np.array(image.convert("L"))
+        image_np = image_np.astype(float)
+        image_np -= image_np.min()
+        image_np /= image_np.max()
+        image_np *= 255
+        image_np = image_np.astype(np.uint8)
+        return Image.fromarray(image_np).convert("L")
+    def _normalize_and_stack_images(
+        self,
+        current_frontal: Image.Image,
+        current_lateral: Image.Image | None,
+        prior_frontal: Image.Image | None,
+    ) -> list[Image.Image]:
+        """
+        This function normalizes the input images and stacks them together. The images are stacked in the order of
+        current_frontal, current_lateral, and prior_frontal. The order of images is important, since it must match the
+        order of the images in the prompt, which is frontal, then lateral then prior.
+        Args:
+            current_frontal (Image.Image):
+                The current frontal image.
+            current_lateral (Image.Image | None):
+                The current lateral image.
+            prior_frontal (Image.Image | None):
+                The prior frontal image.
+        Returns:
+            list[Image.Image]: The normalized images stacked together.
+        """
+        images = [self._normalize_image(current_frontal)]
+        if current_lateral is not None:
+            images.append(self._normalize_image(current_lateral))
+        if prior_frontal is not None:
+            images.append(self._normalize_image(prior_frontal))
+        return images
+    @staticmethod
+    def _get_section_text_or_missing_text(section: str | None) -> str:
+        """
+        This function returns the input section text if it is not None and not empty, otherwise it returns a missing
+        section text "N/A".
+        Args:
+            section (str | None):
+                The input section text.
+        Returns:
+            str: The section text if it is not None and not empty, otherwise "N/A".
+        """
+        missing_section_text = "N/A"
+        if not isinstance(section, str) or len(section) == 0:
+            return missing_section_text
+        return section
+    @staticmethod
+    def _construct_image_chat_messages_for_reporting(has_prior: bool, has_lateral: bool) -> list[SingleChatMessageType]:
+        """
+        This function constructs user chat messages based on the presence of the prior and lateral images.
+        Args:
+            has_prior (bool):
+                A boolean indicating whether the prior image is present.
+            has_lateral (bool):
+                A boolean indicating whether the lateral image is present.
+        Returns:
+            list[SingleChatMessageType]: The image prompt messages in the form of a list of dictionaries.
+        Example:
+        ```python
+        >>> _construct_image_chat_messages_for_reporting(has_prior=True, has_lateral=True)
+        >>> # [
+        >>> #     {"index": None, "text": "Given the current frontal image", "type": "text"},
+        >>> #     {"index": 0, "text": None, "type": "image"},
+        >>> #     {"index": None, "text": " the current lateral image", "type": "text"},
+        >>> #     {"index": 1, "text": None, "type": "image"},
+        >>> #     {"index": None, "text": " and the prior frontal image", "type": "text"},
+        >>> #     {"index": 2, "text": None, "type": "image"},
+        >>> # ]
+        ```
+        """
+        def _add_single_image_to_chat_messages(prompt_text: str, image_index: int) -> None:
+            image_prompt.extend(
+                [
+                    {"index": None, "text": prompt_text, "type": "text"},
+                    {"index": image_index, "text": None, "type": "image"},
+                ]
+            )
+        image_prompt: list[SingleChatMessageType] = []
+        image_index = 0
+        if not has_prior and not has_lateral:
+            _add_single_image_to_chat_messages("Given the current frontal image only", image_index)
+        else:
+            _add_single_image_to_chat_messages("Given the current frontal image", image_index)
+            image_index += 1
+            if has_prior:
+                if has_lateral:
+                    _add_single_image_to_chat_messages(" the current lateral image", image_index)
+                    image_index += 1
+                _add_single_image_to_chat_messages(" and the prior frontal image", image_index)
+            else:
+                if has_lateral:
+                    _add_single_image_to_chat_messages(" and the current lateral image", image_index)
+        return image_prompt
+    def _construct_chat_messages_reporting(
+        self,
+        has_prior: bool,
+        has_lateral: bool,
+        indication: str | None,
+        technique: str | None,
+        comparison: str | None,
+        prior_report: str | None,
+        get_grounding: bool = False,
+        assistant_text: str | None = None,
+    ) -> ChatMessageListType:
+        """
+        This function constructs the chat messages for reporting used in the grounded and non-grounded reporting tasks.
+        Args:
+            has_prior (bool):
+                A boolean indicating whether the prior image is present.
+            has_lateral (bool):
+                A boolean indicating whether the lateral image is present.
+            indication (str | None):
+                The indication section text.
+            technique (str | None):
+                The technique section text.
+            comparison (str | None):
+                The comparison section text.
+            prior_report (str | None):
+                The prior report section text.
+            get_grounding (bool):
+                A boolean indicating whether to get the grounding information.
+            assistant_text (str | None):
+                The assistant text (can be set to None for ordinary inference).
+        Returns:
+            ChatMessageListType: The chat messages for reporting in the form of a list of dictionaries.
+        Example:
+        ```python
+        >>> _construct_chat_messages_reporting(
+        >>>     has_prior=True,
+        >>>     has_lateral=True,
+        >>>     indication="indication text from report goes here",
+        >>>     technique="technique text from report goes here",
+        >>>     comparison="comparison text from report goes here",
+        >>>     prior_report="prior reporting text goes here",
+        >>>     get_grounding=False,
+        >>>     assistant_text=None,
+        >>> )
+        >>> # [
+        >>> #     {"index": None, "text": "Given the current frontal image", "type": "text"},
+        >>> #     {"index": 0, "text": None, "type": "image"},
+        >>> #     {"index": None, "text": " the current lateral image", "type": "text"},
+        >>> #     {"index": 1, "text": None, "type": "image"},
+        >>> #     {"index": None, "text": " and the prior frontal image", "type": "text"},
+        >>> #     {"index": 2, "text": None, "type": "image"},
+        >>> #     {"index": None, "text": " PRIOR_REPORT: prior reporting text goes here", "type": "text"},
+        >>> #     {"index": None, "text": " Provide a description of the findings in the radiology study in comparison to the "
+        >>> #     "prior frontal image. INDICATION: indication text from report goes here TECHNIQUE: technique text from report "
+        >>> #     "goes here COMPARISON: comparison text from report goes here", "type": "text"},
+        >>> # ]
+        ```
+        """
+        indication = self._get_section_text_or_missing_text(indication)
+        technique = self._get_section_text_or_missing_text(technique)
+        comparison = self._get_section_text_or_missing_text(comparison)
+        prior_report = self._get_section_text_or_missing_text(prior_report)
+        prompt = self._construct_image_chat_messages_for_reporting(has_prior=has_prior, has_lateral=has_lateral)
+        if has_prior:
+            prompt.append({"index": None, "text": f" PRIOR_REPORT: {prior_report}", "type": "text"})
+        if get_grounding:
+            prompt.append(
+                {
+                    "index": None,
+                    "text": " Provide a description of the findings in the radiology study in comparison to the "
+                    "prior frontal image. Each finding should be described as a self-contained plain-text sentence."
+                    " If the finding is groundable, locate the finding in the current frontal chest X-ray image, "
+                    "with bounding boxes indicating all locations where it can be seen in the current frontal "
+                    "image. Otherwise, generate just the ungrounded finding without bounding boxes. INDICATION: "
+                    f"{indication} TECHNIQUE: {technique} COMPARISON: {comparison}",
+                    "type": "text",
+                }
+            )
+        else:
+            prompt.append(
+                {
+                    "index": None,
+                    "text": " Provide a description of the findings in the radiology study in comparison to the "
+                    f"prior frontal image. INDICATION: {indication} TECHNIQUE: {technique} COMPARISON: "
+                    f"{comparison}",
+                    "type": "text",
+                }
+            )
+        messages: ChatMessageListType = [{"content": prompt, "role": "user"}]
+        if assistant_text is not None:
+            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
+        return messages
+    def _construct_chat_messages_phrase_grounding(
+        self, phrase: str, assistant_text: str | None = None
+    ) -> ChatMessageListType:
+        """
+        This function constructs the chat messages for phrase grounding used in the phrase grounding task.
+        Args:
+            phrase (str):
+                The phrase to be grounded.
+            assistant_text (str | None):
+                The assistant text (can be set to None for ordinary inference).
+        Returns:
+            ChatMessageListType: The chat messages for phrase grounding in the form of a list of dictionaries.
+        """
+        prompt: list[SingleChatMessageType] = [
+            {"index": None, "text": "Given the current frontal image", "type": "text"},
+            {"index": 0, "text": None, "type": "image"},
+            {
+                "index": None,
+                "text": f" Repeat the following finding as a grounded phrase with bounding boxes indicating all "
+                f"locations where it can be seen in the given chest X-ray image. Finding: {phrase}",
+                "type": "text",
+            },
+        ]
+        messages: ChatMessageListType = [{"content": prompt, "role": "user"}]
+        if assistant_text is not None:
+            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
+        return messages
+    def format_reporting_input(
+        self,
+        current_frontal: Image.Image,
+        current_lateral: Image.Image | None,
+        prior_frontal: Image.Image | None,
+        indication: str | None,
+        technique: str | None,
+        comparison: str | None,
+        prior_report: str | None,
+        get_grounding: bool = False,
+        assistant_text: str | None = None,
+    ) -> tuple[str, list[Image.Image]]:
+        """
+        This function formats the reporting prompt for the grounded and non-grounded reporting tasks from the given
+        input images and text sections. The images are normalized and stacked together in the right order.
+        Args:
+            current_frontal (Image.Image):
+                The current frontal image.
+            current_lateral (Image.Image | None):
+                The current lateral image.
+            prior_frontal (Image.Image | None):
+                The prior frontal image.
+            indication (str | None):
+                The indication section text.
+            technique (str | None):
+                The technique section text.
+            comparison (str | None):
+                The comparison section text.
+            prior_report (str | None):
+                The prior report section text.
+            get_grounding (bool):
+                A boolean indicating whether to construct the prompt for grounded or non-grounded reporting.
+            assistant_text (str | None): The assistant text (can be set to None for ordinary inference).
+        Returns:
+            tuple[str, list[Image.Image]]: The formatted prompt text and the normalized images stacked in the right order.
+        """
+        images = self._normalize_and_stack_images(
+            current_frontal=current_frontal,
+            current_lateral=current_lateral,
+            prior_frontal=prior_frontal,
+        )
+        messages = self._construct_chat_messages_reporting(
+            has_prior=prior_frontal is not None,
+            has_lateral=current_lateral is not None,
+            indication=indication,
+            technique=technique,
+            comparison=comparison,
+            prior_report=prior_report,
+            get_grounding=get_grounding,
+            assistant_text=assistant_text,
+        )
+        add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
+        return text, images
+    def format_phrase_grounding_input(
+        self,
+        frontal_image: Image.Image,
+        phrase: str,
+        assistant_text: str | None = None,
+    ) -> tuple[str, list[Image.Image]]:
+        """
+        This function formats the phrase grounding prompt for the phrase grounding task from the given input
+        image and phrase.
+        Args:
+            frontal_image (Image.Image):
+                The frontal image.
+            phrase (str):
+                The phrase to be grounded.
+            assistant_text (str | None):
+                The assistant text (can be set to None for ordinary inference).
+        Returns:
+            tuple[str, list[Image.Image]]: The formatted phrase grounding prompt text and the normalized image.
+        """
+        images = self._normalize_and_stack_images(
+            current_frontal=frontal_image,
+            current_lateral=None,
+            prior_frontal=None,
+        )
+        messages = self._construct_chat_messages_phrase_grounding(phrase)
+        add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
+        return text, images
+    def format_and_preprocess_reporting_input(
+        self,
+        current_frontal: Image.Image,
+        current_lateral: Image.Image | None,
+        prior_frontal: Image.Image | None,
+        indication: str | None,
+        technique: str | None,
+        comparison: str | None,
+        prior_report: str | None,
+        get_grounding: bool = False,
+        assistant_text: str | None = None,
+        **kwargs: Any,
+    ) -> BatchFeature:
+        """
+        This function formats and then preprocesses the input for the grounded and non-grounded reporting tasks from
+        the given input images and text sections and returns the batch feature for the model. It calls format_reporting_input
+        internally to format the input prompt and stack the images together in the right order.
+        Args:
+            current_frontal (Image.Image):
+                The current frontal image.
+            current_lateral (Image.Image | None):
+                The current lateral image.
+            prior_frontal (Image.Image | None):
+                The prior frontal image.
+            indication (str | None):
+                The indication section text.
+            technique (str | None):
+                The technique section text.
+            comparison (str | None):
+                The comparison section text.
+            prior_report (str | None):
+                The prior report section text.
+            get_grounding (bool):
+                A boolean indicating whether to preprocess the input for grounded or non-grounded reporting.
+            assistant_text (str | None):
+                The assistant text (can be set to None for ordinary inference).
+        Returns:
+            BatchFeature: The batch feature for the model, ready to be passed to the model.
+        """
+        text, images = self.format_reporting_input(
+            current_frontal=current_frontal,
+            current_lateral=current_lateral,
+            prior_frontal=prior_frontal,
+            indication=indication,
+            technique=technique,
+            comparison=comparison,
+            prior_report=prior_report,
+            get_grounding=get_grounding,
+            assistant_text=assistant_text,
+        )
+        return self(text=text, images=images, **kwargs)
+    def format_and_preprocess_phrase_grounding_input(
+        self,
+        frontal_image: Image.Image,
+        phrase: str,
+        assistant_text: str | None = None,
+        **kwargs: Any,
+    ) -> BatchFeature:
+        """
+        This function formats and then processes the input for the phrase grounding task from the given input image and
+        phrase and returns the batch feature for the model. It calls format_phrase_grounding_input internally to format
+        the input prompt and normalize the image.
+        Args:
+            frontal_image (Image.Image):
+                The frontal image.
+            phrase (str):
+                The phrase to be grounded.
+            assistant_text (str | None):
+                The assistant text (can be set to None for ordinary inference).
+        Returns:
+            BatchFeature: The batch feature for the model, ready to be passed to the model.
+        """
+        text, images = self.format_phrase_grounding_input(
+            frontal_image=frontal_image,
+            phrase=phrase,
+            assistant_text=assistant_text,
+        )
+        return self(text=text, images=images, **kwargs)
+    def _get_text_between_delimiters(self, text: str, begin_token: str, end_token: str) -> list[str]:
+        """
+        This function splits the input text into a list of substrings beased on the given begin and end tokens.
+        Args:
+            text (str):
+                The input text to be split.
+            begin_token (str):
+                The begin token.
+            end_token (str):
+                The end token.
+        Returns:
+            list[str]: The list of substrings between the given begin and end tokens.
+        Example:
+        ```python
+        >>> _get_text_between_delimiters("<obj>This is a grounded phrase</obj>. <obj>This is another grounded phrase</obj>.", "<obj>", "</obj>")
+        >>> # ["grounded phrase", "This is another grounded phrase"]
+        >>> _get_text_between_delimiters("<box><x10><y20><x30><y40></box><box><x50><y60><x70><y80></box>", "<box>", "</box>")
+        >>> # ["<x10><y20><x30><y40>", "<x50><y60><x70><y80>"]
+        ```
+        """
+        split_text = []
+        while begin_token in text:
+            assert text.startswith(begin_token)
+            end_index = text.find(end_token)
+            assert end_index != -1
+            split_text.append(text[len(begin_token) : end_index])
+            text = text[end_index + len(end_token) :]
+        assert len(text) == 0
+        return split_text
+    def convert_output_to_plaintext_or_grounded_sequence(
+        self, text: str
+    ) -> str | list[tuple[str, list[BoxType] | None]]:
+        """
+        This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
+        boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
+        Args:
+            text (str):
+                The input text to be converted.
+        Returns:
+            str | list[tuple[str, list[BoxType] | None]]: The grounded sequence.
+        Example:
+        ```python
+        >>> convert_output_to_plaintext_or_grounded_sequence("<obj>grounded phrase <box><x55><y45><x70><y56></box></obj><obj>ungrounded phrase</obj>")
+        >>> # [
+        >>> #     ("grounded phrase", [(0.55, 0.45, 0.70, 0.56)]),
+        >>> #     ("ungrounded phrase", None),
+        >>> # ]
+        >>> convert_output_to_plaintext_or_grounded_sequence("plain text")
+        >>> # "plain text"
+        ```
+        """
+        text = text.strip()
+        # Plain text
+        if not any(
+            [
+                self.phrase_start_token in text,
+                self.phrase_end_token in text,
+                self.box_start_token in text,
+                self.box_end_token in text,
+            ]
+        ):
+            return text
+        # One or more grounded phrases
+        grounded_phrase_texts = self._get_text_between_delimiters(text, self.phrase_start_token, self.phrase_end_token)
+        grounded_phrases: list[tuple[str, list[BoxType] | None]] = []
+        for grounded_phrase_text in grounded_phrase_texts:
+            if self.box_start_token in grounded_phrase_text or self.box_end_token in grounded_phrase_text:
+                first_box_start_index = grounded_phrase_text.find(self.box_start_token)
+                phrase_text = grounded_phrase_text[:first_box_start_index].strip()
+                boxes_text = grounded_phrase_text[first_box_start_index:]
+                boxes_text_list = self._get_text_between_delimiters(
+                    boxes_text, self.box_start_token, self.box_end_token
+                )
+                boxes: list[BoxType] = []
+                for box_text in boxes_text_list:
+                    # extract from <x_><y_><x_><y_>
+                    regex = r"<x(\d+?)><y(\d+?)><x(\d+?)><y(\d+?)>"
+                    match = re.search(regex, box_text)
+                    if match:
+                        x_min, y_min, x_max, y_max = match.groups()
+                        box: BoxType = tuple(  # type: ignore[assignment]
+                            (int(coord) + 0.5) / self.num_box_coord_bins for coord in (x_min, y_min, x_max, y_max)
+                        )
+                        assert all(0 <= coord <= 1 for coord in box), f"Invalid box coordinates: {box}"
+                        boxes.append(box)
+                    else:
+                        raise ValueError(f"Invalid box coordinates: {box_text} not matching regex {regex}")
+                grounded_phrases.append((phrase_text, boxes))
+            else:
+                grounded_phrases.append((grounded_phrase_text.lstrip(), None))
+        return grounded_phrases
+    @staticmethod
+    def adjust_box_for_original_image_size(box: BoxType, width: int, height: int) -> BoxType:
+        """
+        This function adjusts the bounding boxes from the MAIRA-2 model output to account for the image processor
+        cropping the image to be square prior to the model forward pass. The box coordinates are adjusted to be
+        relative to the original shape of the image assuming the image processor cropped the image based on the length
+        of the shortest side.
+        Args:
+            box (BoxType):
+                The box to be adjusted, normalised to (0, 1).
+            width (int):
+                Original width of the image, in pixels.
+            height (int):
+                Original height of the image, in pixels.
+        Returns:
+            BoxType: The box normalised relative to the original size of the image.
+        """
+        crop_width = crop_height = min(width, height)
+        x_offset = (width - crop_width) // 2
+        y_offset = (height - crop_height) // 2
+        norm_x_min, norm_y_min, norm_x_max, norm_y_max = box
+        abs_x_min = int(norm_x_min * crop_width + x_offset)
+        abs_x_max = int(norm_x_max * crop_width + x_offset)
+        abs_y_min = int(norm_y_min * crop_height + y_offset)
+        abs_y_max = int(norm_y_max * crop_height + y_offset)
+        adjusted_norm_x_min = abs_x_min / width
+        adjusted_norm_x_max = abs_x_max / width
+        adjusted_norm_y_min = abs_y_min / height
+        adjusted_norm_y_max = abs_y_max / height
+        return (adjusted_norm_x_min, adjusted_norm_y_min, adjusted_norm_x_max, adjusted_norm_y_max)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[LlavaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+        output_kwargs = self._merge_kwargs(
+            LlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+                # Replace the image token with the expanded image token sequence
+                pixel_values = image_inputs["pixel_values"]
+                height, width = get_image_size(to_numpy_array(pixel_values[0]))
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+                prompt_strings = []
+                for sample in text:
+                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                    prompt_strings.append(sample)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs})

processor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "box_end_token": "</box>",
+  "box_start_token": "<box>",
+  "image_token": "<image>",
+  "num_box_coord_bins": 100,
+  "patch_size": 2,
+  "phrase_end_token": "</obj>",
+  "phrase_start_token": "<obj>",
+  "processor_class": "Maira2Processor",
+  "vision_feature_select_strategy": "default",
+  "auto_map": {
+    "AutoProcessor": "processing_maira2.Maira2Processor"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1701 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<obj>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32001": {
+      "content": "</obj>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32002": {
+      "content": "<x0>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32003": {
+      "content": "<x1>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32004": {
+      "content": "<x2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32005": {
+      "content": "<x3>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32006": {
+      "content": "<x4>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32007": {
+      "content": "<x5>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32008": {
+      "content": "<x6>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32009": {
+      "content": "<x7>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32010": {
+      "content": "<x8>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32011": {
+      "content": "<x9>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32012": {
+      "content": "<x10>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32013": {
+      "content": "<x11>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32014": {
+      "content": "<x12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32015": {
+      "content": "<x13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32016": {
+      "content": "<x14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32017": {
+      "content": "<x15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32018": {
+      "content": "<x16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32019": {
+      "content": "<x17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32020": {
+      "content": "<x18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32021": {
+      "content": "<x19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32022": {
+      "content": "<x20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32023": {
+      "content": "<x21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32024": {
+      "content": "<x22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32025": {
+      "content": "<x23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32026": {
+      "content": "<x24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32027": {
+      "content": "<x25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32028": {
+      "content": "<x26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32029": {
+      "content": "<x27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32030": {
+      "content": "<x28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32031": {
+      "content": "<x29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32032": {
+      "content": "<x30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32033": {
+      "content": "<x31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32034": {
+      "content": "<x32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32035": {
+      "content": "<x33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32036": {
+      "content": "<x34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32037": {
+      "content": "<x35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32038": {
+      "content": "<x36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32039": {
+      "content": "<x37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32040": {
+      "content": "<x38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32041": {
+      "content": "<x39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32042": {
+      "content": "<x40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32043": {
+      "content": "<x41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32044": {
+      "content": "<x42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32045": {
+      "content": "<x43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32046": {
+      "content": "<x44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32047": {
+      "content": "<x45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32048": {
+      "content": "<x46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32049": {
+      "content": "<x47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32050": {
+      "content": "<x48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32051": {
+      "content": "<x49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32052": {
+      "content": "<x50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32053": {
+      "content": "<x51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32054": {
+      "content": "<x52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32055": {
+      "content": "<x53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32056": {
+      "content": "<x54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32057": {
+      "content": "<x55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32058": {
+      "content": "<x56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32059": {
+      "content": "<x57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32060": {
+      "content": "<x58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32061": {
+      "content": "<x59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32062": {
+      "content": "<x60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32063": {
+      "content": "<x61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32064": {
+      "content": "<x62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32065": {
+      "content": "<x63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32066": {
+      "content": "<x64>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32067": {
+      "content": "<x65>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32068": {
+      "content": "<x66>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32069": {
+      "content": "<x67>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32070": {
+      "content": "<x68>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32071": {
+      "content": "<x69>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32072": {
+      "content": "<x70>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32073": {
+      "content": "<x71>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32074": {
+      "content": "<x72>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32075": {
+      "content": "<x73>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32076": {
+      "content": "<x74>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32077": {
+      "content": "<x75>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32078": {
+      "content": "<x76>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32079": {
+      "content": "<x77>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32080": {
+      "content": "<x78>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32081": {
+      "content": "<x79>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32082": {
+      "content": "<x80>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32083": {
+      "content": "<x81>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32084": {
+      "content": "<x82>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32085": {
+      "content": "<x83>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32086": {
+      "content": "<x84>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32087": {
+      "content": "<x85>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32088": {
+      "content": "<x86>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32089": {
+      "content": "<x87>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32090": {
+      "content": "<x88>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32091": {
+      "content": "<x89>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32092": {
+      "content": "<x90>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32093": {
+      "content": "<x91>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32094": {
+      "content": "<x92>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32095": {
+      "content": "<x93>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32096": {
+      "content": "<x94>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32097": {
+      "content": "<x95>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32098": {
+      "content": "<x96>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32099": {
+      "content": "<x97>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32100": {
+      "content": "<x98>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32101": {
+      "content": "<x99>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32102": {
+      "content": "<y0>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32103": {
+      "content": "<y1>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32104": {
+      "content": "<y2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32105": {
+      "content": "<y3>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32106": {
+      "content": "<y4>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32107": {
+      "content": "<y5>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32108": {
+      "content": "<y6>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32109": {
+      "content": "<y7>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32110": {
+      "content": "<y8>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32111": {
+      "content": "<y9>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32112": {
+      "content": "<y10>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32113": {
+      "content": "<y11>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32114": {
+      "content": "<y12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32115": {
+      "content": "<y13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32116": {
+      "content": "<y14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32117": {
+      "content": "<y15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32118": {
+      "content": "<y16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32119": {
+      "content": "<y17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32120": {
+      "content": "<y18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32121": {
+      "content": "<y19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32122": {
+      "content": "<y20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32123": {
+      "content": "<y21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32124": {
+      "content": "<y22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32125": {
+      "content": "<y23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32126": {
+      "content": "<y24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32127": {
+      "content": "<y25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32128": {
+      "content": "<y26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32129": {
+      "content": "<y27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32130": {
+      "content": "<y28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32131": {
+      "content": "<y29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32132": {
+      "content": "<y30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32133": {
+      "content": "<y31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32134": {
+      "content": "<y32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32135": {
+      "content": "<y33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32136": {
+      "content": "<y34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32137": {
+      "content": "<y35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32138": {
+      "content": "<y36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32139": {
+      "content": "<y37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32140": {
+      "content": "<y38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32141": {
+      "content": "<y39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32142": {
+      "content": "<y40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32143": {
+      "content": "<y41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32144": {
+      "content": "<y42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32145": {
+      "content": "<y43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32146": {
+      "content": "<y44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32147": {
+      "content": "<y45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32148": {
+      "content": "<y46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32149": {
+      "content": "<y47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32150": {
+      "content": "<y48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32151": {
+      "content": "<y49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32152": {
+      "content": "<y50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32153": {
+      "content": "<y51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32154": {
+      "content": "<y52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32155": {
+      "content": "<y53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32156": {
+      "content": "<y54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32157": {
+      "content": "<y55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32158": {
+      "content": "<y56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32159": {
+      "content": "<y57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32160": {
+      "content": "<y58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32161": {
+      "content": "<y59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32162": {
+      "content": "<y60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32163": {
+      "content": "<y61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32164": {
+      "content": "<y62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32165": {
+      "content": "<y63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32166": {
+      "content": "<y64>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32167": {
+      "content": "<y65>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32168": {
+      "content": "<y66>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32169": {
+      "content": "<y67>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32170": {
+      "content": "<y68>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32171": {
+      "content": "<y69>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32172": {
+      "content": "<y70>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32173": {
+      "content": "<y71>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32174": {
+      "content": "<y72>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32175": {
+      "content": "<y73>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32176": {
+      "content": "<y74>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32177": {
+      "content": "<y75>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32178": {
+      "content": "<y76>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32179": {
+      "content": "<y77>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32180": {
+      "content": "<y78>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32181": {
+      "content": "<y79>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32182": {
+      "content": "<y80>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32183": {
+      "content": "<y81>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32184": {
+      "content": "<y82>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32185": {
+      "content": "<y83>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32186": {
+      "content": "<y84>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32187": {
+      "content": "<y85>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32188": {
+      "content": "<y86>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32189": {
+      "content": "<y87>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32190": {
+      "content": "<y88>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32191": {
+      "content": "<y89>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32192": {
+      "content": "<y90>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32193": {
+      "content": "<y91>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32194": {
+      "content": "<y92>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32195": {
+      "content": "<y93>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32196": {
+      "content": "<y94>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32197": {
+      "content": "<y95>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32198": {
+      "content": "<y96>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32199": {
+      "content": "<y97>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32200": {
+      "content": "<y98>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32201": {
+      "content": "<y99>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32202": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32203": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32204": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32205": {
+      "content": "<prev_im>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32206": {
+      "content": "<lat_image>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}You are an expert radiology assistant tasked with interpreting a chest X-ray study.  {% for message in messages %}{% if message[\"role\"] == \"user\" %}USER:  {% else %}ASSISTANT: {% endif %}{% for item in message[\"content\"] %}{% if item[\"type\"] == \"text\" %}{{ item[\"text\"] }}{% elif item[\"type\"] == \"image\" %}<image>{% endif %}{% endfor %}{% if message[\"role\"] == \"user\" %}  {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}