benwiesel commited on Apr 14

Commit

1ebe0da

verified ·

1 Parent(s): f8d4d25

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

README.md +0 -0
adapter_config.json +176 -0
adapter_model.safetensors +3 -0
added_tokens.json +3 -0
chat_template.jinja +180 -0
config.json +219 -0
configuration.py +65 -0
downsampling.py +426 -0
generation_config.json +7 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling.py +955 -0
preprocessor_config.json +144 -0
processing.py +55 -0
processor_config.json +11 -0
special_tokens_map.json +33 -0
tokenizer.json +0 -0
tokenizer_config.json +796 -0
vocab.json +0 -0

README.md ADDED Viewed

File without changes

adapter_config.json ADDED Viewed

	@@ -0,0 +1,176 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Granite4VisionForConditionalGeneration",
+    "parent_library": "modeling"
+  },
+  "base_model_name_or_path": "granite-vision-dev/granite-4.1-3b-vision",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "model.downsampler.0",
+    "model.downsampler.1",
+    "model.downsampler.2",
+    "model.downsampler.3",
+    "model.multi_modal_projector.0",
+    "model.multi_modal_projector.1",
+    "model.multi_modal_projector.2",
+    "model.multi_modal_projector.3",
+    "model.image_newline"
+  ],
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 256,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "28.self_attn.v_proj",
+    "27.self_attn.q_proj",
+    "language_model.layers.7.self_attn.q_proj",
+    "language_model.layers.19.self_attn.k_proj",
+    "31.self_attn.k_proj",
+    "language_model.layers.25.self_attn.k_proj",
+    "language_model.layers.14.self_attn.v_proj",
+    "language_model.layers.12.self_attn.q_proj",
+    "28.self_attn.q_proj",
+    "language_model.layers.2.self_attn.q_proj",
+    "language_model.layers.2.self_attn.v_proj",
+    "language_model.layers.15.self_attn.k_proj",
+    "language_model.layers.14.self_attn.k_proj",
+    "language_model.layers.6.self_attn.v_proj",
+    "language_model.layers.21.self_attn.k_proj",
+    "language_model.layers.3.self_attn.q_proj",
+    "30.self_attn.v_proj",
+    "language_model.layers.8.self_attn.k_proj",
+    "27.self_attn.k_proj",
+    "language_model.layers.1.self_attn.k_proj",
+    "language_model.layers.18.self_attn.q_proj",
+    "down_proj",
+    "29.self_attn.v_proj",
+    "38.self_attn.v_proj",
+    "up_proj",
+    "language_model.layers.9.self_attn.k_proj",
+    "language_model.layers.11.self_attn.q_proj",
+    "language_model.layers.5.self_attn.k_proj",
+    "35.self_attn.k_proj",
+    "language_model.layers.25.self_attn.q_proj",
+    "language_model.layers.19.self_attn.v_proj",
+    "language_model.layers.13.self_attn.q_proj",
+    "33.self_attn.v_proj",
+    "language_model.layers.9.self_attn.v_proj",
+    "37.self_attn.k_proj",
+    "language_model.layers.24.self_attn.v_proj",
+    "33.self_attn.q_proj",
+    "31.self_attn.v_proj",
+    "gate_proj",
+    "34.self_attn.v_proj",
+    "language_model.layers.21.self_attn.v_proj",
+    "o_proj",
+    "language_model.layers.22.self_attn.v_proj",
+    "language_model.layers.26.self_attn.k_proj",
+    "language_model.layers.4.self_attn.q_proj",
+    "language_model.layers.26.self_attn.v_proj",
+    "language_model.layers.23.self_attn.q_proj",
+    "language_model.layers.21.self_attn.q_proj",
+    "language_model.layers.20.self_attn.q_proj",
+    "language_model.layers.23.self_attn.v_proj",
+    "language_model.layers.16.self_attn.q_proj",
+    "37.self_attn.v_proj",
+    "language_model.layers.5.self_attn.v_proj",
+    "language_model.layers.16.self_attn.v_proj",
+    "language_model.layers.7.self_attn.k_proj",
+    "language_model.layers.15.self_attn.v_proj",
+    "36.self_attn.v_proj",
+    "language_model.layers.16.self_attn.k_proj",
+    "language_model.layers.1.self_attn.q_proj",
+    "language_model.layers.4.self_attn.k_proj",
+    "language_model.layers.14.self_attn.q_proj",
+    "30.self_attn.q_proj",
+    "language_model.layers.19.self_attn.q_proj",
+    "language_model.layers.25.self_attn.v_proj",
+    "language_model.layers.13.self_attn.v_proj",
+    "language_model.layers.18.self_attn.k_proj",
+    "language_model.layers.0.self_attn.v_proj",
+    "language_model.layers.23.self_attn.k_proj",
+    "language_model.layers.10.self_attn.v_proj",
+    "language_model.layers.17.self_attn.q_proj",
+    "36.self_attn.q_proj",
+    "35.self_attn.v_proj",
+    "30.self_attn.k_proj",
+    "language_model.layers.8.self_attn.v_proj",
+    "language_model.layers.20.self_attn.k_proj",
+    "32.self_attn.q_proj",
+    "38.self_attn.q_proj",
+    "language_model.layers.15.self_attn.q_proj",
+    "language_model.layers.24.self_attn.q_proj",
+    "language_model.layers.10.self_attn.q_proj",
+    "34.self_attn.k_proj",
+    "language_model.layers.3.self_attn.v_proj",
+    "language_model.layers.11.self_attn.v_proj",
+    "language_model.layers.22.self_attn.k_proj",
+    "38.self_attn.k_proj",
+    "language_model.layers.7.self_attn.v_proj",
+    "39.self_attn.v_proj",
+    "language_model.layers.10.self_attn.k_proj",
+    "language_model.layers.13.self_attn.k_proj",
+    "language_model.layers.12.self_attn.k_proj",
+    "37.self_attn.q_proj",
+    "34.self_attn.q_proj",
+    "language_model.layers.22.self_attn.q_proj",
+    "33.self_attn.k_proj",
+    "28.self_attn.k_proj",
+    "language_model.layers.6.self_attn.k_proj",
+    "language_model.layers.6.self_attn.q_proj",
+    "language_model.layers.18.self_attn.v_proj",
+    "language_model.layers.0.self_attn.k_proj",
+    "36.self_attn.k_proj",
+    "language_model.layers.5.self_attn.q_proj",
+    "31.self_attn.q_proj",
+    "language_model.layers.3.self_attn.k_proj",
+    "language_model.layers.24.self_attn.k_proj",
+    "29.self_attn.k_proj",
+    "language_model.layers.2.self_attn.k_proj",
+    "language_model.layers.20.self_attn.v_proj",
+    "language_model.layers.17.self_attn.v_proj",
+    "language_model.layers.4.self_attn.v_proj",
+    "language_model.layers.0.self_attn.q_proj",
+    "language_model.layers.11.self_attn.k_proj",
+    "39.self_attn.q_proj",
+    "29.self_attn.q_proj",
+    "language_model.layers.1.self_attn.v_proj",
+    "language_model.layers.26.self_attn.q_proj",
+    "language_model.layers.9.self_attn.q_proj",
+    "language_model.layers.17.self_attn.k_proj",
+    "language_model.layers.12.self_attn.v_proj",
+    "27.self_attn.v_proj",
+    "32.self_attn.v_proj",
+    "35.self_attn.q_proj",
+    "32.self_attn.k_proj",
+    "language_model.layers.8.self_attn.q_proj",
+    "39.self_attn.k_proj"
+  ],
+  "target_parameters": [],
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdcddb6ba09c12b7ab299fbc6e805291e6744f2fee0be60e0fb281bd9b55cb56
+size 1328498208

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image>": 100352
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,180 @@

+{#- ===== Task tag prompt constants ===== -#}
+{%- set chart2code_prompt = "Generate code that recreates the chart as best as possible." -%}
+{%- set chart2csv_prompt = "Please examine this chart image. Consider you are a data visualization expert, and extract the data into a CSV table.\n\nYour CSV should:\n- Include a header row with clear column names\n- Represent all data series/categories shown in the chart\n- Use numeric values that match the chart as closely as possible\n\nOutput only the CSV data, nothing else." -%}
+{%- set chart2summary_prompt = "Can you describe this chart image?" -%}
+{%- set tables_json_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a valid JSON object containing a list of dictionaries with the following structure:\n\n                {\n                    \"dimensions\": {\n                        \"rows\": <number of data rows (excluding header rows)>,\n                        \"columns\": <number of columns>,\n                        \"header_rows\": <number of header rows>,\n                        \"total_rows\": <total number of rows including headers>\n                    },\n                    \"cells\": [\n                        {\n                        \"row\": <row index starting at 1>,\n                        \"col\": <column index starting at 1>,\n                        \"colspan\": <number of columns spanned>,\n                        \"rowspan\": <number of rows spanned>,\n                        \"type\": \"<'header' or 'data'>\",\n                        \"header_level\": <header nesting level if type=header, else omit or null>,\n                        \"content\": \"<string content of the cell>\"\n                        },\n                        ...\n                    ]\n                }" -%}
+{%- set tables_html_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a list of valid HTML tables" -%}
+{%- set tables_otsl_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a list of valid OTSL objects, each consists of the following fields: \n                        <fcel> - a cell with content in it\n                        <ecel> - an empty cell\n                        <lcel> - a cell that is merged with the cell to its left\n                        <ucel> - a cell that is merged with the cell above it\n                        <xcel> - a cell that is merged with both the cell above it and the cell to its left\n                        <nl> - a new line\n                        <ched> - a clumn header\n                        <otsl> - the beginning of the OTSL table\n                        </otsl> - the end of the OTSL table\n\n                        An example for an output:\n                        [\n                        <otsl><ched>first table header1<ched>first table header2<nl><fcel>data1<fcel>data2<nl><fcel>data with horizontal span<lcel><nl><fcell>data with vertical span<ecel><nl><ucel><fcel>data3<nl></otsl>,\n                        <otsl><ched>second table header1<ched>second table header2<nl><fcel>data1<fcel>data2<nl><fcel>data with horizontal span<lcel><nl><fcell>data with vertical span<ecel><nl><ucel><fcel>data3<nl></otsl>\n                        ]" -%}
+{#- ===== Tag expansion dispatcher ===== -#}
+{%- macro expand_tags(text) -%}
+{%- set has_image = "<image>" in text -%}
+{#- Determine image position: prefix if <image> appears before the tag, suffix if after -#}
+{%- if has_image -%}
+  {%- set img_idx = text.index("<image>") -%}
+  {%- if "<chart2code>" in text -%}{%- set tag_idx = text.index("<chart2code>") -%}
+  {%- elif "<chart2csv>" in text -%}{%- set tag_idx = text.index("<chart2csv>") -%}
+  {%- elif "<chart2summary>" in text -%}{%- set tag_idx = text.index("<chart2summary>") -%}
+  {%- elif "<tables_json>" in text -%}{%- set tag_idx = text.index("<tables_json>") -%}
+  {%- elif "<tables_html>" in text -%}{%- set tag_idx = text.index("<tables_html>") -%}
+  {%- elif "<tables_otsl>" in text -%}{%- set tag_idx = text.index("<tables_otsl>") -%}
+  {%- else -%}{%- set tag_idx = 999999 -%}
+  {%- endif -%}
+  {%- set img_prefix = "<image>\n" if img_idx < tag_idx else "" -%}
+  {%- set img_suffix = "\n<image>" if img_idx >= tag_idx else "" -%}
+{%- else -%}
+  {%- set img_prefix = "" -%}
+  {%- set img_suffix = "" -%}
+{%- endif -%}
+{%- if "<chart2code>" in text -%}
+  {{- img_prefix + chart2code_prompt + img_suffix -}}
+{%- elif "<chart2csv>" in text -%}
+  {{- img_prefix + chart2csv_prompt + img_suffix -}}
+{%- elif "<chart2summary>" in text -%}
+  {{- img_prefix + chart2summary_prompt + img_suffix -}}
+{%- elif "<tables_json>" in text -%}
+  {{- img_prefix + tables_json_prompt + img_suffix -}}
+{%- elif "<tables_html>" in text -%}
+  {{- img_prefix + tables_html_prompt + img_suffix -}}
+{%- elif "<tables_otsl>" in text -%}
+  {{- img_prefix + tables_otsl_prompt + img_suffix -}}
+{%- else -%}
+  {{- text -}}
+{%- endif -%}
+{%- endmacro -%}
+{#- ===== Original chat template ===== -#}
+{% macro render_content(x) %}
+    {%- if x is string %}
+        {{ x }}
+    {%- else %}
+        {%- for chunk in x %}
+            {%- if chunk['type'] == 'text' -%}
+                {{ chunk['text']}}
+            {%- elif chunk['type'] == 'image' -%}
+                {{- "<image>
+" }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {% endmacro %}
+    {%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>'  %}
+{%- set tools_system_message_suffix = '\n</tools>\n\nFor each tool call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %}
+{%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within <documents></documents> XML tags:\n<documents>' %}
+{%- set documents_system_message_suffix = '\n</documents>\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %}
+{%- set g4_default_system_message = 'You are a helpful assistant. Please ensure responses are professional, accurate, and safe.' %}
+{%- if available_tools is defined and available_tools %}
+    {%- set tools = available_tools %}
+{%- endif %}
+{%- set ns = namespace(tools_system_message=tools_system_message_prefix,
+                       documents_system_message=documents_system_message_prefix,
+                       default_system_message=g4_default_system_message,
+                       system_message=''
+                       ) %}
+{%- if tools %}
+    {%- for tool in tools %}
+        {%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %}
+    {%- endfor %}
+    {%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %}
+{%- else %}
+    {%- set ns.tools_system_message = '' %}
+{%- endif %}
+{%- if documents %}
+    {%- for document in documents %}
+        {%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %}
+    {%- endfor %}
+    {%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %}
+{%- else %}
+    {%- set ns.documents_system_message = '' %}
+{%- endif %}
+{%- if messages[0].role == 'system' %}
+    {%- if messages[0].content is string %}
+        {%- set ns.system_message = messages[0].content %}
+    {%- elif messages[0].content is iterable %}
+        {%- for entry in messages[0].content %}
+            {%- if entry.type== 'text' %}
+                {%- if ns.system_message != '' %}
+                    {%- set ns.system_message = ns.system_message + '\n' %}
+                {%- endif %}
+                {%- set ns.system_message = ns.system_message + entry.text %}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+    {%- if tools and documents %}
+        {%- set ns.system_message = ns.system_message + '\n\n' +  ns.tools_system_message + '\n\n' + ns.documents_system_message %}
+    {%- elif tools %}
+        {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %}
+    {%- elif documents %}
+        {%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %}
+    {%- endif %}
+{%- else %}
+    {%- if tools and documents %}
+        {%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message  %}
+    {%- elif tools %}
+        {%- set ns.system_message = ns.tools_system_message %}
+    {%- elif documents %}
+        {%- set ns.system_message = ns.documents_system_message %}
+    {%- endif %}
+{%- endif %}
+{%- if ns.system_message %}
+    {{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }}
+{%- else %}
+    {{- '<|start_of_role|>system<|end_of_role|>' + ns.default_system_message + '<|end_of_text|>\n' }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = namespace(val='') %}
+    {%- if render_content(message['content']) is string %}
+        {%- set content.val = render_content(message['content']) %}
+    {%- else %}
+        {%- if render_content(message['content']) is iterable %}
+            {%- for entry in render_content(message['content']) %}
+                {%- if entry.type== 'text' %}
+                    {%- if content.val != '' %}
+                        {%- set content.val = content.val + '\n' %}
+                    {%- endif %}
+                    {%- set content.val = content.val + entry.text %}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- endif %}
+    {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}
+        {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + expand_tags(content.val) + '<|end_of_text|>\n' }}
+    {%- elif message.role == 'assistant' %}
+        {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content.val) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|end_of_text|>\n' }}
+    {%- elif message.role == 'tool' %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
+            {{- '<|start_of_role|>user<|end_of_role|>' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content.val }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
+            {{- '<|end_of_text|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,219 @@

+{
+  "architectures": [
+    "Granite4VisionForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration.Granite4VisionConfig",
+    "AutoModel": "modeling.Granite4VisionForConditionalGeneration",
+    "AutoModelForVision2Seq": "modeling.Granite4VisionForConditionalGeneration",
+    "AutoModelForImageTextToText": "modeling.Granite4VisionForConditionalGeneration",
+    "AutoProcessor": "processing.Granite4VisionProcessor"
+  },
+  "checkerboard_llm_layers": [
+    12,
+    15,
+    18,
+    21
+  ],
+  "checkerboard_stride": 2,
+  "checkerboard_vision_layer": -1,
+  "dave_encoder": null,
+  "downsample_method": "window_qformer",
+  "downsample_rate": "4/8",
+  "dtype": "bfloat16",
+  "image_grid_pinpoints": [
+    [
+      384,
+      384
+    ],
+    [
+      384,
+      768
+    ],
+    [
+      384,
+      1152
+    ],
+    [
+      384,
+      1536
+    ],
+    [
+      384,
+      1920
+    ],
+    [
+      384,
+      2304
+    ],
+    [
+      384,
+      2688
+    ],
+    [
+      384,
+      3072
+    ],
+    [
+      384,
+      3456
+    ],
+    [
+      384,
+      3840
+    ],
+    [
+      768,
+      384
+    ],
+    [
+      768,
+      768
+    ],
+    [
+      768,
+      1152
+    ],
+    [
+      768,
+      1536
+    ],
+    [
+      768,
+      1920
+    ],
+    [
+      1152,
+      384
+    ],
+    [
+      1152,
+      768
+    ],
+    [
+      1152,
+      1152
+    ],
+    [
+      1536,
+      384
+    ],
+    [
+      1536,
+      768
+    ],
+    [
+      1920,
+      384
+    ],
+    [
+      1920,
+      768
+    ],
+    [
+      2304,
+      384
+    ],
+    [
+      2688,
+      384
+    ],
+    [
+      3072,
+      384
+    ],
+    [
+      3456,
+      384
+    ],
+    [
+      3840,
+      384
+    ]
+  ],
+  "image_seq_length": 576,
+  "image_token_index": 100352,
+  "initializer_range": 0.02,
+  "model_type": "granite4_vision",
+  "multimodal_projector_bias": true,
+  "pretrained_language_model": "",
+  "pretrained_vision_tower": "",
+  "projector_dropout": 0.1,
+  "projector_hidden_act": "gelu",
+  "simplified_qformer": false,
+  "text_config": {
+    "_name_or_path": "/proj/mmfm/users/avihu/dmf/granite-4.1-3b/r260401a/",
+    "architectures": [
+      "GraniteForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_multiplier": 0.015625,
+    "bos_token_id": 100257,
+    "dtype": "bfloat16",
+    "embedding_multiplier": 12,
+    "eos_token_id": 100257,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.1,
+    "intermediate_size": 8192,
+    "logits_scaling": 10,
+    "max_position_embeddings": 131072,
+    "mlp_bias": false,
+    "model_type": "granite",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 8,
+    "pad_token_id": 100256,
+    "residual_multiplier": 0.22,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 10000000,
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "vocab_size": 100353
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_checkerboard_sampling": true,
+  "use_image_newline_parameter": true,
+  "use_quadrant_sampling": false,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 16
+  },
+  "vision_feature_layer": [
+    -24,
+    -20,
+    -12,
+    -1
+  ],
+  "vision_feature_select_strategy": "full",
+  "vision_layer_to_llm_layer": [
+    [
+      -19,
+      9
+    ],
+    [
+      -13,
+      6
+    ],
+    [
+      -7,
+      3
+    ],
+    [
+      -1,
+      0
+    ]
+  ]
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Optional
+import logging
+from transformers import LlavaNextConfig
+logger = logging.getLogger(__name__)
+class Granite4VisionConfig(LlavaNextConfig):
+    model_type = "granite4_vision"
+    def __init__(
+        self,
+        pretrained_vision_tower: str = "",
+        pretrained_language_model: str = "",
+        downsample_rate=None,
+        downsample_method="interpolate",
+        use_image_newline_parameter=True,
+        simplified_qformer=True,
+        dave_encoder=None,
+        vision_layer_to_llm_layer: Optional[list] = None,
+        use_checkerboard_sampling: bool = False,
+        checkerboard_stride: int = 2,
+        checkerboard_vision_layer: int = -1,
+        checkerboard_llm_layers: Optional[list] = None,
+        use_quadrant_sampling=False,
+        projector_dropout=0.1,
+        **kwargs
+    ):
+        self.pretrained_vision_tower = pretrained_vision_tower
+        self.pretrained_language_model = pretrained_language_model
+        self.downsample_method = downsample_method
+        self.downsample_rate = downsample_rate
+        self.use_image_newline_parameter = use_image_newline_parameter
+        self.dave_encoder = dave_encoder
+        self.projector_dropout = projector_dropout
+        # List of tuples mapping vision tower layer indices to LLM layer indices
+        # e.g., [(-8, 0), (-1, 0)] extracts vision layers -8 and -1, both inject at LLM input (layer 0)
+        # e.g., [(-8, 8), (-1, 0)] extracts vision layers -8 and -1, inject at LLM layers 8 and 0 respectively
+        # e.g., [(-1, 0), (-1, 4)] extracts vision layer -1 twice with different projectors, inject at layers 0 and 4
+        # None means use default single-layer behavior (vision_feature_layer)
+        if vision_layer_to_llm_layer is not None:
+            self.vision_layer_to_llm_layer = [(int(v), int(l)) for v, l in vision_layer_to_llm_layer]
+            # Validate for redundant (vision_layer, llm_layer) pairs
+            assert len(self.vision_layer_to_llm_layer) == len(set(self.vision_layer_to_llm_layer)), "expecting no duplicates"
+        else:
+            self.vision_layer_to_llm_layer = None
+        # Checkerboard sampling configuration
+        # When enabled, extracts 4 groups from a vision layer using spatial sampling
+        # Each group uses a different offset in a sampling pattern
+        self.use_checkerboard_sampling = use_checkerboard_sampling
+        self.checkerboard_stride = checkerboard_stride  # Stride for sampling (e.g., 4 means sample every 4th position)
+        self.checkerboard_vision_layer = checkerboard_vision_layer  # Which vision layer to apply this to (e.g., -1)
+        self.checkerboard_llm_layers = checkerboard_llm_layers or [0, 10, 20, 30]  # LLM layers for the 4 groups
+        self.simplified_qformer = simplified_qformer
+        # Sampling strategy: False = block sampling (full coverage, local continuity)
+        #                    True = quadrant sampling (maximum continuity, limited coverage)
+        self.use_quadrant_sampling = bool(use_quadrant_sampling)
+        # Note: With list of tuples, we now allow multiple projections to the same LLM layer
+        # So we remove the conflict validation that prevented this
+        super().__init__(**kwargs)
+class Granite4VisionConfigNaflex(Granite4VisionConfig):
+    model_type = "granite4_vision_naflex"

downsampling.py ADDED Viewed

	@@ -0,0 +1,426 @@

+from typing import Any
+import torch
+from torch import nn
+import math
+from fractions import Fraction
+from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
+from transformers.models.blip_2.modeling_blip_2 import Blip2QFormerModel
+import torch.nn.functional as F
+class QFormerCrossAttention(nn.Module):
+    """Multi-headed cross-attention for QFormer with SDPA/Flash Attention support"""
+    def __init__(self, hidden_size, num_heads, attn_bias=False, attention_dropout=0.05, final_dropout=0.05):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.attention_dropout = attention_dropout
+        if self.head_dim * num_heads != hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {hidden_size} "
+                f"and `num_heads`: {num_heads})."
+            )
+        # Q from queries, K and V from encoder
+        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
+        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
+        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
+        self.o_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
+        self.dropout = nn.Dropout(final_dropout)
+    def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
+        """
+        Args:
+            hidden_states: (B, query_len, hidden_size) - queries
+            encoder_hidden_states: (B, encoder_len, hidden_size) - keys and values
+            attention_mask: optional attention mask
+        Returns:
+            (B, query_len, hidden_size)
+        """
+        batch_size, query_len, _ = hidden_states.shape
+        encoder_len = encoder_hidden_states.shape[1]
+        # Project queries from hidden_states
+        query_states = self.q_proj(hidden_states).view(
+            batch_size, query_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # Project keys and values from encoder_hidden_states
+        key_states = self.k_proj(encoder_hidden_states).view(
+            batch_size, encoder_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = self.v_proj(encoder_hidden_states).view(
+            batch_size, encoder_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # Use PyTorch's scaled_dot_product_attention (SDPA)
+        # This automatically uses Flash Attention when available
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=False,
+        )
+        # Reshape back to (B, query_len, hidden_size)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, query_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_output = self.dropout(attn_output)
+        return attn_output
+class QFormerMLP(nn.Module):
+    """Feed-forward network (MLP) for QFormer with SiLU activation"""
+    def __init__(self, hidden_size, mlp_hidden_size, mlp_bias=False, dropout_prob=0.05):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.fc1 = nn.Linear(hidden_size, mlp_hidden_size, bias=mlp_bias)
+        self.act = nn.SiLU()
+        self.fc2 = nn.Linear(mlp_hidden_size, hidden_size, bias=mlp_bias)
+        self.dropout = nn.Dropout(dropout_prob)
+    def forward(self, hidden_states):
+        """
+        Args:
+            hidden_states: (B, seq_len, hidden_size)
+        Returns:
+            (B, seq_len, hidden_size)
+        """
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(self.fc2(hidden_states))
+        return hidden_states
+class SimplifiedQFormer(nn.Module):
+    """
+    Simplified QFormer with a single cross-attention layer followed by an MLP.
+    Lightweight design: queries attend to encoder hidden states via cross-attention,
+    then pass through a feed-forward network, similar to a transformer block.
+    """
+    def __init__(self, hidden_size, num_heads=8, mlp_hidden_size=2048, mlp_bias=False, attn_bias=False, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        # Cross-attention block
+        self.attn_norm = nn.LayerNorm(hidden_size, eps=eps)
+        self.cross_attention = QFormerCrossAttention(
+            hidden_size, num_heads, attn_bias=attn_bias,
+        )
+        # MLP block (feed-forward network)
+        self.mlp_norm = nn.LayerNorm(hidden_size, eps=eps)
+        self.mlp = QFormerMLP(hidden_size, mlp_hidden_size, mlp_bias=mlp_bias)
+    def forward(self, query_embeds, encoder_hidden_states):
+        """
+        Args:
+            query_embeds: (B, num_queries, hidden_size) - learnable queries
+            encoder_hidden_states: (B, num_tokens, hidden_size) - input features
+        Returns:
+            (B, num_queries, hidden_size) - output features
+        """
+        # Cross-attention block with residual and pre-norm
+        residual = query_embeds
+        hidden_states = self.attn_norm(query_embeds)
+        hidden_states = self.cross_attention(hidden_states, encoder_hidden_states)
+        hidden_states = residual + hidden_states
+        # MLP block with residual and pre-norm
+        residual = hidden_states
+        hidden_states = self.mlp_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class InterpolateDownsampler:
+    def __init__(self, config, mode="area"):
+        self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
+        self.new_image_side = int(self.orig_image_side * Fraction(config.downsample_rate))
+        self.mode = mode
+    def __call__(self, image_features):
+        batch_size, _, dim = image_features.size()
+        up_shape = [batch_size] + [self.orig_image_side] * 2 + [dim]
+        # interpolate expects B,C,H,W
+        large_image_permuted = image_features.view(up_shape).permute(0,3,1,2)
+        small_image_permuted = torch.nn.functional.interpolate(
+                large_image_permuted, size=(self.new_image_side, self.new_image_side),
+                mode=self.mode,
+        )
+        # back to B,H*W,C
+        final = small_image_permuted.permute(0,2,3,1).flatten(1,2)
+        return final
+class SpatialOffsetDownsampler:
+    """
+    Downsampler that samples with local block continuity pattern.
+    Instead of global strided [1,0,1,0], creates local 2x2 blocks where sampling
+    creates continuity: within each 2x2 block, adjacent samples are spatially adjacent.
+    """
+    def __init__(self, config, offset=0):
+        """
+        Args:
+            config: Model configuration
+            offset: Integer offset (0, 1, 2, or 3) for position within each 2x2 block
+                   0: top-left, 1: top-right, 2: bottom-left, 3: bottom-right
+        """
+        self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
+        self.new_image_side = self.orig_image_side // 2  # downsample by 2x
+        self.offset = offset
+        # Map offset to position within 2x2 blocks
+        self.offsets = [(0, 0), (0, 1), (1, 0), (1, 1)]
+        self.offset_h, self.offset_w = self.offsets[offset]
+    def __call__(self, image_features):
+        """
+        Extract features by sampling one position from each 2x2 block across the image.
+        This maintains full spatial coverage while creating local continuity.
+        For a 4x4 image with offset=0 (top-left of each 2x2 block):
+        Original:        Sampled (raster order):
+        [A B | C D]      [A C]
+        [E F | G H]  ->  [I K]
+        [---+---]
+        [I J | K L]
+        [M N | O P]
+        Result in sequence: [A, C, I, K] - maintains spatial structure
+        Args:
+            image_features: Tensor of shape [batch, height*width, hidden_dim]
+        Returns:
+            Downsampled features of shape [batch, (height/2)*(width/2), hidden_dim]
+        """
+        batch_size, seq_len, hidden_dim = image_features.shape
+        # Reshape to [batch, height, width, hidden_dim]
+        features_2d = image_features.reshape(batch_size, self.orig_image_side, self.orig_image_side, hidden_dim)
+        # Reshape into 2x2 blocks: [batch, n_blocks_h, 2, n_blocks_w, 2, hidden_dim]
+        n_blocks = self.new_image_side
+        features_blocks = features_2d.reshape(
+            batch_size, n_blocks, 2, n_blocks, 2, hidden_dim
+        )
+        # Select the specified position from each 2x2 block
+        # This maintains spatial coverage while creating local continuity
+        sampled = features_blocks[:, :, self.offset_h, :, self.offset_w, :]
+        # Flatten spatial dimensions back to [batch, n_blocks*n_blocks, hidden_dim]
+        sampled = sampled.reshape(batch_size, -1, hidden_dim)
+        return sampled
+class SpatialQuadrantDownsampler:
+    """
+    Alternative downsampler that samples contiguous spatial quadrants.
+    Takes a full quadrant of the image rather than sampling across the entire image.
+    This creates maximum local continuity but only covers 1/4 of the spatial extent.
+    Use case: When you want queries to focus on a specific region with maximum
+    local coherence, trading off global spatial coverage.
+    """
+    def __init__(self, config, offset=0):
+        """
+        Args:
+            config: Model configuration
+            offset: Integer offset (0, 1, 2, or 3) for quadrant selection
+                   0: top-left, 1: top-right, 2: bottom-left, 3: bottom-right
+        """
+        self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
+        self.new_image_side = self.orig_image_side // 2  # downsample by 2x
+        self.offset = offset
+        # Map offset to quadrant starting positions
+        self.offsets = [
+            (0, 0),  # top-left
+            (0, self.new_image_side),  # top-right
+            (self.new_image_side, 0),  # bottom-left
+            (self.new_image_side, self.new_image_side)  # bottom-right
+        ]
+        self.start_h, self.start_w = self.offsets[offset]
+    def __call__(self, image_features):
+        """
+        Extract a contiguous quadrant from the image.
+        For a 4x4 image with offset=0 (top-left quadrant):
+        Original:        Sampled:
+        [A B | C D]      [A B]
+        [E F | G H]  ->  [E F]
+        [---+---]
+        [I J | K L]
+        [M N | O P]
+        Result in sequence: [A, B, E, F] - maximum local continuity
+        Args:
+            image_features: Tensor of shape [batch, height*width, hidden_dim]
+        Returns:
+            Downsampled features of shape [batch, (height/2)*(width/2), hidden_dim]
+        """
+        batch_size, seq_len, hidden_dim = image_features.shape
+        # Reshape to [batch, height, width, hidden_dim]
+        features_2d = image_features.reshape(batch_size, self.orig_image_side, self.orig_image_side, hidden_dim)
+        # Extract contiguous quadrant
+        sampled = features_2d[:, self.start_h:self.start_h + self.new_image_side,
+                              self.start_w:self.start_w + self.new_image_side, :]
+        # Flatten spatial dimensions back to [batch, new_height*new_width, hidden_dim]
+        sampled = sampled.reshape(batch_size, -1, hidden_dim)
+        return sampled
+class WindowQFormerDownsampler(nn.Module):
+    def __init__(self, config, checkerboard_offset=None, use_quadrant_sampling=False):
+        super().__init__()
+        llm_hidden_size = config.text_config.hidden_size
+        vision_hidden_size = config.vision_config.hidden_size
+        # Dropout rates for robustness (conservative approach)
+        self.dropout = nn.Dropout(config.projector_dropout)
+        # Choose downsampler based on parameters
+        if checkerboard_offset is not None:
+            if use_quadrant_sampling:
+                # Use quadrant sampling: maximum local continuity, limited spatial coverage
+                self.downsampler = SpatialQuadrantDownsampler(config, offset=checkerboard_offset)
+            else:
+                # Use block sampling: balanced continuity and full spatial coverage (default)
+                self.downsampler = SpatialOffsetDownsampler(config, offset=checkerboard_offset)
+        else:
+            self.downsampler = InterpolateDownsampler(config)
+        self.use_simplified_qformer = config.simplified_qformer
+        # Choose between SimplifiedQFormer and Blip2QFormerModel
+        if self.use_simplified_qformer:
+            # Use our simplified QFormer with full self-attention
+            self.qformer = SimplifiedQFormer(
+                hidden_size=vision_hidden_size,
+                num_heads=vision_hidden_size // 64,
+                mlp_hidden_size=3072,
+                mlp_bias=True,
+                attn_bias=True
+            )
+        else:
+            # Use original Blip2QFormerModel with cross-attention
+            configuration = Blip2QFormerConfig(
+                hidden_size=vision_hidden_size,
+                num_attention_heads=vision_hidden_size // 64,
+                intermediate_size=3072,
+                num_hidden_layers=1,
+                encoder_hidden_size=vision_hidden_size,
+                cross_attention_frequency=1,
+                max_position_embeddings=2048,
+                use_qformer_text_input=False,
+            )
+            self.qformer = Blip2QFormerModel(configuration)
+        self.image_side = config.vision_config.image_size // config.vision_config.patch_size
+        q, w = config.downsample_rate.split("/")
+        self.query_side, self.window_side = int(q), int(w)
+        # query length is cubical for seamless integration with llava next
+        self.query_length = self.query_side ** 2
+        embed_std = 1 / math.sqrt(vision_hidden_size)
+        self.norm = nn.LayerNorm(vision_hidden_size, eps=1e-6)
+        self.query = nn.Parameter(torch.randn(1, self.query_length, vision_hidden_size) * embed_std)
+        # qformer model doesn't have positional embeddings, adding to the flat patches
+        self.image_positions = nn.Parameter(torch.randn(1, self.window_side ** 2, vision_hidden_size) * embed_std)
+        self.out_linear = nn.Linear(vision_hidden_size, llm_hidden_size, bias=True)
+    def _win(self, x, side, win):
+        """
+        (B, side*side, C) raster -> (B*n*n, win*win, C) where n=side//win
+        windows are raster-ordered, and tokens inside each window are raster-ordered.
+        """
+        B, _, C = x.shape
+        n = side // win
+        return (
+            x.view(B, side, side, C)
+            .view(B, n, win, n, win, C)
+            .transpose(2, 3)          # (B, n, n, win, win, C)
+            .flatten(0, 2)            # (B*n*n, win, win, C)
+            .flatten(1, 2)            # (B*n*n, win*win, C)
+        )
+    def _unwin(self, xw, n, win):
+        """
+        (B*n*n, win*win, C) -> (B, (n*win)^2, C) raster
+        """
+        Bnn, _, C = xw.shape
+        assert Bnn % (n * n) == 0
+        B = Bnn // (n * n)
+        side = n * win
+        return (
+            xw.view(B, n, n, win, win, C)
+            .transpose(2, 3)                 # (B, n, win, n, win, C)
+            .contiguous()
+            .view(B, side, side, C)
+            .flatten(1, 2)
+        )
+    def forward(self, image_features):
+        B, HW, C = image_features.shape
+        assert HW == self.image_side * self.image_side
+        n = self.image_side // self.window_side
+        image_features = self.norm(image_features)
+        enc = self._win(image_features, self.image_side, self.window_side)  # (B*n^2, w^2, C)
+        # Apply downsampling (either spatial offset or interpolation)
+        downsampled = self.downsampler(image_features)  # (B, new_side^2, C) raster
+        new_side = n * self.query_side
+        downsampled_w = self._win(downsampled, new_side, self.query_side)  # (B*n^2, q^2, C)
+        # Apply QFormer based on the chosen mechanism
+        if self.use_simplified_qformer:
+            # SimplifiedQFormer: full self-attention between queries and inputs
+            # Broadcasting handles batch dimension automatically
+            # Apply dropout to embeddings for robustness
+            query_embeds = self.dropout(self.query + downsampled_w)
+            encoder_embeds = self.dropout(enc + self.image_positions)
+            out_w = self.qformer(
+                query_embeds=query_embeds,
+                encoder_hidden_states=encoder_embeds
+            )  # (B*n^2, q^2, C)
+        else:
+            # Blip2QFormerModel: cross-attention mechanism
+            # Apply dropout to embeddings for robustness
+            query_embeds = self.query + downsampled_w # blip already dropouts the queries
+            encoder_embeds = self.dropout(enc + self.image_positions)
+            out_w = self.qformer(
+                query_embeds=query_embeds,
+                encoder_hidden_states=encoder_embeds,
+                return_dict=True,
+            ).last_hidden_state  # (B*n^2, q^2, C)
+        out = self._unwin(out_w, n=n, win=self.query_side)  # (B, new_side^2, C) raster
+        # Apply output dropout before final projection
+        out = self.dropout(out)
+        return self.out_linear(out)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100257,
+  "eos_token_id": 100257,
+  "pad_token_id": 100256,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656dafec476cf8227a1c77b4f4ce7905776dca3e09c6a9cddc06657edc8b81a9
+size 4963943712

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b04b421e73d926f897cc21ba00bdf7b382f426793322f476d7952db9c0307f75
+size 3544424944

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling.py ADDED Viewed

	@@ -0,0 +1,955 @@

+import math
+from dataclasses import dataclass
+from fractions import Fraction
+from typing import Optional, Union
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    LlavaNextForConditionalGeneration,
+)
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
+    HybridMambaAttentionDynamicCache,
+    MoeModelOutputWithPast,
+)
+from transformers.models.llava_next.modeling_llava_next import (
+    LlavaNextCausalLMOutputWithPast,
+    LlavaNextModelOutputWithPast,
+    LlavaNextPreTrainedModel,
+    get_anyres_image_grid_shape,
+    image_size_to_num_patches,
+    unpad_image,
+)
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
+from .configuration import Granite4VisionConfig
+from .downsampling import InterpolateDownsampler, WindowQFormerDownsampler
+IGNORE_INDEX = -100
+logger = logging.get_logger(__name__)
+# Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextMultiModalProjector
+# Modified to handle vision_layer_to_llm_layer config
+class Granite4VisionMultiModalProjector(nn.Module):
+    def __init__(self, config: Granite4VisionConfig):
+        super().__init__()
+        # When using vision_layer_to_llm_layer, each projector handles a single vision layer
+        # Otherwise, use the original logic that can handle concatenated multi-layer features
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+@dataclass
+class Granite4VisionModelOutputWithPast(LlavaNextModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    balancing_loss: Optional[torch.FloatTensor] = None
+@dataclass
+class Granite4VisionCausalLMOutputWithPast(LlavaNextCausalLMOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    balancing_loss: Optional[torch.FloatTensor] = None
+class ParamWrapper(nn.Module):
+    def __init__(self, param):
+        super().__init__()
+        self.param = param
+class Granite4VisionForConditionalGeneration(LlavaNextForConditionalGeneration):
+    config_class = Granite4VisionConfig
+    def __init__(self, config: Granite4VisionConfig):
+        # Update config with pretrained models if specified
+        if config.pretrained_vision_tower:
+            config.vision_config = AutoConfig.from_pretrained(
+                config.pretrained_vision_tower, **config.vision_config.to_dict()
+            )
+            config.vision_config = (
+                config.vision_config.vision_config
+                if hasattr(config.vision_config, "vision_config")
+                else config.vision_config
+            )
+        if config.pretrained_language_model:
+            config.text_config = AutoConfig.from_pretrained(
+                config.pretrained_language_model, **config.text_config.to_dict()
+            )
+        # Initialize parent
+        LlavaNextPreTrainedModel.__init__(self, config)
+        # Create custom model instance
+        self.model = Granite4VisionModel(config)
+        # Create lm_head
+        self.lm_head = nn.Linear(
+            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
+        )
+        # Load pretrained components if specified
+        if config.pretrained_vision_tower:
+            self._load_pretrained_vision_tower(config)
+            config.pretrained_vision_tower = ""
+        if config.dave_encoder:
+            dave_state_dict = torch.load(config.dave_encoder, map_location="cpu")["model"]
+            self.model.vision_tower.vision_model.load_state_dict(dave_state_dict)
+        if config.pretrained_language_model:
+            self._load_pretrained_language_model(config)
+            config.pretrained_language_model = ""
+        self.post_init()
+    def _load_pretrained_vision_tower(self, config):
+        """Load pretrained vision tower weights"""
+        print(f"Loading vision tower from: {config.pretrained_vision_tower}")
+        vision_tower = AutoModel.from_pretrained(
+            config.pretrained_vision_tower,
+            attn_implementation="flash_attention_2",
+            device_map="cpu",
+            dtype=torch.bfloat16,
+        )
+        self.model.vision_tower = self.model.vision_tower.to(torch.bfloat16)
+        print(self.model.vision_tower.load_state_dict(vision_tower.state_dict(), strict=False).missing_keys)
+        self.model.vision_tower.config._attn_implementation = "flash_attention_2"
+        # todo: (Avihu) would have done this but afraid - maybe something I'm missing
+        # self.model.vision_tower = vision_tower
+        self.config.vision_config = (
+            self.model.vision_tower.config.vision_config
+            if hasattr(self.model.vision_tower.config, "vision_config")
+            else self.model.vision_tower.config
+        )
+    def _load_pretrained_language_model(self, config):
+        """Load pretrained language model weights"""
+        print(f"Loading language model from: {config.pretrained_language_model}")
+        language_model = AutoModelForCausalLM.from_pretrained(
+            config.pretrained_language_model,
+            device_map="cpu",
+            attn_implementation="flash_attention_2",
+            dtype=torch.bfloat16,
+            # use_kernels=True,
+        )
+        if self.config.image_token_index >= language_model.config.vocab_size:
+            language_model.resize_token_embeddings(self.config.image_token_index + 1)
+        # load weights in quantized mode with kernels
+        self.model.language_model = language_model.model
+        self.lm_head = language_model.lm_head
+        # Load weights into the language model inside self.model
+        self.config.text_config = self.model.language_model.config
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Granite4VisionCausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        outputs = self.model(
+            input_ids,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            spatial_shapes=spatial_shapes,
+            pixel_attention_mask=pixel_attention_mask,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        loss = None
+        if labels is not None:
+            hidden_for_pred = hidden_states[:, :-1, :].contiguous()
+            labels_to_pred = labels[:, 1:].contiguous()
+            valid_mask = labels_to_pred != IGNORE_INDEX
+            # key line! slicing only relevant last hidden states to compute the logits
+            relevant_hidden = hidden_for_pred[valid_mask]
+            relevant_labels = labels_to_pred[valid_mask]
+            # Compute logits only for relevant positions
+            logits = self.lm_head(relevant_hidden)
+            logits = logits / self.config.text_config.logits_scaling
+            # Compute loss with pre-shifted labels
+            loss = self.loss_function(
+                logits,
+                relevant_labels,
+                vocab_size=self.config.text_config.vocab_size,
+                shift_labels=relevant_labels,  # Pass pre-shifted labels to skip internal shifting
+                **kwargs,
+            )
+        else:
+            # Compute logits for generation
+            slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+            logits = self.lm_head(hidden_states[:, slice_indices, :])
+            logits = logits / self.config.text_config.logits_scaling
+        return Granite4VisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            balancing_loss=outputs.balancing_loss
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        image_sizes=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        if self.config.text_config.model_type == "granitemoehybrid":
+            model_inputs = self.prepare_inputs_for_generation_granite_moe(**model_inputs)
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+        return model_inputs
+    # Avihu: would have used the GraniteMoeSharedForCausalLM method, but we don't store this object anymore (split the model / lm head)
+    def prepare_inputs_for_generation_granite_moe(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+        # Note: (Avihu) in transformers v4, the past_key_values is already an empty DynamicCache object. Testing that too
+        empty_past_kv = past_key_values is None or (isinstance(past_key_values, DynamicCache) and past_key_values[0][0] is None)
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        elif use_cache:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.model.language_model.config, input_ids.shape[0], self.dtype, device=self.device
+            )
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+            }
+        )
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        return model_inputs
+class Granite4VisionModel(LlavaNextPreTrainedModel):
+    config_class = Granite4VisionConfig
+    def __init__(self, config: Granite4VisionConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+        self.multi_modal_projector = None
+        # Multi-layer vision injection: create list of projectors if enabled
+        assert config.vision_layer_to_llm_layer is not None
+        assert config.downsample_rate is not None
+        # Downsampler(s) - create multiple if using multi-layer vision injection
+        self.downsampler = None
+        self.downsample_rate = config.downsample_rate
+        # Create separate downsampler for each (vision_layer, llm_layer) pair
+        num_projections = len(config.vision_layer_to_llm_layer)
+        downsamplers = []
+        for _ in range(num_projections):
+            if config.downsample_method in ["interpolate", "bilinear"]:
+                downsamplers.append(InterpolateDownsampler(config))
+            elif config.downsample_method == "window_qformer":
+                downsamplers.append(WindowQFormerDownsampler(config))
+        self.downsampler = nn.ModuleList(downsamplers)
+        # Checkerboard sampling projectors
+        self.multi_modal_projector = None
+        if config.use_checkerboard_sampling:
+            # Create 4 WindowQFormer projectors for the 4 spatial sampling groups
+            use_quadrant = getattr(config, 'use_quadrant_sampling', False)
+            self.multi_modal_projector = nn.ModuleList([
+                WindowQFormerDownsampler(config, checkerboard_offset=i, use_quadrant_sampling=use_quadrant)
+                for i in range(4)
+            ])
+        self.image_newline = None
+        if config.use_image_newline_parameter:
+            embed_std = 1 / math.sqrt(config.text_config.hidden_size)
+            image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
+            self.model_type = config.model_type
+            if self.model_type in ["gpt_vision", "granite4_vision"]:
+                # this hack allows to do lora training from scratch, so image_newline would be in modules_to_keep
+                self.image_newline = ParamWrapper(image_newline)
+            else:
+                self.image_newline = image_newline
+        self.vocab_size = config.text_config.vocab_size
+        # with init_empty_weights(): # Avihu: hack to load the model faster
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
+        """
+        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+        Args:
+            image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
+                List of image feature tensor, each contains all the visual feature of all patches.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
+            image_newline (`torch.Tensor` of shape `(embed_dim)`)
+                New line embedding vector.
+        Returns:
+            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
+            feature_lens (`list[int]`)
+                token length of each image in image_features
+        """
+        new_image_features = []
+        feature_lens = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+                if self.downsampler is not None:
+                    ds_rate = Fraction(self.downsample_rate)
+                    height = int(height  * ds_rate)
+                    width = int(width  * ds_rate)
+                if (
+                    np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
+                    and vision_feature_select_strategy == "default"
+                ):
+                    logger.warning_once(
+                        "Image feature shape does not line up with the provided patch size. "
+                        "You may be using the `default` vision_feature_select_strategy with a"
+                        " visual encoder that does not have CLS."
+                    )
+                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
+        return new_image_features, feature_lens
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_layer (`Union[int, list[int]]`, *optional*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            When vision_layer_to_llm_layer is None:
+                image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all
+                the visual feature of all patches and are of shape `(num_patches, image_length, embed_dim)`)
+            When vision_layer_to_llm_layer is set:
+                dict mapping vision layer index → list of image features for that layer
+        """
+        # Multi-layer vision injection mode
+        if self.config.vision_layer_to_llm_layer is not None:
+            return self._get_image_features_multi_layer(
+                pixel_values, image_sizes, vision_feature_select_strategy
+            )
+    def _get_image_features_multi_layer(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        """
+        Extract and process multiple vision encoder layers separately.
+        Returns:
+            dict: Maps vision layer index (0, 1, ...) to list of image features for that layer
+        """
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        # Infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            )
+            for imsize in image_sizes
+        ]
+        image_newline = self.image_newline.param if self.model_type in ["gpt_vision", "granite4_vision"] else self.image_newline
+        # Process pixel values
+        if pixel_values.dim() == 5:
+            _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
+        # Get all hidden states from vision tower
+        vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # Process each (vision_layer, llm_layer) pair separately
+        # Store as list of tuples: [(llm_layer, packed_features), ...]
+        multi_layer_features = []
+        for projection_idx, (vision_layer, llm_layer) in enumerate(self.config.vision_layer_to_llm_layer):
+            # Extract features from this vision layer
+            selected_feature = vision_outputs.hidden_states[vision_layer]
+            # Apply feature selection strategy (remove CLS if needed)
+            if vision_feature_select_strategy == "default":
+                selected_feature = selected_feature[:, 1:]
+            # Apply projection-specific downsampler if configured
+            projected_features = self.downsampler[projection_idx](selected_feature)
+            # Split by image
+            projected_features = torch.split(projected_features, image_num_patches, dim=0)
+            # Pack image features
+            packed_features, feature_lens = self.pack_image_features(
+                projected_features,
+                image_sizes,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_newline=image_newline,
+            )
+            # Store as tuple (llm_layer, packed_features)
+            multi_layer_features.append((llm_layer, packed_features))
+        # Process checkerboard sampling if enabled
+        if self.config.use_checkerboard_sampling:
+            # Extract the specified vision layer for checkerboard sampling
+            checkerboard_feature = vision_outputs.hidden_states[self.config.checkerboard_vision_layer]
+            # Apply feature selection strategy (remove CLS if needed)
+            if vision_feature_select_strategy == "default":
+                checkerboard_feature = checkerboard_feature[:, 1:]
+            # Process each checkerboard offset with its own WindowQFormer projector
+            for group_idx, llm_layer in enumerate(self.config.checkerboard_llm_layers):
+                # Apply WindowQFormer with checkerboard downsampling
+                # The projector handles both downsampling and projection
+                projected_group = self.multi_modal_projector[group_idx](checkerboard_feature)
+                # Split by image
+                projected_group_split = torch.split(projected_group, image_num_patches, dim=0)
+                # Pack image features
+                packed_group, _ = self.pack_image_features(
+                    projected_group_split,
+                    image_sizes,
+                    vision_feature_select_strategy=vision_feature_select_strategy,
+                    image_newline=image_newline,
+                )
+                # Add to multi_layer_features as tuple
+                multi_layer_features.append((llm_layer, packed_group))
+        return multi_layer_features
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+        return special_image_mask
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Granite4VisionModelOutputWithPast]:
+        r"""
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            print(input_ids, inputs_embeds, position_ids, pixel_values, image_sizes, kwargs, )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        # Initialize variables for multi-layer vision injection
+        # List of tuples: [(llm_layer, concat_features), ...]
+        layerwise_image_features = []
+        vision_mask = None
+        ran_dummy_pass = False
+        if (pixel_values is not None and pixel_values.size(0) > 0) or (pixel_values is None and torch.is_grad_enabled()):
+            if pixel_values is not None and pixel_values.size(0) > 0:
+                image_features = self.get_image_features(
+                    pixel_values,
+                    image_sizes,
+                    vision_feature_layer=vision_feature_layer,
+                    vision_feature_select_strategy=vision_feature_select_strategy,
+                )
+            else:
+                image_features = self.run_dummy_encoder_forward(
+                    inputs_embeds, vision_feature_layer, vision_feature_select_strategy
+                )
+                ran_dummy_pass = True
+            # Multi-layer vision injection (image_features is a list of tuples: [(llm_layer, packed_features), ...])
+            # - Zero out image token positions in inputs_embeds (will be filled by LLM forward pass)
+            # - Store compact concat_features for each projection
+            for idx, (llm_layer_idx, packed_features) in enumerate(image_features):
+                # Concatenate all image features for this projection
+                concat_features = torch.cat(packed_features, dim=0).to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                if idx == 0:
+                    if ran_dummy_pass:
+                        vision_mask = torch.zeros_like(inputs_embeds).bool()
+                        vision_mask[:, :concat_features.shape[0]] = True
+                    else:
+                        vision_mask = self.get_placeholder_mask(
+                            input_ids, inputs_embeds=inputs_embeds, image_features=concat_features
+                        )
+                        inputs_embeds = inputs_embeds.masked_fill(vision_mask, 0.0)
+                # Store as tuple (llm_layer, concat_features)
+                layerwise_image_features.append((llm_layer_idx, concat_features))
+        # Dispatch to model-specific forward
+        model_type = self.config.text_config.model_type
+        try:
+            if model_type == "granitemoehybrid":
+                outputs = self._forward_granitemoehybrid(
+                    inputs_embeds, attention_mask, position_ids, past_key_values,
+                    use_cache, output_attentions, output_hidden_states, cache_position,
+                    layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
+                )
+            elif model_type == "granite":
+                outputs = self._forward_granite(
+                    inputs_embeds, attention_mask, position_ids, past_key_values,
+                    use_cache, output_attentions, output_hidden_states, cache_position,
+                    layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
+                )
+            else:
+                raise ValueError(f"Unsupported text model type: {model_type}")
+        except Exception as e:
+            print(e)
+            print(attention_mask)
+            print(position_ids)
+            print(inputs_embeds)
+            print(input_ids)
+            print(kwargs)
+            raise e
+        return Granite4VisionModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+    def _inject_vision_features(self, hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass):
+        """Inject vision features at the specified layer via masked_scatter."""
+        for llm_layer, image_features_for_layer in layerwise_image_features:
+            if layer_idx == llm_layer:
+                if ran_dummy_pass:
+                    image_features_for_layer = image_features_for_layer[:hidden_states.shape[1]]
+                hidden_states = hidden_states.masked_scatter(
+                    vision_mask,
+                    (hidden_states[vision_mask] + image_features_for_layer.flatten()).view(-1)
+                )
+        return hidden_states
+    def _forward_granitemoehybrid(
+        self, inputs_embeds, attention_mask, position_ids, past_key_values,
+        use_cache, output_attentions, output_hidden_states, cache_position,
+        layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
+    ):
+        hidden_states = inputs_embeds * self.language_model.embedding_multiplier
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self.language_model._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        mamba_mask = self.language_model._update_mamba_mask(attention_mask, cache_position)
+        position_embeddings = None
+        if self.language_model.rotary_emb is not None:
+            position_embeddings = self.language_model.rotary_emb(hidden_states, position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if kwargs.get('output_router_logits', False) else None
+        for layer_idx, decoder_layer in enumerate(self.language_model.layers):
+            hidden_states = self._inject_vision_features(
+                hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass
+            )
+            layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=layer_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                output_router_logits=kwargs.get('output_router_logits', False),
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    all_self_attns += (layer_outputs[1],)
+            if kwargs.get('output_router_logits', False):
+                if layer_outputs[-1] is not None:
+                    all_router_logits += (layer_outputs[-1],)
+        hidden_states = self.language_model.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+    def _forward_granite(
+        self, inputs_embeds, attention_mask, position_ids, past_key_values,
+        use_cache, output_attentions, output_hidden_states, cache_position,
+        layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
+    ):
+        hidden_states = inputs_embeds * self.language_model.embedding_multiplier
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.language_model.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.language_model.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        position_embeddings = self.language_model.rotary_emb(hidden_states, position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for layer_idx, decoder_layer in enumerate(self.language_model.layers):
+            hidden_states = self._inject_vision_features(
+                hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass
+            )
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.language_model.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def run_dummy_encoder_forward(self, inputs_embeds, vision_feature_layer, vision_feature_select_strategy):
+        print("no pixel values, using dummy data to get grads")
+        dummy_data = torch.zeros((3, 3, 384, 384), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        dummy_sizes = torch.tensor([[768, 384]], device=inputs_embeds.device)
+        image_features = self.get_image_features(
+            dummy_data, dummy_sizes,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy
+        )
+        return [(k, [v[0] * 0]) for k, v in image_features]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+  "crop_size": {
+    "height": 384,
+    "width": 384
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_grid_pinpoints": [
+    [
+      384,
+      384
+    ],
+    [
+      384,
+      768
+    ],
+    [
+      384,
+      1152
+    ],
+    [
+      384,
+      1536
+    ],
+    [
+      384,
+      1920
+    ],
+    [
+      384,
+      2304
+    ],
+    [
+      384,
+      2688
+    ],
+    [
+      384,
+      3072
+    ],
+    [
+      384,
+      3456
+    ],
+    [
+      384,
+      3840
+    ],
+    [
+      768,
+      384
+    ],
+    [
+      768,
+      768
+    ],
+    [
+      768,
+      1152
+    ],
+    [
+      768,
+      1536
+    ],
+    [
+      768,
+      1920
+    ],
+    [
+      1152,
+      384
+    ],
+    [
+      1152,
+      768
+    ],
+    [
+      1152,
+      1152
+    ],
+    [
+      1536,
+      384
+    ],
+    [
+      1536,
+      768
+    ],
+    [
+      1920,
+      384
+    ],
+    [
+      1920,
+      768
+    ],
+    [
+      2304,
+      384
+    ],
+    [
+      2688,
+      384
+    ],
+    [
+      3072,
+      384
+    ],
+    [
+      3456,
+      384
+    ],
+    [
+      3840,
+      384
+    ]
+  ],
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "LlavaNextImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "Granite4VisionProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing.Granite4VisionProcessor"
+  },
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  },
+  "window_side": 8
+}

processing.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from fractions import Fraction
+from transformers import LlavaNextProcessor
+from transformers.image_processing_utils import select_best_resolution
+class Granite4VisionProcessor(LlavaNextProcessor):
+    model_type = "granite4_vision"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
+        downsample_rate=None,
+        **kwargs,
+    ):
+        super().__init__(image_processor=image_processor,
+                         tokenizer=tokenizer,
+                         patch_size=patch_size,
+                         vision_feature_select_strategy=vision_feature_select_strategy,
+                         chat_template=chat_template,
+                         image_token=image_token,
+                         num_additional_image_tokens=num_additional_image_tokens,
+                         )
+        self.downsample_rate = downsample_rate
+    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+        height_best_resolution, width_best_resolution = select_best_resolution(
+            [orig_height, orig_width], image_grid_pinpoints
+        )
+        scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
+        patches_height = height // self.patch_size
+        patches_width = width // self.patch_size
+        if self.downsample_rate is not None:
+            # todo: maybe add an assertion that it divides nicely?
+            ds_rate = Fraction(self.downsample_rate)
+            patches_height = int(patches_height * ds_rate)
+            patches_width = int(patches_width * ds_rate)
+        unpadded_features, newline_features = self._get_unpadded_features(
+            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
+        )
+        # The base patch covers the entire image (+1 for the CLS)
+        base_features = patches_height * patches_width + self.num_additional_image_tokens
+        num_image_tokens = unpadded_features + newline_features + base_features
+        return num_image_tokens

processor_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "downsample_rate": "4/8",
+  "image_token": "<image>",
+  "num_additional_image_tokens": 0,
+  "patch_size": 16,
+  "processor_class": "Granite4VisionProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing.Granite4VisionProcessor"
+  },
+  "vision_feature_select_strategy": "full"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<image>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|unk|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,796 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100261": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100262": {
+      "content": "<|filename|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100263": {
+      "content": "<|reponame|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100264": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|unused_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100271": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100272": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100273": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100274": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100275": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100276": {
+      "content": "<think_on>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100277": {
+      "content": "<think_off>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100278": {
+      "content": "<schema>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100279": {
+      "content": "</schema>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100280": {
+      "content": "<tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100281": {
+      "content": "</tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100282": {
+      "content": "<documents>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100283": {
+      "content": "</documents>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100284": {
+      "content": "<|unused_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100285": {
+      "content": "<|unused_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100286": {
+      "content": "<|unused_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100287": {
+      "content": "<|unused_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100288": {
+      "content": "<|unused_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100289": {
+      "content": "<|unused_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100290": {
+      "content": "<|unused_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100291": {
+      "content": "<|unused_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100292": {
+      "content": "<|unused_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100293": {
+      "content": "<|unused_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100294": {
+      "content": "<|unused_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100295": {
+      "content": "<|unused_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100296": {
+      "content": "<|unused_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100297": {
+      "content": "<|unused_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100298": {
+      "content": "<|unused_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100299": {
+      "content": "<|unused_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100300": {
+      "content": "<|unused_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100301": {
+      "content": "<|unused_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100302": {
+      "content": "<|unused_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100303": {
+      "content": "<|unused_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100304": {
+      "content": "<|unused_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100305": {
+      "content": "<|unused_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100306": {
+      "content": "<|unused_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100307": {
+      "content": "<|unused_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100308": {
+      "content": "<|unused_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100309": {
+      "content": "<|unused_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100310": {
+      "content": "<|unused_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100311": {
+      "content": "<|unused_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100312": {
+      "content": "<|unused_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100313": {
+      "content": "<|unused_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100314": {
+      "content": "<|unused_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100315": {
+      "content": "<|unused_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100316": {
+      "content": "<|unused_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100317": {
+      "content": "<|unused_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100318": {
+      "content": "<|unused_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100319": {
+      "content": "<|unused_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100320": {
+      "content": "<|unused_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100321": {
+      "content": "<|unused_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100322": {
+      "content": "<|unused_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100323": {
+      "content": "<|unused_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100324": {
+      "content": "<|unused_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100325": {
+      "content": "<|unused_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100326": {
+      "content": "<|unused_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100327": {
+      "content": "<|unused_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100328": {
+      "content": "<|unused_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100329": {
+      "content": "<|unused_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100330": {
+      "content": "<|unused_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100331": {
+      "content": "<|unused_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100332": {
+      "content": "<|unused_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100333": {
+      "content": "<|unused_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100334": {
+      "content": "<|unused_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100335": {
+      "content": "<|unused_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100336": {
+      "content": "<|unused_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100337": {
+      "content": "<|unused_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100338": {
+      "content": "<|unused_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100339": {
+      "content": "<|unused_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100340": {
+      "content": "<|unused_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100341": {
+      "content": "<|unused_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100342": {
+      "content": "<|unused_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100343": {
+      "content": "<|unused_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100344": {
+      "content": "<|unused_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100345": {
+      "content": "<|unused_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100346": {
+      "content": "<|unused_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100347": {
+      "content": "<|unused_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100348": {
+      "content": "<|unused_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100349": {
+      "content": "<|unused_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100350": {
+      "content": "<|unused_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100351": {
+      "content": "<|unused_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100352": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<image>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|pad|>",
+  "padding_side": "left",
+  "processor_class": "Granite4VisionProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|unk|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff