benwiesel commited on
Commit
1ebe0da
·
verified ·
1 Parent(s): f8d4d25

Upload folder using huggingface_hub

Browse files
README.md ADDED
File without changes
adapter_config.json ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Granite4VisionForConditionalGeneration",
7
+ "parent_library": "modeling"
8
+ },
9
+ "base_model_name_or_path": "granite-vision-dev/granite-4.1-3b-vision",
10
+ "bias": "none",
11
+ "corda_config": null,
12
+ "ensure_weight_tying": false,
13
+ "eva_config": null,
14
+ "exclude_modules": null,
15
+ "fan_in_fan_out": false,
16
+ "inference_mode": true,
17
+ "init_lora_weights": true,
18
+ "layer_replication": null,
19
+ "layers_pattern": null,
20
+ "layers_to_transform": null,
21
+ "loftq_config": {},
22
+ "lora_alpha": 256,
23
+ "lora_bias": false,
24
+ "lora_dropout": 0.1,
25
+ "megatron_config": null,
26
+ "megatron_core": "megatron.core",
27
+ "modules_to_save": [
28
+ "model.downsampler.0",
29
+ "model.downsampler.1",
30
+ "model.downsampler.2",
31
+ "model.downsampler.3",
32
+ "model.multi_modal_projector.0",
33
+ "model.multi_modal_projector.1",
34
+ "model.multi_modal_projector.2",
35
+ "model.multi_modal_projector.3",
36
+ "model.image_newline"
37
+ ],
38
+ "peft_type": "LORA",
39
+ "peft_version": "0.18.1",
40
+ "qalora_group_size": 16,
41
+ "r": 256,
42
+ "rank_pattern": {},
43
+ "revision": null,
44
+ "target_modules": [
45
+ "28.self_attn.v_proj",
46
+ "27.self_attn.q_proj",
47
+ "language_model.layers.7.self_attn.q_proj",
48
+ "language_model.layers.19.self_attn.k_proj",
49
+ "31.self_attn.k_proj",
50
+ "language_model.layers.25.self_attn.k_proj",
51
+ "language_model.layers.14.self_attn.v_proj",
52
+ "language_model.layers.12.self_attn.q_proj",
53
+ "28.self_attn.q_proj",
54
+ "language_model.layers.2.self_attn.q_proj",
55
+ "language_model.layers.2.self_attn.v_proj",
56
+ "language_model.layers.15.self_attn.k_proj",
57
+ "language_model.layers.14.self_attn.k_proj",
58
+ "language_model.layers.6.self_attn.v_proj",
59
+ "language_model.layers.21.self_attn.k_proj",
60
+ "language_model.layers.3.self_attn.q_proj",
61
+ "30.self_attn.v_proj",
62
+ "language_model.layers.8.self_attn.k_proj",
63
+ "27.self_attn.k_proj",
64
+ "language_model.layers.1.self_attn.k_proj",
65
+ "language_model.layers.18.self_attn.q_proj",
66
+ "down_proj",
67
+ "29.self_attn.v_proj",
68
+ "38.self_attn.v_proj",
69
+ "up_proj",
70
+ "language_model.layers.9.self_attn.k_proj",
71
+ "language_model.layers.11.self_attn.q_proj",
72
+ "language_model.layers.5.self_attn.k_proj",
73
+ "35.self_attn.k_proj",
74
+ "language_model.layers.25.self_attn.q_proj",
75
+ "language_model.layers.19.self_attn.v_proj",
76
+ "language_model.layers.13.self_attn.q_proj",
77
+ "33.self_attn.v_proj",
78
+ "language_model.layers.9.self_attn.v_proj",
79
+ "37.self_attn.k_proj",
80
+ "language_model.layers.24.self_attn.v_proj",
81
+ "33.self_attn.q_proj",
82
+ "31.self_attn.v_proj",
83
+ "gate_proj",
84
+ "34.self_attn.v_proj",
85
+ "language_model.layers.21.self_attn.v_proj",
86
+ "o_proj",
87
+ "language_model.layers.22.self_attn.v_proj",
88
+ "language_model.layers.26.self_attn.k_proj",
89
+ "language_model.layers.4.self_attn.q_proj",
90
+ "language_model.layers.26.self_attn.v_proj",
91
+ "language_model.layers.23.self_attn.q_proj",
92
+ "language_model.layers.21.self_attn.q_proj",
93
+ "language_model.layers.20.self_attn.q_proj",
94
+ "language_model.layers.23.self_attn.v_proj",
95
+ "language_model.layers.16.self_attn.q_proj",
96
+ "37.self_attn.v_proj",
97
+ "language_model.layers.5.self_attn.v_proj",
98
+ "language_model.layers.16.self_attn.v_proj",
99
+ "language_model.layers.7.self_attn.k_proj",
100
+ "language_model.layers.15.self_attn.v_proj",
101
+ "36.self_attn.v_proj",
102
+ "language_model.layers.16.self_attn.k_proj",
103
+ "language_model.layers.1.self_attn.q_proj",
104
+ "language_model.layers.4.self_attn.k_proj",
105
+ "language_model.layers.14.self_attn.q_proj",
106
+ "30.self_attn.q_proj",
107
+ "language_model.layers.19.self_attn.q_proj",
108
+ "language_model.layers.25.self_attn.v_proj",
109
+ "language_model.layers.13.self_attn.v_proj",
110
+ "language_model.layers.18.self_attn.k_proj",
111
+ "language_model.layers.0.self_attn.v_proj",
112
+ "language_model.layers.23.self_attn.k_proj",
113
+ "language_model.layers.10.self_attn.v_proj",
114
+ "language_model.layers.17.self_attn.q_proj",
115
+ "36.self_attn.q_proj",
116
+ "35.self_attn.v_proj",
117
+ "30.self_attn.k_proj",
118
+ "language_model.layers.8.self_attn.v_proj",
119
+ "language_model.layers.20.self_attn.k_proj",
120
+ "32.self_attn.q_proj",
121
+ "38.self_attn.q_proj",
122
+ "language_model.layers.15.self_attn.q_proj",
123
+ "language_model.layers.24.self_attn.q_proj",
124
+ "language_model.layers.10.self_attn.q_proj",
125
+ "34.self_attn.k_proj",
126
+ "language_model.layers.3.self_attn.v_proj",
127
+ "language_model.layers.11.self_attn.v_proj",
128
+ "language_model.layers.22.self_attn.k_proj",
129
+ "38.self_attn.k_proj",
130
+ "language_model.layers.7.self_attn.v_proj",
131
+ "39.self_attn.v_proj",
132
+ "language_model.layers.10.self_attn.k_proj",
133
+ "language_model.layers.13.self_attn.k_proj",
134
+ "language_model.layers.12.self_attn.k_proj",
135
+ "37.self_attn.q_proj",
136
+ "34.self_attn.q_proj",
137
+ "language_model.layers.22.self_attn.q_proj",
138
+ "33.self_attn.k_proj",
139
+ "28.self_attn.k_proj",
140
+ "language_model.layers.6.self_attn.k_proj",
141
+ "language_model.layers.6.self_attn.q_proj",
142
+ "language_model.layers.18.self_attn.v_proj",
143
+ "language_model.layers.0.self_attn.k_proj",
144
+ "36.self_attn.k_proj",
145
+ "language_model.layers.5.self_attn.q_proj",
146
+ "31.self_attn.q_proj",
147
+ "language_model.layers.3.self_attn.k_proj",
148
+ "language_model.layers.24.self_attn.k_proj",
149
+ "29.self_attn.k_proj",
150
+ "language_model.layers.2.self_attn.k_proj",
151
+ "language_model.layers.20.self_attn.v_proj",
152
+ "language_model.layers.17.self_attn.v_proj",
153
+ "language_model.layers.4.self_attn.v_proj",
154
+ "language_model.layers.0.self_attn.q_proj",
155
+ "language_model.layers.11.self_attn.k_proj",
156
+ "39.self_attn.q_proj",
157
+ "29.self_attn.q_proj",
158
+ "language_model.layers.1.self_attn.v_proj",
159
+ "language_model.layers.26.self_attn.q_proj",
160
+ "language_model.layers.9.self_attn.q_proj",
161
+ "language_model.layers.17.self_attn.k_proj",
162
+ "language_model.layers.12.self_attn.v_proj",
163
+ "27.self_attn.v_proj",
164
+ "32.self_attn.v_proj",
165
+ "35.self_attn.q_proj",
166
+ "32.self_attn.k_proj",
167
+ "language_model.layers.8.self_attn.q_proj",
168
+ "39.self_attn.k_proj"
169
+ ],
170
+ "target_parameters": [],
171
+ "task_type": null,
172
+ "trainable_token_indices": null,
173
+ "use_dora": false,
174
+ "use_qalora": false,
175
+ "use_rslora": false
176
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcddb6ba09c12b7ab299fbc6e805291e6744f2fee0be60e0fb281bd9b55cb56
3
+ size 1328498208
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image>": 100352
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {#- ===== Task tag prompt constants ===== -#}
2
+ {%- set chart2code_prompt = "Generate code that recreates the chart as best as possible." -%}
3
+ {%- set chart2csv_prompt = "Please examine this chart image. Consider you are a data visualization expert, and extract the data into a CSV table.\n\nYour CSV should:\n- Include a header row with clear column names\n- Represent all data series/categories shown in the chart\n- Use numeric values that match the chart as closely as possible\n\nOutput only the CSV data, nothing else." -%}
4
+ {%- set chart2summary_prompt = "Can you describe this chart image?" -%}
5
+ {%- set tables_json_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a valid JSON object containing a list of dictionaries with the following structure:\n\n {\n \"dimensions\": {\n \"rows\": <number of data rows (excluding header rows)>,\n \"columns\": <number of columns>,\n \"header_rows\": <number of header rows>,\n \"total_rows\": <total number of rows including headers>\n },\n \"cells\": [\n {\n \"row\": <row index starting at 1>,\n \"col\": <column index starting at 1>,\n \"colspan\": <number of columns spanned>,\n \"rowspan\": <number of rows spanned>,\n \"type\": \"<'header' or 'data'>\",\n \"header_level\": <header nesting level if type=header, else omit or null>,\n \"content\": \"<string content of the cell>\"\n },\n ...\n ]\n }" -%}
6
+ {%- set tables_html_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a list of valid HTML tables" -%}
7
+ {%- set tables_otsl_prompt = "Identify and extract the tabls schema\n Extruct the schema of all the tables in the image sorted according to the reading order.\nThe output must be a list of valid OTSL objects, each consists of the following fields: \n <fcel> - a cell with content in it\n <ecel> - an empty cell\n <lcel> - a cell that is merged with the cell to its left\n <ucel> - a cell that is merged with the cell above it\n <xcel> - a cell that is merged with both the cell above it and the cell to its left\n <nl> - a new line\n <ched> - a clumn header\n <otsl> - the beginning of the OTSL table\n </otsl> - the end of the OTSL table\n\n An example for an output:\n [\n <otsl><ched>first table header1<ched>first table header2<nl><fcel>data1<fcel>data2<nl><fcel>data with horizontal span<lcel><nl><fcell>data with vertical span<ecel><nl><ucel><fcel>data3<nl></otsl>,\n <otsl><ched>second table header1<ched>second table header2<nl><fcel>data1<fcel>data2<nl><fcel>data with horizontal span<lcel><nl><fcell>data with vertical span<ecel><nl><ucel><fcel>data3<nl></otsl>\n ]" -%}
8
+
9
+
10
+ {#- ===== Tag expansion dispatcher ===== -#}
11
+ {%- macro expand_tags(text) -%}
12
+ {%- set has_image = "<image>" in text -%}
13
+ {#- Determine image position: prefix if <image> appears before the tag, suffix if after -#}
14
+ {%- if has_image -%}
15
+ {%- set img_idx = text.index("<image>") -%}
16
+ {%- if "<chart2code>" in text -%}{%- set tag_idx = text.index("<chart2code>") -%}
17
+ {%- elif "<chart2csv>" in text -%}{%- set tag_idx = text.index("<chart2csv>") -%}
18
+ {%- elif "<chart2summary>" in text -%}{%- set tag_idx = text.index("<chart2summary>") -%}
19
+ {%- elif "<tables_json>" in text -%}{%- set tag_idx = text.index("<tables_json>") -%}
20
+ {%- elif "<tables_html>" in text -%}{%- set tag_idx = text.index("<tables_html>") -%}
21
+ {%- elif "<tables_otsl>" in text -%}{%- set tag_idx = text.index("<tables_otsl>") -%}
22
+ {%- else -%}{%- set tag_idx = 999999 -%}
23
+ {%- endif -%}
24
+ {%- set img_prefix = "<image>\n" if img_idx < tag_idx else "" -%}
25
+ {%- set img_suffix = "\n<image>" if img_idx >= tag_idx else "" -%}
26
+ {%- else -%}
27
+ {%- set img_prefix = "" -%}
28
+ {%- set img_suffix = "" -%}
29
+ {%- endif -%}
30
+ {%- if "<chart2code>" in text -%}
31
+ {{- img_prefix + chart2code_prompt + img_suffix -}}
32
+ {%- elif "<chart2csv>" in text -%}
33
+ {{- img_prefix + chart2csv_prompt + img_suffix -}}
34
+ {%- elif "<chart2summary>" in text -%}
35
+ {{- img_prefix + chart2summary_prompt + img_suffix -}}
36
+ {%- elif "<tables_json>" in text -%}
37
+ {{- img_prefix + tables_json_prompt + img_suffix -}}
38
+ {%- elif "<tables_html>" in text -%}
39
+ {{- img_prefix + tables_html_prompt + img_suffix -}}
40
+ {%- elif "<tables_otsl>" in text -%}
41
+ {{- img_prefix + tables_otsl_prompt + img_suffix -}}
42
+ {%- else -%}
43
+ {{- text -}}
44
+ {%- endif -%}
45
+ {%- endmacro -%}
46
+
47
+ {#- ===== Original chat template ===== -#}
48
+ {% macro render_content(x) %}
49
+ {%- if x is string %}
50
+ {{ x }}
51
+ {%- else %}
52
+ {%- for chunk in x %}
53
+ {%- if chunk['type'] == 'text' -%}
54
+ {{ chunk['text']}}
55
+ {%- elif chunk['type'] == 'image' -%}
56
+ {{- "<image>
57
+ " }}
58
+ {%- endif -%}
59
+ {%- endfor -%}
60
+ {%- endif -%}
61
+ {% endmacro %}
62
+
63
+ {%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' %}
64
+ {%- set tools_system_message_suffix = '\n</tools>\n\nFor each tool call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %}
65
+ {%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within <documents></documents> XML tags:\n<documents>' %}
66
+ {%- set documents_system_message_suffix = '\n</documents>\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %}
67
+ {%- set g4_default_system_message = 'You are a helpful assistant. Please ensure responses are professional, accurate, and safe.' %}
68
+ {%- if available_tools is defined and available_tools %}
69
+ {%- set tools = available_tools %}
70
+ {%- endif %}
71
+ {%- set ns = namespace(tools_system_message=tools_system_message_prefix,
72
+ documents_system_message=documents_system_message_prefix,
73
+ default_system_message=g4_default_system_message,
74
+ system_message=''
75
+ ) %}
76
+ {%- if tools %}
77
+ {%- for tool in tools %}
78
+ {%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %}
79
+ {%- endfor %}
80
+ {%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %}
81
+ {%- else %}
82
+ {%- set ns.tools_system_message = '' %}
83
+ {%- endif %}
84
+ {%- if documents %}
85
+ {%- for document in documents %}
86
+ {%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %}
87
+ {%- endfor %}
88
+ {%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %}
89
+ {%- else %}
90
+ {%- set ns.documents_system_message = '' %}
91
+ {%- endif %}
92
+ {%- if messages[0].role == 'system' %}
93
+ {%- if messages[0].content is string %}
94
+ {%- set ns.system_message = messages[0].content %}
95
+ {%- elif messages[0].content is iterable %}
96
+ {%- for entry in messages[0].content %}
97
+ {%- if entry.type== 'text' %}
98
+ {%- if ns.system_message != '' %}
99
+ {%- set ns.system_message = ns.system_message + '\n' %}
100
+ {%- endif %}
101
+ {%- set ns.system_message = ns.system_message + entry.text %}
102
+ {%- endif %}
103
+ {%- endfor %}
104
+ {%- endif %}
105
+ {%- if tools and documents %}
106
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message + '\n\n' + ns.documents_system_message %}
107
+ {%- elif tools %}
108
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %}
109
+ {%- elif documents %}
110
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %}
111
+ {%- endif %}
112
+ {%- else %}
113
+ {%- if tools and documents %}
114
+ {%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message %}
115
+ {%- elif tools %}
116
+ {%- set ns.system_message = ns.tools_system_message %}
117
+ {%- elif documents %}
118
+ {%- set ns.system_message = ns.documents_system_message %}
119
+ {%- endif %}
120
+ {%- endif %}
121
+ {%- if ns.system_message %}
122
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }}
123
+ {%- else %}
124
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.default_system_message + '<|end_of_text|>\n' }}
125
+ {%- endif %}
126
+ {%- for message in messages %}
127
+ {%- set content = namespace(val='') %}
128
+ {%- if render_content(message['content']) is string %}
129
+ {%- set content.val = render_content(message['content']) %}
130
+ {%- else %}
131
+ {%- if render_content(message['content']) is iterable %}
132
+ {%- for entry in render_content(message['content']) %}
133
+ {%- if entry.type== 'text' %}
134
+ {%- if content.val != '' %}
135
+ {%- set content.val = content.val + '\n' %}
136
+ {%- endif %}
137
+ {%- set content.val = content.val + entry.text %}
138
+ {%- endif %}
139
+ {%- endfor %}
140
+ {%- endif %}
141
+ {%- endif %}
142
+ {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}
143
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + expand_tags(content.val) + '<|end_of_text|>\n' }}
144
+ {%- elif message.role == 'assistant' %}
145
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }}
146
+ {%- if message.tool_calls %}
147
+ {%- for tool_call in message.tool_calls %}
148
+ {%- if (loop.first and content.val) or (not loop.first) %}
149
+ {{- '\n' }}
150
+ {%- endif %}
151
+ {%- if tool_call.function %}
152
+ {%- set tool_call = tool_call.function %}
153
+ {%- endif %}
154
+ {{- '<tool_call>\n{"name": "' }}
155
+ {{- tool_call.name }}
156
+ {{- '", "arguments": ' }}
157
+ {%- if tool_call.arguments is string %}
158
+ {{- tool_call.arguments }}
159
+ {%- else %}
160
+ {{- tool_call.arguments | tojson }}
161
+ {%- endif %}
162
+ {{- '}\n</tool_call>' }}
163
+ {%- endfor %}
164
+ {%- endif %}
165
+ {{- '<|end_of_text|>\n' }}
166
+ {%- elif message.role == 'tool' %}
167
+ {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
168
+ {{- '<|start_of_role|>user<|end_of_role|>' }}
169
+ {%- endif %}
170
+ {{- '\n<tool_response>\n' }}
171
+ {{- content.val }}
172
+ {{- '\n</tool_response>' }}
173
+ {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
174
+ {{- '<|end_of_text|>\n' }}
175
+ {%- endif %}
176
+ {%- endif %}
177
+ {%- endfor %}
178
+ {%- if add_generation_prompt %}
179
+ {{- '<|start_of_role|>assistant<|end_of_role|>' }}
180
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Granite4VisionForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration.Granite4VisionConfig",
7
+ "AutoModel": "modeling.Granite4VisionForConditionalGeneration",
8
+ "AutoModelForVision2Seq": "modeling.Granite4VisionForConditionalGeneration",
9
+ "AutoModelForImageTextToText": "modeling.Granite4VisionForConditionalGeneration",
10
+ "AutoProcessor": "processing.Granite4VisionProcessor"
11
+ },
12
+ "checkerboard_llm_layers": [
13
+ 12,
14
+ 15,
15
+ 18,
16
+ 21
17
+ ],
18
+ "checkerboard_stride": 2,
19
+ "checkerboard_vision_layer": -1,
20
+ "dave_encoder": null,
21
+ "downsample_method": "window_qformer",
22
+ "downsample_rate": "4/8",
23
+ "dtype": "bfloat16",
24
+ "image_grid_pinpoints": [
25
+ [
26
+ 384,
27
+ 384
28
+ ],
29
+ [
30
+ 384,
31
+ 768
32
+ ],
33
+ [
34
+ 384,
35
+ 1152
36
+ ],
37
+ [
38
+ 384,
39
+ 1536
40
+ ],
41
+ [
42
+ 384,
43
+ 1920
44
+ ],
45
+ [
46
+ 384,
47
+ 2304
48
+ ],
49
+ [
50
+ 384,
51
+ 2688
52
+ ],
53
+ [
54
+ 384,
55
+ 3072
56
+ ],
57
+ [
58
+ 384,
59
+ 3456
60
+ ],
61
+ [
62
+ 384,
63
+ 3840
64
+ ],
65
+ [
66
+ 768,
67
+ 384
68
+ ],
69
+ [
70
+ 768,
71
+ 768
72
+ ],
73
+ [
74
+ 768,
75
+ 1152
76
+ ],
77
+ [
78
+ 768,
79
+ 1536
80
+ ],
81
+ [
82
+ 768,
83
+ 1920
84
+ ],
85
+ [
86
+ 1152,
87
+ 384
88
+ ],
89
+ [
90
+ 1152,
91
+ 768
92
+ ],
93
+ [
94
+ 1152,
95
+ 1152
96
+ ],
97
+ [
98
+ 1536,
99
+ 384
100
+ ],
101
+ [
102
+ 1536,
103
+ 768
104
+ ],
105
+ [
106
+ 1920,
107
+ 384
108
+ ],
109
+ [
110
+ 1920,
111
+ 768
112
+ ],
113
+ [
114
+ 2304,
115
+ 384
116
+ ],
117
+ [
118
+ 2688,
119
+ 384
120
+ ],
121
+ [
122
+ 3072,
123
+ 384
124
+ ],
125
+ [
126
+ 3456,
127
+ 384
128
+ ],
129
+ [
130
+ 3840,
131
+ 384
132
+ ]
133
+ ],
134
+ "image_seq_length": 576,
135
+ "image_token_index": 100352,
136
+ "initializer_range": 0.02,
137
+ "model_type": "granite4_vision",
138
+ "multimodal_projector_bias": true,
139
+ "pretrained_language_model": "",
140
+ "pretrained_vision_tower": "",
141
+ "projector_dropout": 0.1,
142
+ "projector_hidden_act": "gelu",
143
+ "simplified_qformer": false,
144
+ "text_config": {
145
+ "_name_or_path": "/proj/mmfm/users/avihu/dmf/granite-4.1-3b/r260401a/",
146
+ "architectures": [
147
+ "GraniteForCausalLM"
148
+ ],
149
+ "attention_bias": false,
150
+ "attention_dropout": 0.0,
151
+ "attention_multiplier": 0.015625,
152
+ "bos_token_id": 100257,
153
+ "dtype": "bfloat16",
154
+ "embedding_multiplier": 12,
155
+ "eos_token_id": 100257,
156
+ "hidden_act": "silu",
157
+ "hidden_size": 2560,
158
+ "initializer_range": 0.1,
159
+ "intermediate_size": 8192,
160
+ "logits_scaling": 10,
161
+ "max_position_embeddings": 131072,
162
+ "mlp_bias": false,
163
+ "model_type": "granite",
164
+ "num_attention_heads": 40,
165
+ "num_hidden_layers": 40,
166
+ "num_key_value_heads": 8,
167
+ "pad_token_id": 100256,
168
+ "residual_multiplier": 0.22,
169
+ "rms_norm_eps": 1e-05,
170
+ "rope_scaling": null,
171
+ "rope_theta": 10000000,
172
+ "tie_word_embeddings": true,
173
+ "use_cache": false,
174
+ "vocab_size": 100353
175
+ },
176
+ "tie_word_embeddings": true,
177
+ "transformers_version": "4.57.3",
178
+ "use_checkerboard_sampling": true,
179
+ "use_image_newline_parameter": true,
180
+ "use_quadrant_sampling": false,
181
+ "vision_config": {
182
+ "attention_dropout": 0.0,
183
+ "hidden_act": "gelu_pytorch_tanh",
184
+ "hidden_size": 1152,
185
+ "image_size": 384,
186
+ "intermediate_size": 4304,
187
+ "layer_norm_eps": 1e-06,
188
+ "model_type": "siglip_vision_model",
189
+ "num_attention_heads": 16,
190
+ "num_channels": 3,
191
+ "num_hidden_layers": 27,
192
+ "patch_size": 16
193
+ },
194
+ "vision_feature_layer": [
195
+ -24,
196
+ -20,
197
+ -12,
198
+ -1
199
+ ],
200
+ "vision_feature_select_strategy": "full",
201
+ "vision_layer_to_llm_layer": [
202
+ [
203
+ -19,
204
+ 9
205
+ ],
206
+ [
207
+ -13,
208
+ 6
209
+ ],
210
+ [
211
+ -7,
212
+ 3
213
+ ],
214
+ [
215
+ -1,
216
+ 0
217
+ ]
218
+ ]
219
+ }
configuration.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import logging
3
+ from transformers import LlavaNextConfig
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ class Granite4VisionConfig(LlavaNextConfig):
9
+ model_type = "granite4_vision"
10
+ def __init__(
11
+ self,
12
+ pretrained_vision_tower: str = "",
13
+ pretrained_language_model: str = "",
14
+ downsample_rate=None,
15
+ downsample_method="interpolate",
16
+ use_image_newline_parameter=True,
17
+ simplified_qformer=True,
18
+ dave_encoder=None,
19
+ vision_layer_to_llm_layer: Optional[list] = None,
20
+ use_checkerboard_sampling: bool = False,
21
+ checkerboard_stride: int = 2,
22
+ checkerboard_vision_layer: int = -1,
23
+ checkerboard_llm_layers: Optional[list] = None,
24
+ use_quadrant_sampling=False,
25
+ projector_dropout=0.1,
26
+ **kwargs
27
+ ):
28
+ self.pretrained_vision_tower = pretrained_vision_tower
29
+ self.pretrained_language_model = pretrained_language_model
30
+ self.downsample_method = downsample_method
31
+ self.downsample_rate = downsample_rate
32
+ self.use_image_newline_parameter = use_image_newline_parameter
33
+ self.dave_encoder = dave_encoder
34
+ self.projector_dropout = projector_dropout
35
+ # List of tuples mapping vision tower layer indices to LLM layer indices
36
+ # e.g., [(-8, 0), (-1, 0)] extracts vision layers -8 and -1, both inject at LLM input (layer 0)
37
+ # e.g., [(-8, 8), (-1, 0)] extracts vision layers -8 and -1, inject at LLM layers 8 and 0 respectively
38
+ # e.g., [(-1, 0), (-1, 4)] extracts vision layer -1 twice with different projectors, inject at layers 0 and 4
39
+ # None means use default single-layer behavior (vision_feature_layer)
40
+ if vision_layer_to_llm_layer is not None:
41
+ self.vision_layer_to_llm_layer = [(int(v), int(l)) for v, l in vision_layer_to_llm_layer]
42
+ # Validate for redundant (vision_layer, llm_layer) pairs
43
+ assert len(self.vision_layer_to_llm_layer) == len(set(self.vision_layer_to_llm_layer)), "expecting no duplicates"
44
+ else:
45
+ self.vision_layer_to_llm_layer = None
46
+
47
+ # Checkerboard sampling configuration
48
+ # When enabled, extracts 4 groups from a vision layer using spatial sampling
49
+ # Each group uses a different offset in a sampling pattern
50
+ self.use_checkerboard_sampling = use_checkerboard_sampling
51
+ self.checkerboard_stride = checkerboard_stride # Stride for sampling (e.g., 4 means sample every 4th position)
52
+ self.checkerboard_vision_layer = checkerboard_vision_layer # Which vision layer to apply this to (e.g., -1)
53
+ self.checkerboard_llm_layers = checkerboard_llm_layers or [0, 10, 20, 30] # LLM layers for the 4 groups
54
+ self.simplified_qformer = simplified_qformer
55
+ # Sampling strategy: False = block sampling (full coverage, local continuity)
56
+ # True = quadrant sampling (maximum continuity, limited coverage)
57
+ self.use_quadrant_sampling = bool(use_quadrant_sampling)
58
+ # Note: With list of tuples, we now allow multiple projections to the same LLM layer
59
+ # So we remove the conflict validation that prevented this
60
+
61
+ super().__init__(**kwargs)
62
+
63
+
64
+ class Granite4VisionConfigNaflex(Granite4VisionConfig):
65
+ model_type = "granite4_vision_naflex"
downsampling.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ import torch
3
+ from torch import nn
4
+ import math
5
+ from fractions import Fraction
6
+ from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
7
+ from transformers.models.blip_2.modeling_blip_2 import Blip2QFormerModel
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class QFormerCrossAttention(nn.Module):
12
+ """Multi-headed cross-attention for QFormer with SDPA/Flash Attention support"""
13
+
14
+ def __init__(self, hidden_size, num_heads, attn_bias=False, attention_dropout=0.05, final_dropout=0.05):
15
+ super().__init__()
16
+ self.hidden_size = hidden_size
17
+ self.num_heads = num_heads
18
+ self.head_dim = hidden_size // num_heads
19
+ self.attention_dropout = attention_dropout
20
+
21
+ if self.head_dim * num_heads != hidden_size:
22
+ raise ValueError(
23
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {hidden_size} "
24
+ f"and `num_heads`: {num_heads})."
25
+ )
26
+
27
+ # Q from queries, K and V from encoder
28
+ self.q_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
29
+ self.k_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
30
+ self.v_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
31
+ self.o_proj = nn.Linear(hidden_size, hidden_size, bias=attn_bias)
32
+ self.dropout = nn.Dropout(final_dropout)
33
+
34
+ def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
35
+ """
36
+ Args:
37
+ hidden_states: (B, query_len, hidden_size) - queries
38
+ encoder_hidden_states: (B, encoder_len, hidden_size) - keys and values
39
+ attention_mask: optional attention mask
40
+ Returns:
41
+ (B, query_len, hidden_size)
42
+ """
43
+ batch_size, query_len, _ = hidden_states.shape
44
+ encoder_len = encoder_hidden_states.shape[1]
45
+
46
+ # Project queries from hidden_states
47
+ query_states = self.q_proj(hidden_states).view(
48
+ batch_size, query_len, self.num_heads, self.head_dim
49
+ ).transpose(1, 2)
50
+
51
+ # Project keys and values from encoder_hidden_states
52
+ key_states = self.k_proj(encoder_hidden_states).view(
53
+ batch_size, encoder_len, self.num_heads, self.head_dim
54
+ ).transpose(1, 2)
55
+ value_states = self.v_proj(encoder_hidden_states).view(
56
+ batch_size, encoder_len, self.num_heads, self.head_dim
57
+ ).transpose(1, 2)
58
+
59
+ # Use PyTorch's scaled_dot_product_attention (SDPA)
60
+ # This automatically uses Flash Attention when available
61
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
62
+ query_states,
63
+ key_states,
64
+ value_states,
65
+ attn_mask=attention_mask,
66
+ dropout_p=self.attention_dropout if self.training else 0.0,
67
+ is_causal=False,
68
+ )
69
+
70
+ # Reshape back to (B, query_len, hidden_size)
71
+ attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, query_len, self.hidden_size)
72
+ attn_output = self.o_proj(attn_output)
73
+
74
+ attn_output = self.dropout(attn_output)
75
+ return attn_output
76
+
77
+
78
+ class QFormerMLP(nn.Module):
79
+ """Feed-forward network (MLP) for QFormer with SiLU activation"""
80
+
81
+ def __init__(self, hidden_size, mlp_hidden_size, mlp_bias=False, dropout_prob=0.05):
82
+ super().__init__()
83
+ self.hidden_size = hidden_size
84
+
85
+ self.fc1 = nn.Linear(hidden_size, mlp_hidden_size, bias=mlp_bias)
86
+ self.act = nn.SiLU()
87
+ self.fc2 = nn.Linear(mlp_hidden_size, hidden_size, bias=mlp_bias)
88
+ self.dropout = nn.Dropout(dropout_prob)
89
+
90
+ def forward(self, hidden_states):
91
+ """
92
+ Args:
93
+ hidden_states: (B, seq_len, hidden_size)
94
+
95
+ Returns:
96
+ (B, seq_len, hidden_size)
97
+ """
98
+ hidden_states = self.fc1(hidden_states)
99
+ hidden_states = self.act(hidden_states)
100
+ hidden_states = self.dropout(self.fc2(hidden_states))
101
+ return hidden_states
102
+
103
+
104
+ class SimplifiedQFormer(nn.Module):
105
+ """
106
+ Simplified QFormer with a single cross-attention layer followed by an MLP.
107
+ Lightweight design: queries attend to encoder hidden states via cross-attention,
108
+ then pass through a feed-forward network, similar to a transformer block.
109
+ """
110
+
111
+ def __init__(self, hidden_size, num_heads=8, mlp_hidden_size=2048, mlp_bias=False, attn_bias=False, eps=1e-6):
112
+ super().__init__()
113
+ self.hidden_size = hidden_size
114
+ self.num_heads = num_heads
115
+
116
+ # Cross-attention block
117
+ self.attn_norm = nn.LayerNorm(hidden_size, eps=eps)
118
+ self.cross_attention = QFormerCrossAttention(
119
+ hidden_size, num_heads, attn_bias=attn_bias,
120
+ )
121
+
122
+ # MLP block (feed-forward network)
123
+ self.mlp_norm = nn.LayerNorm(hidden_size, eps=eps)
124
+ self.mlp = QFormerMLP(hidden_size, mlp_hidden_size, mlp_bias=mlp_bias)
125
+
126
+ def forward(self, query_embeds, encoder_hidden_states):
127
+ """
128
+ Args:
129
+ query_embeds: (B, num_queries, hidden_size) - learnable queries
130
+ encoder_hidden_states: (B, num_tokens, hidden_size) - input features
131
+
132
+ Returns:
133
+ (B, num_queries, hidden_size) - output features
134
+ """
135
+ # Cross-attention block with residual and pre-norm
136
+ residual = query_embeds
137
+ hidden_states = self.attn_norm(query_embeds)
138
+ hidden_states = self.cross_attention(hidden_states, encoder_hidden_states)
139
+ hidden_states = residual + hidden_states
140
+
141
+ # MLP block with residual and pre-norm
142
+ residual = hidden_states
143
+ hidden_states = self.mlp_norm(hidden_states)
144
+ hidden_states = self.mlp(hidden_states)
145
+ hidden_states = residual + hidden_states
146
+
147
+ return hidden_states
148
+
149
+
150
+
151
+ class InterpolateDownsampler:
152
+ def __init__(self, config, mode="area"):
153
+ self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
154
+ self.new_image_side = int(self.orig_image_side * Fraction(config.downsample_rate))
155
+ self.mode = mode
156
+
157
+ def __call__(self, image_features):
158
+ batch_size, _, dim = image_features.size()
159
+ up_shape = [batch_size] + [self.orig_image_side] * 2 + [dim]
160
+ # interpolate expects B,C,H,W
161
+ large_image_permuted = image_features.view(up_shape).permute(0,3,1,2)
162
+ small_image_permuted = torch.nn.functional.interpolate(
163
+ large_image_permuted, size=(self.new_image_side, self.new_image_side),
164
+ mode=self.mode,
165
+ )
166
+ # back to B,H*W,C
167
+ final = small_image_permuted.permute(0,2,3,1).flatten(1,2)
168
+ return final
169
+
170
+
171
+ class SpatialOffsetDownsampler:
172
+ """
173
+ Downsampler that samples with local block continuity pattern.
174
+ Instead of global strided [1,0,1,0], creates local 2x2 blocks where sampling
175
+ creates continuity: within each 2x2 block, adjacent samples are spatially adjacent.
176
+ """
177
+ def __init__(self, config, offset=0):
178
+ """
179
+ Args:
180
+ config: Model configuration
181
+ offset: Integer offset (0, 1, 2, or 3) for position within each 2x2 block
182
+ 0: top-left, 1: top-right, 2: bottom-left, 3: bottom-right
183
+ """
184
+ self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
185
+ self.new_image_side = self.orig_image_side // 2 # downsample by 2x
186
+ self.offset = offset
187
+ # Map offset to position within 2x2 blocks
188
+ self.offsets = [(0, 0), (0, 1), (1, 0), (1, 1)]
189
+ self.offset_h, self.offset_w = self.offsets[offset]
190
+
191
+ def __call__(self, image_features):
192
+ """
193
+ Extract features by sampling one position from each 2x2 block across the image.
194
+ This maintains full spatial coverage while creating local continuity.
195
+
196
+ For a 4x4 image with offset=0 (top-left of each 2x2 block):
197
+ Original: Sampled (raster order):
198
+ [A B | C D] [A C]
199
+ [E F | G H] -> [I K]
200
+ [---+---]
201
+ [I J | K L]
202
+ [M N | O P]
203
+
204
+ Result in sequence: [A, C, I, K] - maintains spatial structure
205
+
206
+ Args:
207
+ image_features: Tensor of shape [batch, height*width, hidden_dim]
208
+
209
+ Returns:
210
+ Downsampled features of shape [batch, (height/2)*(width/2), hidden_dim]
211
+ """
212
+ batch_size, seq_len, hidden_dim = image_features.shape
213
+
214
+ # Reshape to [batch, height, width, hidden_dim]
215
+ features_2d = image_features.reshape(batch_size, self.orig_image_side, self.orig_image_side, hidden_dim)
216
+
217
+ # Reshape into 2x2 blocks: [batch, n_blocks_h, 2, n_blocks_w, 2, hidden_dim]
218
+ n_blocks = self.new_image_side
219
+ features_blocks = features_2d.reshape(
220
+ batch_size, n_blocks, 2, n_blocks, 2, hidden_dim
221
+ )
222
+
223
+ # Select the specified position from each 2x2 block
224
+ # This maintains spatial coverage while creating local continuity
225
+ sampled = features_blocks[:, :, self.offset_h, :, self.offset_w, :]
226
+
227
+ # Flatten spatial dimensions back to [batch, n_blocks*n_blocks, hidden_dim]
228
+ sampled = sampled.reshape(batch_size, -1, hidden_dim)
229
+
230
+ return sampled
231
+
232
+
233
+ class SpatialQuadrantDownsampler:
234
+ """
235
+ Alternative downsampler that samples contiguous spatial quadrants.
236
+ Takes a full quadrant of the image rather than sampling across the entire image.
237
+ This creates maximum local continuity but only covers 1/4 of the spatial extent.
238
+
239
+ Use case: When you want queries to focus on a specific region with maximum
240
+ local coherence, trading off global spatial coverage.
241
+ """
242
+ def __init__(self, config, offset=0):
243
+ """
244
+ Args:
245
+ config: Model configuration
246
+ offset: Integer offset (0, 1, 2, or 3) for quadrant selection
247
+ 0: top-left, 1: top-right, 2: bottom-left, 3: bottom-right
248
+ """
249
+ self.orig_image_side = config.vision_config.image_size // config.vision_config.patch_size
250
+ self.new_image_side = self.orig_image_side // 2 # downsample by 2x
251
+ self.offset = offset
252
+ # Map offset to quadrant starting positions
253
+ self.offsets = [
254
+ (0, 0), # top-left
255
+ (0, self.new_image_side), # top-right
256
+ (self.new_image_side, 0), # bottom-left
257
+ (self.new_image_side, self.new_image_side) # bottom-right
258
+ ]
259
+ self.start_h, self.start_w = self.offsets[offset]
260
+
261
+ def __call__(self, image_features):
262
+ """
263
+ Extract a contiguous quadrant from the image.
264
+
265
+ For a 4x4 image with offset=0 (top-left quadrant):
266
+ Original: Sampled:
267
+ [A B | C D] [A B]
268
+ [E F | G H] -> [E F]
269
+ [---+---]
270
+ [I J | K L]
271
+ [M N | O P]
272
+
273
+ Result in sequence: [A, B, E, F] - maximum local continuity
274
+
275
+ Args:
276
+ image_features: Tensor of shape [batch, height*width, hidden_dim]
277
+
278
+ Returns:
279
+ Downsampled features of shape [batch, (height/2)*(width/2), hidden_dim]
280
+ """
281
+ batch_size, seq_len, hidden_dim = image_features.shape
282
+
283
+ # Reshape to [batch, height, width, hidden_dim]
284
+ features_2d = image_features.reshape(batch_size, self.orig_image_side, self.orig_image_side, hidden_dim)
285
+
286
+ # Extract contiguous quadrant
287
+ sampled = features_2d[:, self.start_h:self.start_h + self.new_image_side,
288
+ self.start_w:self.start_w + self.new_image_side, :]
289
+
290
+ # Flatten spatial dimensions back to [batch, new_height*new_width, hidden_dim]
291
+ sampled = sampled.reshape(batch_size, -1, hidden_dim)
292
+
293
+ return sampled
294
+
295
+
296
+
297
+ class WindowQFormerDownsampler(nn.Module):
298
+ def __init__(self, config, checkerboard_offset=None, use_quadrant_sampling=False):
299
+ super().__init__()
300
+ llm_hidden_size = config.text_config.hidden_size
301
+ vision_hidden_size = config.vision_config.hidden_size
302
+
303
+ # Dropout rates for robustness (conservative approach)
304
+ self.dropout = nn.Dropout(config.projector_dropout)
305
+
306
+ # Choose downsampler based on parameters
307
+ if checkerboard_offset is not None:
308
+ if use_quadrant_sampling:
309
+ # Use quadrant sampling: maximum local continuity, limited spatial coverage
310
+ self.downsampler = SpatialQuadrantDownsampler(config, offset=checkerboard_offset)
311
+ else:
312
+ # Use block sampling: balanced continuity and full spatial coverage (default)
313
+ self.downsampler = SpatialOffsetDownsampler(config, offset=checkerboard_offset)
314
+ else:
315
+ self.downsampler = InterpolateDownsampler(config)
316
+
317
+ self.use_simplified_qformer = config.simplified_qformer
318
+
319
+ # Choose between SimplifiedQFormer and Blip2QFormerModel
320
+ if self.use_simplified_qformer:
321
+ # Use our simplified QFormer with full self-attention
322
+ self.qformer = SimplifiedQFormer(
323
+ hidden_size=vision_hidden_size,
324
+ num_heads=vision_hidden_size // 64,
325
+ mlp_hidden_size=3072,
326
+ mlp_bias=True,
327
+ attn_bias=True
328
+ )
329
+ else:
330
+ # Use original Blip2QFormerModel with cross-attention
331
+ configuration = Blip2QFormerConfig(
332
+ hidden_size=vision_hidden_size,
333
+ num_attention_heads=vision_hidden_size // 64,
334
+ intermediate_size=3072,
335
+ num_hidden_layers=1,
336
+ encoder_hidden_size=vision_hidden_size,
337
+ cross_attention_frequency=1,
338
+ max_position_embeddings=2048,
339
+ use_qformer_text_input=False,
340
+ )
341
+ self.qformer = Blip2QFormerModel(configuration)
342
+
343
+ self.image_side = config.vision_config.image_size // config.vision_config.patch_size
344
+ q, w = config.downsample_rate.split("/")
345
+ self.query_side, self.window_side = int(q), int(w)
346
+ # query length is cubical for seamless integration with llava next
347
+ self.query_length = self.query_side ** 2
348
+ embed_std = 1 / math.sqrt(vision_hidden_size)
349
+ self.norm = nn.LayerNorm(vision_hidden_size, eps=1e-6)
350
+ self.query = nn.Parameter(torch.randn(1, self.query_length, vision_hidden_size) * embed_std)
351
+ # qformer model doesn't have positional embeddings, adding to the flat patches
352
+ self.image_positions = nn.Parameter(torch.randn(1, self.window_side ** 2, vision_hidden_size) * embed_std)
353
+ self.out_linear = nn.Linear(vision_hidden_size, llm_hidden_size, bias=True)
354
+
355
+ def _win(self, x, side, win):
356
+ """
357
+ (B, side*side, C) raster -> (B*n*n, win*win, C) where n=side//win
358
+ windows are raster-ordered, and tokens inside each window are raster-ordered.
359
+ """
360
+ B, _, C = x.shape
361
+ n = side // win
362
+ return (
363
+ x.view(B, side, side, C)
364
+ .view(B, n, win, n, win, C)
365
+ .transpose(2, 3) # (B, n, n, win, win, C)
366
+ .flatten(0, 2) # (B*n*n, win, win, C)
367
+ .flatten(1, 2) # (B*n*n, win*win, C)
368
+ )
369
+
370
+ def _unwin(self, xw, n, win):
371
+ """
372
+ (B*n*n, win*win, C) -> (B, (n*win)^2, C) raster
373
+ """
374
+ Bnn, _, C = xw.shape
375
+ assert Bnn % (n * n) == 0
376
+ B = Bnn // (n * n)
377
+ side = n * win
378
+ return (
379
+ xw.view(B, n, n, win, win, C)
380
+ .transpose(2, 3) # (B, n, win, n, win, C)
381
+ .contiguous()
382
+ .view(B, side, side, C)
383
+ .flatten(1, 2)
384
+ )
385
+
386
+ def forward(self, image_features):
387
+ B, HW, C = image_features.shape
388
+ assert HW == self.image_side * self.image_side
389
+ n = self.image_side // self.window_side
390
+ image_features = self.norm(image_features)
391
+ enc = self._win(image_features, self.image_side, self.window_side) # (B*n^2, w^2, C)
392
+
393
+ # Apply downsampling (either spatial offset or interpolation)
394
+ downsampled = self.downsampler(image_features) # (B, new_side^2, C) raster
395
+
396
+ new_side = n * self.query_side
397
+ downsampled_w = self._win(downsampled, new_side, self.query_side) # (B*n^2, q^2, C)
398
+
399
+ # Apply QFormer based on the chosen mechanism
400
+ if self.use_simplified_qformer:
401
+ # SimplifiedQFormer: full self-attention between queries and inputs
402
+ # Broadcasting handles batch dimension automatically
403
+ # Apply dropout to embeddings for robustness
404
+ query_embeds = self.dropout(self.query + downsampled_w)
405
+ encoder_embeds = self.dropout(enc + self.image_positions)
406
+ out_w = self.qformer(
407
+ query_embeds=query_embeds,
408
+ encoder_hidden_states=encoder_embeds
409
+ ) # (B*n^2, q^2, C)
410
+ else:
411
+ # Blip2QFormerModel: cross-attention mechanism
412
+ # Apply dropout to embeddings for robustness
413
+ query_embeds = self.query + downsampled_w # blip already dropouts the queries
414
+ encoder_embeds = self.dropout(enc + self.image_positions)
415
+ out_w = self.qformer(
416
+ query_embeds=query_embeds,
417
+ encoder_hidden_states=encoder_embeds,
418
+ return_dict=True,
419
+ ).last_hidden_state # (B*n^2, q^2, C)
420
+
421
+ out = self._unwin(out_w, n=n, win=self.query_side) # (B, new_side^2, C) raster
422
+
423
+ # Apply output dropout before final projection
424
+ out = self.dropout(out)
425
+ return self.out_linear(out)
426
+
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100257,
4
+ "eos_token_id": 100257,
5
+ "pad_token_id": 100256,
6
+ "transformers_version": "4.57.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656dafec476cf8227a1c77b4f4ce7905776dca3e09c6a9cddc06657edc8b81a9
3
+ size 4963943712
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04b421e73d926f897cc21ba00bdf7b382f426793322f476d7952db9c0307f75
3
+ size 3544424944
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling.py ADDED
@@ -0,0 +1,955 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from fractions import Fraction
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModel,
12
+ AutoModelForCausalLM,
13
+ LlavaNextForConditionalGeneration,
14
+ )
15
+ from transformers.activations import ACT2FN
16
+ from transformers.cache_utils import Cache, DynamicCache
17
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
18
+ from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
19
+ HybridMambaAttentionDynamicCache,
20
+ MoeModelOutputWithPast,
21
+ )
22
+ from transformers.models.llava_next.modeling_llava_next import (
23
+ LlavaNextCausalLMOutputWithPast,
24
+ LlavaNextModelOutputWithPast,
25
+ LlavaNextPreTrainedModel,
26
+ get_anyres_image_grid_shape,
27
+ image_size_to_num_patches,
28
+ unpad_image,
29
+ )
30
+ from transformers.masking_utils import create_causal_mask
31
+ from transformers.modeling_outputs import BaseModelOutputWithPast
32
+ from transformers.processing_utils import Unpack
33
+ from transformers.utils import TransformersKwargs, can_return_tuple, logging
34
+
35
+ from .configuration import Granite4VisionConfig
36
+ from .downsampling import InterpolateDownsampler, WindowQFormerDownsampler
37
+
38
+ IGNORE_INDEX = -100
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextMultiModalProjector
43
+ # Modified to handle vision_layer_to_llm_layer config
44
+ class Granite4VisionMultiModalProjector(nn.Module):
45
+ def __init__(self, config: Granite4VisionConfig):
46
+ super().__init__()
47
+ # When using vision_layer_to_llm_layer, each projector handles a single vision layer
48
+ # Otherwise, use the original logic that can handle concatenated multi-layer features
49
+
50
+ self.linear_1 = nn.Linear(
51
+ config.vision_config.hidden_size,
52
+ config.text_config.hidden_size,
53
+ bias=config.multimodal_projector_bias,
54
+ )
55
+ self.act = ACT2FN[config.projector_hidden_act]
56
+ self.linear_2 = nn.Linear(
57
+ config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
58
+ )
59
+
60
+ def forward(self, image_features):
61
+ hidden_states = self.linear_1(image_features)
62
+ hidden_states = self.act(hidden_states)
63
+ hidden_states = self.linear_2(hidden_states)
64
+ return hidden_states
65
+
66
+ @dataclass
67
+ class Granite4VisionModelOutputWithPast(LlavaNextModelOutputWithPast):
68
+ r"""
69
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
70
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
71
+
72
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
73
+ `past_key_values` input) to speed up sequential decoding.
74
+ image_hidden_states (`torch.FloatTensor`, *optional*):
75
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
76
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
77
+ """
78
+
79
+ balancing_loss: Optional[torch.FloatTensor] = None
80
+
81
+ @dataclass
82
+ class Granite4VisionCausalLMOutputWithPast(LlavaNextCausalLMOutputWithPast):
83
+ r"""
84
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
85
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
86
+
87
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
88
+ `past_key_values` input) to speed up sequential decoding.
89
+ image_hidden_states (`torch.FloatTensor`, *optional*):
90
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
91
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
92
+ """
93
+
94
+ balancing_loss: Optional[torch.FloatTensor] = None
95
+
96
+
97
+ class ParamWrapper(nn.Module):
98
+ def __init__(self, param):
99
+ super().__init__()
100
+ self.param = param
101
+
102
+ class Granite4VisionForConditionalGeneration(LlavaNextForConditionalGeneration):
103
+ config_class = Granite4VisionConfig
104
+
105
+ def __init__(self, config: Granite4VisionConfig):
106
+ # Update config with pretrained models if specified
107
+ if config.pretrained_vision_tower:
108
+ config.vision_config = AutoConfig.from_pretrained(
109
+ config.pretrained_vision_tower, **config.vision_config.to_dict()
110
+ )
111
+ config.vision_config = (
112
+ config.vision_config.vision_config
113
+ if hasattr(config.vision_config, "vision_config")
114
+ else config.vision_config
115
+ )
116
+ if config.pretrained_language_model:
117
+ config.text_config = AutoConfig.from_pretrained(
118
+ config.pretrained_language_model, **config.text_config.to_dict()
119
+ )
120
+
121
+ # Initialize parent
122
+ LlavaNextPreTrainedModel.__init__(self, config)
123
+
124
+ # Create custom model instance
125
+ self.model = Granite4VisionModel(config)
126
+
127
+ # Create lm_head
128
+ self.lm_head = nn.Linear(
129
+ config.text_config.hidden_size, config.text_config.vocab_size, bias=False
130
+ )
131
+
132
+ # Load pretrained components if specified
133
+ if config.pretrained_vision_tower:
134
+ self._load_pretrained_vision_tower(config)
135
+ config.pretrained_vision_tower = ""
136
+ if config.dave_encoder:
137
+ dave_state_dict = torch.load(config.dave_encoder, map_location="cpu")["model"]
138
+ self.model.vision_tower.vision_model.load_state_dict(dave_state_dict)
139
+
140
+ if config.pretrained_language_model:
141
+ self._load_pretrained_language_model(config)
142
+ config.pretrained_language_model = ""
143
+
144
+ self.post_init()
145
+
146
+ def _load_pretrained_vision_tower(self, config):
147
+ """Load pretrained vision tower weights"""
148
+ print(f"Loading vision tower from: {config.pretrained_vision_tower}")
149
+ vision_tower = AutoModel.from_pretrained(
150
+ config.pretrained_vision_tower,
151
+ attn_implementation="flash_attention_2",
152
+ device_map="cpu",
153
+ dtype=torch.bfloat16,
154
+
155
+ )
156
+ self.model.vision_tower = self.model.vision_tower.to(torch.bfloat16)
157
+ print(self.model.vision_tower.load_state_dict(vision_tower.state_dict(), strict=False).missing_keys)
158
+ self.model.vision_tower.config._attn_implementation = "flash_attention_2"
159
+ # todo: (Avihu) would have done this but afraid - maybe something I'm missing
160
+ # self.model.vision_tower = vision_tower
161
+ self.config.vision_config = (
162
+ self.model.vision_tower.config.vision_config
163
+ if hasattr(self.model.vision_tower.config, "vision_config")
164
+ else self.model.vision_tower.config
165
+ )
166
+
167
+ def _load_pretrained_language_model(self, config):
168
+ """Load pretrained language model weights"""
169
+ print(f"Loading language model from: {config.pretrained_language_model}")
170
+ language_model = AutoModelForCausalLM.from_pretrained(
171
+ config.pretrained_language_model,
172
+ device_map="cpu",
173
+ attn_implementation="flash_attention_2",
174
+ dtype=torch.bfloat16,
175
+ # use_kernels=True,
176
+ )
177
+ if self.config.image_token_index >= language_model.config.vocab_size:
178
+ language_model.resize_token_embeddings(self.config.image_token_index + 1)
179
+ # load weights in quantized mode with kernels
180
+ self.model.language_model = language_model.model
181
+ self.lm_head = language_model.lm_head
182
+ # Load weights into the language model inside self.model
183
+ self.config.text_config = self.model.language_model.config
184
+
185
+
186
+ @can_return_tuple
187
+ def forward(
188
+ self,
189
+ input_ids: Optional[torch.LongTensor] = None,
190
+ pixel_values: Optional[torch.FloatTensor] = None,
191
+ image_sizes: Optional[torch.LongTensor] = None,
192
+ attention_mask: Optional[torch.Tensor] = None,
193
+ position_ids: Optional[torch.LongTensor] = None,
194
+ past_key_values: Optional[Cache] = None,
195
+ inputs_embeds: Optional[torch.FloatTensor] = None,
196
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
197
+ vision_feature_select_strategy: Optional[str] = None,
198
+ labels: Optional[torch.LongTensor] = None,
199
+ use_cache: Optional[bool] = None,
200
+ output_attentions: Optional[bool] = None,
201
+ output_hidden_states: Optional[bool] = None,
202
+ cache_position: Optional[torch.LongTensor] = None,
203
+ logits_to_keep: Union[int, torch.Tensor] = 0,
204
+ spatial_shapes: Optional[torch.LongTensor] = None,
205
+ pixel_attention_mask: Optional[torch.Tensor] = None,
206
+ **kwargs: Unpack[TransformersKwargs],
207
+ ) -> Union[tuple, Granite4VisionCausalLMOutputWithPast]:
208
+
209
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
210
+ output_hidden_states = (
211
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
212
+ )
213
+ vision_feature_layer = (
214
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
215
+ )
216
+ vision_feature_select_strategy = (
217
+ vision_feature_select_strategy
218
+ if vision_feature_select_strategy is not None
219
+ else self.config.vision_feature_select_strategy
220
+ )
221
+
222
+ outputs = self.model(
223
+ input_ids,
224
+ pixel_values=pixel_values,
225
+ image_sizes=image_sizes,
226
+ vision_feature_layer=vision_feature_layer,
227
+ vision_feature_select_strategy=vision_feature_select_strategy,
228
+ attention_mask=attention_mask,
229
+ position_ids=position_ids,
230
+ past_key_values=past_key_values,
231
+ inputs_embeds=inputs_embeds,
232
+ use_cache=use_cache,
233
+ output_attentions=output_attentions,
234
+ output_hidden_states=output_hidden_states,
235
+ return_dict=True,
236
+ cache_position=cache_position,
237
+ spatial_shapes=spatial_shapes,
238
+ pixel_attention_mask=pixel_attention_mask,
239
+ **kwargs,
240
+ )
241
+
242
+ hidden_states = outputs.last_hidden_state
243
+
244
+ loss = None
245
+ if labels is not None:
246
+ hidden_for_pred = hidden_states[:, :-1, :].contiguous()
247
+ labels_to_pred = labels[:, 1:].contiguous()
248
+ valid_mask = labels_to_pred != IGNORE_INDEX
249
+ # key line! slicing only relevant last hidden states to compute the logits
250
+ relevant_hidden = hidden_for_pred[valid_mask]
251
+ relevant_labels = labels_to_pred[valid_mask]
252
+ # Compute logits only for relevant positions
253
+ logits = self.lm_head(relevant_hidden)
254
+ logits = logits / self.config.text_config.logits_scaling
255
+ # Compute loss with pre-shifted labels
256
+ loss = self.loss_function(
257
+ logits,
258
+ relevant_labels,
259
+ vocab_size=self.config.text_config.vocab_size,
260
+ shift_labels=relevant_labels, # Pass pre-shifted labels to skip internal shifting
261
+ **kwargs,
262
+ )
263
+
264
+ else:
265
+ # Compute logits for generation
266
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
267
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
268
+ logits = logits / self.config.text_config.logits_scaling
269
+
270
+ return Granite4VisionCausalLMOutputWithPast(
271
+ loss=loss,
272
+ logits=logits,
273
+ past_key_values=outputs.past_key_values,
274
+ hidden_states=outputs.hidden_states,
275
+ attentions=outputs.attentions,
276
+ image_hidden_states=outputs.image_hidden_states,
277
+ balancing_loss=outputs.balancing_loss
278
+ )
279
+
280
+ def prepare_inputs_for_generation(
281
+ self,
282
+ input_ids,
283
+ past_key_values=None,
284
+ inputs_embeds=None,
285
+ pixel_values=None,
286
+ image_sizes=None,
287
+ attention_mask=None,
288
+ cache_position=None,
289
+ logits_to_keep=None,
290
+ **kwargs,
291
+ ):
292
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
293
+
294
+ model_inputs = super().prepare_inputs_for_generation(
295
+ input_ids,
296
+ past_key_values=past_key_values,
297
+ inputs_embeds=inputs_embeds,
298
+ attention_mask=attention_mask,
299
+ cache_position=cache_position,
300
+ logits_to_keep=logits_to_keep,
301
+ **kwargs,
302
+ )
303
+ if self.config.text_config.model_type == "granitemoehybrid":
304
+ model_inputs = self.prepare_inputs_for_generation_granite_moe(**model_inputs)
305
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
306
+ # Otherwise we need pixel values to be passed to model
307
+ if cache_position[0] == 0:
308
+ model_inputs["pixel_values"] = pixel_values
309
+ model_inputs["image_sizes"] = image_sizes
310
+
311
+ return model_inputs
312
+
313
+ # Avihu: would have used the GraniteMoeSharedForCausalLM method, but we don't store this object anymore (split the model / lm head)
314
+ def prepare_inputs_for_generation_granite_moe(
315
+ self,
316
+ input_ids,
317
+ past_key_values=None,
318
+ attention_mask=None,
319
+ inputs_embeds=None,
320
+ cache_position=None,
321
+ position_ids=None,
322
+ use_cache=True,
323
+ **kwargs,
324
+ ):
325
+ # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
326
+ # Note: (Avihu) in transformers v4, the past_key_values is already an empty DynamicCache object. Testing that too
327
+ empty_past_kv = past_key_values is None or (isinstance(past_key_values, DynamicCache) and past_key_values[0][0] is None)
328
+
329
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
330
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
331
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
332
+ # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
333
+ # (we can't check exception 3 while compiling)
334
+ if not empty_past_kv:
335
+ if (
336
+ inputs_embeds is not None # Exception 1
337
+ or cache_position[-1] >= input_ids.shape[1] # Exception 3
338
+ ):
339
+ input_ids = input_ids[:, -cache_position.shape[0] :]
340
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
341
+ input_ids = input_ids[:, cache_position]
342
+ elif use_cache:
343
+ past_key_values = HybridMambaAttentionDynamicCache(
344
+ self.model.language_model.config, input_ids.shape[0], self.dtype, device=self.device
345
+ )
346
+
347
+ if attention_mask is not None and position_ids is None:
348
+ # create position_ids on the fly for batch generation
349
+ position_ids = attention_mask.long().cumsum(-1) - 1
350
+ position_ids.masked_fill_(attention_mask == 0, 1)
351
+ if not empty_past_kv:
352
+ position_ids = position_ids[:, -input_ids.shape[1] :]
353
+
354
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
355
+ if inputs_embeds is not None and empty_past_kv:
356
+ model_inputs = {"inputs_embeds": inputs_embeds}
357
+ else:
358
+ model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
359
+
360
+ model_inputs.update(
361
+ {
362
+ "position_ids": position_ids,
363
+ "past_key_values": past_key_values,
364
+ "use_cache": use_cache,
365
+ "attention_mask": attention_mask,
366
+ "cache_position": cache_position,
367
+ }
368
+ )
369
+
370
+ # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
371
+ for key, value in kwargs.items():
372
+ if key not in model_inputs:
373
+ model_inputs[key] = value
374
+
375
+ return model_inputs
376
+
377
+
378
+ class Granite4VisionModel(LlavaNextPreTrainedModel):
379
+ config_class = Granite4VisionConfig
380
+
381
+ def __init__(self, config: Granite4VisionConfig):
382
+ super().__init__(config)
383
+ self.vision_tower = AutoModel.from_config(config.vision_config)
384
+ self.multi_modal_projector = None
385
+ # Multi-layer vision injection: create list of projectors if enabled
386
+ assert config.vision_layer_to_llm_layer is not None
387
+ assert config.downsample_rate is not None
388
+ # Downsampler(s) - create multiple if using multi-layer vision injection
389
+ self.downsampler = None
390
+ self.downsample_rate = config.downsample_rate
391
+
392
+ # Create separate downsampler for each (vision_layer, llm_layer) pair
393
+ num_projections = len(config.vision_layer_to_llm_layer)
394
+ downsamplers = []
395
+ for _ in range(num_projections):
396
+ if config.downsample_method in ["interpolate", "bilinear"]:
397
+ downsamplers.append(InterpolateDownsampler(config))
398
+ elif config.downsample_method == "window_qformer":
399
+ downsamplers.append(WindowQFormerDownsampler(config))
400
+ self.downsampler = nn.ModuleList(downsamplers)
401
+
402
+ # Checkerboard sampling projectors
403
+ self.multi_modal_projector = None
404
+ if config.use_checkerboard_sampling:
405
+ # Create 4 WindowQFormer projectors for the 4 spatial sampling groups
406
+ use_quadrant = getattr(config, 'use_quadrant_sampling', False)
407
+ self.multi_modal_projector = nn.ModuleList([
408
+ WindowQFormerDownsampler(config, checkerboard_offset=i, use_quadrant_sampling=use_quadrant)
409
+ for i in range(4)
410
+ ])
411
+
412
+ self.image_newline = None
413
+ if config.use_image_newline_parameter:
414
+ embed_std = 1 / math.sqrt(config.text_config.hidden_size)
415
+ image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
416
+ self.model_type = config.model_type
417
+ if self.model_type in ["gpt_vision", "granite4_vision"]:
418
+ # this hack allows to do lora training from scratch, so image_newline would be in modules_to_keep
419
+ self.image_newline = ParamWrapper(image_newline)
420
+ else:
421
+ self.image_newline = image_newline
422
+ self.vocab_size = config.text_config.vocab_size
423
+
424
+ # with init_empty_weights(): # Avihu: hack to load the model faster
425
+ self.language_model = AutoModel.from_config(config.text_config)
426
+
427
+ self.post_init()
428
+
429
+ def get_input_embeddings(self):
430
+ return self.language_model.get_input_embeddings()
431
+
432
+ def set_input_embeddings(self, value):
433
+ self.language_model.set_input_embeddings(value)
434
+
435
+ def set_decoder(self, decoder):
436
+ self.language_model = decoder
437
+
438
+ def get_decoder(self):
439
+ return self.language_model
440
+
441
+
442
+ def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
443
+ """
444
+ Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
445
+
446
+ Args:
447
+ image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
448
+ List of image feature tensor, each contains all the visual feature of all patches.
449
+ image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
450
+ Actual image size of each images (H, W).
451
+ vision_feature_select_strategy (`str`)
452
+ The feature selection strategy used to select the vision feature from the vision backbone.
453
+ image_newline (`torch.Tensor` of shape `(embed_dim)`)
454
+ New line embedding vector.
455
+ Returns:
456
+ image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
457
+ feature_lens (`list[int]`)
458
+ token length of each image in image_features
459
+ """
460
+ new_image_features = []
461
+ feature_lens = []
462
+ for image_idx, image_feature in enumerate(image_features):
463
+ if image_feature.shape[0] > 1:
464
+ base_image_feature = image_feature[0]
465
+ image_feature = image_feature[1:]
466
+ height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
467
+
468
+ num_patch_height, num_patch_width = get_anyres_image_grid_shape(
469
+ image_sizes[image_idx],
470
+ self.config.image_grid_pinpoints,
471
+ self.config.vision_config.image_size,
472
+ )
473
+ if self.downsampler is not None:
474
+ ds_rate = Fraction(self.downsample_rate)
475
+ height = int(height * ds_rate)
476
+ width = int(width * ds_rate)
477
+
478
+ if (
479
+ np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
480
+ and vision_feature_select_strategy == "default"
481
+ ):
482
+ logger.warning_once(
483
+ "Image feature shape does not line up with the provided patch size. "
484
+ "You may be using the `default` vision_feature_select_strategy with a"
485
+ " visual encoder that does not have CLS."
486
+ )
487
+
488
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
489
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
490
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
491
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
492
+ if image_newline is not None:
493
+ image_feature = torch.cat(
494
+ (
495
+ image_feature,
496
+ image_newline[:, None, None]
497
+ .expand(*image_feature.shape[:-1], 1)
498
+ .to(image_feature.device, image_feature.dtype),
499
+ ),
500
+ dim=-1,
501
+ )
502
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
503
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
504
+ else:
505
+ image_feature = image_feature[0]
506
+ if image_newline is not None:
507
+ image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
508
+ new_image_features.append(image_feature)
509
+ feature_lens.append(image_feature.size(0))
510
+ feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
511
+ return new_image_features, feature_lens
512
+
513
+ def get_image_features(
514
+ self,
515
+ pixel_values: torch.FloatTensor,
516
+ image_sizes: torch.Tensor,
517
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
518
+ vision_feature_select_strategy: Optional[str] = None,
519
+ ):
520
+ """
521
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
522
+
523
+ Args:
524
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
525
+ The tensors corresponding to the input images.
526
+ image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
527
+ Actual image size of each images (H, W).
528
+ vision_feature_layer (`Union[int, list[int]]`, *optional*):
529
+ The index of the layer to select the vision feature. If multiple indices are provided,
530
+ the vision feature of the corresponding indices will be concatenated to form the
531
+ vision features.
532
+ vision_feature_select_strategy (`str`, *optional*):
533
+ The feature selection strategy used to select the vision feature from the vision backbone.
534
+ Can be one of `"default"` or `"full"`
535
+ Returns:
536
+ When vision_layer_to_llm_layer is None:
537
+ image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all
538
+ the visual feature of all patches and are of shape `(num_patches, image_length, embed_dim)`)
539
+ When vision_layer_to_llm_layer is set:
540
+ dict mapping vision layer index → list of image features for that layer
541
+ """
542
+ # Multi-layer vision injection mode
543
+ if self.config.vision_layer_to_llm_layer is not None:
544
+ return self._get_image_features_multi_layer(
545
+ pixel_values, image_sizes, vision_feature_select_strategy
546
+ )
547
+
548
+
549
+ def _get_image_features_multi_layer(
550
+ self,
551
+ pixel_values: torch.FloatTensor,
552
+ image_sizes: torch.Tensor,
553
+ vision_feature_select_strategy: Optional[str] = None,
554
+ ):
555
+ """
556
+ Extract and process multiple vision encoder layers separately.
557
+
558
+ Returns:
559
+ dict: Maps vision layer index (0, 1, ...) to list of image features for that layer
560
+ """
561
+ vision_feature_select_strategy = (
562
+ vision_feature_select_strategy
563
+ if vision_feature_select_strategy is not None
564
+ else self.config.vision_feature_select_strategy
565
+ )
566
+
567
+ # Infer image_num_patches from image_sizes
568
+ image_num_patches = [
569
+ image_size_to_num_patches(
570
+ image_size=imsize,
571
+ grid_pinpoints=self.config.image_grid_pinpoints,
572
+ patch_size=self.config.vision_config.image_size,
573
+ )
574
+ for imsize in image_sizes
575
+ ]
576
+
577
+ image_newline = self.image_newline.param if self.model_type in ["gpt_vision", "granite4_vision"] else self.image_newline
578
+ # Process pixel values
579
+ if pixel_values.dim() == 5:
580
+ _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
581
+ pixel_values = torch.cat(_pixel_values_list, dim=0)
582
+ elif pixel_values.dim() != 4:
583
+ raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
584
+
585
+ # Get all hidden states from vision tower
586
+ vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
587
+
588
+ # Process each (vision_layer, llm_layer) pair separately
589
+ # Store as list of tuples: [(llm_layer, packed_features), ...]
590
+ multi_layer_features = []
591
+ for projection_idx, (vision_layer, llm_layer) in enumerate(self.config.vision_layer_to_llm_layer):
592
+ # Extract features from this vision layer
593
+ selected_feature = vision_outputs.hidden_states[vision_layer]
594
+
595
+ # Apply feature selection strategy (remove CLS if needed)
596
+ if vision_feature_select_strategy == "default":
597
+ selected_feature = selected_feature[:, 1:]
598
+
599
+ # Apply projection-specific downsampler if configured
600
+ projected_features = self.downsampler[projection_idx](selected_feature)
601
+
602
+ # Split by image
603
+ projected_features = torch.split(projected_features, image_num_patches, dim=0)
604
+
605
+ # Pack image features
606
+ packed_features, feature_lens = self.pack_image_features(
607
+ projected_features,
608
+ image_sizes,
609
+ vision_feature_select_strategy=vision_feature_select_strategy,
610
+ image_newline=image_newline,
611
+ )
612
+
613
+ # Store as tuple (llm_layer, packed_features)
614
+ multi_layer_features.append((llm_layer, packed_features))
615
+
616
+ # Process checkerboard sampling if enabled
617
+ if self.config.use_checkerboard_sampling:
618
+ # Extract the specified vision layer for checkerboard sampling
619
+ checkerboard_feature = vision_outputs.hidden_states[self.config.checkerboard_vision_layer]
620
+
621
+ # Apply feature selection strategy (remove CLS if needed)
622
+ if vision_feature_select_strategy == "default":
623
+ checkerboard_feature = checkerboard_feature[:, 1:]
624
+
625
+ # Process each checkerboard offset with its own WindowQFormer projector
626
+ for group_idx, llm_layer in enumerate(self.config.checkerboard_llm_layers):
627
+ # Apply WindowQFormer with checkerboard downsampling
628
+ # The projector handles both downsampling and projection
629
+ projected_group = self.multi_modal_projector[group_idx](checkerboard_feature)
630
+
631
+ # Split by image
632
+ projected_group_split = torch.split(projected_group, image_num_patches, dim=0)
633
+
634
+ # Pack image features
635
+ packed_group, _ = self.pack_image_features(
636
+ projected_group_split,
637
+ image_sizes,
638
+ vision_feature_select_strategy=vision_feature_select_strategy,
639
+ image_newline=image_newline,
640
+ )
641
+
642
+ # Add to multi_layer_features as tuple
643
+ multi_layer_features.append((llm_layer, packed_group))
644
+
645
+ return multi_layer_features
646
+
647
+ def get_placeholder_mask(
648
+ self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
649
+ ):
650
+ """
651
+ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
652
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
653
+ """
654
+ if input_ids is None:
655
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
656
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
657
+ )
658
+ special_image_mask = special_image_mask.all(-1)
659
+ else:
660
+ special_image_mask = input_ids == self.config.image_token_id
661
+
662
+ n_image_tokens = special_image_mask.sum()
663
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
664
+ if inputs_embeds[special_image_mask].numel() != image_features.numel():
665
+ raise ValueError(
666
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
667
+ )
668
+ return special_image_mask
669
+
670
+ @can_return_tuple
671
+ def forward(
672
+ self,
673
+ input_ids: Optional[torch.LongTensor] = None,
674
+ pixel_values: Optional[torch.FloatTensor] = None,
675
+ image_sizes: Optional[torch.LongTensor] = None,
676
+ attention_mask: Optional[torch.Tensor] = None,
677
+ position_ids: Optional[torch.LongTensor] = None,
678
+ past_key_values: Optional[Cache] = None,
679
+ inputs_embeds: Optional[torch.FloatTensor] = None,
680
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
681
+ vision_feature_select_strategy: Optional[str] = None,
682
+ use_cache: Optional[bool] = None,
683
+ output_attentions: Optional[bool] = None,
684
+ output_hidden_states: Optional[bool] = None,
685
+ return_dict: Optional[bool] = None,
686
+ cache_position: Optional[torch.LongTensor] = None,
687
+ spatial_shapes: Optional[torch.LongTensor] = None,
688
+ pixel_attention_mask: Optional[torch.Tensor] = None,
689
+ **kwargs: Unpack[FlashAttentionKwargs],
690
+ ) -> Union[tuple, Granite4VisionModelOutputWithPast]:
691
+ r"""
692
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
693
+ The feature selection strategy used to select the vision feature from the vision backbone.
694
+ Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
695
+ If `"full"`, the full vision features are used.
696
+ """
697
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
698
+ output_hidden_states = (
699
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
700
+ )
701
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
702
+ vision_feature_layer = (
703
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
704
+ )
705
+ vision_feature_select_strategy = (
706
+ vision_feature_select_strategy
707
+ if vision_feature_select_strategy is not None
708
+ else self.config.vision_feature_select_strategy
709
+ )
710
+
711
+ if (input_ids is None) ^ (inputs_embeds is not None):
712
+ print(input_ids, inputs_embeds, position_ids, pixel_values, image_sizes, kwargs, )
713
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
714
+
715
+ if inputs_embeds is None:
716
+ inputs_embeds = self.get_input_embeddings()(input_ids)
717
+
718
+ # Initialize variables for multi-layer vision injection
719
+ # List of tuples: [(llm_layer, concat_features), ...]
720
+ layerwise_image_features = []
721
+ vision_mask = None
722
+ ran_dummy_pass = False
723
+ if (pixel_values is not None and pixel_values.size(0) > 0) or (pixel_values is None and torch.is_grad_enabled()):
724
+ if pixel_values is not None and pixel_values.size(0) > 0:
725
+ image_features = self.get_image_features(
726
+ pixel_values,
727
+ image_sizes,
728
+ vision_feature_layer=vision_feature_layer,
729
+ vision_feature_select_strategy=vision_feature_select_strategy,
730
+ )
731
+ else:
732
+ image_features = self.run_dummy_encoder_forward(
733
+ inputs_embeds, vision_feature_layer, vision_feature_select_strategy
734
+ )
735
+ ran_dummy_pass = True
736
+
737
+ # Multi-layer vision injection (image_features is a list of tuples: [(llm_layer, packed_features), ...])
738
+ # - Zero out image token positions in inputs_embeds (will be filled by LLM forward pass)
739
+ # - Store compact concat_features for each projection
740
+ for idx, (llm_layer_idx, packed_features) in enumerate(image_features):
741
+ # Concatenate all image features for this projection
742
+ concat_features = torch.cat(packed_features, dim=0).to(
743
+ inputs_embeds.device, inputs_embeds.dtype
744
+ )
745
+ if idx == 0:
746
+ if ran_dummy_pass:
747
+ vision_mask = torch.zeros_like(inputs_embeds).bool()
748
+ vision_mask[:, :concat_features.shape[0]] = True
749
+ else:
750
+ vision_mask = self.get_placeholder_mask(
751
+ input_ids, inputs_embeds=inputs_embeds, image_features=concat_features
752
+ )
753
+ inputs_embeds = inputs_embeds.masked_fill(vision_mask, 0.0)
754
+ # Store as tuple (llm_layer, concat_features)
755
+ layerwise_image_features.append((llm_layer_idx, concat_features))
756
+ # Dispatch to model-specific forward
757
+ model_type = self.config.text_config.model_type
758
+ try:
759
+ if model_type == "granitemoehybrid":
760
+ outputs = self._forward_granitemoehybrid(
761
+ inputs_embeds, attention_mask, position_ids, past_key_values,
762
+ use_cache, output_attentions, output_hidden_states, cache_position,
763
+ layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
764
+ )
765
+ elif model_type == "granite":
766
+ outputs = self._forward_granite(
767
+ inputs_embeds, attention_mask, position_ids, past_key_values,
768
+ use_cache, output_attentions, output_hidden_states, cache_position,
769
+ layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
770
+ )
771
+ else:
772
+ raise ValueError(f"Unsupported text model type: {model_type}")
773
+ except Exception as e:
774
+ print(e)
775
+ print(attention_mask)
776
+ print(position_ids)
777
+ print(inputs_embeds)
778
+ print(input_ids)
779
+ print(kwargs)
780
+ raise e
781
+ return Granite4VisionModelOutputWithPast(
782
+ last_hidden_state=outputs.last_hidden_state,
783
+ past_key_values=outputs.past_key_values,
784
+ hidden_states=outputs.hidden_states,
785
+ attentions=outputs.attentions,
786
+ image_hidden_states=image_features if pixel_values is not None else None,
787
+ )
788
+
789
+ def _inject_vision_features(self, hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass):
790
+ """Inject vision features at the specified layer via masked_scatter."""
791
+ for llm_layer, image_features_for_layer in layerwise_image_features:
792
+ if layer_idx == llm_layer:
793
+ if ran_dummy_pass:
794
+ image_features_for_layer = image_features_for_layer[:hidden_states.shape[1]]
795
+ hidden_states = hidden_states.masked_scatter(
796
+ vision_mask,
797
+ (hidden_states[vision_mask] + image_features_for_layer.flatten()).view(-1)
798
+ )
799
+ return hidden_states
800
+
801
+ def _forward_granitemoehybrid(
802
+ self, inputs_embeds, attention_mask, position_ids, past_key_values,
803
+ use_cache, output_attentions, output_hidden_states, cache_position,
804
+ layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
805
+ ):
806
+ hidden_states = inputs_embeds * self.language_model.embedding_multiplier
807
+
808
+ if cache_position is None:
809
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
810
+ cache_position = torch.arange(
811
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
812
+ )
813
+ if position_ids is None:
814
+ position_ids = cache_position.unsqueeze(0)
815
+
816
+ causal_mask = self.language_model._update_causal_mask(
817
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
818
+ )
819
+ mamba_mask = self.language_model._update_mamba_mask(attention_mask, cache_position)
820
+
821
+ position_embeddings = None
822
+ if self.language_model.rotary_emb is not None:
823
+ position_embeddings = self.language_model.rotary_emb(hidden_states, position_ids)
824
+
825
+ all_hidden_states = () if output_hidden_states else None
826
+ all_self_attns = () if output_attentions else None
827
+ all_router_logits = () if kwargs.get('output_router_logits', False) else None
828
+
829
+ for layer_idx, decoder_layer in enumerate(self.language_model.layers):
830
+ hidden_states = self._inject_vision_features(
831
+ hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass
832
+ )
833
+
834
+ layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask
835
+
836
+ if output_hidden_states:
837
+ all_hidden_states += (hidden_states,)
838
+
839
+ layer_outputs = decoder_layer(
840
+ hidden_states,
841
+ attention_mask=layer_mask,
842
+ past_key_values=past_key_values,
843
+ output_attentions=output_attentions,
844
+ use_cache=use_cache,
845
+ cache_position=cache_position,
846
+ output_router_logits=kwargs.get('output_router_logits', False),
847
+ position_embeddings=position_embeddings,
848
+ **kwargs,
849
+ )
850
+
851
+ hidden_states = layer_outputs[0]
852
+
853
+ if output_attentions:
854
+ if layer_outputs[1] is not None:
855
+ all_self_attns += (layer_outputs[1],)
856
+
857
+ if kwargs.get('output_router_logits', False):
858
+ if layer_outputs[-1] is not None:
859
+ all_router_logits += (layer_outputs[-1],)
860
+
861
+ hidden_states = self.language_model.norm(hidden_states)
862
+
863
+ if output_hidden_states:
864
+ all_hidden_states += (hidden_states,)
865
+
866
+ if past_key_values and not past_key_values.has_previous_state:
867
+ past_key_values.has_previous_state = True
868
+
869
+ return MoeModelOutputWithPast(
870
+ last_hidden_state=hidden_states,
871
+ past_key_values=past_key_values,
872
+ hidden_states=all_hidden_states,
873
+ attentions=all_self_attns,
874
+ router_logits=all_router_logits,
875
+ )
876
+
877
+ def _forward_granite(
878
+ self, inputs_embeds, attention_mask, position_ids, past_key_values,
879
+ use_cache, output_attentions, output_hidden_states, cache_position,
880
+ layerwise_image_features, vision_mask, ran_dummy_pass, **kwargs,
881
+ ):
882
+ hidden_states = inputs_embeds * self.language_model.embedding_multiplier
883
+
884
+ if use_cache and past_key_values is None:
885
+ past_key_values = DynamicCache(config=self.language_model.config)
886
+
887
+ if cache_position is None:
888
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
889
+ cache_position = torch.arange(
890
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
891
+ )
892
+ if position_ids is None:
893
+ position_ids = cache_position.unsqueeze(0)
894
+
895
+ causal_mask = create_causal_mask(
896
+ config=self.language_model.config,
897
+ input_embeds=inputs_embeds,
898
+ attention_mask=attention_mask,
899
+ cache_position=cache_position,
900
+ past_key_values=past_key_values,
901
+ position_ids=position_ids,
902
+ )
903
+
904
+ position_embeddings = self.language_model.rotary_emb(hidden_states, position_ids)
905
+
906
+ all_hidden_states = () if output_hidden_states else None
907
+ all_self_attns = () if output_attentions else None
908
+
909
+ for layer_idx, decoder_layer in enumerate(self.language_model.layers):
910
+ hidden_states = self._inject_vision_features(
911
+ hidden_states, layer_idx, layerwise_image_features, vision_mask, ran_dummy_pass
912
+ )
913
+
914
+ if output_hidden_states:
915
+ all_hidden_states += (hidden_states,)
916
+
917
+ layer_outputs = decoder_layer(
918
+ hidden_states,
919
+ attention_mask=causal_mask,
920
+ position_ids=position_ids,
921
+ past_key_values=past_key_values,
922
+ output_attentions=output_attentions,
923
+ use_cache=use_cache,
924
+ cache_position=cache_position,
925
+ position_embeddings=position_embeddings,
926
+ **kwargs,
927
+ )
928
+
929
+ hidden_states = layer_outputs[0]
930
+
931
+ if output_attentions:
932
+ all_self_attns += (layer_outputs[1],)
933
+
934
+ hidden_states = self.language_model.norm(hidden_states)
935
+
936
+ if output_hidden_states:
937
+ all_hidden_states += (hidden_states,)
938
+
939
+ return BaseModelOutputWithPast(
940
+ last_hidden_state=hidden_states,
941
+ past_key_values=past_key_values if use_cache else None,
942
+ hidden_states=all_hidden_states,
943
+ attentions=all_self_attns,
944
+ )
945
+
946
+ def run_dummy_encoder_forward(self, inputs_embeds, vision_feature_layer, vision_feature_select_strategy):
947
+ print("no pixel values, using dummy data to get grads")
948
+ dummy_data = torch.zeros((3, 3, 384, 384), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
949
+ dummy_sizes = torch.tensor([[768, 384]], device=inputs_embeds.device)
950
+ image_features = self.get_image_features(
951
+ dummy_data, dummy_sizes,
952
+ vision_feature_layer=vision_feature_layer,
953
+ vision_feature_select_strategy=vision_feature_select_strategy
954
+ )
955
+ return [(k, [v[0] * 0]) for k, v in image_features]
preprocessor_config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 384,
4
+ "width": 384
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": true,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "image_grid_pinpoints": [
13
+ [
14
+ 384,
15
+ 384
16
+ ],
17
+ [
18
+ 384,
19
+ 768
20
+ ],
21
+ [
22
+ 384,
23
+ 1152
24
+ ],
25
+ [
26
+ 384,
27
+ 1536
28
+ ],
29
+ [
30
+ 384,
31
+ 1920
32
+ ],
33
+ [
34
+ 384,
35
+ 2304
36
+ ],
37
+ [
38
+ 384,
39
+ 2688
40
+ ],
41
+ [
42
+ 384,
43
+ 3072
44
+ ],
45
+ [
46
+ 384,
47
+ 3456
48
+ ],
49
+ [
50
+ 384,
51
+ 3840
52
+ ],
53
+ [
54
+ 768,
55
+ 384
56
+ ],
57
+ [
58
+ 768,
59
+ 768
60
+ ],
61
+ [
62
+ 768,
63
+ 1152
64
+ ],
65
+ [
66
+ 768,
67
+ 1536
68
+ ],
69
+ [
70
+ 768,
71
+ 1920
72
+ ],
73
+ [
74
+ 1152,
75
+ 384
76
+ ],
77
+ [
78
+ 1152,
79
+ 768
80
+ ],
81
+ [
82
+ 1152,
83
+ 1152
84
+ ],
85
+ [
86
+ 1536,
87
+ 384
88
+ ],
89
+ [
90
+ 1536,
91
+ 768
92
+ ],
93
+ [
94
+ 1920,
95
+ 384
96
+ ],
97
+ [
98
+ 1920,
99
+ 768
100
+ ],
101
+ [
102
+ 2304,
103
+ 384
104
+ ],
105
+ [
106
+ 2688,
107
+ 384
108
+ ],
109
+ [
110
+ 3072,
111
+ 384
112
+ ],
113
+ [
114
+ 3456,
115
+ 384
116
+ ],
117
+ [
118
+ 3840,
119
+ 384
120
+ ]
121
+ ],
122
+ "image_mean": [
123
+ 0.5,
124
+ 0.5,
125
+ 0.5
126
+ ],
127
+ "image_processor_type": "LlavaNextImageProcessor",
128
+ "image_std": [
129
+ 0.5,
130
+ 0.5,
131
+ 0.5
132
+ ],
133
+ "processor_class": "Granite4VisionProcessor",
134
+ "auto_map": {
135
+ "AutoProcessor": "processing.Granite4VisionProcessor"
136
+ },
137
+ "resample": 3,
138
+ "rescale_factor": 0.00392156862745098,
139
+ "size": {
140
+ "height": 384,
141
+ "width": 384
142
+ },
143
+ "window_side": 8
144
+ }
processing.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fractions import Fraction
2
+
3
+ from transformers import LlavaNextProcessor
4
+ from transformers.image_processing_utils import select_best_resolution
5
+
6
+
7
+
8
+ class Granite4VisionProcessor(LlavaNextProcessor):
9
+ model_type = "granite4_vision"
10
+
11
+ def __init__(
12
+ self,
13
+ image_processor=None,
14
+ tokenizer=None,
15
+ patch_size=None,
16
+ vision_feature_select_strategy=None,
17
+ chat_template=None,
18
+ image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
19
+ num_additional_image_tokens=0,
20
+ downsample_rate=None,
21
+ **kwargs,
22
+ ):
23
+ super().__init__(image_processor=image_processor,
24
+ tokenizer=tokenizer,
25
+ patch_size=patch_size,
26
+ vision_feature_select_strategy=vision_feature_select_strategy,
27
+ chat_template=chat_template,
28
+ image_token=image_token,
29
+ num_additional_image_tokens=num_additional_image_tokens,
30
+ )
31
+ self.downsample_rate = downsample_rate
32
+
33
+ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
34
+ image_grid_pinpoints = self.image_processor.image_grid_pinpoints
35
+
36
+ height_best_resolution, width_best_resolution = select_best_resolution(
37
+ [orig_height, orig_width], image_grid_pinpoints
38
+ )
39
+ scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
40
+
41
+ patches_height = height // self.patch_size
42
+ patches_width = width // self.patch_size
43
+ if self.downsample_rate is not None:
44
+ # todo: maybe add an assertion that it divides nicely?
45
+ ds_rate = Fraction(self.downsample_rate)
46
+ patches_height = int(patches_height * ds_rate)
47
+ patches_width = int(patches_width * ds_rate)
48
+
49
+ unpadded_features, newline_features = self._get_unpadded_features(
50
+ orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
51
+ )
52
+ # The base patch covers the entire image (+1 for the CLS)
53
+ base_features = patches_height * patches_width + self.num_additional_image_tokens
54
+ num_image_tokens = unpadded_features + newline_features + base_features
55
+ return num_image_tokens
processor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "downsample_rate": "4/8",
3
+ "image_token": "<image>",
4
+ "num_additional_image_tokens": 0,
5
+ "patch_size": 16,
6
+ "processor_class": "Granite4VisionProcessor",
7
+ "auto_map": {
8
+ "AutoProcessor": "processing.Granite4VisionProcessor"
9
+ },
10
+ "vision_feature_select_strategy": "full"
11
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<image>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<|end_of_text|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "content": "<|end_of_text|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "pad_token": {
20
+ "content": "<|pad|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<|unk|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "100256": {
6
+ "content": "<|pad|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "100257": {
14
+ "content": "<|end_of_text|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "100258": {
22
+ "content": "<|fim_prefix|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "100259": {
30
+ "content": "<|fim_middle|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "100260": {
38
+ "content": "<|fim_suffix|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "100261": {
46
+ "content": "<|fim_pad|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "100262": {
54
+ "content": "<|filename|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "100263": {
62
+ "content": "<|reponame|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "100264": {
70
+ "content": "<|start_of_role|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "100265": {
78
+ "content": "<|end_of_role|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "100266": {
86
+ "content": "<|unused_1|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "100267": {
94
+ "content": "<|start_of_plugin|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "100268": {
102
+ "content": "<|end_of_plugin|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "100269": {
110
+ "content": "<|unk|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "100270": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "100271": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "100272": {
134
+ "content": "<tool_response>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "100273": {
142
+ "content": "</tool_response>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "100274": {
150
+ "content": "<think>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "100275": {
158
+ "content": "</think>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "100276": {
166
+ "content": "<think_on>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "100277": {
174
+ "content": "<think_off>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "100278": {
182
+ "content": "<schema>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "100279": {
190
+ "content": "</schema>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "100280": {
198
+ "content": "<tools>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "100281": {
206
+ "content": "</tools>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "100282": {
214
+ "content": "<documents>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "100283": {
222
+ "content": "</documents>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "100284": {
230
+ "content": "<|unused_15|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "100285": {
238
+ "content": "<|unused_16|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "100286": {
246
+ "content": "<|unused_17|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "100287": {
254
+ "content": "<|unused_18|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "100288": {
262
+ "content": "<|unused_19|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "100289": {
270
+ "content": "<|unused_20|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "100290": {
278
+ "content": "<|unused_21|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "100291": {
286
+ "content": "<|unused_22|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "100292": {
294
+ "content": "<|unused_23|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "100293": {
302
+ "content": "<|unused_24|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "100294": {
310
+ "content": "<|unused_25|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "100295": {
318
+ "content": "<|unused_26|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "100296": {
326
+ "content": "<|unused_27|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "100297": {
334
+ "content": "<|unused_28|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "100298": {
342
+ "content": "<|unused_29|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "100299": {
350
+ "content": "<|unused_30|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "100300": {
358
+ "content": "<|unused_31|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "100301": {
366
+ "content": "<|unused_32|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "100302": {
374
+ "content": "<|unused_33|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "100303": {
382
+ "content": "<|unused_34|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "100304": {
390
+ "content": "<|unused_35|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "100305": {
398
+ "content": "<|unused_36|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "100306": {
406
+ "content": "<|unused_37|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "100307": {
414
+ "content": "<|unused_38|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "100308": {
422
+ "content": "<|unused_39|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "100309": {
430
+ "content": "<|unused_40|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "100310": {
438
+ "content": "<|unused_41|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "100311": {
446
+ "content": "<|unused_42|>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "100312": {
454
+ "content": "<|unused_43|>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "100313": {
462
+ "content": "<|unused_44|>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "100314": {
470
+ "content": "<|unused_45|>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "100315": {
478
+ "content": "<|unused_46|>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "100316": {
486
+ "content": "<|unused_47|>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "100317": {
494
+ "content": "<|unused_48|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "100318": {
502
+ "content": "<|unused_49|>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "100319": {
510
+ "content": "<|unused_50|>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "100320": {
518
+ "content": "<|unused_51|>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "100321": {
526
+ "content": "<|unused_52|>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "100322": {
534
+ "content": "<|unused_53|>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "100323": {
542
+ "content": "<|unused_54|>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "100324": {
550
+ "content": "<|unused_55|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "100325": {
558
+ "content": "<|unused_56|>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "100326": {
566
+ "content": "<|unused_57|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "100327": {
574
+ "content": "<|unused_58|>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "100328": {
582
+ "content": "<|unused_59|>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "100329": {
590
+ "content": "<|unused_60|>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "100330": {
598
+ "content": "<|unused_61|>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "100331": {
606
+ "content": "<|unused_62|>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "100332": {
614
+ "content": "<|unused_63|>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "100333": {
622
+ "content": "<|unused_64|>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "100334": {
630
+ "content": "<|unused_65|>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "100335": {
638
+ "content": "<|unused_66|>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "100336": {
646
+ "content": "<|unused_67|>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "100337": {
654
+ "content": "<|unused_68|>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "100338": {
662
+ "content": "<|unused_69|>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "100339": {
670
+ "content": "<|unused_70|>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "100340": {
678
+ "content": "<|unused_71|>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "100341": {
686
+ "content": "<|unused_72|>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "100342": {
694
+ "content": "<|unused_73|>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "100343": {
702
+ "content": "<|unused_74|>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "100344": {
710
+ "content": "<|unused_75|>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "100345": {
718
+ "content": "<|unused_76|>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "100346": {
726
+ "content": "<|unused_77|>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "100347": {
734
+ "content": "<|unused_78|>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "100348": {
742
+ "content": "<|unused_79|>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "100349": {
750
+ "content": "<|unused_80|>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "100350": {
758
+ "content": "<|unused_81|>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "100351": {
766
+ "content": "<|unused_82|>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "100352": {
774
+ "content": "<image>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ }
781
+ },
782
+ "additional_special_tokens": [
783
+ "<image>"
784
+ ],
785
+ "bos_token": "<|end_of_text|>",
786
+ "clean_up_tokenization_spaces": false,
787
+ "eos_token": "<|end_of_text|>",
788
+ "errors": "replace",
789
+ "extra_special_tokens": {},
790
+ "model_max_length": 1000000000000000019884624838656,
791
+ "pad_token": "<|pad|>",
792
+ "padding_side": "left",
793
+ "processor_class": "Granite4VisionProcessor",
794
+ "tokenizer_class": "GPT2Tokenizer",
795
+ "unk_token": "<|unk|>"
796
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff