Fix HF inference to match native OLMo outputs exactly

Five fixes for bitwise parity: chat template format, GPU pixel normalization, RoPE on-device recomputation, SDPA causal mask, and GQA repeat_interleave.

Files changed (3) hide show

chat_template.jinja +1 -1
image_processing_molmo2.py +17 -2
modeling_molmo2.py +34 -3

chat_template.jinja CHANGED Viewed

@@ -1 +1 @@

- {% set DEMO_STYLES = ['point_count','pointing','cosyn_point','user_qa','long_caption','short_caption','video_long_caption','video_short_caption','video_point_track_per_frame','video_point_track_start_end','video_point_track_all_frames','video_single_point_track_start_end','video_transcript','video_clip_caption_start_end','video_clip_caption_start_end_in_seconds','video_clip_transcript_start_end','video_clip_transcript_start_end_in_seconds','video_frame_caption_timestamp','video_frame_caption_timestamp_in_seconds','correction_qa','text_sft','video_point','video_point_count','video_count','video_count_point','multi_image_pointing','multi_image_counting','multi_image_point_then_count','multi_image_count_then_point','demo','a_okvqa_mc','ai2_diagram_no_letter','ai2_diagram','science_qa','multi_image_mc','multi_image_mc_exp','mantis_instruct_mc','video_multiple_choice','video_multiple_choice_count_without_pointing','video_multiple_choice_multiple_correct','video_multiple_choice_w_subtitle'] %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set has_subtitle = messages and messages[0]['role'].lower() == 'subtitle' %}{% for message in messages %}{% if message['content'] is not string %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}{% set video_count.value = video_count.value + 1 %}{% endif %}{% endfor %}{% endif %}{% endfor %}{% if image_count.value == 1 %}{{ '<|image|>' }}{% elif image_count.value > 1 %}{% for i in range(image_count.value) %}{{ 'Image ' ~ (i + 1) ~ '<|image|>' }}{% endfor %}{% endif %}{% for _ in range(video_count.value) %}{{ '<|video|>' }}{% endfor %}{% if has_subtitle %}{{ messages[0]['content'] }}{% endif %}{% for message in messages %}{% set role = message['role'].lower() %}{% if role == 'subtitle' %}{% continue %}{% endif %}{% set conv_index = loop.index - (1 if has_subtitle else 0) %}{%- if (conv_index % 2 == 1 and role != 'user') or (conv_index % 2 == 0 and role != 'assistant') -%}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{%- endif -%}{% if message['content'] is string %}{% set text_content = message['content'] %}{% else %}{% set m = namespace(text='') %}{% for content in message['content'] %}{% if content['type'] == 'text' %}{% if content['style'] is defined and content['style'] not in DEMO_STYLES %}{% set seg = content['style'] ~ ': ' ~ content['text'] %}{% else %}{% set seg = content['text'] %}{% endif %}{% set m.text = m.text ~ ('' if not m.text else ' ') ~ seg %}{% endif %}{% endfor %}{% set text_content = m.text %}{% endif %}{% if role == 'user' %}{% ~~if not (has_subtitle and loop.index == 2) and not (not has_subtitle and loop.first) %}~~{{ ~~'<|im_end|>\n' }}{% endif %}{{ '<|im_start|>user\n' }}{{~~ text_content }}{~~{ '<|im_end|>\n' }}{~~% else %} {~~# assistant #}~~{{ ~~'<|im_start|>assistant\n' }}{{~~ text_content }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ~~'<|im_start|>assistant\n' }}~~{% endif %}

+ {% set DEMO_STYLES = ['point_count','pointing','cosyn_point','user_qa','long_caption','short_caption','video_long_caption','video_short_caption','video_point_track_per_frame','video_point_track_start_end','video_point_track_all_frames','video_single_point_track_start_end','video_transcript','video_clip_caption_start_end','video_clip_caption_start_end_in_seconds','video_clip_transcript_start_end','video_clip_transcript_start_end_in_seconds','video_frame_caption_timestamp','video_frame_caption_timestamp_in_seconds','correction_qa','text_sft','video_point','video_point_count','video_count','video_count_point','multi_image_pointing','multi_image_counting','multi_image_point_then_count','multi_image_count_then_point','demo','a_okvqa_mc','ai2_diagram_no_letter','ai2_diagram','science_qa','multi_image_mc','multi_image_mc_exp','mantis_instruct_mc','video_multiple_choice','video_multiple_choice_count_without_pointing','video_multiple_choice_multiple_correct','video_multiple_choice_w_subtitle'] %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set has_subtitle = messages and messages[0]['role'].lower() == 'subtitle' %}{% for message in messages %}{% if message['content'] is not string %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}{% set video_count.value = video_count.value + 1 %}{% endif %}{% endfor %}{% endif %}{% endfor %}{% if image_count.value == 1 %}{{ '<|image|>' }}{% elif image_count.value > 1 %}{% for i in range(image_count.value) %}{{ 'Image ' ~ (i + 1) ~ '<|image|>' }}{% endfor %}{% endif %}{% for _ in range(video_count.value) %}{{ '<|video|>' }}{% endfor %}{% if has_subtitle %}{{ messages[0]['content'] }}{% endif %}{% for message in messages %}{% set role = message['role'].lower() %}{% if role == 'subtitle' %}{% continue %}{% endif %}{% set conv_index = loop.index - (1 if has_subtitle else 0) %}{%- if (conv_index % 2 == 1 and role != 'user') or (conv_index % 2 == 0 and role != 'assistant') -%}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{%- endif -%}{% if message['content'] is string %}{% set text_content = message['content'] %}{% else %}{% set m = namespace(text='') %}{% for content in message['content'] %}{% if content['type'] == 'text' %}{% if content['style'] is defined and content['style'] not in DEMO_STYLES %}{% set seg = content['style'] ~ ': ' ~ content['text'] %}{% else %}{% set seg = content['text'] %}{% endif %}{% set m.text = m.text ~ ('' if not m.text else ' ') ~ seg %}{% endif %}{% endfor %}{% set text_content = m.text %}{% endif %}{% if role == 'user' %}User: {{ text_content }}{% else %} {{ text_content }}{% endif %}{% endfor %}{% if add_generation_prompt %} Assistant:{% endif %}

image_processing_molmo2.py CHANGED Viewed

@@ -29,7 +29,10 @@ def normalize_image(
     image: np.ndarray,
     image_mean: list[float],
     image_std: list[float],
 ) -> np.ndarray:
     image -= np.array(image_mean, dtype=np.float32)[None, None, :]
     image /= np.array(image_std, dtype=np.float32)[None, None, :]
     return image
@@ -110,11 +113,12 @@ def build_resized_image(
     image_mean: list[float],
     image_std: list[float],
     image_patch_size: int,
 ) -> tuple[np.ndarray, np.ndarray]:
     resized = resize_image(
         image, base_image_input_size, resample,
     )
-    resized = normalize_image(resized, image_mean, image_std)
     if len(resized.shape) == 3:
         resized = np.expand_dims(resized, 0)
     crop_patch_w = base_image_input_size[1] // image_patch_size
@@ -132,6 +136,7 @@ def build_overlapping_crops(
     image_mean: list[float],
     image_std: list[float],
     image_patch_size: int,
 ) -> tuple[np.ndarray, np.ndarray]:
     """Decompose an image into a set of overlapping crops
@@ -167,7 +172,7 @@ def build_overlapping_crops(
         [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
         resample,
     )
-    src = normalize_image(src, image_mean, image_std)
     # Now we have to split the image into crops, and track what patches came from
     # where in `patch_idx_arr`
@@ -259,6 +264,7 @@ def image_to_patches_and_grids(
     image_patch_size: int,
     image_pooling_w: int,
     image_pooling_h: int,
 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
     :return image_grids, the shape of each (low-res, high-res) image after pooling
@@ -284,6 +290,7 @@ def image_to_patches_and_grids(
         image_mean,
         image_std,
         image_patch_size,
     )
     pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
     h, w = pooling_idx.shape[:2]
@@ -297,6 +304,7 @@ def image_to_patches_and_grids(
         image_mean,
         image_std,
         image_patch_size,
     )
     crop_arr = np.concatenate([resized, crop_arr], 0)
@@ -390,6 +398,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, list[float]]] = None,
         image_std: Optional[Union[float, list[float]]] = None,
         do_convert_rgb: Optional[bool] = None,
         max_crops: Optional[int] = None,
         overlap_margins: Optional[list[int]] = None,
         patch_size: Optional[int] = None,
@@ -448,6 +457,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         image_mean = image_mean or self.image_mean
         image_std = image_std or self.image_std
         do_convert_rgb = do_convert_rgb or self.do_convert_rgb
         max_crops = max_crops or self.max_crops
         overlap_margins = overlap_margins or self.overlap_margins
@@ -491,6 +501,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                     patch_size,
                     image_pooling_w,
                     image_pooling_h,
                 )
                 batch_grids.append(image_grid)
                 batch_crops.append(crops)
@@ -498,6 +509,10 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                 batch_num_crops.append(crops.shape[0])
             pixel_values = np.concatenate(batch_crops, 0)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
             image_grids = np.concatenate(batch_grids, 0)
             image_num_crops = np.array(batch_num_crops)

     image: np.ndarray,
     image_mean: list[float],
     image_std: list[float],
+    do_normalize: bool = True,
 ) -> np.ndarray:
+    if not do_normalize:
+        return image
     image -= np.array(image_mean, dtype=np.float32)[None, None, :]
     image /= np.array(image_std, dtype=np.float32)[None, None, :]
     return image
     image_mean: list[float],
     image_std: list[float],
     image_patch_size: int,
+    do_normalize: bool = True,
 ) -> tuple[np.ndarray, np.ndarray]:
     resized = resize_image(
         image, base_image_input_size, resample,
     )
+    resized = normalize_image(resized, image_mean, image_std, do_normalize=do_normalize)
     if len(resized.shape) == 3:
         resized = np.expand_dims(resized, 0)
     crop_patch_w = base_image_input_size[1] // image_patch_size
     image_mean: list[float],
     image_std: list[float],
     image_patch_size: int,
+    do_normalize: bool = True,
 ) -> tuple[np.ndarray, np.ndarray]:
     """Decompose an image into a set of overlapping crops
         [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
         resample,
     )
+    src = normalize_image(src, image_mean, image_std, do_normalize=do_normalize)
     # Now we have to split the image into crops, and track what patches came from
     # where in `patch_idx_arr`
     image_patch_size: int,
     image_pooling_w: int,
     image_pooling_h: int,
+    do_normalize: bool = True,
 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
     :return image_grids, the shape of each (low-res, high-res) image after pooling
         image_mean,
         image_std,
         image_patch_size,
+        do_normalize=do_normalize,
     )
     pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
     h, w = pooling_idx.shape[:2]
         image_mean,
         image_std,
         image_patch_size,
+        do_normalize=do_normalize,
     )
     crop_arr = np.concatenate([resized, crop_arr], 0)
         image_mean: Optional[Union[float, list[float]]] = None,
         image_std: Optional[Union[float, list[float]]] = None,
         do_convert_rgb: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
         max_crops: Optional[int] = None,
         overlap_margins: Optional[list[int]] = None,
         patch_size: Optional[int] = None,
         image_mean = image_mean or self.image_mean
         image_std = image_std or self.image_std
         do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        do_normalize = do_normalize if do_normalize is not None else False
         max_crops = max_crops or self.max_crops
         overlap_margins = overlap_margins or self.overlap_margins
                     patch_size,
                     image_pooling_w,
                     image_pooling_h,
+                    do_normalize=do_normalize,
                 )
                 batch_grids.append(image_grid)
                 batch_crops.append(crops)
                 batch_num_crops.append(crops.shape[0])
             pixel_values = np.concatenate(batch_crops, 0)
+            if not do_normalize:
+                # Convert to uint8 so the model can normalize on GPU with exact
+                # native precision (CPU and GPU float32 /255 differ for ~half of uint8 values).
+                pixel_values = np.clip(pixel_values * 255 + 0.5, 0, 255).astype(np.uint8)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
             image_grids = np.concatenate(batch_grids, 0)
             image_num_crops = np.array(batch_num_crops)

modeling_molmo2.py CHANGED Viewed

@@ -440,7 +440,11 @@ class Molmo2VisionBackbone(nn.Module):
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
         batch_size, num_image = images.shape[:2]
-        images = images.to(device=self.device, dtype=self.dtype)
         image_features = self.encode_image(images)
         image_features = self.image_feature_dropout(image_features)
@@ -543,7 +547,14 @@ class Molmo2RotaryEmbedding(nn.Module):
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -710,16 +721,36 @@ class Molmo2Attention(nn.Module):
         if self.config._attn_implementation != "eager":
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
             query_states,
             key_states,
             value_states,
-            attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
             **kwargs,
         )
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.attn_out(attn_output)

         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
         batch_size, num_image = images.shape[:2]
+        images = images.to(device=self.device)
+        # Normalize pixel values on GPU: uint8 [0,255] -> float [-1,1]
+        # This matches native OLMo's normalize_on_gpu path exactly.
+        images = images.float().div_(255.0).mul_(2.0).sub_(1.0)
+        images = images.to(dtype=self.dtype)
         image_features = self.encode_image(images)
         image_features = self.image_feature_dropout(image_features)
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            # Recompute inv_freq directly on the target device to avoid CPU/GPU
+            # float32 rounding differences when inv_freq is initialized on CPU.
+            dim = self.inv_freq.shape[0] * 2
+            inv_freq = 1.0 / (self.config.rope_theta ** (
+                torch.arange(0, dim, 2, dtype=torch.float, device=x.device) / dim
+            ))
+            seq = position_ids[0].float()
+            freqs = torch.einsum("i , j -> i j", seq, inv_freq).unsqueeze(0)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
         if self.config._attn_implementation != "eager":
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        # During prefill with SDPA, drop the explicit attention mask so SDPA uses
+        # is_causal=True internally. This matches native OLMo's behavior and avoids
+        # numerical differences from explicit-mask vs is_causal code paths.
+        sdpa_mask = attention_mask
+        if self.config._attn_implementation == "sdpa" and query_states.shape[2] > 1:
+            sdpa_mask = None
+        # Expand GQA key/value heads to match query heads via repeat_interleave,
+        # matching native OLMo's approach. This avoids the enable_gqa=True SDPA
+        # path which uses a different kernel and produces different float32 results.
+        # Temporarily set num_key_value_groups=1 so the HF SDPA wrapper doesn't
+        # try to handle GQA again on already-expanded tensors.
+        saved_groups = self.num_key_value_groups
+        if saved_groups > 1:
+            key_states = key_states.repeat_interleave(saved_groups, dim=1)
+            value_states = value_states.repeat_interleave(saved_groups, dim=1)
+            self.num_key_value_groups = 1
         attn_output, attn_weights = attention_interface(
             self,
             query_states,
             key_states,
             value_states,
+            sdpa_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
             **kwargs,
         )
+        self.num_key_value_groups = saved_groups
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.attn_out(attn_output)