revert to V0 as latest vLLM has removed V0 support and V1 integration is still in progress

Files changed (4) hide show

README.md +2 -2
config.json +0 -6
configuration_phi4flash.py +5 -11
modeling_phi4flash.py +9 -9

README.md CHANGED Viewed

@@ -25,7 +25,7 @@ The model belongs to the Phi-4 model family and supports 64K token context lengt
 📚 [Training Codebase](https://github.com/microsoft/ArchScale) <br>
 👩‍🍳 [Phi Cookbook](https://github.com/microsoft/PhiCookBook) <br>
 🏡 [Phi Portal](https://azure.microsoft.com/en-us/products/phi) <br>
-🚀 [vLLM Inference](https://github.com/vllm-project/vllm/pull/20702) <br>
 🖥️ Try It [Azure](https://ai.azure.com/explore/models/Phi-4-mini-flash-reasoning/version/1/registry/azureml-phi-prod) [Nvidia NIM](https://build.nvidia.com/microsoft/phi-4-mini-flash-reasoning)<br>
@@ -236,4 +236,4 @@ Benchmark datasets
 We evaluate the model with three of the most popular math benchmarks where the strongest reasoning models are competing together. Specifically:
 + Math-500: This benchmark consists of 500 challenging math problems designed to test the model's ability to perform complex mathematical reasoning and problem-solving.
 + AIME 2024/AIME 2025: The American Invitational Mathematics Examination (AIME) is a highly regarded math competition that features a series of difficult problems aimed at assessing advanced mathematical skills and logical reasoning. We evaluate the models on the problems from both 2024 and the year 2025 examinations.
-+ GPQA Diamond: The Graduate-Level Google-Proof Q&A (GPQA) Diamond benchmark focuses on evaluating the model's ability to understand and solve a wide range of mathematical questions, including both straightforward calculations and more intricate problem-solving tasks.

 📚 [Training Codebase](https://github.com/microsoft/ArchScale) <br>
 👩‍🍳 [Phi Cookbook](https://github.com/microsoft/PhiCookBook) <br>
 🏡 [Phi Portal](https://azure.microsoft.com/en-us/products/phi) <br>
+🚀 vLLM Inference: V0: [PR](https://github.com/vllm-project/vllm/pull/20702) | [Branch](https://github.com/congcongchen123/vllm/tree/congcongchen/phi4-mini-shadow) V1: [PR](https://github.com/vllm-project/vllm/pull/23996) <br>
 🖥️ Try It [Azure](https://ai.azure.com/explore/models/Phi-4-mini-flash-reasoning/version/1/registry/azureml-phi-prod) [Nvidia NIM](https://build.nvidia.com/microsoft/phi-4-mini-flash-reasoning)<br>
 We evaluate the model with three of the most popular math benchmarks where the strongest reasoning models are competing together. Specifically:
 + Math-500: This benchmark consists of 500 challenging math problems designed to test the model's ability to perform complex mathematical reasoning and problem-solving.
 + AIME 2024/AIME 2025: The American Invitational Mathematics Examination (AIME) is a highly regarded math competition that features a series of difficult problems aimed at assessing advanced mathematical skills and logical reasoning. We evaluate the models on the problems from both 2024 and the year 2025 examinations.
++ GPQA Diamond: The Graduate-Level Google-Proof Q&A (GPQA) Diamond benchmark focuses on evaluating the model's ability to understand and solve a wide range of mathematical questions, including both straightforward calculations and more intricate problem-solving tasks.

config.json CHANGED Viewed

@@ -26,12 +26,6 @@
     "num_key_value_heads": 20,
     "resid_pdrop": 0.0,
     "sliding_window": 512,
-    "layer_types": [
-      "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention",
-      "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention",
-      "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention",
-      "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention"
-    ],
     "torch_dtype": "bfloat16",
     "tie_word_embeddings": true,
     "transformers_version": "4.46.1",

     "num_key_value_heads": 20,
     "resid_pdrop": 0.0,
     "sliding_window": 512,
     "torch_dtype": "bfloat16",
     "tie_word_embeddings": true,
     "transformers_version": "4.46.1",

configuration_phi4flash.py CHANGED Viewed

@@ -112,7 +112,6 @@ class Phi4FlashConfig(PretrainedConfig):
         bos_token_id=1,
         eos_token_id=2,
         sliding_window=2047,
-        layer_types=None,
         mb_per_layer= 2,
         mamba_d_state=16,
         mamba_d_conv=4,
@@ -142,16 +141,11 @@ class Phi4FlashConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.mb_per_layer = mb_per_layer
-        self.sliding_window = sliding_window
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            is_sliding = lambda i: i < num_hidden_layers // 2 and i % 2 == 1
-            self.layer_types = [
-                "sliding_attention" if is_sliding(layer_idx) else "full_attention"
-                for layer_idx in range(num_hidden_layers)
-            ]
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand
@@ -176,4 +170,4 @@ class Phi4FlashConfig(PretrainedConfig):
             else:
                 layer_block_type = "mamba"
             layer_block_types.append(layer_block_type)
-        return layer_block_types

         bos_token_id=1,
         eos_token_id=2,
         sliding_window=2047,
         mb_per_layer= 2,
         mamba_d_state=16,
         mamba_d_conv=4,
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.mb_per_layer = mb_per_layer
+        self.sliding_window = [
+            sliding_window if layer_idx < num_hidden_layers // 2 and layer_idx % 2 == 1 else None
+            for layer_idx in range(num_hidden_layers)
+        ]
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand
             else:
                 layer_block_type = "mamba"
             layer_block_types.append(layer_block_type)
+        return layer_block_types

modeling_phi4flash.py CHANGED Viewed

@@ -129,7 +129,7 @@ def _get_cache(
         cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
     if cache_implementation == "sliding_window":
-        max_cache_len = min(self.config.sliding_window, max_cache_len)
     need_new_cache = (
         not hasattr(self, "_cache")
@@ -243,7 +243,7 @@ class SambaYCache(Cache):
         sliding_cache_shape = (
             self.max_batch_size,
             self.num_key_value_heads,
-            min(config.sliding_window, max_cache_len),
             self.head_dim,
         )
         conv_cache_shape = (self.max_batch_size, intermediate_size, conv_kernel_size)
@@ -573,7 +573,7 @@ class SambaYFlashAttention2(SambaYAttention):
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            use_sliding_windows = self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] == "sliding_attention"
             if past_key_value is not None:
@@ -710,8 +710,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.sliding_window -1,
-                        self.config.sliding_window -1,
                     ),
                 )
@@ -735,8 +735,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.sliding_window -1,
-                        self.config.sliding_window -1,
                     ),
                 )
@@ -1085,9 +1085,9 @@ class SambaYDecoderLayer(nn.Module):
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
-            if self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] == "sliding_attention" and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
-                    attention_mask = attention_mask[:, -self.config.sliding_window:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(

         cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
     if cache_implementation == "sliding_window":
+        max_cache_len = min(self.config.sliding_window[1], max_cache_len)
     need_new_cache = (
         not hasattr(self, "_cache")
         sliding_cache_shape = (
             self.max_batch_size,
             self.num_key_value_heads,
+            min(config.sliding_window[1], max_cache_len),
             self.head_dim,
         )
         conv_cache_shape = (self.max_batch_size, intermediate_size, conv_kernel_size)
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            use_sliding_windows = self.config.sliding_window is not None and self.config.sliding_window[self.layer_idx] is not None
             if past_key_value is not None:
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.sliding_window[self.layer_idx] -1,
+                        self.config.sliding_window[self.layer_idx] -1,
                     ),
                 )
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.sliding_window[self.layer_idx] -1,
+                        self.config.sliding_window[self.layer_idx] -1,
                     ),
                 )
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
+            if self.config.sliding_window is not None and self.config.sliding_window[self.layer_idx] is not None and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
+                    attention_mask = attention_mask[:, -self.config.sliding_window[self.layer_idx]:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(