Update README
Browse files- README.md +2 -0
- inference/model.py +0 -2
README.md
CHANGED
|
@@ -79,7 +79,9 @@ This experimental release represents our ongoing research into more efficient tr
|
|
| 79 |
| SWE-bench Multilingual | 57.8 | 57.9 |
|
| 80 |
| Terminal-bench | 36.7 | 37.7 |
|
| 81 |
|
|
|
|
| 82 |
|
|
|
|
| 83 |
|
| 84 |
## How to Run Locally
|
| 85 |
|
|
|
|
| 79 |
| SWE-bench Multilingual | 57.8 | 57.9 |
|
| 80 |
| Terminal-bench | 36.7 | 37.7 |
|
| 81 |
|
| 82 |
+
## Update
|
| 83 |
|
| 84 |
+
- 2025.11.17: **We have identified that previous versions of the inference demo code contained an implementation discrepancy in Rotary Position Embedding (RoPE) within the indexer module, potentially leading to degraded model performance.** Specifically, the input tensor to RoPE in the indexer module requires a non-interleaved layout, whereas RoPE in the MLA module expects an interleaved layout. This issue has now been resolved. Please refer to the updated version of the inference demo code and take note of this implementation detail.
|
| 85 |
|
| 86 |
## How to Run Locally
|
| 87 |
|
inference/model.py
CHANGED
|
@@ -281,7 +281,6 @@ class RMSNorm(nn.Module):
|
|
| 281 |
super().__init__()
|
| 282 |
self.dim = dim
|
| 283 |
self.eps = eps
|
| 284 |
-
# rmsnorm in the checkpoint is stored in bf16, while the parameter here is stored in fp32 for convenient.
|
| 285 |
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
|
| 286 |
|
| 287 |
def forward(self, x: torch.Tensor, residual: Optional[torch.Tensor] = None):
|
|
@@ -315,7 +314,6 @@ class LayerNorm(nn.Module):
|
|
| 315 |
super().__init__()
|
| 316 |
self.dim = dim
|
| 317 |
self.eps = eps
|
| 318 |
-
# layernorm in the checkpoint is stored in bf16, while the parameters here are stored in fp32 for convenient.
|
| 319 |
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
|
| 320 |
self.bias = nn.Parameter(torch.zeros(dim, dtype=torch.float32))
|
| 321 |
|
|
|
|
| 281 |
super().__init__()
|
| 282 |
self.dim = dim
|
| 283 |
self.eps = eps
|
|
|
|
| 284 |
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
|
| 285 |
|
| 286 |
def forward(self, x: torch.Tensor, residual: Optional[torch.Tensor] = None):
|
|
|
|
| 314 |
super().__init__()
|
| 315 |
self.dim = dim
|
| 316 |
self.eps = eps
|
|
|
|
| 317 |
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
|
| 318 |
self.bias = nn.Parameter(torch.zeros(dim, dtype=torch.float32))
|
| 319 |
|