training: fix type mismatch when training (#6)

- convert moe results to input dtype (827ce49e9f70f875ec446521564bdb5acd03f534)

Co-authored-by: Chen <Jack477@users.noreply.huggingface.co>

Files changed (1) hide show

modeling_deepseek.py CHANGED Viewed

@@ -577,6 +577,7 @@ class DeepseekV2MoE(nn.Module):
             for i, expert in enumerate(self.experts):
                 y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
             y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
             y = y.view(*orig_shape)
             y = AddAuxiliaryLoss.apply(y, aux_loss)
         else:

             for i, expert in enumerate(self.experts):
                 y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
             y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y = y.type(hidden_states.dtype)
             y = y.view(*orig_shape)
             y = AddAuxiliaryLoss.apply(y, aux_loss)
         else: