utdawn commited on
Commit
276abc9
·
verified ·
1 Parent(s): a8af0fb

Update configuration_llada2_moe.py

Browse files
Files changed (1) hide show
  1. configuration_llada2_moe.py +5 -2
configuration_llada2_moe.py CHANGED
@@ -16,7 +16,7 @@ class LLaDA2MoeConfig(PretrainedConfig):
16
  num_key_value_heads=0,
17
  hidden_act="silu",
18
  use_qkv_bias=False, # llada2 only
19
- use_qk_norm=False,
20
  use_bias=True, # llada2 only
21
  rms_norm_eps=1e-05,
22
  norm_head=False, # llada2 only
@@ -54,6 +54,7 @@ class LLaDA2MoeConfig(PretrainedConfig):
54
  self.num_key_value_heads = num_key_value_heads
55
  self.hidden_act = hidden_act
56
  self.use_qkv_bias = use_qkv_bias
 
57
  self.use_bias = use_bias
58
  self.norm_head = norm_head
59
  self.rms_norm_eps = rms_norm_eps
@@ -82,4 +83,6 @@ class LLaDA2MoeConfig(PretrainedConfig):
82
  self.routed_scaling_factor = routed_scaling_factor
83
  self.partial_rotary_factor = partial_rotary_factor
84
 
85
- super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 
 
16
  num_key_value_heads=0,
17
  hidden_act="silu",
18
  use_qkv_bias=False, # llada2 only
19
+ use_qk_norm=True,
20
  use_bias=True, # llada2 only
21
  rms_norm_eps=1e-05,
22
  norm_head=False, # llada2 only
 
54
  self.num_key_value_heads = num_key_value_heads
55
  self.hidden_act = hidden_act
56
  self.use_qkv_bias = use_qkv_bias
57
+ self.use_qk_norm = use_qk_norm
58
  self.use_bias = use_bias
59
  self.norm_head = norm_head
60
  self.rms_norm_eps = rms_norm_eps
 
83
  self.routed_scaling_factor = routed_scaling_factor
84
  self.partial_rotary_factor = partial_rotary_factor
85
 
86
+ super().__init__(
87
+ pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
88
+ )