RxT-Beta-Decoder-SMAT / config.json
AdamF92's picture
Epoch 0 - Val loss 2.7051
b57921b verified
{
"att_groups": 4,
"att_heads": 16,
"att_query_groups": 8,
"cross_att_type": "sqa",
"dense_layer_dim": 1536,
"embed_dim": 512,
"ff_activation": "silu",
"ff_dim": 192,
"ff_dropout": 0.0,
"final_stateless_layers_config": [
"moe",
"moe"
],
"head_norm_type": "rms_norm",
"moe_bias_mode": "global",
"moe_grouped_gemm": true,
"moe_shared_experts_bias_mode": "global",
"moe_top_k": 10,
"moe_use_cutlass_grouped_gemm": true,
"moe_use_weighted_shared_experts": false,
"num_experts": 384,
"num_layers": 21,
"num_shared_experts": 2,
"rope_base": 100000,
"router_amp": true,
"router_dtype": "bfloat16",
"self_att_type": "sqa",
"seq_len": 8192,
"shared_expert_dim": 384,
"stateless_layers_config": [
"dense",
"moe"
],
"stm_size": 4096,
"use_attention_output_bias": false,
"use_flash_attention": true,
"use_gated": true,
"use_gated_attention": true,
"use_gated_cross_attention": false,
"use_head_norm": true,
"use_moe": true,
"use_vectorized_moe": true,
"vocab_size": 65536
}