| { |
| "activation_type": "swiglu", |
| "alibi": false, |
| "alibi_bias_max": 8.0, |
| "architectures": [ |
| "AIGCodeXMoEForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "attention_layer_norm": false, |
| "attention_layer_norm_with_affine": false, |
| "batch_size": 4, |
| "bias_for_layer_norm": false, |
| "block_group_size": 1, |
| "block_type": "sequential", |
| "clip_qkv": null, |
| "d_model": 4096, |
| "deepnorm": false, |
| "embedding_dropout": 0.0, |
| "embedding_size": 65280, |
| "encoder_decoder": false, |
| "eos_token_id": 2, |
| "eval_max_sequence_length": null, |
| "exp_dim_ratio": 1, |
| "flash_attention": false, |
| "gate_level": "token", |
| "gate_sample_ratio": 1, |
| "gate_softmax_temperature": 8.0, |
| "gshard": false, |
| "include_bias": false, |
| "init_cutoff_factor": null, |
| "init_device": "meta", |
| "init_fn": "normal", |
| "init_std": 0.01, |
| "intermediate_size": 16384, |
| "latent_attention": false, |
| "latent_attention_dim": 512, |
| "layer_norm_eps": 1e-05, |
| "layer_norm_type": "default", |
| "layer_norm_with_affine": false, |
| "layer_share": false, |
| "layer_share_mlp_version": 1, |
| "layer_std_check": false, |
| "max_sequence_length": 4096, |
| "mlp_hidden_size": null, |
| "mlp_ratio": 4, |
| "mobile_llm_repeat_num": 1, |
| "model_type": "hf_aigcodexmoe", |
| "moe_act_ckpt_ratio": 1, |
| "moe_auxiliary_loss": false, |
| "moe_auxiliary_loss_weight": 0.0, |
| "moe_batch_prioritized_routing": false, |
| "moe_eval_capacity_token_fraction": 0.25, |
| "moe_expert_count": 4, |
| "moe_expert_count_mluti_level": null, |
| "moe_freq": 2, |
| "moe_freq_pos": 0, |
| "moe_gate_input_type": "concat", |
| "moe_gate_loss_combine_method": "average", |
| "moe_gate_loss_weight": 0.0, |
| "moe_gate_no_grad": false, |
| "moe_gating_use_fp32": true, |
| "moe_logging": false, |
| "moe_normalize_gate_prob_before_dropping": false, |
| "moe_second_expert_policy": "sampling", |
| "moe_share_expert_count": 0, |
| "moe_top1_expert": true, |
| "moe_topn_expert": 1, |
| "moe_version": 1, |
| "multi_query_attention": false, |
| "n_heads": 32, |
| "n_kv_heads": null, |
| "n_layers": 22, |
| "pad_token_id": 0, |
| "ple_layer_num": 0, |
| "ple_layernorm": false, |
| "precision": "amp_bf16", |
| "residual_dropout": 0.0, |
| "rope": true, |
| "rope_base": 30000, |
| "rope_ext_ratio": 1, |
| "rope_full_precision": true, |
| "scale_logits": false, |
| "sft_ans_mask": false, |
| "share_layer_groups": 1, |
| "share_moe_groups": 1, |
| "torch_dtype": "float32", |
| "transformers_version": "4.40.2", |
| "use_cache": true, |
| "use_mobile_llm": false, |
| "use_moe": false, |
| "use_ple": false, |
| "use_xmoe": true, |
| "vocab_size": 64000, |
| "weight_tying": false |
| } |
|
|