diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..52373fe24473b1aa44333d318f578ae6bf04b49b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1ad0488f90862b77498f734270508db723f8d025 --- /dev/null +++ b/LICENSE @@ -0,0 +1,51 @@ +--- +title: MIT License +spdx-id: MIT +featured: true +hidden: false + +description: A short and simple permissive license with conditions only requiring preservation of copyright and license notices. Licensed works, modifications, and larger works may be distributed under different terms and without source code. + +how: Create a text file (typically named LICENSE or LICENSE.txt) in the root of your source code and copy the text of the license into the file. Replace [year] with the current year and [fullname] with the name (or names) of the copyright holders. + +using: + Babel: https://github.com/babel/babel/blob/master/LICENSE + .NET: https://github.com/dotnet/runtime/blob/main/LICENSE.TXT + Rails: https://github.com/rails/rails/blob/master/MIT-LICENSE + +permissions: + - commercial-use + - modifications + - distribution + - private-use + +conditions: + - include-copyright + +limitations: + - liability + - warranty + +--- + +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..434289f9ddebf8e24b719ac230415f5ea39dee42 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +--- +language: en +library_name: mlx +tags: +- quantized +- mlx +base_model: +- zai-org/GLM-5 +pipeline_tag: text-generation +--- +**See GLM-5 MLX in action - [demonstration video](https://youtu.be/3XCYruBYr-0)** + +#### Tested on a M3 Ultra 512GB RAM using [Inferencer app v1.10](https://inferencer.com) +- Single inference ~16.6 tokens/s @ 1000 tokens +- Batched inference ~31.8 total tokens/s across six inferences +- Memory usage: ~417 GiB + +*q4.8bit quant typically achieves 1.281 perplexity in our coding test* +| Quantization | Perplexity | Token Accuracy | Missed Divergence | +|:------------:|:----------:|:--------------:|:-----------------:| +| **q3.5** | 168.0 | 43.45% | 72.57% | +| **q4.5** | 1.33593 | 91.65% | 27.61% | +| **q4.8** | 1.28125 | 93.75% | 21.15% | +| **q5.5** | 1.23437 | 95.05% | 17.28% | +| **q6.5** | 1.21875 | 96.95% | 12.03% | +| **q8.5** | 1.21093 | 97.55% | 10.50% | +| **q9** | 1.21093 | 97.55% | 10.50% | +| **Base** | 1.20312 | 100.0% | 0.000% | + +- Perplexity: Measures the confidence for predicting base tokens (lower is better) +- Token Accuracy: The percentage of correctly generated base tokens +- Missed Divergence: Measures severity of misses; how much the token was missed by + +##### Quantized with a modified version of [MLX](https://github.com/ml-explore/mlx) +##### For more details see [demonstration video](hhttps://youtu.be/3XCYruBYr-0) or visit [GLM-5](https://huggingface.co/zai-org/GLM-5). + +## Disclaimer + +We are not the creator, originator, or owner of any model listed. Each model is created and provided by third parties. Models may not always be accurate or contextually appropriate. You are responsible for verifying the information before making important decisions. We are not liable for any damages, losses, or issues arising from its use, including data loss or inaccuracies in AI-generated content. \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..2ab98ef068d62829d17c5ade1827b9f013fa2bbf --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,86 @@ +[gMASK] +{%- if tools -%} +<|system|> +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{% for tool in tools %} +{{ tool | tojson(ensure_ascii=False) }} +{% endfor %} + + +For each function call, output the function name and arguments within the following XML format: +{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...{%- endif -%} +{%- macro visible_text(content) -%} + {%- if content is string -%} + {{- content }} + {%- elif content is iterable and content is not mapping -%} + {%- for item in content -%} + {%- if item is mapping and item.type == 'text' -%} + {{- item.text }} + {%- elif item is string -%} + {{- item }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{- content }} + {%- endif -%} +{%- endmacro -%} +{%- set ns = namespace(last_user_index=-1) %} +{%- for m in messages %} + {%- if m.role == 'user' %} + {% set ns.last_user_index = loop.index0 -%} + {%- endif %} +{%- endfor %} +{% for m in messages %} +{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }} +{%- elif m.role == 'assistant' -%} +<|assistant|> +{%- set reasoning_content = '' %} +{%- set content = visible_text(m.content) %} +{%- if m.reasoning_content is string %} + {%- set reasoning_content = m.reasoning_content %} +{%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} +{%- endif %} +{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%} +{{ '' + reasoning_content.strip() + ''}} +{%- else -%} +{{ '' }} +{%- endif -%} +{%- if content.strip() -%} +{{ content.strip() }} +{%- endif -%} +{% if m.tool_calls %} +{% for tc in m.tool_calls %} +{%- if tc.function %} + {%- set tc = tc.function %} +{%- endif %} +{{- '' + tc.name -}} +{% set _args = tc.arguments %}{% for k, v in _args.items() %}{{ k }}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{% endfor %}{% endfor %} +{% endif %} +{%- elif m.role == 'tool' -%} +{%- if m.content is string -%} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|observation|>' }} +{%- endif %} +{{- '' }} +{{- m.content }} +{{- '' }} +{%- else -%} +<|observation|>{% for tr in m.content %} +{{ tr.output if tr.output is defined else tr }}{% endfor -%} +{% endif -%} +{%- elif m.role == 'system' -%} +<|system|>{{ visible_text(m.content) }} +{%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a7d7c39089a4a04696e5fdcedab0573c92325bd4 --- /dev/null +++ b/config.json @@ -0,0 +1,10140 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "mlx-sanitized": "0.30.7", + "model_type": "glm_moe_dsa", + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "quantization": { + "group_size": 64, + "bits": 6, + "mode": "affine", + "model.embed_tokens": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.0.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.1.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.1.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.2.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.2.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.9.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.11.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.12.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.14.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.15.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.17.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.18.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.20.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.21.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.23.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.24.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.26.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.27.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.29.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.30.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.32.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.33.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.35.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.36.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.38.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.39.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.41.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.42.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.44.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.45.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.47.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.48.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.50.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.51.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.53.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.54.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.56.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.57.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.59.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.60.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.62.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.63.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.65.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.66.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.68.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "lm_head": { + "group_size": null, + "bits": 6, + "mode": "affine" + } + }, + "quantization_config": { + "group_size": 64, + "bits": 6, + "mode": "affine", + "model.embed_tokens": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.0.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.1.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.1.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.2.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.2.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.9.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.11.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.12.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.14.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.15.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.17.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.18.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.20.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.21.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.23.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.24.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.26.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.27.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.29.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.30.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.32.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.33.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.35.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.36.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.38.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.39.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.41.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.42.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.44.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.45.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.47.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.48.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.50.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.51.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.53.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.54.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.56.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.57.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.59.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.60.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.62.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.63.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.65.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.66.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.68.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.self_attn.q_a_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_b_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.kv_a_proj_with_mqa": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.self_attn.o_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wq_b": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wk": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.weights_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.gate_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.up_proj": { + "group_size": null, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.down_proj": { + "group_size": null, + "bits": 6, + "mode": "affine" + }, + "lm_head": { + "group_size": null, + "bits": 6, + "mode": "affine" + } + }, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.0.2.dev0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..640e99c64d2f17d76e2f1f13af219fb369e1004e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "pad_token_id": 154820, + "temperature": 1.0, + "top_p": 0.95, + "transformers_version": "5.0.2.dev0" +} diff --git a/model-00001-of-00046.safetensors b/model-00001-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac5a7add6695a03c8aa33e5e93dbfb990ed44df4 --- /dev/null +++ b/model-00001-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd8781b9a77b8d4395aaf8e0e000849deb1cb4bd3f224efd5a39e32b5435c0d +size 9655246191 diff --git a/model-00002-of-00046.safetensors b/model-00002-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..431ed9a1e2dbe96c86bcdcfb80ffc4214d51729c --- /dev/null +++ b/model-00002-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e16538f1d77ceadf6f7dcc932e502a5083bcd8795f6f76b44e416390d963099 +size 10717291508 diff --git a/model-00003-of-00046.safetensors b/model-00003-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae22aacd8d4ca6819627a5165125bf0d840eed1c --- /dev/null +++ b/model-00003-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c16d8230a3b15275a90a5a1aefe0427a21c189a07d42c4811ecf15a64f43ab1d +size 10260955651 diff --git a/model-00004-of-00046.safetensors b/model-00004-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e4433a4a42cea946e8fba495e333e04cede3eb8 --- /dev/null +++ b/model-00004-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc36bf2ec51d79e720ec0ff299705c95231951b48616bbf9d91836357b5c96d +size 9152773973 diff --git a/model-00005-of-00046.safetensors b/model-00005-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9dfceb2a6fb9f75dee40567c3d4f55f241deb9d --- /dev/null +++ b/model-00005-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e688b45819d088a15bfefafafe17a566c0b7b9c940d7cb7cb46a29a6668852 +size 9203311752 diff --git a/model-00006-of-00046.safetensors b/model-00006-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6bc9bc0b93145af06dbc1ce90c40433c2286a05e --- /dev/null +++ b/model-00006-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3adaa4d3ac81c0708e2efc9c8fe195bda2754a4b495d09215830d12afdbf0736 +size 10156261798 diff --git a/model-00007-of-00046.safetensors b/model-00007-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..278b0838abae49809a0bf67d9854dd46f4701ee4 --- /dev/null +++ b/model-00007-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c5ac8bf1986b74c462744e740dd6879c68d3d2a7bbe8b2d96ef8d07e600aef +size 9347809579 diff --git a/model-00008-of-00046.safetensors b/model-00008-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3030a15f752a08dda09c1b84e82307b1c30b9405 --- /dev/null +++ b/model-00008-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09cdbb452fe2410e5dfa36e9a135cdf47a042eeb7924d8041e281f27e25ce78b +size 10011763881 diff --git a/model-00009-of-00046.safetensors b/model-00009-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d39017a255bbe67b0f8a8fb4d5338910992ca39 --- /dev/null +++ b/model-00009-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd4c3fe3609dd044e4522a3881d8203d72673734ec31a7dd5d186279a84d862 +size 9347809721 diff --git a/model-00010-of-00046.safetensors b/model-00010-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9b7812e833814f1b8d9527b39e371275e5362bf --- /dev/null +++ b/model-00010-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4783e9a0455163374c91373d49fa92ffe35b519d94944cbc7aeda7b6467b7d10 +size 10155376967 diff --git a/model-00011-of-00046.safetensors b/model-00011-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4958c5462ebf51b790d5f2337888ed5de67c7377 --- /dev/null +++ b/model-00011-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af3aec5a3960f4bb7f1396f6ca4023c021218c6a7e2c96b4808df53bac50ba03 +size 9204196509 diff --git a/model-00012-of-00046.safetensors b/model-00012-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4efb57dd4134c4340540edfcd2357c7896a59840 --- /dev/null +++ b/model-00012-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0bf081f545cbc6b42d42c3535e7e0f672c933f43172cb376781ce05a47dbbfe +size 10155377033 diff --git a/model-00013-of-00046.safetensors b/model-00013-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2436a45b6392915d9bc843aaad33b3352fe9661 --- /dev/null +++ b/model-00013-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ea268a23b2d9f78e247a9fd1411cb6cfe416a2afeddb7b8a91877364bd5e36 +size 10156261688 diff --git a/model-00014-of-00046.safetensors b/model-00014-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d3790b783746684c697915f067ca776b6e7f32f --- /dev/null +++ b/model-00014-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d59b5be057578f8916304da1bee1e4515591906213548ee0e630c5c504199aa +size 9203311778 diff --git a/model-00015-of-00046.safetensors b/model-00015-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..090bdd52d98d1b3f6ca199cf0579cbc227c421e2 --- /dev/null +++ b/model-00015-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1deb56fcf3a6407cec7d1d5db496ca0a320e5adc7105e54078b072c3a2db152 +size 10156261796 diff --git a/model-00016-of-00046.safetensors b/model-00016-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6528469b563cd73d4f7f65daf1569a9394fb4c6a --- /dev/null +++ b/model-00016-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be026465c25063434b217b61011f0b89ec4f547a66ddc356048c88921956ae49 +size 9347809641 diff --git a/model-00017-of-00046.safetensors b/model-00017-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..599ac19d422543fd10f8f4f3869a369feb314d50 --- /dev/null +++ b/model-00017-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe512cfe86eee10692f7b0849ff309e6ecbbc8057caa8b293975de43e39fcca +size 10011763873 diff --git a/model-00018-of-00046.safetensors b/model-00018-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d784d6ff2c98c22000ce1936e323947354060228 --- /dev/null +++ b/model-00018-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935c68b01eb25becec0925639ae31a598c652e06a6e322491e04cf04b8cf790c +size 9347809659 diff --git a/model-00019-of-00046.safetensors b/model-00019-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3304c79459ce82c8753faa9b7122a948e2af142d --- /dev/null +++ b/model-00019-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9422f9b6475843cb6b545d3357c9a68fd6b2d2f0906be8c6699284b25a415ee2 +size 10155376959 diff --git a/model-00020-of-00046.safetensors b/model-00020-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..542ad5c88e7f8bbbdc751649d75b642e815f0f36 --- /dev/null +++ b/model-00020-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe7c1110281d4560d9d9b842298cac740b9019a99dc3bc84c2f3da47bf5a504 +size 9204196503 diff --git a/model-00021-of-00046.safetensors b/model-00021-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23091cd79badd3671eefc8235e3ca645a971db19 --- /dev/null +++ b/model-00021-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0176db04d61b4c7c89833f76c04a807c6ede2289ccf31540927f393584324240 +size 10155377089 diff --git a/model-00022-of-00046.safetensors b/model-00022-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..edb25091a384e8e15830a6611e894286ef0df9d2 --- /dev/null +++ b/model-00022-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8acaa5b8df2c350f64694788066f3bd56093c4a1983ed7fe8971991d09ccd15 +size 10156261692 diff --git a/model-00023-of-00046.safetensors b/model-00023-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..85ae886f562c8fa3cb0487b2f296b6bdcd7bcb93 --- /dev/null +++ b/model-00023-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb2df2941ccec486bd463a63cc2b28be9d380900aa4456104c63eea8a2b5a6d +size 9203311774 diff --git a/model-00024-of-00046.safetensors b/model-00024-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9be71ff3fc78118f4f703423278307fdb54d9a0b --- /dev/null +++ b/model-00024-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3735e43602e890a1c144cd99dbafcde26480e0080ade87a9cbd9e423aacadf +size 10156261730 diff --git a/model-00025-of-00046.safetensors b/model-00025-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7c17d680d5b10cfc1df543f04989712e4c7641bb --- /dev/null +++ b/model-00025-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403334f35e415601bc41fe7b0964a926c9891d243b6d5988ad239a7101a00020 +size 9347809601 diff --git a/model-00026-of-00046.safetensors b/model-00026-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..853bd2486d1dce008cfc54872f64285538f33b5e --- /dev/null +++ b/model-00026-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1ef7b2552738fbde0ccf7a646768820368d79ce103a42e047d4b4f282e9339 +size 10011763873 diff --git a/model-00027-of-00046.safetensors b/model-00027-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99ce05864cb65b3fae470533bbd05ed36bb65864 --- /dev/null +++ b/model-00027-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdea26e1f6acc225e4dd12590cc1af2504665eecde4894503f5ecaa37de696f3 +size 9347809663 diff --git a/model-00028-of-00046.safetensors b/model-00028-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfc84666f03adba85a37b0efe09ceb37da654274 --- /dev/null +++ b/model-00028-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c03dc95c03f57085671bd78caad80d6f17dd2ab883cda7b5f3a6caa3e8a824be +size 10155376967 diff --git a/model-00029-of-00046.safetensors b/model-00029-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c9ee433e700d8af8ecce707c22e53bdaf8ac176 --- /dev/null +++ b/model-00029-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa7f31fa574004b31e1b07e3b9a9414c3e3cfc07a398aad70c28f500f979c02 +size 9204196513 diff --git a/model-00030-of-00046.safetensors b/model-00030-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4c49686c195995d1f071768628e362647b34e0b2 --- /dev/null +++ b/model-00030-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e66db4b616759ef952a5fda2e3a16ef782d04d923de85b0bab03adcc3db6552 +size 10155377087 diff --git a/model-00031-of-00046.safetensors b/model-00031-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c94f5726d9ea21d24e6d5b7d7d1b3ebdaab13782 --- /dev/null +++ b/model-00031-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f75dbb652e4f4cf09ab4ccd6dc4691eca605d43c346b7587597faafca2519a74 +size 10156261688 diff --git a/model-00032-of-00046.safetensors b/model-00032-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ea1c9e5d7b8b837ebea8d96062ce42bd25fea7a --- /dev/null +++ b/model-00032-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe55622290db104adebfbc5984b27a7f5f17a6c1919b244e5d0d93767153420 +size 9203311778 diff --git a/model-00033-of-00046.safetensors b/model-00033-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a038fb1ff9e85ac7997d1c5be89e6cf7a4a3919 --- /dev/null +++ b/model-00033-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6034239e01fcd817de6f3d52b6ef826cca37691544e6f7c44298d45e65eb2e5 +size 10156261848 diff --git a/model-00034-of-00046.safetensors b/model-00034-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a14810ed3bd1909877dcc4f6cf86a9c217520798 --- /dev/null +++ b/model-00034-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b0867a3926308386eee4387bbae96a03ceae6452228c7738be652dab880ceb +size 9347809599 diff --git a/model-00035-of-00046.safetensors b/model-00035-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b912a5dfe0a3551e19f5b76a1dcb0a4d6a1423f7 --- /dev/null +++ b/model-00035-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49af3fb161be1ce11ad6b65e933e979eb583e72bb2d327731e64e66cf2cb642 +size 10011763877 diff --git a/model-00036-of-00046.safetensors b/model-00036-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04605b4b95c340a4a89f5921384037953ed5e3c2 --- /dev/null +++ b/model-00036-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711bd7e7fd787faebb4685bb4553d6ed6d3e73f47cc2e77198bb49fad91e3a39 +size 9347809671 diff --git a/model-00037-of-00046.safetensors b/model-00037-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81a4723a97d798e19fb9018d0e458aa4a786e6e0 --- /dev/null +++ b/model-00037-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2ce49ab2af2340a5da262ecd9bbd22f928e2205b9c992e61ec64b488add3e2 +size 10155376951 diff --git a/model-00038-of-00046.safetensors b/model-00038-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfc10b9613485b854613505861421a61799e50e8 --- /dev/null +++ b/model-00038-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed671a32202f3b54a4937272c3214e085c7e3da727953d157f166febf1dae31f +size 9204196517 diff --git a/model-00039-of-00046.safetensors b/model-00039-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..45d2977e572cd5cd184cb63f91b7a65298152974 --- /dev/null +++ b/model-00039-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39adcf7cd339a02667e0fcefcb5f880f6b06f4e65dac94a8bbaf3381298f792e +size 10155377053 diff --git a/model-00040-of-00046.safetensors b/model-00040-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28f7216c8746447a635ca72fdb57af79b5ce3cd8 --- /dev/null +++ b/model-00040-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dca946cae6e5c0aadc4b4cb508d26573beb3d2ae2cd66ce1cee577ad76449d6 +size 10157146433 diff --git a/model-00041-of-00046.safetensors b/model-00041-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cb7edf133beae4a7156c6fcb119abf1aa9678d5 --- /dev/null +++ b/model-00041-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e83bc722da574eac7ebca1f5b82e23838b99fb548bf54e16a23c9fbbe658e70 +size 10012648598 diff --git a/model-00042-of-00046.safetensors b/model-00042-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29f040415a5b256b51c893cbb403a2d0c87859fa --- /dev/null +++ b/model-00042-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152551d2b6fe6116b37623b3b9d6e88485309d46f31446c398e697dd60d34c12 +size 9153658816 diff --git a/model-00043-of-00046.safetensors b/model-00043-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2b64842ad29e80d33e5f440f0f99ef95f80d8ae --- /dev/null +++ b/model-00043-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f67f4ce2d3558a42f5401d3301d9ded1015504f8d20e5dbf0ec5a4d312bf9127 +size 10012648542 diff --git a/model-00044-of-00046.safetensors b/model-00044-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2ab7e2e4659bb08a20a507609c31f26ea8660c1 --- /dev/null +++ b/model-00044-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90199160428104b9838637f552e94fe5f837fda6eed4ac99a224e19d01e89c2b +size 9153658804 diff --git a/model-00045-of-00046.safetensors b/model-00045-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ee569771ae5e7eb826b7eba422888a908821093 --- /dev/null +++ b/model-00045-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb09d896f4081a37eb5a9d13125d3bb7a7ec1070c0518cee4c63a262fb4c5d5 +size 10012648612 diff --git a/model-00046-of-00046.safetensors b/model-00046-of-00046.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8799e0dea4af73a4b387694702f1047e3c8b9d48 --- /dev/null +++ b/model-00046-of-00046.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ab13f2ad70526cd404d4ad8f0aa4964e04508f370fe6ffc04147168220951 +size 9806741170 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d95ba29bbb68720af1d31ce5742c2951982c40f2 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,3804 @@ +{ + "metadata": { + "total_size": 448891367424, + "total_parameters": 743911218432 + }, + "weight_map": { + "lm_head.biases": "model-00046-of-00046.safetensors", + "lm_head.scales": "model-00046-of-00046.safetensors", + "lm_head.weight": "model-00046-of-00046.safetensors", + "model.embed_tokens.biases": "model-00001-of-00046.safetensors", + "model.embed_tokens.scales": "model-00001-of-00046.safetensors", + "model.embed_tokens.weight": "model-00001-of-00046.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.down_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.down_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.up_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.up_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.embed_q.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.k_norm.bias": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.k_norm.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wk.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wk.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wk.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wq_b.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wq_b.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.indexer.wq_b.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.0.self_attn.unembed_out.weight": "model-00001-of-00046.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.down_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.down_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.up_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.up_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.embed_q.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.k_norm.bias": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.k_norm.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wk.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wk.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wk.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wq_b.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wq_b.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.indexer.wq_b.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.1.self_attn.unembed_out.weight": "model-00001-of-00046.safetensors", + "model.layers.10.input_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.gate.e_score_correction_bias": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.gate.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.10.self_attn.embed_q.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.k_norm.bias": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.k_norm.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wk.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wk.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wk.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wq_b.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wq_b.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.indexer.wq_b.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.o_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.o_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_a_layernorm.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_a_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_a_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_a_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_b_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_b_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.q_b_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.10.self_attn.unembed_out.weight": "model-00005-of-00046.safetensors", + "model.layers.11.input_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.gate.e_score_correction_bias": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.gate.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.embed_q.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.k_norm.bias": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.k_norm.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wk.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wk.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wk.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wq_b.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wq_b.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.indexer.wq_b.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.o_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.o_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_a_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_a_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_a_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_a_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_b_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_b_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.q_b_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.11.self_attn.unembed_out.weight": "model-00006-of-00046.safetensors", + "model.layers.12.input_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.gate.e_score_correction_bias": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.gate.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.12.self_attn.embed_q.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.k_norm.bias": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.k_norm.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wk.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wk.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wk.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wq_b.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wq_b.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.indexer.wq_b.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.o_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.o_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_a_layernorm.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_a_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_a_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_a_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_b_proj.biases": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_b_proj.scales": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.q_b_proj.weight": "model-00006-of-00046.safetensors", + "model.layers.12.self_attn.unembed_out.weight": "model-00006-of-00046.safetensors", + "model.layers.13.input_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.gate.e_score_correction_bias": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.gate.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.embed_q.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.k_norm.bias": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.k_norm.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wk.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wk.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wk.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wq_b.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wq_b.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.indexer.wq_b.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.o_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.o_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_a_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_a_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_a_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_a_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_b_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_b_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.q_b_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.13.self_attn.unembed_out.weight": "model-00007-of-00046.safetensors", + "model.layers.14.input_layernorm.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.gate.e_score_correction_bias": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.gate.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00008-of-00046.safetensors", + "model.layers.14.self_attn.embed_q.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.k_norm.bias": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.k_norm.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wk.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wk.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wk.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wq_b.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wq_b.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.indexer.wq_b.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.o_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.o_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_a_layernorm.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_a_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_a_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_a_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_b_proj.biases": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_b_proj.scales": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.q_b_proj.weight": "model-00007-of-00046.safetensors", + "model.layers.14.self_attn.unembed_out.weight": "model-00007-of-00046.safetensors", + "model.layers.15.input_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.gate.e_score_correction_bias": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.gate.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.15.self_attn.embed_q.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.k_norm.bias": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.k_norm.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wk.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wk.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wk.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wq_b.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wq_b.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.indexer.wq_b.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.o_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.o_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_a_layernorm.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_a_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_a_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_a_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_b_proj.biases": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_b_proj.scales": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.q_b_proj.weight": "model-00008-of-00046.safetensors", + "model.layers.15.self_attn.unembed_out.weight": "model-00008-of-00046.safetensors", + "model.layers.16.input_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.gate.e_score_correction_bias": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.gate.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.embed_q.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.k_norm.bias": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.k_norm.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wk.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wk.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wk.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wq_b.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wq_b.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.indexer.wq_b.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.o_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.o_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_a_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_a_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_a_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_a_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_b_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_b_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.q_b_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.16.self_attn.unembed_out.weight": "model-00009-of-00046.safetensors", + "model.layers.17.input_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.gate.e_score_correction_bias": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.gate.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.17.self_attn.embed_q.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.k_norm.bias": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.k_norm.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wk.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wk.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wk.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wq_b.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wq_b.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.indexer.wq_b.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.o_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.o_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_a_layernorm.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_a_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_a_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_a_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_b_proj.biases": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_b_proj.scales": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.q_b_proj.weight": "model-00009-of-00046.safetensors", + "model.layers.17.self_attn.unembed_out.weight": "model-00009-of-00046.safetensors", + "model.layers.18.input_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.gate.e_score_correction_bias": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.gate.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.embed_q.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.k_norm.bias": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.k_norm.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wk.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wk.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wk.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wq_b.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wq_b.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.indexer.wq_b.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.o_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.o_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_a_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_a_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_a_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_a_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_b_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_b_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.q_b_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.18.self_attn.unembed_out.weight": "model-00010-of-00046.safetensors", + "model.layers.19.input_layernorm.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.gate.e_score_correction_bias": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.gate.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00011-of-00046.safetensors", + "model.layers.19.self_attn.embed_q.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.k_norm.bias": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.k_norm.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wk.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wk.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wk.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wq_b.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wq_b.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.indexer.wq_b.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.o_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.o_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_a_layernorm.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_a_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_a_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_a_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_b_proj.biases": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_b_proj.scales": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.q_b_proj.weight": "model-00010-of-00046.safetensors", + "model.layers.19.self_attn.unembed_out.weight": "model-00010-of-00046.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.down_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.down_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.up_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.up_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.embed_q.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.k_norm.bias": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.k_norm.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wk.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wk.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wk.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wq_b.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wq_b.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.indexer.wq_b.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.2.self_attn.unembed_out.weight": "model-00001-of-00046.safetensors", + "model.layers.20.input_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.gate.e_score_correction_bias": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.gate.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.20.self_attn.embed_q.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.k_norm.bias": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.k_norm.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wk.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wk.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wk.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wq_b.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wq_b.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.indexer.wq_b.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.o_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.o_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_a_layernorm.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_a_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_a_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_a_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_b_proj.biases": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_b_proj.scales": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.q_b_proj.weight": "model-00011-of-00046.safetensors", + "model.layers.20.self_attn.unembed_out.weight": "model-00011-of-00046.safetensors", + "model.layers.21.input_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.gate.e_score_correction_bias": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.gate.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.embed_q.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.k_norm.bias": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.k_norm.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wk.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wk.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wk.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wq_b.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wq_b.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.indexer.wq_b.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.o_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.o_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_a_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_a_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_a_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_a_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_b_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_b_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.q_b_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.21.self_attn.unembed_out.weight": "model-00012-of-00046.safetensors", + "model.layers.22.input_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.gate.e_score_correction_bias": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.gate.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.22.self_attn.embed_q.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.k_norm.bias": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.k_norm.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wk.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wk.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wk.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wq_b.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wq_b.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.indexer.wq_b.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.o_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.o_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_a_layernorm.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_a_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_a_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_a_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_b_proj.biases": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_b_proj.scales": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.q_b_proj.weight": "model-00012-of-00046.safetensors", + "model.layers.22.self_attn.unembed_out.weight": "model-00012-of-00046.safetensors", + "model.layers.23.input_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.gate.e_score_correction_bias": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.gate.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.embed_q.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.k_norm.bias": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.k_norm.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wk.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wk.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wk.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wq_b.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wq_b.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.indexer.wq_b.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.o_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.o_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_a_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_a_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_a_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_a_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_b_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_b_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.q_b_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.23.self_attn.unembed_out.weight": "model-00013-of-00046.safetensors", + "model.layers.24.input_layernorm.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.gate.e_score_correction_bias": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.gate.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00014-of-00046.safetensors", + "model.layers.24.self_attn.embed_q.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.k_norm.bias": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.k_norm.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wk.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wk.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wk.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wq_b.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wq_b.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.indexer.wq_b.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.o_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.o_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_a_layernorm.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_a_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_a_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_a_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_b_proj.biases": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_b_proj.scales": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.q_b_proj.weight": "model-00013-of-00046.safetensors", + "model.layers.24.self_attn.unembed_out.weight": "model-00013-of-00046.safetensors", + "model.layers.25.input_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.gate.e_score_correction_bias": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.gate.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.25.self_attn.embed_q.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.k_norm.bias": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.k_norm.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wk.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wk.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wk.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wq_b.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wq_b.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.indexer.wq_b.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.o_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.o_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_a_layernorm.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_a_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_a_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_a_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_b_proj.biases": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_b_proj.scales": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.q_b_proj.weight": "model-00014-of-00046.safetensors", + "model.layers.25.self_attn.unembed_out.weight": "model-00014-of-00046.safetensors", + "model.layers.26.input_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.gate.e_score_correction_bias": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.gate.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.embed_q.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.k_norm.bias": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.k_norm.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wk.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wk.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wk.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wq_b.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wq_b.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.indexer.wq_b.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.o_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.o_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_a_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_a_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_a_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_a_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_b_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_b_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.q_b_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.26.self_attn.unembed_out.weight": "model-00015-of-00046.safetensors", + "model.layers.27.input_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.gate.e_score_correction_bias": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.gate.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.27.self_attn.embed_q.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.k_norm.bias": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.k_norm.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wk.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wk.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wk.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wq_b.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wq_b.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.indexer.wq_b.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.o_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.o_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_a_layernorm.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_a_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_a_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_a_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_b_proj.biases": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_b_proj.scales": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.q_b_proj.weight": "model-00015-of-00046.safetensors", + "model.layers.27.self_attn.unembed_out.weight": "model-00015-of-00046.safetensors", + "model.layers.28.input_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.gate.e_score_correction_bias": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.gate.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.embed_q.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.k_norm.bias": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.k_norm.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wk.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wk.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wk.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wq_b.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wq_b.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.indexer.wq_b.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.o_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.o_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_a_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_a_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_a_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_a_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_b_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_b_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.q_b_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.28.self_attn.unembed_out.weight": "model-00016-of-00046.safetensors", + "model.layers.29.input_layernorm.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.gate.e_score_correction_bias": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.gate.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00017-of-00046.safetensors", + "model.layers.29.self_attn.embed_q.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.k_norm.bias": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.k_norm.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wk.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wk.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wk.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wq_b.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wq_b.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.indexer.wq_b.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.o_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.o_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_a_layernorm.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_a_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_a_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_a_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_b_proj.biases": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_b_proj.scales": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.q_b_proj.weight": "model-00016-of-00046.safetensors", + "model.layers.29.self_attn.unembed_out.weight": "model-00016-of-00046.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.gate.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.embed_q.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.k_norm.bias": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.k_norm.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wk.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wk.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wk.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wq_b.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wq_b.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.indexer.wq_b.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.3.self_attn.unembed_out.weight": "model-00001-of-00046.safetensors", + "model.layers.30.input_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.gate.e_score_correction_bias": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.gate.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.30.self_attn.embed_q.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.k_norm.bias": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.k_norm.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wk.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wk.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wk.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wq_b.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wq_b.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.indexer.wq_b.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.o_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.o_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_a_layernorm.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_a_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_a_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_a_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_b_proj.biases": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_b_proj.scales": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.q_b_proj.weight": "model-00017-of-00046.safetensors", + "model.layers.30.self_attn.unembed_out.weight": "model-00017-of-00046.safetensors", + "model.layers.31.input_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.gate.e_score_correction_bias": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.gate.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.embed_q.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.k_norm.bias": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.k_norm.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wk.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wk.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wk.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wq_b.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wq_b.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.indexer.wq_b.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.o_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.o_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_a_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_a_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_a_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_a_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_b_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_b_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.q_b_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.31.self_attn.unembed_out.weight": "model-00018-of-00046.safetensors", + "model.layers.32.input_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.gate.e_score_correction_bias": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.gate.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.32.self_attn.embed_q.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.k_norm.bias": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.k_norm.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wk.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wk.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wk.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wq_b.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wq_b.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.indexer.wq_b.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.o_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.o_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_a_layernorm.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_a_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_a_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_a_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_b_proj.biases": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_b_proj.scales": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.q_b_proj.weight": "model-00018-of-00046.safetensors", + "model.layers.32.self_attn.unembed_out.weight": "model-00018-of-00046.safetensors", + "model.layers.33.input_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.gate.e_score_correction_bias": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.gate.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.embed_q.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.k_norm.bias": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.k_norm.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wk.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wk.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wk.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wq_b.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wq_b.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.indexer.wq_b.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.o_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.o_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_a_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_a_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_a_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_a_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_b_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_b_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.q_b_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.33.self_attn.unembed_out.weight": "model-00019-of-00046.safetensors", + "model.layers.34.input_layernorm.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.gate.e_score_correction_bias": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.gate.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00020-of-00046.safetensors", + "model.layers.34.self_attn.embed_q.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.k_norm.bias": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.k_norm.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wk.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wk.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wk.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wq_b.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wq_b.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.indexer.wq_b.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.o_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.o_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_a_layernorm.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_a_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_a_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_a_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_b_proj.biases": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_b_proj.scales": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.q_b_proj.weight": "model-00019-of-00046.safetensors", + "model.layers.34.self_attn.unembed_out.weight": "model-00019-of-00046.safetensors", + "model.layers.35.input_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.gate.e_score_correction_bias": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.gate.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.35.self_attn.embed_q.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.k_norm.bias": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.k_norm.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wk.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wk.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wk.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wq_b.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wq_b.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.indexer.wq_b.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.o_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.o_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_a_layernorm.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_a_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_a_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_a_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_b_proj.biases": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_b_proj.scales": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.q_b_proj.weight": "model-00020-of-00046.safetensors", + "model.layers.35.self_attn.unembed_out.weight": "model-00020-of-00046.safetensors", + "model.layers.36.input_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.gate.e_score_correction_bias": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.gate.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.embed_q.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.k_norm.bias": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.k_norm.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wk.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wk.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wk.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wq_b.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wq_b.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.indexer.wq_b.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.o_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.o_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_a_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_a_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_a_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_a_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_b_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_b_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.q_b_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.36.self_attn.unembed_out.weight": "model-00021-of-00046.safetensors", + "model.layers.37.input_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.gate.e_score_correction_bias": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.gate.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.37.self_attn.embed_q.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.k_norm.bias": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.k_norm.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wk.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wk.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wk.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wq_b.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wq_b.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.indexer.wq_b.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.o_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.o_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_a_layernorm.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_a_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_a_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_a_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_b_proj.biases": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_b_proj.scales": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.q_b_proj.weight": "model-00021-of-00046.safetensors", + "model.layers.37.self_attn.unembed_out.weight": "model-00021-of-00046.safetensors", + "model.layers.38.input_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.gate.e_score_correction_bias": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.gate.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.embed_q.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.k_norm.bias": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.k_norm.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wk.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wk.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wk.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wq_b.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wq_b.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.indexer.wq_b.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.o_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.o_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_a_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_a_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_a_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_a_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_b_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_b_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.q_b_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.38.self_attn.unembed_out.weight": "model-00022-of-00046.safetensors", + "model.layers.39.input_layernorm.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.gate.e_score_correction_bias": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.gate.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00023-of-00046.safetensors", + "model.layers.39.self_attn.embed_q.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.k_norm.bias": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.k_norm.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wk.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wk.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wk.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wq_b.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wq_b.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.indexer.wq_b.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.o_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.o_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_a_layernorm.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_a_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_a_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_a_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_b_proj.biases": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_b_proj.scales": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.q_b_proj.weight": "model-00022-of-00046.safetensors", + "model.layers.39.self_attn.unembed_out.weight": "model-00022-of-00046.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.gate.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00046.safetensors", + "model.layers.4.self_attn.embed_q.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.k_norm.bias": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.k_norm.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wk.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wk.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wk.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wq_b.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wq_b.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.indexer.wq_b.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.o_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.o_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_a_layernorm.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_a_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_a_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_a_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_b_proj.biases": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_b_proj.scales": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.q_b_proj.weight": "model-00001-of-00046.safetensors", + "model.layers.4.self_attn.unembed_out.weight": "model-00001-of-00046.safetensors", + "model.layers.40.input_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.gate.e_score_correction_bias": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.gate.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.40.self_attn.embed_q.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.k_norm.bias": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.k_norm.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wk.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wk.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wk.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wq_b.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wq_b.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.indexer.wq_b.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.o_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.o_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_a_layernorm.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_a_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_a_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_a_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_b_proj.biases": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_b_proj.scales": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.q_b_proj.weight": "model-00023-of-00046.safetensors", + "model.layers.40.self_attn.unembed_out.weight": "model-00023-of-00046.safetensors", + "model.layers.41.input_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.gate.e_score_correction_bias": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.gate.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.embed_q.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.k_norm.bias": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.k_norm.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wk.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wk.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wk.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wq_b.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wq_b.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.indexer.wq_b.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.o_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.o_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_a_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_a_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_a_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_a_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_b_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_b_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.q_b_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.41.self_attn.unembed_out.weight": "model-00024-of-00046.safetensors", + "model.layers.42.input_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.gate.e_score_correction_bias": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.gate.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.42.self_attn.embed_q.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.k_norm.bias": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.k_norm.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wk.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wk.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wk.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wq_b.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wq_b.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.indexer.wq_b.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.o_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.o_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_a_layernorm.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_a_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_a_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_a_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_b_proj.biases": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_b_proj.scales": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.q_b_proj.weight": "model-00024-of-00046.safetensors", + "model.layers.42.self_attn.unembed_out.weight": "model-00024-of-00046.safetensors", + "model.layers.43.input_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.gate.e_score_correction_bias": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.gate.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.embed_q.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.k_norm.bias": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.k_norm.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wk.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wk.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wk.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wq_b.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wq_b.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.indexer.wq_b.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.o_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.o_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_a_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_a_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_a_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_a_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_b_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_b_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.q_b_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.43.self_attn.unembed_out.weight": "model-00025-of-00046.safetensors", + "model.layers.44.input_layernorm.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.gate.e_score_correction_bias": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.gate.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00026-of-00046.safetensors", + "model.layers.44.self_attn.embed_q.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.k_norm.bias": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.k_norm.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wk.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wk.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wk.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wq_b.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wq_b.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.indexer.wq_b.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.o_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.o_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_a_layernorm.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_a_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_a_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_a_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_b_proj.biases": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_b_proj.scales": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.q_b_proj.weight": "model-00025-of-00046.safetensors", + "model.layers.44.self_attn.unembed_out.weight": "model-00025-of-00046.safetensors", + "model.layers.45.input_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.gate.e_score_correction_bias": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.gate.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.45.self_attn.embed_q.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.k_norm.bias": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.k_norm.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wk.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wk.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wk.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wq_b.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wq_b.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.indexer.wq_b.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.o_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.o_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_a_layernorm.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_a_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_a_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_a_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_b_proj.biases": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_b_proj.scales": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.q_b_proj.weight": "model-00026-of-00046.safetensors", + "model.layers.45.self_attn.unembed_out.weight": "model-00026-of-00046.safetensors", + "model.layers.46.input_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.gate.e_score_correction_bias": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.gate.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.embed_q.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.k_norm.bias": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.k_norm.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wk.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wk.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wk.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wq_b.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wq_b.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.indexer.wq_b.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.o_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.o_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_a_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_a_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_a_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_a_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_b_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_b_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.q_b_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.46.self_attn.unembed_out.weight": "model-00027-of-00046.safetensors", + "model.layers.47.input_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.gate.e_score_correction_bias": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.gate.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.47.self_attn.embed_q.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.k_norm.bias": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.k_norm.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wk.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wk.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wk.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wq_b.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wq_b.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.indexer.wq_b.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.o_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.o_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_a_layernorm.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_a_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_a_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_a_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_b_proj.biases": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_b_proj.scales": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.q_b_proj.weight": "model-00027-of-00046.safetensors", + "model.layers.47.self_attn.unembed_out.weight": "model-00027-of-00046.safetensors", + "model.layers.48.input_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.gate.e_score_correction_bias": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.gate.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.embed_q.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.k_norm.bias": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.k_norm.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wk.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wk.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wk.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wq_b.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wq_b.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.indexer.wq_b.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.o_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.o_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_a_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_a_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_a_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_a_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_b_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_b_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.q_b_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.48.self_attn.unembed_out.weight": "model-00028-of-00046.safetensors", + "model.layers.49.input_layernorm.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.gate.e_score_correction_bias": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.gate.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00029-of-00046.safetensors", + "model.layers.49.self_attn.embed_q.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.k_norm.bias": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.k_norm.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wk.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wk.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wk.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wq_b.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wq_b.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.indexer.wq_b.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.o_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.o_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_a_layernorm.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_a_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_a_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_a_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_b_proj.biases": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_b_proj.scales": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.q_b_proj.weight": "model-00028-of-00046.safetensors", + "model.layers.49.self_attn.unembed_out.weight": "model-00028-of-00046.safetensors", + "model.layers.5.input_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.gate.e_score_correction_bias": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.gate.weight": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.5.self_attn.embed_q.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.k_norm.bias": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.k_norm.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wk.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wk.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wk.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wq_b.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wq_b.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.indexer.wq_b.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.o_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.o_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_a_layernorm.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_a_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_a_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_a_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_b_proj.biases": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_b_proj.scales": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.q_b_proj.weight": "model-00002-of-00046.safetensors", + "model.layers.5.self_attn.unembed_out.weight": "model-00002-of-00046.safetensors", + "model.layers.50.input_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.gate.e_score_correction_bias": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.gate.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.50.self_attn.embed_q.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.k_norm.bias": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.k_norm.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wk.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wk.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wk.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wq_b.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wq_b.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.indexer.wq_b.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.o_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.o_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_a_layernorm.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_a_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_a_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_a_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_b_proj.biases": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_b_proj.scales": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.q_b_proj.weight": "model-00029-of-00046.safetensors", + "model.layers.50.self_attn.unembed_out.weight": "model-00029-of-00046.safetensors", + "model.layers.51.input_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.gate.e_score_correction_bias": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.gate.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.embed_q.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.k_norm.bias": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.k_norm.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wk.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wk.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wk.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wq_b.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wq_b.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.indexer.wq_b.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.o_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.o_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_a_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_a_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_a_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_a_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_b_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_b_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.q_b_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.51.self_attn.unembed_out.weight": "model-00030-of-00046.safetensors", + "model.layers.52.input_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.gate.e_score_correction_bias": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.gate.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.52.self_attn.embed_q.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.k_norm.bias": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.k_norm.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wk.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wk.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wk.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wq_b.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wq_b.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.indexer.wq_b.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.o_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.o_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_a_layernorm.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_a_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_a_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_a_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_b_proj.biases": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_b_proj.scales": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.q_b_proj.weight": "model-00030-of-00046.safetensors", + "model.layers.52.self_attn.unembed_out.weight": "model-00030-of-00046.safetensors", + "model.layers.53.input_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.gate.e_score_correction_bias": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.gate.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.embed_q.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.k_norm.bias": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.k_norm.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wk.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wk.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wk.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wq_b.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wq_b.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.indexer.wq_b.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.o_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.o_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_a_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_a_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_a_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_a_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_b_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_b_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.q_b_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.53.self_attn.unembed_out.weight": "model-00031-of-00046.safetensors", + "model.layers.54.input_layernorm.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.gate.e_score_correction_bias": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.gate.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00032-of-00046.safetensors", + "model.layers.54.self_attn.embed_q.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.k_norm.bias": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.k_norm.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wk.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wk.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wk.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wq_b.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wq_b.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.indexer.wq_b.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.o_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.o_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_a_layernorm.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_a_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_a_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_a_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_b_proj.biases": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_b_proj.scales": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.q_b_proj.weight": "model-00031-of-00046.safetensors", + "model.layers.54.self_attn.unembed_out.weight": "model-00031-of-00046.safetensors", + "model.layers.55.input_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.gate.e_score_correction_bias": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.gate.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.55.self_attn.embed_q.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.k_norm.bias": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.k_norm.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wk.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wk.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wk.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wq_b.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wq_b.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.indexer.wq_b.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.o_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.o_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_a_layernorm.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_a_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_a_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_a_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_b_proj.biases": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_b_proj.scales": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.q_b_proj.weight": "model-00032-of-00046.safetensors", + "model.layers.55.self_attn.unembed_out.weight": "model-00032-of-00046.safetensors", + "model.layers.56.input_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.gate.e_score_correction_bias": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.gate.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.embed_q.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.k_norm.bias": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.k_norm.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wk.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wk.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wk.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wq_b.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wq_b.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.indexer.wq_b.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.o_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.o_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_a_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_a_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_a_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_a_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_b_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_b_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.q_b_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.56.self_attn.unembed_out.weight": "model-00033-of-00046.safetensors", + "model.layers.57.input_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.gate.e_score_correction_bias": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.gate.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.57.self_attn.embed_q.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.k_norm.bias": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.k_norm.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wk.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wk.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wk.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wq_b.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wq_b.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.indexer.wq_b.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.o_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.o_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_a_layernorm.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_a_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_a_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_a_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_b_proj.biases": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_b_proj.scales": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.q_b_proj.weight": "model-00033-of-00046.safetensors", + "model.layers.57.self_attn.unembed_out.weight": "model-00033-of-00046.safetensors", + "model.layers.58.input_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.gate.e_score_correction_bias": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.gate.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.embed_q.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.k_norm.bias": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.k_norm.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wk.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wk.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wk.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wq_b.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wq_b.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.indexer.wq_b.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.o_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.o_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_a_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_a_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_a_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_a_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_b_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_b_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.q_b_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.58.self_attn.unembed_out.weight": "model-00034-of-00046.safetensors", + "model.layers.59.input_layernorm.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.gate.e_score_correction_bias": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.gate.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00035-of-00046.safetensors", + "model.layers.59.self_attn.embed_q.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.k_norm.bias": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.k_norm.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wk.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wk.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wk.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wq_b.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wq_b.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.indexer.wq_b.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.o_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.o_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_a_layernorm.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_a_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_a_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_a_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_b_proj.biases": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_b_proj.scales": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.q_b_proj.weight": "model-00034-of-00046.safetensors", + "model.layers.59.self_attn.unembed_out.weight": "model-00034-of-00046.safetensors", + "model.layers.6.input_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.gate.e_score_correction_bias": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.gate.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.embed_q.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.k_norm.bias": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.k_norm.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wk.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wk.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wk.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wq_b.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wq_b.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.indexer.wq_b.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.o_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.o_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_a_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_a_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_a_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_a_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_b_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_b_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.q_b_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.6.self_attn.unembed_out.weight": "model-00003-of-00046.safetensors", + "model.layers.60.input_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.gate.e_score_correction_bias": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.gate.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.60.self_attn.embed_q.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.k_norm.bias": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.k_norm.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wk.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wk.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wk.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wq_b.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wq_b.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.indexer.wq_b.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.o_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.o_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_a_layernorm.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_a_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_a_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_a_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_b_proj.biases": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_b_proj.scales": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.q_b_proj.weight": "model-00035-of-00046.safetensors", + "model.layers.60.self_attn.unembed_out.weight": "model-00035-of-00046.safetensors", + "model.layers.61.input_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.gate.e_score_correction_bias": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.gate.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.embed_q.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.k_norm.bias": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.k_norm.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wk.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wk.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wk.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wq_b.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wq_b.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.indexer.wq_b.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.kv_a_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.o_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.o_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_a_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_a_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_a_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_a_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_b_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_b_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.q_b_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.61.self_attn.unembed_out.weight": "model-00036-of-00046.safetensors", + "model.layers.62.input_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.gate.e_score_correction_bias": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.gate.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.62.self_attn.embed_q.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.k_norm.bias": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.k_norm.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wk.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wk.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wk.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wq_b.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wq_b.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.indexer.wq_b.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.kv_a_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.o_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.o_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_a_layernorm.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_a_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_a_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_a_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_b_proj.biases": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_b_proj.scales": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.q_b_proj.weight": "model-00036-of-00046.safetensors", + "model.layers.62.self_attn.unembed_out.weight": "model-00036-of-00046.safetensors", + "model.layers.63.input_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.gate.e_score_correction_bias": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.gate.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.embed_q.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.k_norm.bias": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.k_norm.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wk.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wk.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wk.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wq_b.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wq_b.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.indexer.wq_b.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.kv_a_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.o_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.o_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_a_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_a_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_a_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_a_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_b_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_b_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.q_b_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.63.self_attn.unembed_out.weight": "model-00037-of-00046.safetensors", + "model.layers.64.input_layernorm.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.gate.e_score_correction_bias": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.gate.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.64.post_attention_layernorm.weight": "model-00038-of-00046.safetensors", + "model.layers.64.self_attn.embed_q.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.k_norm.bias": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.k_norm.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wk.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wk.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wk.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wq_b.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wq_b.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.indexer.wq_b.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.kv_a_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.o_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.o_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.o_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_a_layernorm.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_a_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_a_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_a_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_b_proj.biases": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_b_proj.scales": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.q_b_proj.weight": "model-00037-of-00046.safetensors", + "model.layers.64.self_attn.unembed_out.weight": "model-00037-of-00046.safetensors", + "model.layers.65.input_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.gate.e_score_correction_bias": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.gate.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.post_attention_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.65.self_attn.embed_q.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.k_norm.bias": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.k_norm.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wk.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wk.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wk.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wq_b.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wq_b.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.indexer.wq_b.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.kv_a_layernorm.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.o_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.o_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.o_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_a_layernorm.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_a_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_a_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_a_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_b_proj.biases": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_b_proj.scales": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.q_b_proj.weight": "model-00038-of-00046.safetensors", + "model.layers.65.self_attn.unembed_out.weight": "model-00038-of-00046.safetensors", + "model.layers.66.input_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.gate.e_score_correction_bias": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.gate.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.post_attention_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.embed_q.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.k_norm.bias": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.k_norm.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wk.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wk.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wk.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wq_b.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wq_b.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.indexer.wq_b.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.kv_a_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.o_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.o_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.o_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_a_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_a_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_a_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_a_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_b_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_b_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.q_b_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.66.self_attn.unembed_out.weight": "model-00039-of-00046.safetensors", + "model.layers.67.input_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.gate.e_score_correction_bias": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.gate.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.67.post_attention_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.67.self_attn.embed_q.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.k_norm.bias": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.k_norm.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wk.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wk.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wk.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wq_b.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wq_b.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.indexer.wq_b.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.kv_a_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.o_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.o_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.o_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_a_layernorm.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_a_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_a_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_a_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_b_proj.biases": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_b_proj.scales": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.q_b_proj.weight": "model-00039-of-00046.safetensors", + "model.layers.67.self_attn.unembed_out.weight": "model-00039-of-00046.safetensors", + "model.layers.68.input_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.gate.e_score_correction_bias": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.gate.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.post_attention_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.embed_q.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.k_norm.bias": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.k_norm.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wk.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wk.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wk.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wq_b.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wq_b.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.indexer.wq_b.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.kv_a_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.o_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.o_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.o_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_a_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_a_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_a_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_a_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_b_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_b_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.q_b_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.68.self_attn.unembed_out.weight": "model-00040-of-00046.safetensors", + "model.layers.69.input_layernorm.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.gate.e_score_correction_bias": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.gate.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.69.post_attention_layernorm.weight": "model-00041-of-00046.safetensors", + "model.layers.69.self_attn.embed_q.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.k_norm.bias": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.k_norm.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wk.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wk.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wk.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wq_b.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wq_b.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.indexer.wq_b.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.kv_a_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.o_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.o_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.o_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_a_layernorm.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_a_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_a_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_a_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_b_proj.biases": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_b_proj.scales": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.q_b_proj.weight": "model-00040-of-00046.safetensors", + "model.layers.69.self_attn.unembed_out.weight": "model-00040-of-00046.safetensors", + "model.layers.7.input_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.gate.e_score_correction_bias": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.gate.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.7.self_attn.embed_q.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.k_norm.bias": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.k_norm.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wk.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wk.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wk.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wq_b.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wq_b.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.indexer.wq_b.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.o_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.o_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_a_layernorm.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_a_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_a_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_a_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_b_proj.biases": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_b_proj.scales": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.q_b_proj.weight": "model-00003-of-00046.safetensors", + "model.layers.7.self_attn.unembed_out.weight": "model-00003-of-00046.safetensors", + "model.layers.70.input_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.gate.e_score_correction_bias": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.gate.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.post_attention_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.70.self_attn.embed_q.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.k_norm.bias": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.k_norm.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wk.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wk.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wk.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wq_b.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wq_b.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.indexer.wq_b.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.kv_a_layernorm.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.o_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.o_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.o_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_a_layernorm.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_a_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_a_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_a_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_b_proj.biases": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_b_proj.scales": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.q_b_proj.weight": "model-00041-of-00046.safetensors", + "model.layers.70.self_attn.unembed_out.weight": "model-00041-of-00046.safetensors", + "model.layers.71.input_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.gate.e_score_correction_bias": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.gate.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.post_attention_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.embed_q.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.k_norm.bias": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.k_norm.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wk.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wk.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wk.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wq_b.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wq_b.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.indexer.wq_b.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.kv_a_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.o_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.o_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.o_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_a_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_a_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_a_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_a_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_b_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_b_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.q_b_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.71.self_attn.unembed_out.weight": "model-00042-of-00046.safetensors", + "model.layers.72.input_layernorm.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.gate.e_score_correction_bias": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.gate.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.72.post_attention_layernorm.weight": "model-00043-of-00046.safetensors", + "model.layers.72.self_attn.embed_q.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.k_norm.bias": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.k_norm.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wk.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wk.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wk.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wq_b.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wq_b.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.indexer.wq_b.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.kv_a_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.o_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.o_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.o_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_a_layernorm.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_a_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_a_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_a_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_b_proj.biases": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_b_proj.scales": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.q_b_proj.weight": "model-00042-of-00046.safetensors", + "model.layers.72.self_attn.unembed_out.weight": "model-00042-of-00046.safetensors", + "model.layers.73.input_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.gate.e_score_correction_bias": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.gate.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.post_attention_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.73.self_attn.embed_q.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.k_norm.bias": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.k_norm.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wk.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wk.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wk.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wq_b.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wq_b.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.indexer.wq_b.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.kv_a_layernorm.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.o_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.o_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.o_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_a_layernorm.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_a_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_a_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_a_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_b_proj.biases": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_b_proj.scales": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.q_b_proj.weight": "model-00043-of-00046.safetensors", + "model.layers.73.self_attn.unembed_out.weight": "model-00043-of-00046.safetensors", + "model.layers.74.input_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.gate.e_score_correction_bias": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.gate.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.post_attention_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.embed_q.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.k_norm.bias": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.k_norm.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wk.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wk.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wk.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wq_b.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wq_b.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.indexer.wq_b.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.kv_a_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.o_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.o_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.o_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_a_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_a_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_a_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_a_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_b_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_b_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.q_b_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.74.self_attn.unembed_out.weight": "model-00044-of-00046.safetensors", + "model.layers.75.input_layernorm.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.gate.e_score_correction_bias": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.gate.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.75.post_attention_layernorm.weight": "model-00045-of-00046.safetensors", + "model.layers.75.self_attn.embed_q.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.k_norm.bias": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.k_norm.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wk.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wk.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wk.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wq_b.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wq_b.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.indexer.wq_b.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.kv_a_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.o_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.o_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.o_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_a_layernorm.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_a_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_a_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_a_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_b_proj.biases": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_b_proj.scales": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.q_b_proj.weight": "model-00044-of-00046.safetensors", + "model.layers.75.self_attn.unembed_out.weight": "model-00044-of-00046.safetensors", + "model.layers.76.input_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.gate.e_score_correction_bias": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.gate.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.post_attention_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.76.self_attn.embed_q.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.k_norm.bias": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.k_norm.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wk.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wk.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wk.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wq_b.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wq_b.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.indexer.wq_b.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.kv_a_layernorm.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.o_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.o_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.o_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_a_layernorm.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_a_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_a_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_a_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_b_proj.biases": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_b_proj.scales": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.q_b_proj.weight": "model-00045-of-00046.safetensors", + "model.layers.76.self_attn.unembed_out.weight": "model-00045-of-00046.safetensors", + "model.layers.77.input_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.gate.e_score_correction_bias": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.gate.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.post_attention_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.embed_q.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.k_norm.bias": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.k_norm.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wk.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wk.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wk.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wq_b.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wq_b.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.indexer.wq_b.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.kv_a_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.o_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.o_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.o_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_a_layernorm.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_a_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_a_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_a_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_b_proj.biases": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_b_proj.scales": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.q_b_proj.weight": "model-00046-of-00046.safetensors", + "model.layers.77.self_attn.unembed_out.weight": "model-00046-of-00046.safetensors", + "model.layers.8.input_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.gate.e_score_correction_bias": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.gate.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.embed_q.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.k_norm.bias": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.k_norm.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wk.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wk.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wk.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wq_b.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wq_b.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.indexer.wq_b.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.o_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.o_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_a_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_a_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_a_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_a_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_b_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_b_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.q_b_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.8.self_attn.unembed_out.weight": "model-00004-of-00046.safetensors", + "model.layers.9.input_layernorm.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.gate.e_score_correction_bias": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.gate.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00005-of-00046.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00005-of-00046.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00005-of-00046.safetensors", + "model.layers.9.self_attn.embed_q.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.k_norm.bias": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.k_norm.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wk.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wk.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wk.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wq_b.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wq_b.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.indexer.wq_b.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.o_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.o_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_a_layernorm.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_a_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_a_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_a_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_b_proj.biases": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_b_proj.scales": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.q_b_proj.weight": "model-00004-of-00046.safetensors", + "model.layers.9.self_attn.unembed_out.weight": "model-00004-of-00046.safetensors", + "model.norm.weight": "model-00046-of-00046.safetensors" + } +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..aba40197a4cdb5607f4ab7a05fb0a4ee8054fd6d --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d +size 20217442 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa53776c9f7ac98333a470b78a5b732d5343d15 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,34 @@ +{ + "backend": "tokenizers", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>", + "<|begin_of_audio|>", + "<|end_of_audio|>", + "<|begin_of_transcription|>", + "<|end_of_transcription|>" + ], + "is_local": true, + "model_max_length": 202752, + "model_specific_special_tokens": {}, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "TokenizersBackend", + "tool_parser_type": "glm47" +}