yujiepan commited on
Commit
82919c0
·
verified ·
1 Parent(s): d4a1eb6

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - zai-org/GLM-5
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5).
8
+
9
+ | File path | Size |
10
+ |------|------|
11
+ | model.safetensors | 9.0MB |
12
+
13
+
14
+ ### Example usage:
15
+
16
+ - vLLM
17
+
18
+ ```bash
19
+ # Multi-token prediction is supported
20
+ model_id=tiny-random/glm-moe-dsa
21
+ vllm serve $model_id \
22
+ --tensor-parallel-size 2 \
23
+ --speculative-config.method mtp \
24
+ --speculative-config.num_speculative_tokens 1 \
25
+ --tool-call-parser glm47 \
26
+ --reasoning-parser glm45 \
27
+ --enable-auto-tool-choice
28
+ ```
29
+
30
+ - SGLang
31
+
32
+ ```bash
33
+ # Multi-token prediction is supported
34
+ model_id=tiny-random/glm-moe-dsa
35
+ python3 -m sglang.launch_server --model-path $model_id --tp-size 2 \
36
+ --tool-call-parser glm47 \
37
+ --reasoning-parser glm45 \
38
+ --speculative-algorithm EAGLE \
39
+ --speculative-num-steps 3 \
40
+ --speculative-eagle-topk 1 \
41
+ --speculative-num-draft-tokens 4
42
+ ```
43
+
44
+ - Transformers
45
+
46
+ ```python
47
+ import torch
48
+ from transformers import AutoModelForCausalLM, AutoTokenizer
49
+
50
+ model_id = "tiny-random/glm-moe-dsa"
51
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
52
+ input_ids = torch.randint(1000, 2000, size=(1, 2333), dtype=torch.long).cuda() # trigger DSA
53
+ # messages = [{"role": "user", "content": "hello"}]
54
+ # input_ids = tokenizer(messages, return_tensors="pt").input_ids.cuda()
55
+ model = AutoModelForCausalLM.from_pretrained(
56
+ model_id,
57
+ dtype=torch.bfloat16,
58
+ device_map="cuda",
59
+ )
60
+ generated_ids = model.generate(input_ids, max_new_tokens=32)
61
+ output_text = tokenizer.decode(generated_ids[0][input_ids.shape[1]:])
62
+ print(output_text)
63
+ ```
64
+
65
+ ### Codes to create this repo:
66
+
67
+ <details>
68
+ <summary>Click to expand</summary>
69
+
70
+ ```python
71
+ import json
72
+ from copy import deepcopy
73
+ from pathlib import Path
74
+
75
+ import accelerate
76
+ import torch
77
+ import torch.nn as nn
78
+ from huggingface_hub import file_exists, hf_hub_download
79
+ from transformers import (
80
+ AutoConfig,
81
+ AutoModelForCausalLM,
82
+ AutoProcessor,
83
+ GenerationConfig,
84
+ set_seed,
85
+ )
86
+
87
+ source_model_id = "zai-org/GLM-5"
88
+ save_folder = "/tmp/tiny-random/glm-moe-dsa"
89
+
90
+ processor = AutoProcessor.from_pretrained(
91
+ source_model_id, trust_remote_code=True)
92
+ processor.save_pretrained(save_folder)
93
+
94
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
95
+ config_json: dict = json.load(f)
96
+
97
+ head_dim = 64
98
+ kv_lora_rank = 512
99
+ qk_nope_head_dim = 192
100
+ config_json.update({
101
+ "first_k_dense_replace": 1,
102
+ "mlp_layer_types": ['dense'] + ['sparse'],
103
+ "head_dim": head_dim,
104
+ "hidden_size": 8,
105
+ "index_head_dim": 32,
106
+ "index_n_heads": 4,
107
+ "intermediate_size": 32,
108
+ "moe_intermediate_size": 32,
109
+ "num_hidden_layers": 2,
110
+ 'kv_lora_rank': kv_lora_rank,
111
+ "num_attention_heads": 4,
112
+ 'num_key_value_heads': 4,
113
+ 'q_lora_rank': 32,
114
+ "qk_head_dim": qk_nope_head_dim + head_dim,
115
+ 'qk_nope_head_dim': qk_nope_head_dim,
116
+ 'qk_rope_head_dim': head_dim,
117
+ 'v_head_dim': qk_nope_head_dim + head_dim,
118
+ "tie_word_embeddings": True,
119
+ })
120
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
121
+ json.dump(config_json, f, indent=2)
122
+
123
+ config = AutoConfig.from_pretrained(
124
+ save_folder,
125
+ trust_remote_code=True,
126
+ )
127
+ print(config)
128
+ torch.set_default_dtype(torch.bfloat16)
129
+ model = AutoModelForCausalLM.from_config(config)
130
+ torch.set_default_dtype(torch.float32)
131
+
132
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
133
+ model.generation_config = GenerationConfig.from_pretrained(
134
+ source_model_id, trust_remote_code=True,
135
+ )
136
+ model.generation_config.do_sample = True
137
+ print(model.generation_config)
138
+
139
+ model = model.cpu()
140
+ set_seed(42)
141
+ n_params = sum(p.numel() for p in model.parameters())
142
+ with torch.no_grad():
143
+ for name, p in sorted(model.named_parameters()):
144
+ torch.nn.init.normal_(p, 0, 0.1)
145
+ print(name, p.shape, p.numel() / n_params * 100, '%')
146
+ # MTP
147
+ set_seed(42)
148
+ model.model.layers.append(nn.ModuleDict(dict(
149
+ shared_head=nn.ModuleDict(dict(
150
+ norm=nn.RMSNorm(config.hidden_size),
151
+ # head=deepcopy(model.model.embed_tokens),
152
+ )),
153
+ # embed_tokens=deepcopy(model.model.embed_tokens),
154
+ eh_proj=nn.Linear(config.hidden_size * 2,
155
+ config.hidden_size, bias=False),
156
+ enorm=nn.RMSNorm(config.hidden_size),
157
+ hnorm=nn.RMSNorm(config.hidden_size),
158
+ input_layernorm=nn.RMSNorm(config.hidden_size),
159
+ post_attention_layernorm=nn.RMSNorm(config.hidden_size),
160
+ self_attn=deepcopy(model.model.layers[1].self_attn),
161
+ mlp=deepcopy(model.model.layers[1].mlp),
162
+ )))
163
+ for i in range(1, len(model.model.layers)):
164
+ model.model.layers[i].mlp.gate.e_score_correction_bias = torch.rand_like(
165
+ model.model.layers[i].mlp.gate.e_score_correction_bias).float()
166
+ model.save_pretrained(save_folder)
167
+ print(model)
168
+ ```
169
+
170
+ </details>
171
+
172
+ ### Printing the model:
173
+
174
+ <details><summary>Click to expand</summary>
175
+
176
+ ```text
177
+ GlmMoeDsaForCausalLM(
178
+ (model): GlmMoeDsaModel(
179
+ (embed_tokens): Embedding(154880, 8, padding_idx=154820)
180
+ (layers): ModuleList(
181
+ (0): GlmMoeDsaDecoderLayer(
182
+ (self_attn): GlmMoeDsaAttention(
183
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
184
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
185
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
186
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
187
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
188
+ (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
189
+ (o_proj): Linear(in_features=1024, out_features=8, bias=False)
190
+ (wq_b): Linear(in_features=32, out_features=1024, bias=False)
191
+ (wk): Linear(in_features=8, out_features=256, bias=False)
192
+ (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
193
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
194
+ )
195
+ (mlp): GlmMoeDsaMLP(
196
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
197
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
198
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
199
+ (act_fn): SiLUActivation()
200
+ )
201
+ (input_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
202
+ (post_attention_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
203
+ )
204
+ (1): GlmMoeDsaDecoderLayer(
205
+ (self_attn): GlmMoeDsaAttention(
206
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
207
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
208
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
209
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
210
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
211
+ (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
212
+ (o_proj): Linear(in_features=1024, out_features=8, bias=False)
213
+ (wq_b): Linear(in_features=32, out_features=1024, bias=False)
214
+ (wk): Linear(in_features=8, out_features=256, bias=False)
215
+ (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
216
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
217
+ )
218
+ (mlp): GlmMoeDsaMoE(
219
+ (experts): GlmMoeDsaNaiveMoe(
220
+ (act_fn): SiLUActivation()
221
+ )
222
+ (gate): GlmMoeDsaTopkRouter()
223
+ (shared_experts): GlmMoeDsaMLP(
224
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
225
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
226
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
227
+ (act_fn): SiLUActivation()
228
+ )
229
+ )
230
+ (input_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
231
+ (post_attention_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
232
+ )
233
+ (2): ModuleDict(
234
+ (shared_head): ModuleDict(
235
+ (norm): RMSNorm((8,), eps=None, elementwise_affine=True)
236
+ )
237
+ (eh_proj): Linear(in_features=16, out_features=8, bias=False)
238
+ (enorm): RMSNorm((8,), eps=None, elementwise_affine=True)
239
+ (hnorm): RMSNorm((8,), eps=None, elementwise_affine=True)
240
+ (input_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
241
+ (post_attention_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
242
+ (self_attn): GlmMoeDsaAttention(
243
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
244
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
245
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
246
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
247
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
248
+ (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
249
+ (o_proj): Linear(in_features=1024, out_features=8, bias=False)
250
+ (wq_b): Linear(in_features=32, out_features=1024, bias=False)
251
+ (wk): Linear(in_features=8, out_features=256, bias=False)
252
+ (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
253
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
254
+ )
255
+ (mlp): GlmMoeDsaMoE(
256
+ (experts): GlmMoeDsaNaiveMoe(
257
+ (act_fn): SiLUActivation()
258
+ )
259
+ (gate): GlmMoeDsaTopkRouter()
260
+ (shared_experts): GlmMoeDsaMLP(
261
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
262
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
263
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
264
+ (act_fn): SiLUActivation()
265
+ )
266
+ )
267
+ )
268
+ )
269
+ (norm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
270
+ (rotary_emb): GlmMoeDsaRotaryEmbedding()
271
+ )
272
+ (lm_head): Linear(in_features=8, out_features=154880, bias=False)
273
+ )
274
+ ```
275
+
276
+ </details>
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
17
+ {%- macro visible_text(content) -%}
18
+ {%- if content is string -%}
19
+ {{- content }}
20
+ {%- elif content is iterable and content is not mapping -%}
21
+ {%- for item in content -%}
22
+ {%- if item is mapping and item.type == 'text' -%}
23
+ {{- item.text }}
24
+ {%- elif item is string -%}
25
+ {{- item }}
26
+ {%- endif -%}
27
+ {%- endfor -%}
28
+ {%- else -%}
29
+ {{- content }}
30
+ {%- endif -%}
31
+ {%- endmacro -%}
32
+ {%- set ns = namespace(last_user_index=-1) %}
33
+ {%- for m in messages %}
34
+ {%- if m.role == 'user' %}
35
+ {% set ns.last_user_index = loop.index0 -%}
36
+ {%- endif %}
37
+ {%- endfor %}
38
+ {% for m in messages %}
39
+ {%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}
40
+ {%- elif m.role == 'assistant' -%}
41
+ <|assistant|>
42
+ {%- set reasoning_content = '' %}
43
+ {%- set content = visible_text(m.content) %}
44
+ {%- if m.reasoning_content is string %}
45
+ {%- set reasoning_content = m.reasoning_content %}
46
+ {%- else %}
47
+ {%- if '</think>' in content %}
48
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
49
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
50
+ {%- endif %}
51
+ {%- endif %}
52
+ {%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}
53
+ {{ '<think>' + reasoning_content.strip() + '</think>'}}
54
+ {%- else -%}
55
+ {{ '</think>' }}
56
+ {%- endif -%}
57
+ {%- if content.strip() -%}
58
+ {{ content.strip() }}
59
+ {%- endif -%}
60
+ {% if m.tool_calls %}
61
+ {% for tc in m.tool_calls %}
62
+ {%- if tc.function %}
63
+ {%- set tc = tc.function %}
64
+ {%- endif %}
65
+ {{- '<tool_call>' + tc.name -}}
66
+ {% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
67
+ {% endif %}
68
+ {%- elif m.role == 'tool' -%}
69
+ {%- if m.content is string -%}
70
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
71
+ {{- '<|observation|>' }}
72
+ {%- endif %}
73
+ {{- '<tool_response>' }}
74
+ {{- m.content }}
75
+ {{- '</tool_response>' }}
76
+ {%- else -%}
77
+ <|observation|>{% for tr in m.content %}
78
+ <tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}
79
+ {% endif -%}
80
+ {%- elif m.role == 'system' -%}
81
+ <|system|>{{ visible_text(m.content) }}
82
+ {%- endif -%}
83
+ {%- endfor -%}
84
+ {%- if add_generation_prompt -%}
85
+ <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
86
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GlmMoeDsaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 154820,
11
+ 154827,
12
+ 154829
13
+ ],
14
+ "ep_size": 1,
15
+ "first_k_dense_replace": 1,
16
+ "head_dim": 64,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 8,
19
+ "index_head_dim": 32,
20
+ "index_n_heads": 4,
21
+ "index_topk": 2048,
22
+ "indexer_rope_interleave": true,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 32,
25
+ "kv_lora_rank": 512,
26
+ "max_position_embeddings": 202752,
27
+ "mlp_layer_types": [
28
+ "dense",
29
+ "sparse"
30
+ ],
31
+ "model_type": "glm_moe_dsa",
32
+ "moe_intermediate_size": 32,
33
+ "moe_layer_freq": 1,
34
+ "n_group": 1,
35
+ "n_routed_experts": 256,
36
+ "n_shared_experts": 1,
37
+ "norm_topk_prob": true,
38
+ "num_attention_heads": 4,
39
+ "num_experts_per_tok": 8,
40
+ "num_hidden_layers": 2,
41
+ "num_key_value_heads": 4,
42
+ "num_nextn_predict_layers": 1,
43
+ "pad_token_id": 154820,
44
+ "pretraining_tp": 1,
45
+ "q_lora_rank": 32,
46
+ "qk_head_dim": 256,
47
+ "qk_nope_head_dim": 192,
48
+ "qk_rope_head_dim": 64,
49
+ "rms_norm_eps": 1e-05,
50
+ "rope_interleave": true,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000,
53
+ "rope_type": "default"
54
+ },
55
+ "routed_scaling_factor": 2.5,
56
+ "scoring_func": "sigmoid",
57
+ "tie_word_embeddings": true,
58
+ "topk_group": 1,
59
+ "topk_method": "noaux_tc",
60
+ "transformers_version": "5.2.0.dev0",
61
+ "use_cache": true,
62
+ "v_head_dim": 256,
63
+ "vocab_size": 154880
64
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 154820,
6
+ 154827,
7
+ 154829
8
+ ],
9
+ "pad_token_id": 154820,
10
+ "temperature": 1.0,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.2.0.dev0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed734a563b043f3746f8922c9447c582f2084b63369e5a0a75347e0b210a90a
3
+ size 9455168
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d
3
+ size 20217442
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|endoftext|>",
8
+ "[MASK]",
9
+ "[gMASK]",
10
+ "[sMASK]",
11
+ "<sop>",
12
+ "<eop>",
13
+ "<|system|>",
14
+ "<|user|>",
15
+ "<|assistant|>",
16
+ "<|observation|>",
17
+ "<|begin_of_image|>",
18
+ "<|end_of_image|>",
19
+ "<|begin_of_video|>",
20
+ "<|end_of_video|>",
21
+ "<|begin_of_audio|>",
22
+ "<|end_of_audio|>",
23
+ "<|begin_of_transcription|>",
24
+ "<|end_of_transcription|>"
25
+ ],
26
+ "is_local": false,
27
+ "model_max_length": 202752,
28
+ "model_specific_special_tokens": {},
29
+ "pad_token": "<|endoftext|>",
30
+ "padding_side": "left",
31
+ "remove_space": false,
32
+ "tokenizer_class": "TokenizersBackend"
33
+ }