HuggingFaceTB/training-guide-nanotron-configs
Viewer • Updated • 2 • 247 • 10
Thanks for your response! I'll check that out. 👏
Thanks for doing this! I had to train some tokenizers with the v4, it was indeed not straightforward to understand the behavior.
I had two questions:
Thanks for sharing, probably worth having a script to check:
import warnings
from transformers import AutoTokenizer
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
def check_tokenizer_gotchas(model_id):
print(f"\n{'='*60}")
print(f"Analyzing Tokenizer for: {model_id}")
print(f"{'='*60}\n")
try:
# Load tokenizer (trust_remote_code=True is often needed for newer/custom models)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
except Exception as e:
print(f"Error loading tokenizer: {e}")
return
# Standard test input
test_text = "Beautiful is better than ugly"
# Standard test messages for Chat Templates
messages = [
{"role": "user", "content": "What is better than ugly?"},
{"role": "assistant", "content": "Beautiful."}
]
# --- GOTCHA 1 & 2: BOS Token Existence and Usage ---
print(f"--- 1 & 2. BOS Token Analysis ---")
if tokenizer.bos_token is None:
print(f"⚠️ Gotcha #1: This tokenizer has NO BOS token defined.")
else:
print(f"✅ BOS token exists: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
# Check usage in standard encoding
encoded = tokenizer(test_text)["input_ids"]
if tokenizer.bos_token_id in encoded:
print(f"✅ BOS token IS automatically added during standard tokenization.")
else:
print(f"⚠️ Gotcha #2: BOS exists but is NOT added automatically.")
# --- GOTCHA 3: EOS Token in Standard Tokenization ---
print(f"--- 3. Standard EOS Token Analysis ---")
encoded = tokenizer(test_text)["input_ids"]
if tokenizer.eos_token_id and encoded[-1] == tokenizer.eos_token_id:
print(f"ℹ️ EOS token WAS added automatically (Uncommon behavior).")
else:
print(f"⚠️ Gotcha #3: Tokenization did NOT add the EOS token automatically.")
# --- GOTCHA 4: EOS in Chat Templates ---
print(f"--- 4. Chat Template EOS Analysis ---")
if tokenizer.chat_template:
# Generate IDs without adding the generation prompt yet
chat_encoded = tokenizer.apply_chat_template(messages, add_generation_prompt=False)
if tokenizer.eos_token_id is None:
print("❌ No EOS token defined in tokenizer.")
elif len(chat_encoded) > 0:
last_id = chat_encoded[-1]
# Check if the very last token is EOS
if last_id == tokenizer.eos_token_id:
print(f"✅ Chat template correctly appends EOS ({tokenizer.eos_token}) at the very end.")
# Check if EOS is second to last (common issue)
elif len(chat_encoded) > 1 and chat_encoded[-2] == tokenizer.eos_token_id:
# Decode the actual last token to show the user
trailing_token = tokenizer.decode([last_id])
# Escape newlines for visibility in print output
trailing_repr = repr(trailing_token)
print(f"⚠️ Gotcha #4: EOS is present but NOT at the end.")
print(f" The actual last token is ID {last_id} ({trailing_repr}).")
print(f" (This is likely a trailing newline from the Jinja template).")
else:
print(f"⚠️ Gotcha #4: Chat template does NOT append the EOS token.")
else:
print("ℹ️ No chat template defined for this tokenizer.")
# --- GOTCHA 5: PAD == EOS ---
print(f"--- 5. Pad Token Collision Check ---")
if tokenizer.pad_token_id is not None and tokenizer.eos_token_id is not None:
if tokenizer.pad_token_id == tokenizer.eos_token_id:
print(f"⚠️ Gotcha #5: PAD token ID equals EOS token ID ({tokenizer.pad_token_id}).")
print(f" Warning: Masking logic `input_ids == pad_token_id` will unintentionally mask EOS tokens.")
else:
print(f"✅ PAD ({tokenizer.pad_token_id}) and EOS ({tokenizer.eos_token_id}) are distinct.")
else:
print("ℹ️ PAD or EOS token not defined for this tokenizer.")
# --- GOTCHA 6 & 7: Composition and Double Special Tokens ---
print(f"--- 6 & 7. Chat Template Composition ---")
if tokenizer.chat_template:
# Step 1: Apply template directly to IDs (Correct way)
direct_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False)
# Step 2: Apply template to string, THEN tokenize (Incorrect way often used)
str_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
composed_ids = tokenizer(str_template)["input_ids"]
if direct_ids != composed_ids:
print(f"⚠️ Gotcha #7: Tokenizing the output of `apply_chat_template` ADDS extra special tokens.")
print(f" Direct ID length: {len(direct_ids)} vs Re-tokenized length: {len(composed_ids)}")
else:
print(f"✅ Tokenization of chat template string matches direct ID generation.")
else:
print("ℹ️ No chat template defined for this tokenizer.")
# Run for all models mentioned in the text
models = [
"Qwen/Qwen2.5-0.5B",
"microsoft/Phi-3-mini-128k-instruct",
"CohereLabs/aya-expanse-8b",
"meta-llama/Llama-3.2-1B-Instruct",
"databricks/dbrx-instruct",
"Qwen/Qwen2.5-0.5B-Instruct"
]
for model in models:
check_tokenizer_gotchas(model)
Very instersing example regarding CamemBERT, these were actually what I was referring to when I said "with a few exception", didn't know it was much more common, your point now on how this biases the results makes much more sense, thanks for clarifications!