training_script / train.py

Ern You

initial commit

131e609 8 months ago

13 kB

	MODEL = "bigcode/starcoderbase-1b" # Model checkpoint on the Hugging Face Hub
	DATASET = "smangrul/hf-stack-v1" # Dataset on the Hugging Face Hub
	DATA_COLUMN = "content" # Column name containing the code content

	SEQ_LENGTH = 2048 # Sequence length

	MAX_STEPS = 2000 # max_steps
	BATCH_SIZE = 8 # batch_size
	GR_ACC_STEPS = 1 # gradient_accumulation_steps
	LR = 5e-4 # learning_rate
	LR_SCHEDULER_TYPE = "cosine" # lr_scheduler_type
	WEIGHT_DECAY = 0.01 # weight_decay
	NUM_WARMUP_STEPS = 30 # num_warmup_steps
	EVAL_FREQ = 100 # eval_freq
	SAVE_FREQ = 100 # save_freq
	LOG_FREQ = 25 # log_freq
	OUTPUT_DIR = "peft-starcoder-lora-a100" # output_dir
	BF16 = False # bf16
	FP16 = False # no_fp16

	# FIM trasformations arguments
	FIM_RATE = 0.5 # fim_rate
	FIM_SPM_RATE = 0.5 # fim_spm_rate

	# LORA
	LORA_R = 8 # lora_r
	LORA_ALPHA = 32 # lora_alpha
	LORA_DROPOUT = 0.0 # lora_dropout
	LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj" # lora_target_modules

	# bitsandbytes config
	#USE_NESTED_QUANT = True # use_nested_quant
	#BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # bnb_4bit_compute_dtype

	SEED = 0

	from huggingface_hub import login
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	Trainer,
	TrainingArguments,
	logging,
	set_seed,
	BitsAndBytesConfig,
	)

	from datasets import load_dataset
	import torch
	from tqdm import tqdm

	#Prepare Data
	dataset = load_dataset(
	DATASET,
	data_dir="data",
	split="train",
	streaming=True,
	)

	valid_data = dataset.take(4000)
	train_data = dataset.skip(4000)
	train_data = train_data.shuffle(buffer_size=5000, seed=SEED)

	set_seed(SEED)

	tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)


	def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
	"""
	Estimate the average number of characters per token in the dataset.
	"""

	total_characters, total_tokens = 0, 0
	for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
	total_characters += len(example[data_column])
	total_tokens += len(tokenizer(example[data_column]).tokens())

	return total_characters / total_tokens


	chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
	print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

	import functools
	import numpy as np


	# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.
	@functools.lru_cache(maxsize=None)
	def get_fim_token_ids(tokenizer):
	try:
	FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][1:5]
	suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
	tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
	)
	except KeyError:
	suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None
	return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id


	## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
	def permute(
	sample,
	np_rng,
	suffix_tok_id,
	prefix_tok_id,
	middle_tok_id,
	pad_tok_id,
	fim_rate=0.5,
	fim_spm_rate=0.5,
	truncate_or_pad=False,
	):
	"""
	Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
	PSM and SPM (with a probability of fim_spm_rate).
	"""

	# The if condition will trigger with the probability of fim_rate
	# This means FIM transformations will apply to samples with a probability of fim_rate
	if np_rng.binomial(1, fim_rate):

	# Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.
	boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
	boundaries.sort()

	prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
	middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
	suffix = np.array(sample[boundaries[1] :], dtype=np.int64)

	if truncate_or_pad:
	# calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix
	new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
	diff = new_length - len(sample)

	# trancate or pad if there's a difference in length between the new length and the original
	if diff > 0:
	if suffix.shape[0] <= diff:
	return sample, np_rng
	suffix = suffix[: suffix.shape[0] - diff]
	elif diff < 0:
	suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])

	# With the probability of fim_spm_rateapply SPM variant of FIM transformations
	# SPM: suffix, prefix, middle
	if np_rng.binomial(1, fim_spm_rate):
	new_sample = np.concatenate(
	[
	[prefix_tok_id, suffix_tok_id],
	suffix,
	[middle_tok_id],
	prefix,
	middle,
	]
	)
	# Otherwise, apply the PSM variant of FIM transformations
	# PSM: prefix, suffix, middle
	else:

	new_sample = np.concatenate(
	[
	[prefix_tok_id],
	prefix,
	[suffix_tok_id],
	suffix,
	[middle_tok_id],
	middle,
	]
	)
	else:
	# don't apply FIM transformations
	new_sample = sample

	return list(new_sample), np_rng

	from torch.utils.data import IterableDataset
	from torch.utils.data.dataloader import DataLoader
	import random

	# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.


	class ConstantLengthDataset(IterableDataset):
	"""
	Iterable dataset that returns constant length chunks of tokens from stream of text files.
	Args:
	tokenizer (Tokenizer): The processor used for proccessing the data.
	dataset (dataset.Dataset): Dataset with text files.
	infinite (bool): If True the iterator is reset after dataset reaches end else stops.
	seq_length (int): Length of token sequences to return.
	num_of_sequences (int): Number of token sequences to keep in buffer.
	chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
	fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
	fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
	seed (int): Seed for random number generator.
	"""

	def __init__(
	self,
	tokenizer,
	dataset,
	infinite=False,
	seq_length=1024,
	num_of_sequences=1024,
	chars_per_token=3.6,
	content_field="content",
	fim_rate=0.5,
	fim_spm_rate=0.5,
	seed=0,
	):
	self.tokenizer = tokenizer
	self.concat_token_id = tokenizer.eos_token_id
	self.dataset = dataset
	self.seq_length = seq_length
	self.infinite = infinite
	self.current_size = 0
	self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
	self.content_field = content_field
	self.fim_rate = fim_rate
	self.fim_spm_rate = fim_spm_rate
	self.seed = seed

	(
	self.suffix_tok_id,
	self.prefix_tok_id,
	self.middle_tok_id,
	self.pad_tok_id,
	) = get_fim_token_ids(self.tokenizer)
	if not self.suffix_tok_id and self.fim_rate > 0:
	print("FIM is not supported by tokenizer, disabling FIM")
	self.fim_rate = 0

	def __iter__(self):
	iterator = iter(self.dataset)
	more_examples = True
	np_rng = np.random.RandomState(seed=self.seed)
	while more_examples:
	buffer, buffer_len = [], 0
	while True:
	if buffer_len >= self.max_buffer_size:
	break
	try:
	buffer.append(next(iterator)[self.content_field])
	buffer_len += len(buffer[-1])
	except StopIteration:
	if self.infinite:
	iterator = iter(self.dataset)
	else:
	more_examples = False
	break
	tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
	all_token_ids = []

	for tokenized_input in tokenized_inputs:
	# optionally do FIM permutations
	if self.fim_rate > 0:
	tokenized_input, np_rng = permute(
	tokenized_input,
	np_rng,
	self.suffix_tok_id,
	self.prefix_tok_id,
	self.middle_tok_id,
	self.pad_tok_id,
	fim_rate=self.fim_rate,
	fim_spm_rate=self.fim_spm_rate,
	truncate_or_pad=False,
	)

	all_token_ids.extend(tokenized_input + [self.concat_token_id])
	examples = []
	for i in range(0, len(all_token_ids), self.seq_length):
	input_ids = all_token_ids[i : i + self.seq_length]
	if len(input_ids) == self.seq_length:
	examples.append(input_ids)
	random.shuffle(examples)
	for example in examples:
	self.current_size += 1
	yield {
	"input_ids": torch.LongTensor(example),
	"labels": torch.LongTensor(example),
	}


	train_dataset = ConstantLengthDataset(
	tokenizer,
	train_data,
	infinite=True,
	seq_length=SEQ_LENGTH,
	chars_per_token=chars_per_token,
	content_field=DATA_COLUMN,
	fim_rate=FIM_RATE,
	fim_spm_rate=FIM_SPM_RATE,
	seed=SEED,
	)
	eval_dataset = ConstantLengthDataset(
	tokenizer,
	valid_data,
	infinite=False,
	seq_length=SEQ_LENGTH,
	chars_per_token=chars_per_token,
	content_field=DATA_COLUMN,
	fim_rate=FIM_RATE,
	fim_spm_rate=FIM_SPM_RATE,
	seed=SEED,
	)

	import torch
	from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
	from peft.tuners.lora import LoraLayer

	#load_in_8bit = False

	# 4-bit quantization
	#compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
	#compute_float32 = torch.float32

	#bnb_config = BitsAndBytesConfig(
	# load_in_4bit=True,
	# bnb_4bit_quant_type="nf4",
	# bnb_4bit_compute_dtype=compute_float32,
	# bnb_4bit_use_double_quant=USE_NESTED_QUANT,
	# bnb_4bit_quant_storage=compute_float32
	#)

	#import os
	#device_map = int(os.environ.get("LOCAL_RANK", -1))

	model = AutoModelForCausalLM.from_pretrained(
	MODEL,
	#quantization_config=bnb_config,
	device_map=None,
	use_cache=False, # We will be using gradient checkpointing
	trust_remote_code=True,
	torch_dtype = torch.float32,
	)

	#from collections import Counter
	#print(Counter(p.dtype for p in model.parameters()))

	#model = prepare_model_for_kbit_training(model)

	#from collections import Counter
	#print("after prepare_model_for_kbit_training ", Counter(p.dtype for p in model.parameters()))

	peft_config = LoraConfig(
	lora_alpha=LORA_ALPHA,
	lora_dropout=LORA_DROPOUT,
	r=LORA_R,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=LORA_TARGET_MODULES.split(","),
	)

	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters()
	#from collections import Counter
	#print("after get_peft_model ", Counter(p.dtype for p in model.parameters()))

	train_data.start_iteration = 0

	training_args = TrainingArguments(
	output_dir=f"limernyou/{OUTPUT_DIR}",
	dataloader_drop_last=True,
	eval_strategy="steps",
	save_strategy="steps",
	max_steps=MAX_STEPS,
	eval_steps=EVAL_FREQ,
	save_steps=SAVE_FREQ,
	logging_steps=LOG_FREQ,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	learning_rate=LR,
	lr_scheduler_type=LR_SCHEDULER_TYPE,
	warmup_steps=NUM_WARMUP_STEPS,
	gradient_accumulation_steps=GR_ACC_STEPS,
	gradient_checkpointing_kwargs={"use_reentrant": False},
	gradient_checkpointing=True,
	fp16=FP16,
	bf16=BF16,
	weight_decay=WEIGHT_DECAY,
	push_to_hub=True,
	include_tokens_per_second=True,
	)

	#from trl import SFTConfig, SFTTrainer
	trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)

	print("Training...")
	trainer.train()
	trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
	trainer.save_model()
	trainer.push_to_hub()