| | import os |
| | import numpy as np |
| | from transformers import AutoTokenizer |
| | import random |
| | import argparse |
| |
|
| | def parse_arguments(): |
| | parser = argparse.ArgumentParser(description='Process the text data for tokenization.') |
| | parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.") |
| | parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.") |
| | parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.") |
| | parser.add_argument("--file_name", type=str, default="data.txt", required=True) |
| | parser.add_argument("--block_size", type=int, default=512, help="Max token length.") |
| | parser.add_argument("--is_start_with_eos", type=bool, default=False, help="Whether each line starts with `eos_token`.") |
| | parser.add_argument("--is_end_with_eos", type=bool, default=False, help="Whether each line ends with `eos_token`.") |
| | parser.add_argument("--split_ratio", type=float, default=0.99, help="Train-validation split ratio.") |
| | return parser.parse_args() |
| |
|
| | def tokenize_and_save_lines(tokenizer, input_file, train_txt_file, val_txt_file, train_bin_file, val_bin_file,is_start_with_eos, is_end_with_eos, block_size, split_ratio): |
| | train_ids = [] |
| | val_ids = [] |
| | train_lines = [] |
| | val_lines = [] |
| | |
| | with open(input_file, 'r', encoding='utf-8') as f: |
| | lines = f.readlines() |
| | |
| | random.shuffle(lines) |
| | split_at = int(split_ratio * len(lines)) |
| | train_lines_list = lines[:split_at] |
| | val_lines_list = lines[split_at:] |
| | |
| | for i, line in enumerate(train_lines_list): |
| | ids = tokenizer.encode(line) |
| | if not is_end_with_eos: |
| | ids.append(0) |
| | elif not is_start_with_eos: |
| | ids.insert(0,0) |
| |
|
| | if len(ids) < block_size: |
| | train_ids.extend(ids) |
| | train_lines.append(line.strip()) |
| | if i % 1000000 == 0: |
| | print(f"now processing {i}...") |
| | |
| | for i, line in enumerate(val_lines_list): |
| | ids = tokenizer.encode(line) |
| | if not is_end_with_eos: |
| | ids.append(0) |
| | elif not is_start_with_eos: |
| | ids.insert(0,0) |
| | |
| | if len(ids) <= block_size: |
| | val_ids.extend(ids) |
| | val_lines.append(line.strip()) |
| | |
| | |
| | save_tokenized_data(train_ids, train_bin_file) |
| | save_tokenized_data(val_ids, val_bin_file) |
| | print("Tokenized data saved...") |
| | |
| | |
| | save_text_data(train_lines, train_txt_file) |
| | save_text_data(val_lines, val_txt_file) |
| | print("Text data saved...") |
| |
|
| | def save_tokenized_data(tokenized_data, file_path): |
| | np_data = np.array(tokenized_data, dtype=np.uint16) |
| | os.makedirs(os.path.dirname(file_path), exist_ok=True) |
| | np_data.tofile(file_path) |
| |
|
| | def save_text_data(text_data, file_path): |
| | os.makedirs(os.path.dirname(file_path), exist_ok=True) |
| | with open(file_path, 'w', encoding='utf-8') as f: |
| | for line in text_data: |
| | f.write(line + '\n') |
| |
|
| | def main(): |
| | args = parse_arguments() |
| |
|
| | |
| | raw_data_path = os.path.join(args.data_dir, args.file_name) |
| | train_txt_path = os.path.join(args.out_dir, 'train.txt') |
| | val_txt_path = os.path.join(args.out_dir, 'val.txt') |
| | train_bin_path = os.path.join(args.out_dir, 'train.bin') |
| | val_bin_path = os.path.join(args.out_dir, 'val.bin') |
| | print("Paths setup complete...") |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) |
| | tokenize_and_save_lines(tokenizer, raw_data_path, train_txt_path, val_txt_path, train_bin_path, val_bin_path, args.is_start_with_eos, args.is_end_with_eos, args.block_size, args.split_ratio) |
| | print("Tokenization and data saving") |
| |
|
| | if __name__ == "__main__": |
| | main() |