| | class CustomTokenizer: |
| | def __init__(self, pretrained_tokenizer_path, cache_dir): |
| | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, |
| | cache_dir= cache_dir, |
| | trust_remote_code = True |
| | ) |
| |
|
| | new_tokens = ['<num>', '<url>', '<mail>'] |
| | self.tokenizer.add_tokens(new_tokens) |
| |
|
| | |
| | self.number_pattern = re.compile(r'\b\d+\.?\d*\b') |
| | self.url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})') |
| | self.mail_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}') |
| |
|
| | self.replacement_symbols = { |
| | "url": '<url>', |
| | "num": '<num>', |
| | "mail": '<mail>', |
| | } |
| |
|
| | def preprocess(self, text): |
| | text = self.number_pattern.sub(self.replacement_symbols["num"], text) |
| | text = self.url_pattern.sub(self.replacement_symbols["url"], text) |
| | text = self.mail_pattern.sub(self.replacement_symbols["mail"], text) |
| |
|
| | return text |
| |
|
| | def __getattr__(self, attr): |
| | |
| | return getattr(self.tokenizer, attr) |
| |
|
| | def __call__(self, text, **kwargs): |
| | preprocessed_text = self.preprocess(text) |
| | return self.tokenizer(preprocessed_text, **kwargs) |