| import json
|
| import os
|
| from transformers import PreTrainedTokenizer
|
|
|
|
|
| class NomTokenizer(PreTrainedTokenizer):
|
| vocab_files_names = {'vocab_file': 'vocab.json'}
|
|
|
| def __init__(
|
| self,
|
| vocab_file,
|
| unk_token='<UNK>',
|
| unk_token_id=0,
|
| id_start=1,
|
| **kwargs
|
| ):
|
| self.vocab_file = vocab_file
|
| self.id_start = id_start
|
| self.unk_token = unk_token
|
| self.unk_token_id = unk_token_id
|
| self.pad_token = unk_token
|
| self.pad_token_id = unk_token_id
|
|
|
| with open(vocab_file, 'r', encoding='utf-8') as f:
|
| self.vocab_dict = json.load(f)
|
|
|
| self.char2id = {}
|
| self.id2char = {}
|
| for i, char in enumerate(self.vocab_dict.keys(), start=id_start):
|
| self.char2id[char] = i
|
| self.id2char[i] = char
|
| self.id_to_options = {idx: v for idx, v in enumerate(self.vocab_dict.values(), start=id_start)}
|
|
|
| super().__init__(**kwargs)
|
|
|
| def _tokenize(self, text):
|
| return list(text)
|
|
|
| def _convert_token_to_id(self, token):
|
| return self.char2id.get(token, self.unk_token_id)
|
|
|
| def _convert_id_to_token(self, index):
|
| if index == self.unk_token_id:
|
| return self.unk_token
|
| return self.id2char.get(index, self.unk_token)
|
|
|
| @property
|
| def vocab_size(self):
|
| return len(self.char2id) + 1
|
|
|
| def get_vocab(self):
|
| vocab = {**self.char2id, **self.added_tokens_encoder}
|
| return vocab
|
|
|
| def save_vocabulary(self, save_directory, filename_prefix=None):
|
| if filename_prefix:
|
| vocab_file = os.path.join(save_directory, f'{filename_prefix}-vocab.json')
|
| else:
|
| vocab_file = os.path.join(save_directory, 'vocab.json')
|
|
|
| with open(vocab_file, 'w', encoding='utf-8') as f:
|
| json.dump(self.vocab_dict, f, ensure_ascii=False)
|
|
|
| return (vocab_file,)
|
|
|