Update README.md
Browse files
README.md
CHANGED
|
@@ -16,11 +16,11 @@ class JiebaTokenizer(BertTokenizer):
|
|
| 16 |
self.pre_tokenizer = pre_tokenizer
|
| 17 |
def _tokenize(self, text, *arg, **kwargs):
|
| 18 |
split_tokens = []
|
| 19 |
-
for
|
| 20 |
-
if
|
| 21 |
-
split_tokens.append(
|
| 22 |
else:
|
| 23 |
-
split_tokens.extend(super()._tokenize(
|
| 24 |
return split_tokens
|
| 25 |
model = BigBirdModel.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
| 26 |
tokenizer = JiebaTokenizer.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
|
|
|
| 16 |
self.pre_tokenizer = pre_tokenizer
|
| 17 |
def _tokenize(self, text, *arg, **kwargs):
|
| 18 |
split_tokens = []
|
| 19 |
+
for word in self.pre_tokenizer(text):
|
| 20 |
+
if word in self.vocab:
|
| 21 |
+
split_tokens.append(word)
|
| 22 |
else:
|
| 23 |
+
split_tokens.extend(super()._tokenize(word))
|
| 24 |
return split_tokens
|
| 25 |
model = BigBirdModel.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
| 26 |
tokenizer = JiebaTokenizer.from_pretrained('Lowin/chinese-bigbird-base-4096')
|