Tokenizer
# base_tokenizer
input: text
output: tokens
output = []
text = convert2unicode(text)
text = clean_text(text) # remove invalid character(0, 0xfffd, chars these are unicodedata.category is 'Cc' or Cf' except ['\r', '\n', '\t'])
text = tokenize_chinese_char(text) # add spaces before and after each chinese character
tokens = text.split()
for token in tokens:
if do_lower_case:
token = token.lower()
token = run_strip_accents(token) # normalize with 'NFD' encoder, remove chars belong to 'Mn' unicodedata catefgory
token = run_split_on_punc(token) # end a word before a punc, and start a word after a punc
output.append(token)
# wordpeice tokenizer
input: token
output: wordpeices
max_match