Tokenizer
# base_tokenizerinput: textoutput: tokensoutput = []text = convert2unicode(text)text = clean_text(text) # remove invalid character(0, 0xfffd, chars these are unicodedata.category is 'Cc' or Cf' except ['\r', '\n', '\t'])text = tokenize_chinese_char(text) # add spaces before and after each chinese charactertokens = text.split()for token in tokens: if do_lower_case: token = token.lower() token = run_strip_accents(token) # normalize with 'NFD' encoder, remove chars belong to 'Mn' unicodedata catefgory token = run_split_on_punc(token) # end a word before a punc, and start a word after a punc output.append(token)# wordpeice tokenizerinput: tokenoutput: wordpeicesmax_match