Tokenizer

  1. # base_tokenizer
  2. input: text
  3. output: tokens
  4. output = []
  5. text = convert2unicode(text)
  6. text = clean_text(text) # remove invalid character(0, 0xfffd, chars these are unicodedata.category is 'Cc' or Cf' except ['\r', '\n', '\t'])
  7. text = tokenize_chinese_char(text) # add spaces before and after each chinese character
  8. tokens = text.split()
  9. for token in tokens:
  10. if do_lower_case:
  11. token = token.lower()
  12. token = run_strip_accents(token) # normalize with 'NFD' encoder, remove chars belong to 'Mn' unicodedata catefgory
  13. token = run_split_on_punc(token) # end a word before a punc, and start a word after a punc
  14. output.append(token)
  15. # wordpeice tokenizer
  16. input: token
  17. output: wordpeices
  18. max_match