[toc]

stringr

1. str_c粘贴,相当于paste0

  1. > library(stringr)
  2. > str_c('a','b',sep = '-')
  3. [1] "a-b"
  4. > paste0('a','b',sep = '-')
  5. [1] "ab-"
  6. > paste('a','b',sep = '-')
  7. [1] "a-b"

2. str_length() 相当于nchar() 返回字符串长度

  1. > str_length("strinng")
  2. [1] 7
  3. > nchar("strinng")
  4. [1] 7

3.str_sub and sbustr()( in base package)

  1. > yxf <- 'yi xue fang'
  2. > str_sub(yxf, c(1,4,8), c(2,6,11)) # return value and change value directly
  3. [1] "yi" "xue" "fang"
  4. > substr(yxf, c(1,4,8), c(2,6,11))
  5. [1] "yi"
  6. > str_sub(yxf, 1,1) <- 'Y';yxf # change value
  7. [1] "Yi xue fang"

4.str_dup duplicate strings 重复字符串

  1. > fruit <- c("apple", "pear", "banana")
  2. > str_dup(fruit, 2)
  3. [1] "appleapple" "pearpear" "bananabanana"

5.去掉空白字符str_trim

  1. string <- ' Eternal love for YanQ '
  2. str_trim(string, side = 'both')

6.str_extract()以向量的形式返回提取值

str_extract_all()以列表的形式返回所有的值,

stri_locate_all() 以列表的形式返回所有位置

  1. > phones <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
  2. + "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
  3. + "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
  4. + "Home: 543.355.3679")
  5. > str_extract(phones, "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})");phones
  6. [1] "219 733 8965" "329-293-8753" NA "595 794 7569"
  7. [5] "387 287 6718" NA "233.398.9187" "482 952 3315"
  8. [9] "239 923 8115" "579-499-7527" NA "543.355.3679"
  9. [1] " 219 733 8965" "329-293-8753 "
  10. [3] "banana" "595 794 7569"
  11. [5] "387 287 6718" "apple"
  12. [7] "233.398.9187 " "482 952 3315"
  13. [9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"
  14. [11] "$1000" "Home: 543.355.3679"
  15. > shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
  16. > str_extract(shopping_list, "\\d")
  17. [1] "4" NA NA "2"
  18. > str_extract(shopping_list, "[a-z]+")
  19. [1] "apples" "bag" "bag" "milk"
  20. > str_extract(shopping_list, "[a-z]{1,4}")
  21. [1] "appl" "bag" "bag" "milk"
  22. > str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
  23. [1] NA "bag" "bag" "milk"
  24. > # Extract all matches
  25. > str_extract_all(shopping_list, "[a-z]+")
  26. [[1]]
  27. [1] "apples" "x"
  28. [[2]]
  29. [1] "bag" "of" "flour"
  30. [[3]]
  31. [1] "bag" "of" "sugar"
  32. [[4]]
  33. [1] "milk" "x"
  34. > str_extract_all(shopping_list, "\\b[a-z]+\\b")
  35. [[1]]
  36. [1] "apples"
  37. [[2]]
  38. [1] "bag" "of" "flour"
  39. [[3]]
  40. [1] "bag" "of" "sugar"
  41. [[4]]
  42. [1] "milk"
  43. > str_extract_all(shopping_list, "\\d")
  44. [[1]]
  45. [1] "4"
  46. [[2]]
  47. character(0)
  48. [[3]]
  49. character(0)
  50. [[4]]
  51. [1] "2"
  52. > # Simplify results into character matrix
  53. > str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
  54. [,1] [,2] [,3]
  55. [1,] "apples" "" ""
  56. [2,] "bag" "of" "flour"
  57. [3,] "bag" "of" "sugar"
  58. [4,] "milk" "" ""
  59. > str_extract_all(shopping_list, "\\d", simplify = TRUE)
  60. [,1]
  61. [1,] "4"
  62. [2,] ""
  63. [3,] ""
  64. [4,] "2"
  65. > # Extract all words
  66. > str_extract_all("This is, suprisingly, a sentence.", boundary("word"))
  67. [[1]]
  68. [1] "This" "is" "suprisingly" "a" "sentence"

7.str_replace_all(x,a,b)把x里的a全部替换为b

  1. > fruits <- c("one apple", "two pears", "three bananas")
  2. > str_replace_all(fruits, "[aeiou]", "-")
  3. [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"

stringi

1.连接 连接两个字符串,注意与paste paste0的区别

  1. > library(stringi)
  2. > stri_join(1:7, letters[1:7], sep='-')
  3. [1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"
  4. > stri_join(1:7, letters[1:7], collapse='-')
  5. [1] "1a-2b-3c-4d-5e-6f-7g"

2.compare字符创比较

  1. > stri_cmp_eq('AB','aB')
  2. [1] FALSE
  3. > stri_cmp_neq('AB','aB')
  4. [1] TRUE
  5. > stri_cmp_gt('a121','b221')
  6. [1] FALSE
  7. > stri_cmp_lt('a121','b221')
  8. [1] TRUE

3.stri_count(a,fixed/regex=b)在a固定或正则匹配计数b

  1. > language <- c('Python','R', 'PHP', 'Ruby', 'Java',
  2. + 'JavaScript', 'C', 'Oracle', 'C++', 'C#', 'Spark', 'Go',
  3. + 'Room', 'Good', 'Pathon', 'ScriptJava', 'R2R', 'C+','C*',"r")
  4. > stri_count(language, fixed = 'R')
  5. [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0
  6. > stri_count(language, regex = '^J')
  7. [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

4.stri_count_boundaries计算边界

  1. > test <- 'The above-mentioned features are very useful.
  2. + Warm thanks to their developers.'
  3. > stri_count_boundaries(test, type="word")
  4. [1] 29
  5. > stri_count_boundaries(test, type="sentence")
  6. [1] 2
  7. > stri_count_boundaries(test, type="character")
  8. [1] 82
  9. > stri_count_words(test)
  10. [1] 12

5.stri_dup(a,b)是将a分别重复b此,stri_duplicated是否为重复值

  1. > stri_dup(c("abc", "pqrst"), c(4, 2))
  2. [1] "abcabcabcabc" "pqrstpqrst"
  3. > stri_duplicated(c("a", "b", "a", NA, "a", NA))
  4. [1] FALSE FALSE TRUE FALSE TRUE TRUE
  5. > stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
  6. [1] TRUE FALSE TRUE TRUE FALSE FALSE
  7. > stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
  8. [1] 3

6.stri_dect(a,b) 字符创a中是否含有b,返回逻辑值

fixed 是完全匹配,regex是正则匹配

  1. > stri_detect_fixed(c("stringi R", "REXAMINE", "123"), c('i', 'R', '0'))
  2. [1] TRUE TRUE FALSE
  3. > stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), '^ab')
  4. [1] TRUE TRUE TRUE TRUE TRUE
  5. > stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), 't\\b')
  6. [1] FALSE TRUE TRUE FALSE FALSE
  7. > stri_detect_regex(c('ABOUT','abort','AboVE'), '^ab', case_insensitive = TRUE)
  8. [1] TRUE TRUE TRUE
  9. > stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a")
  10. [1] TRUE TRUE FALSE TRUE FALSE
  11. > stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a1")
  12. [1] TRUE FALSE FALSE FALSE FALSE
  13. > stri_startswith_fixed(c("abaDc", "aabadc",'ababa'), "ba", from=2)
  14. [1] TRUE FALSE TRUE
  15. > stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba')
  16. [1] FALSE FALSE TRUE
  17. > stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba', to = 3)
  18. [1] TRUE FALSE TRUE

7.stri_extract_all() 提取字符创的内容,返回值列表

These functions extract all substrings matching a given pattern.

  • stri_extract_all_* extracts all the matches.
  • stri_extract_first_* and stri_extract_last_* yield the first or the last matches, respectively.
  1. > tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_B0_2004',
  2. + 'EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05')
  3. > #Generate a strings composed by several sequence names.
  4. > stri_extract_all(tEmp_text, regex = '[0-9]{2,4}\\b')
  5. [[1]]
  6. [1] "2008"
  7. [[2]]
  8. [1] "2004"
  9. [[3]]
  10. [1] "2007"
  11. [[4]]
  12. [1] "2005"
  13. [[5]]
  14. [1] "05"
  15. > stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)
  16. [[1]]
  17. [1] "aba" "aBA" "Aba"
  18. #case_insensitive=TRUE 不区分大小写
  19. > stri_extract_all_boundaries("stringi: THE string processing package 123.48...")
  20. [[1]]
  21. [1] "stringi: " "THE " "string " "processing " "package "
  22. [6] "123.48..."
  23. > stri_extract_all_words("stringi: THE string processing package 123.48...")
  24. [[1]]
  25. [1] "stringi" "THE" "string" "processing" "package"
  26. [6] "123.48"
  27. > stri_count_boundaries("stringi: THE string processing package 123.48...", type="word")
  28. [1] 15

7.stri_isempty()判断字符创或向量是否为空

  1. > stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104',' ',''))
  2. [1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE

8 stri_locate_all(a,fixed = b) 返回字符串b在字符创a中的全部位置

  1. > stri_locate_all('I want to learn R to promote my statistical skills', fixed='to')
  2. [[1]]
  3. start end
  4. [1,] 8 9
  5. [2,] 19 20

strto系列 转换字母大小写

  1. > library(stringr)
  2. > a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
  3. > str_to_title(a)
  4. [1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"
  5. > dog <- "The quick brown dog"
  6. > str_to_upper(dog) #全部大写
  7. [1] "THE QUICK BROWN DOG"
  8. > str_to_lower(dog) #全部小写
  9. [1] "the quick brown dog"
  10. > str_to_title(dog) #首字母大写
  11. [1] "The Quick Brown Dog"
  12. > str_to_sentence("the quick brown dog") #句子第一个词的第一个字母大写
  13. [1] "The quick brown dog"
  14. #另一个包里的函数capitalize
  15. > library(Hmisc)
  16. > a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
  17. > capitalize(tolower(a))
  18. [1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"