基础

  1. x = c('tiantiankaixin','haode')
  2. nchar(x) # 14,5 每个元素的长度
  3. length(x) # 2,元素的个数
  4. toupper(x) #"TIANTIANKAIXIN" "HAODE"
  5. tolower(x) #"tiantiankaixin" "haode"

paste() paste0()

  1. a = LETTERS[1:5]
  2. b = 1:5
  3. paste(a,b,sep = '-') ##"A-1" "B-2" "C-3" "D-4" "E-5"
  4. paste(a,b,collapse = "-") ##"A 1-B 2-C 3-D 4-E 5"
  5. paste(a,b) ##"A 1" "B 2" "C 3" "D 4" "E 5"
  6. paste0(a,b) ##"A1" "B2" "C3" "D4" "E5"
  7. paste0(a,b,sep = '-') ##"A1-" "B2-" "C3-" "D4-" "E5-"
  8. paste0(a,b,collapse = "-") ##"A1-B2-C3-D4-E5"

strsplit

  1. c = paste(a,b,sep = "/");c ##"A/1" "B/2" "C/3" "D/4" "E/5"
  2. d = strsplit(c,split = "/");d ##[[1]][1] "A" "1" [[2]][1] "B" "2" 返回列表
  3. d[[1]][2] # "1"

sub_str or substr ,提取字符串,直接修改字符串

  1. e = c("python","perl","ruby","php","nihaoa")
  2. sub_str = substr(e,start = 2,stop=4);sub_str ##"yth" "erl" "uby" "hp" "iha"
  3. substr(e,start = 2,stop = 4) = "AAA";e ## "pAAAon" "pAAA" "rAAA" "pAA" "nAA直接修改了eAoa"

grep(pattern = “”,x = data) 返回位置 and grepl(pattern = “”,x = data) 返回逻辑值

  1. seq1<-c("CEUFRA2_C1-S2008","AF_COM12B828-04","AF_COM17F05-2008",
  2. "AS_CHN11_C3_2884","EU-FRA-C3-S2807"," NAUSA2E82-s85",
  3. "AS_CHN12N_85","NA_USA83_C252907" ,"NA USAC4A3-2004",
  4. "EU UKO1_A8_2009","eu fra_a2_s98","SA/BRA88/B/1996")
  5. grep(pattern = "FRA|fra",x = seq1) ##[1] 1 5 11
  6. grep(pattern = "FRA|fra",x = seq1,value = T) ## "CEUFRA2_C1S2008" "EU-FRA-C3-S2807" "eu fra_a2_s98"
  7. grepl(pattern = "FRA|fra",x = seq1) ## TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
  8. grepl(pattern = "FRA",x = seq1,ignore.case = T) ##忽略大小写

取没有s|S开头的年份

  1. grepl(pattern = "[s|S][0-9]{2,4}\b",seq1) ##\b表示边界,此为s|S加2-4为数字结尾的
  2. seq2 = seq1[!grepl(pattern = "[s|S][0-9]{2,4}\b",seq1)];seq2
  3. f = c("above","about","abortion","cab")
  4. grep("\bab",f,value = T) ##以ab开头的 "above" "about" "abortion"
  5. grep("ab\b",f,value = T) ##以ab结尾的 “cab”

gsub()替换所有,sub,只替换第一个

  1. money = c("$1888","$2888","$3888") ##转为数字
  2. as.numeric(gsub("\$",replacement = "",money)) ##\表示转义 1888 2888 3888
  3. sub("\$",replacement = "",money) ##"1888" "2888" "3888" 但是这里好像也替换了所有,下面有没有全部替换了
  4. ?sub
  5. > txt <- "a test of capitalizing"
  6. > gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
  7. [1] "A Test Of Capitalizing"
  8. > gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE)
  9. [1] "A Test Of Capitalizing"
  10. > txt2 <- "useRs may fly into JFK or laGuardia"
  11. > gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
  12. [1] "UseRS MaY FlY IntO JFK OR LaGuardiA"
  13. > sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
  14. [1] "UseRS may fly into JFK or laGuardia"

gsub全部替换,sub替换一个,R正则查询的时候分割要用(),\w匹配1次[a-zA-Z0-9],(\w*)表示匹配1个次以上[a-zA-Z0-9]。\U表示大写,\E表示不变,\L小写,\1表示第一部分,\2表示第二部分

regexpr gregexpr regexec agrep

regexpr gregexpr regexec 返回的结果一样,但是数据类型不一样,前者是向量,后两者是list

  1. g = c("happy","apple","application","apolitic")
  2. > regexpr('pp',g) ##返回位置向量,第二行为具体个数,不存在则为-1
  3. [1] 3 2 2 -1
  4. attr(,"match.length")
  5. [1] 2 2 2 -1
  6. attr(,"index.type")
  7. [1] "chars"
  8. attr(,"useBytes")
  9. [1] TRUE
  10. > g[regexpr('pp',g)>0] ##取出含pp的元素
  11. [1] "happy" "apple" "appplication"
  12. > regexec('pp',g) ##返回位置列表,第二行为具体个数
  13. [[1]]
  14. [1] 3
  15. attr(,"match.length")
  16. [1] 2
  17. attr(,"index.type")
  18. [1] "chars"
  19. attr(,"useBytes")
  20. [1] TRUE
  21. [[2]]
  22. [1] 2
  23. attr(,"match.length")
  24. [1] 2
  25. attr(,"index.type")
  26. [1] "chars"
  27. attr(,"useBytes")
  28. [1] TRUE
  29. [[3]]
  30. [1] 2
  31. attr(,"match.length")
  32. [1] 2
  33. attr(,"index.type")
  34. [1] "chars"
  35. attr(,"useBytes")
  36. [1] TRUE
  37. [[4]]
  38. [1] -1
  39. attr(,"match.length")
  40. [1] -1
  41. attr(,"index.type")
  42. [1] "chars"
  43. attr(,"useBytes")
  44. [1] TRUE
  45. > gregexpr('pp',g) ##返回位置列表,第二行为具体个数
  46. [[1]]
  47. [1] 3
  48. attr(,"match.length")
  49. [1] 2
  50. attr(,"index.type")
  51. [1] "chars"
  52. attr(,"useBytes")
  53. [1] TRUE
  54. [[2]]
  55. [1] 2
  56. attr(,"match.length")
  57. [1] 2
  58. attr(,"index.type")
  59. [1] "chars"
  60. attr(,"useBytes")
  61. [1] TRUE
  62. [[3]]
  63. [1] 2
  64. attr(,"match.length")
  65. [1] 2
  66. attr(,"index.type")
  67. [1] "chars"
  68. attr(,"useBytes")
  69. [1] TRUE
  70. [[4]]
  71. [1] -1
  72. attr(,"match.length")
  73. [1] -1
  74. attr(,"index.type")
  75. [1] "chars"
  76. attr(,"useBytes")
  77. [1] TRUE

agrep

  1. > m = c("I need a favour","my favorite sport","you made an favor error")
  2. > agrep("favor",m) #返回匹配成功的位置
  3. [1] 1 2 3
  4. > ?agrep
  5. > agrep("lasy", "1 lazy 2")
  6. [1] 1
  7. > agrep("lasy", c(" 1 lazy 2", "1 lasy 2"), max = list(sub = 0))
  8. [1] 2
  9. > agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2)
  10. [1] 1
  11. > agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE)
  12. [1] "1 lazy"
  13. > agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE)
  14. [1] 1 3