基础
x = c('tiantiankaixin','haode')
nchar(x) # 14,5 每个元素的长度
length(x) # 2,元素的个数
toupper(x) #"TIANTIANKAIXIN" "HAODE"
tolower(x) #"tiantiankaixin" "haode"
paste() paste0()
a = LETTERS[1:5]
b = 1:5
paste(a,b,sep = '-') ##"A-1" "B-2" "C-3" "D-4" "E-5"
paste(a,b,collapse = "-") ##"A 1-B 2-C 3-D 4-E 5"
paste(a,b) ##"A 1" "B 2" "C 3" "D 4" "E 5"
paste0(a,b) ##"A1" "B2" "C3" "D4" "E5"
paste0(a,b,sep = '-') ##"A1-" "B2-" "C3-" "D4-" "E5-"
paste0(a,b,collapse = "-") ##"A1-B2-C3-D4-E5"
strsplit
c = paste(a,b,sep = "/");c ##"A/1" "B/2" "C/3" "D/4" "E/5"
d = strsplit(c,split = "/");d ##[[1]][1] "A" "1" [[2]][1] "B" "2" 返回列表
d[[1]][2] # "1"
sub_str or substr ,提取字符串,直接修改字符串
e = c("python","perl","ruby","php","nihaoa")
sub_str = substr(e,start = 2,stop=4);sub_str ##"yth" "erl" "uby" "hp" "iha"
substr(e,start = 2,stop = 4) = "AAA";e ## "pAAAon" "pAAA" "rAAA" "pAA" "nAA直接修改了eAoa"
grep(pattern = “”,x = data) 返回位置 and grepl(pattern = “”,x = data) 返回逻辑值
seq1<-c("CEUFRA2_C1-S2008","AF_COM12B828-04","AF_COM17F05-2008",
"AS_CHN11_C3_2884","EU-FRA-C3-S2807"," NAUSA2E82-s85",
"AS_CHN12N_85","NA_USA83_C252907" ,"NA USAC4A3-2004",
"EU UKO1_A8_2009","eu fra_a2_s98","SA/BRA88/B/1996")
grep(pattern = "FRA|fra",x = seq1) ##[1] 1 5 11
grep(pattern = "FRA|fra",x = seq1,value = T) ## "CEUFRA2_C1S2008" "EU-FRA-C3-S2807" "eu fra_a2_s98"
grepl(pattern = "FRA|fra",x = seq1) ## TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
grepl(pattern = "FRA",x = seq1,ignore.case = T) ##忽略大小写
取没有s|S开头的年份
grepl(pattern = "[s|S][0-9]{2,4}\b",seq1) ##\b表示边界,此为s|S加2-4为数字结尾的
seq2 = seq1[!grepl(pattern = "[s|S][0-9]{2,4}\b",seq1)];seq2
f = c("above","about","abortion","cab")
grep("\bab",f,value = T) ##以ab开头的 "above" "about" "abortion"
grep("ab\b",f,value = T) ##以ab结尾的 “cab”
gsub()替换所有,sub,只替换第一个
money = c("$1888","$2888","$3888") ##转为数字
as.numeric(gsub("\$",replacement = "",money)) ##\表示转义 1888 2888 3888
sub("\$",replacement = "",money) ##"1888" "2888" "3888" 但是这里好像也替换了所有,下面有没有全部替换了
?sub
> txt <- "a test of capitalizing"
> gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
[1] "A Test Of Capitalizing"
> gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE)
[1] "A Test Of Capitalizing"
> txt2 <- "useRs may fly into JFK or laGuardia"
> gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS MaY FlY IntO JFK OR LaGuardiA"
> sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS may fly into JFK or laGuardia"
gsub全部替换,sub替换一个,R正则查询的时候分割要用(),\w匹配1次[a-zA-Z0-9],(\w*)表示匹配1个次以上[a-zA-Z0-9]。\U表示大写,\E表示不变,\L小写,\1表示第一部分,\2表示第二部分
regexpr gregexpr regexec agrep
regexpr gregexpr regexec 返回的结果一样,但是数据类型不一样,前者是向量,后两者是list
g = c("happy","apple","application","apolitic")
> regexpr('pp',g) ##返回位置向量,第二行为具体个数,不存在则为-1
[1] 3 2 2 -1
attr(,"match.length")
[1] 2 2 2 -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
> g[regexpr('pp',g)>0] ##取出含pp的元素
[1] "happy" "apple" "appplication"
> regexec('pp',g) ##返回位置列表,第二行为具体个数
[[1]]
[1] 3
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[3]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
> gregexpr('pp',g) ##返回位置列表,第二行为具体个数
[[1]]
[1] 3
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[3]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
agrep
> m = c("I need a favour","my favorite sport","you made an favor error")
> agrep("favor",m) #返回匹配成功的位置
[1] 1 2 3
> ?agrep
> agrep("lasy", "1 lazy 2")
[1] 1
> agrep("lasy", c(" 1 lazy 2", "1 lasy 2"), max = list(sub = 0))
[1] 2
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2)
[1] 1
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE)
[1] "1 lazy"
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE)
[1] 1 3