基础
x = c('tiantiankaixin','haode')nchar(x) # 14,5 每个元素的长度length(x) # 2,元素的个数toupper(x) #"TIANTIANKAIXIN" "HAODE"tolower(x) #"tiantiankaixin" "haode"
paste() paste0()
a = LETTERS[1:5]b = 1:5paste(a,b,sep = '-') ##"A-1" "B-2" "C-3" "D-4" "E-5"paste(a,b,collapse = "-") ##"A 1-B 2-C 3-D 4-E 5"paste(a,b) ##"A 1" "B 2" "C 3" "D 4" "E 5"paste0(a,b) ##"A1" "B2" "C3" "D4" "E5"paste0(a,b,sep = '-') ##"A1-" "B2-" "C3-" "D4-" "E5-"paste0(a,b,collapse = "-") ##"A1-B2-C3-D4-E5"
strsplit
c = paste(a,b,sep = "/");c ##"A/1" "B/2" "C/3" "D/4" "E/5"d = strsplit(c,split = "/");d ##[[1]][1] "A" "1" [[2]][1] "B" "2" 返回列表d[[1]][2] # "1"
sub_str or substr ,提取字符串,直接修改字符串
e = c("python","perl","ruby","php","nihaoa")sub_str = substr(e,start = 2,stop=4);sub_str ##"yth" "erl" "uby" "hp" "iha"substr(e,start = 2,stop = 4) = "AAA";e ## "pAAAon" "pAAA" "rAAA" "pAA" "nAA直接修改了eAoa"
grep(pattern = “”,x = data) 返回位置 and grepl(pattern = “”,x = data) 返回逻辑值
seq1<-c("CEUFRA2_C1-S2008","AF_COM12B828-04","AF_COM17F05-2008","AS_CHN11_C3_2884","EU-FRA-C3-S2807"," NAUSA2E82-s85","AS_CHN12N_85","NA_USA83_C252907" ,"NA USAC4A3-2004","EU UKO1_A8_2009","eu fra_a2_s98","SA/BRA88/B/1996")grep(pattern = "FRA|fra",x = seq1) ##[1] 1 5 11grep(pattern = "FRA|fra",x = seq1,value = T) ## "CEUFRA2_C1S2008" "EU-FRA-C3-S2807" "eu fra_a2_s98"grepl(pattern = "FRA|fra",x = seq1) ## TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSEgrepl(pattern = "FRA",x = seq1,ignore.case = T) ##忽略大小写
取没有s|S开头的年份
grepl(pattern = "[s|S][0-9]{2,4}\b",seq1) ##\b表示边界,此为s|S加2-4为数字结尾的seq2 = seq1[!grepl(pattern = "[s|S][0-9]{2,4}\b",seq1)];seq2f = c("above","about","abortion","cab")grep("\bab",f,value = T) ##以ab开头的 "above" "about" "abortion"grep("ab\b",f,value = T) ##以ab结尾的 “cab”
gsub()替换所有,sub,只替换第一个
money = c("$1888","$2888","$3888") ##转为数字as.numeric(gsub("\$",replacement = "",money)) ##\表示转义 1888 2888 3888sub("\$",replacement = "",money) ##"1888" "2888" "3888" 但是这里好像也替换了所有,下面有没有全部替换了?sub> txt <- "a test of capitalizing"> gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)[1] "A Test Of Capitalizing"> gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE)[1] "A Test Of Capitalizing"> txt2 <- "useRs may fly into JFK or laGuardia"> gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)[1] "UseRS MaY FlY IntO JFK OR LaGuardiA"> sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)[1] "UseRS may fly into JFK or laGuardia"
gsub全部替换,sub替换一个,R正则查询的时候分割要用(),\w匹配1次[a-zA-Z0-9],(\w*)表示匹配1个次以上[a-zA-Z0-9]。\U表示大写,\E表示不变,\L小写,\1表示第一部分,\2表示第二部分
regexpr gregexpr regexec agrep
regexpr gregexpr regexec 返回的结果一样,但是数据类型不一样,前者是向量,后两者是list
g = c("happy","apple","application","apolitic")> regexpr('pp',g) ##返回位置向量,第二行为具体个数,不存在则为-1[1] 3 2 2 -1attr(,"match.length")[1] 2 2 2 -1attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE> g[regexpr('pp',g)>0] ##取出含pp的元素[1] "happy" "apple" "appplication"> regexec('pp',g) ##返回位置列表,第二行为具体个数[[1]][1] 3attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[2]][1] 2attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[3]][1] 2attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[4]][1] -1attr(,"match.length")[1] -1attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE> gregexpr('pp',g) ##返回位置列表,第二行为具体个数[[1]][1] 3attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[2]][1] 2attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[3]][1] 2attr(,"match.length")[1] 2attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE[[4]][1] -1attr(,"match.length")[1] -1attr(,"index.type")[1] "chars"attr(,"useBytes")[1] TRUE
agrep
> m = c("I need a favour","my favorite sport","you made an favor error")> agrep("favor",m) #返回匹配成功的位置[1] 1 2 3> ?agrep> agrep("lasy", "1 lazy 2")[1] 1> agrep("lasy", c(" 1 lazy 2", "1 lasy 2"), max = list(sub = 0))[1] 2> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2)[1] 1> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE)[1] "1 lazy"> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE)[1] 1 3
