R基础 - R基础_字符串处理 - 《C020101_R》

基础
paste() paste0()
strsplit
sub_str or substr ,提取字符串，直接修改字符串
grep(pattern = “”,x = data) 返回位置 and grepl(pattern = “”,x = data) 返回逻辑值
- 取没有s|S开头的年份
gsub()替换所有，sub,只替换第一个
regexpr gregexpr regexec agrep
- agrep

基础

x = c('tiantiankaixin','haode')
nchar(x) # 14,5 每个元素的长度
length(x) # 2，元素的个数
toupper(x) #"TIANTIANKAIXIN" "HAODE"
tolower(x) #"tiantiankaixin" "haode"

paste() paste0()

a = LETTERS[1:5]
b = 1:5
paste(a,b,sep = '-')  ##"A-1" "B-2" "C-3" "D-4" "E-5"
paste(a,b,collapse = "-") ##"A 1-B 2-C 3-D 4-E 5"
paste(a,b) ##"A 1" "B 2" "C 3" "D 4" "E 5"
paste0(a,b) ##"A1" "B2" "C3" "D4" "E5"
paste0(a,b,sep = '-')  ##"A1-" "B2-" "C3-" "D4-" "E5-"
paste0(a,b,collapse = "-") ##"A1-B2-C3-D4-E5"

strsplit

c = paste(a,b,sep = "/");c  ##"A/1" "B/2" "C/3" "D/4" "E/5"
d = strsplit(c,split  = "/");d  ##[[1]][1] "A" "1"  [[2]][1] "B" "2" 返回列表
d[[1]][2]  # "1"

sub_str or substr ,提取字符串，直接修改字符串

e = c("python","perl","ruby","php","nihaoa")
sub_str = substr(e,start = 2,stop=4);sub_str  ##"yth" "erl" "uby" "hp"  "iha"
substr(e,start = 2,stop = 4) = "AAA";e  ## "pAAAon" "pAAA"   "rAAA"   "pAA"    "nAA直接修改了eAoa"

grep(pattern = “”,x = data) 返回位置 and grepl(pattern = “”,x = data) 返回逻辑值

seq1<-c("CEUFRA2_C1-S2008","AF_COM12B828-04","AF_COM17F05-2008",
"AS_CHN11_C3_2884","EU-FRA-C3-S2807"," NAUSA2E82-s85",
"AS_CHN12N_85","NA_USA83_C252907" ,"NA USAC4A3-2004",
"EU UKO1_A8_2009","eu fra_a2_s98","SA/BRA88/B/1996")
grep(pattern = "FRA|fra",x = seq1) ##[1]  1  5 11
grep(pattern = "FRA|fra",x = seq1,value = T) ## "CEUFRA2_C1S2008" "EU-FRA-C3-S2807" "eu fra_a2_s98"
grepl(pattern = "FRA|fra",x = seq1) ## TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
grepl(pattern = "FRA",x = seq1,ignore.case = T) ##忽略大小写

取没有s|S开头的年份

grepl(pattern = "[s|S][0-9]{2,4}\b",seq1)  ##\b表示边界，此为s|S加2-4为数字结尾的
seq2 = seq1[!grepl(pattern = "[s|S][0-9]{2,4}\b",seq1)];seq2
f = c("above","about","abortion","cab")
grep("\bab",f,value = T)  ##以ab开头的  "above"    "about"    "abortion"
grep("ab\b",f,value = T)  ##以ab结尾的 “cab”

gsub()替换所有，sub,只替换第一个

money = c("$1888","$2888","$3888") ##转为数字
as.numeric(gsub("\$",replacement = "",money)) ##\表示转义 1888 2888 3888
sub("\$",replacement = "",money)  ##"1888" "2888" "3888" 但是这里好像也替换了所有，下面有没有全部替换了
?sub
> txt <- "a test of capitalizing"
> gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
[1] "A Test Of Capitalizing"
> gsub("\\b(\\w)",    "\\U\\1",       txt, perl=TRUE)
[1] "A Test Of Capitalizing"
> txt2 <- "useRs may fly into JFK or laGuardia"
> gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS MaY FlY IntO JFK OR LaGuardiA"
> sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS may fly into JFK or laGuardia"

gsub全部替换，sub替换一个，R正则查询的时候分割要用(),\w匹配1次[a-zA-Z0-9],(\w*)表示匹配1个次以上[a-zA-Z0-9]。\U表示大写，\E表示不变，\L小写，\1表示第一部分，\2表示第二部分

regexpr gregexpr regexec agrep

regexpr gregexpr regexec 返回的结果一样，但是数据类型不一样，前者是向量，后两者是list

g = c("happy","apple","application","apolitic")
> regexpr('pp',g)  ##返回位置向量，第二行为具体个数，不存在则为-1
[1]  3  2  2 -1
attr(,"match.length")
[1]  2  2  2 -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
> g[regexpr('pp',g)>0] ##取出含pp的元素
[1] "happy"        "apple"        "appplication"
> regexec('pp',g) ##返回位置列表，第二行为具体个数
[[1]]
[1] 3
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[3]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
> gregexpr('pp',g) ##返回位置列表，第二行为具体个数
[[1]]
[1] 3
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[3]]
[1] 2
attr(,"match.length")
[1] 2
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

agrep

> m = c("I need a favour","my favorite sport","you made an favor error")
> agrep("favor",m)  #返回匹配成功的位置
[1] 1 2 3
> ?agrep
> agrep("lasy", "1 lazy 2")
[1] 1
> agrep("lasy", c(" 1 lazy 2", "1 lasy 2"), max = list(sub = 0))
[1] 2
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2)
[1] 1
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE)
[1] "1 lazy"
> agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE)
[1] 1 3