构建字符串

安装stringr包，构建示例字符串。

> rm(list = ls())
> if(!require(stringr))install.packages('stringr')
> library(stringr)
> x <- "The birch canoe slid on the smooth planks."
> x
[1] "The birch canoe slid on the smooth planks."

检测长度

主要是str_length()函数。

> str_length(x) #空格和符号也计算在内
[1] 42
> length(x)
[1] 1

字符串拆分与组合

主要是str_split()函数，拆分后可以取子集。

> x
[1] "The birch canoe slid on the smooth planks."
> str_split(x," ")
[[1]]
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> x2 = str_split(x," ")[[1]]
> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."

注意simplify参数的用法，可以形成一个矩阵。

> y = c("jimmy 150","nicker 140","tony 152")
> str_split(y," ")
[[1]]
[1] "jimmy" "150"  
[[2]]
[1] "nicker" "140"   
[[3]]
[1] "tony" "152" 
> str_split(y," ",simplify = T)
     [,1]     [,2] 
[1,] "jimmy"  "150"
[2,] "nicker" "140"
[3,] "tony"   "152"

str_c()函数可以实现字符串的连接。
collapse参数可以作为分隔符，sep是把向量中的每一个元素都与另一字符串相连。

> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> str_c(x2,collapse = " ")
[1] "The birch canoe slid on the smooth planks."
> str_c(x2,1234,sep = "+")
[1] "The+1234"     "birch+1234"   "canoe+1234"   "slid+1234"    "on+1234"     
[6] "the+1234"     "smooth+1234"  "planks.+1234"

提取字符串的一部分

str_sub()函数可以从起始位置到终止位置取字符。

> x
[1] "The birch canoe slid on the smooth planks."
> str_sub(x,5,9)
[1] "birch"

字符定位

str_locate()在向量中定位需要检测的字符。

> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> str_locate(x2,"th")
     start end
[1,]    NA  NA
[2,]    NA  NA
[3,]    NA  NA
[4,]    NA  NA
[5,]    NA  NA
[6,]     1   2
[7,]     5   6
[8,]    NA  NA
> str_locate(x2,"h")
     start end
[1,]     2   2
[2,]     5   5
[3,]    NA  NA
[4,]    NA  NA
[5,]    NA  NA
[6,]     2   2
[7,]     6   6
[8,]    NA  NA

字符检测

str_detect()检测向量中每个字符串是否含有待检测的字符。

> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> str_detect(x2,"h")
[1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
> ###与sum和mean连用，可以统计匹配的个数和比例
> sum(str_detect(x2,"h"))
[1] 4
> mean(str_detect(x2,"h"))
[1] 0.5

字符替换

> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> str_replace(x2,"o","A")
[1] "The"     "birch"   "canAe"   "slid"    "An"      "the"     "smAoth"  "planks."
> str_replace_all(x2,"o","A")
[1] "The"     "birch"   "canAe"   "slid"    "An"      "the"     "smAAth"  "planks."

提取匹配到的字符

> x2
[1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth"  "planks."
> str_extract(x2,"o|e")
[1] "e" NA  "o" NA  "o" "e" "o" NA 
> str_extract_all(x2,"o|e")
[[1]]
[1] "e"
[[2]]
character(0)
[[3]]
[1] "o" "e"
[[4]]
character(0)
[[5]]
[1] "o"
[[6]]
[1] "e"
[[7]]
[1] "o" "o"
[[8]]
character(0)
> str_extract_all(x2,"o|e",simplify = T)
     [,1] [,2]
[1,] "e"  ""  
[2,] ""   ""  
[3,] "o"  "e" 
[4,] ""   ""  
[5,] "o"  ""  
[6,] "e"  ""  
[7,] "o"  "o" 
[8,] ""   ""

字符删除

> x
[1] "The birch canoe slid on the smooth planks."
> str_remove(x," ")
[1] "Thebirch canoe slid on the smooth planks."
> str_remove_all(x," ")
[1] "Thebirchcanoeslidonthesmoothplanks."

生信基础入门

0110_字符串函数