stringr
R版本与运行环境信息
Date:2021-5-18R version 4.0.3 (2020-10-10)Platform: x86_64-w64-mingw32/x64 (64-bit)Running under: Windows 10 x64 (build 18363)
载入相关包
library("tidyverse")
stringr函数简要列表
R语言正则表达式
正则表达式
| 表达式 | 含义 |
|---|---|
| \n | 换行符(unix) |
| \r | 回车符 |
| \t | 制表符 |
| \f | 换页符 |
| [\b] | 退格符 |
| \r\n | 换行符(windows) |
| . | 可以匹配任何单个的字符字母数字甚至.字符本身。同一个正则表达式允许使用多个.字符。但不能匹配换行符 |
| * | 表示前的字符出现至少0次,.*组合代表任意字符 |
| + | 表示+前的字符至少出现一次 |
| \ | 转义符 |
| ^ | 表示开头,如 ^a |
| $ | 表示结尾, 如a$ |
| () | 表示匹配括号内的字符串/表达式 |
| [] | 表示匹配[]中的任意字符如,[a-z] [0-9] [AaCcSs] |
| {} | {}前的字符出现次数,{5}表示5次,{1,3}表示出现1到3次,{2,} 出现两次以上 |
| ? | 匹配零个或一个字符 |
| | | 逻辑运算符“或” |
| [^] | 表示括号里的匹配内容取反 |
| \d | 匹配任意一个数字,使用时应该在加一个””, 下同 |
| \D | 匹配任意一个非数字 |
| \w | 匹配任意一个小写字母(等价于[a-zA-Z0-9]) |
| \W | 匹配任意一个大写字母(等价于[^a-zA-Z0-9]) |
| \s | 任何一个空白字符(等价于[\f\n\r\t\v]) |
| \S | 任何一个非空白字符(等价于[^\\f\\n\\r\\t\\v]) |
POSIX字符类
| [:alnum:] | 任何一个字母或数字(等价于[a-ZA-Z0-9]) |
|---|---|
| [:alpha:] | 任何一个字母(等价于[a-ZA-Z]) |
| [:blank:] | 空格或制表符(等价于[\t ]) 注:t后面有一个空格 |
| [:cntrl:] | ASCII控制字符(ASCII 0到31,再加上ASCII 127) |
| [:digit:] | 任何一个数字(等价于[0-9]) |
| [:graph:] | 和[:print:]一样,但不包括空格 |
| [:lower:] | 任何一个小写字母(等价于[a-z]) |
| [:print:] | 任何一个可打印字符 |
| [:punct:] | 既不属于[:alnum:],也不属于[:cntrl:]的任何一个字符 |
| [:space:] | 任何一个空格字符,包括空格(等价于[f\n\r\t\v ] 注:v后面有一个空格 |
| [:upper:] | 任何一个大写字母(等价于[A-Z]) |
| [:xdigit:] | 任何一个十六进制数字(等价于[a-fA-F0-9]) |
stringr函数使用方法
**str_c()**: 用于连接字符串成一个向量(paste/paste0),要连接的字符串在开头则在在被连接的前面,反之则在后方
选项:seq="": 指定连接的分隔符collapse ="": 将多个字符串连接成一个向量时的连接符
#例如生成一系列处理str_c("T",c(1:8),sep = "_")#输出结果[1] "T_1" "T_2" "T_3" "T_4" "T_5" "T_6" "T_7" "T_8"#另一种连接方式> str_c("T_",c(1:8))[1] "T_1" "T_2" "T_3" "T_4" "T_5" "T_6" "T_7" "T_8"#同样可以指定多个字符串同时连接,例如生成一系列带有重复的处理> str_c("T",c(1:8),"_1")[1] "T1_1" "T2_1" "T3_1" "T4_1" "T5_1" "T6_1" "T7_1" "T8_1"#例如生成一组处理名> a <- str_c("T",c(1:3),"_1")> b <- str_c("T",c(1:3),"_2")> c <- str_c("T",c(1:3),"_3")> data.frame(Treatment = c(a,b,c))Treatment1 T1_12 T2_13 T3_14 T1_25 T2_26 T3_27 T1_38 T2_39 T3_3
使用collapse =""将多个字符串连接成一个向量
> str_c(letters, collapse = "::")[1] "a::b::c::d::e::f::g::h::i::j::k::l::m::n::o::p::q::r::s::t::u::v::w::x::y::z"
使用负索引可以对数据循环连接
> str_c(letters[-26], " comes before ", letters[-1])[1] "a comes before b" "b comes before c" "c comes before d" "d comes before e" "e comes before f"[6] "f comes before g" "g comes before h" "h comes before i" "i comes before j" "j comes before k"[11] "k comes before l" "l comes before m" "m comes before n" "n comes before o" "o comes before p"......
**str_conv()**: 修改字符串编码格式
str_conv(string, encoding)
**str_count()**: 计算某一字符出现的频次,支持正则表达式
fruit <- c("apple", "banana", "pear", "pineapple")fruit2 <- c("aaa","abb")#计算"a"在fruit中出现的频次> str_count(fruit, "a")[1] 1 3 1 1#计算"a"在fruit和fruit2中出现的频率> str_count(c(fruit,fruit2), "a")[1] 1 3 1 1 3 1#统计指定字符在指定元素中的出现频次> str_count(fruit, c("a", "b", "p", "p"))[1] 1 1 1 3#统计向量中每个元素的长度> str_count(c("abc", "bac", "cba1a"), ".")[1] 3 3 5###正则表达式简单举例#统计"b"在开头的频次> str_count(fruit, pattern = "^b")[1] 0 1 0 0#统计"a"在结尾的频次> str_count(fruit, pattern = "a$")[1] 0 1 0 0#统计每个元素中,"a"或"b"或"e"出现的频次> str_count(fruit, pattern = "[a,b,e]")[1] 2 4 2 3
**str_detect()**: 判断字符串中是否含有某元素,支持正则,与grep()类似,但是后者默认显示index,可以用于筛选,**negate = TRUE**代表取反
#查看"p"是在每个元素中是否存在> str_detect(fruit,"p")[1] TRUE FALSE TRUE TRUE#与grep做个比较> grep("^p",fruit )[1] 3 4#对结果取反,即判断p是不是不在各个元素中> str_detect(fruit,"p",negate = T)[1] FALSE TRUE FALSE FALSE
**str_dup()**: 将指定字符串重复,与rep()函数类似,str_dup(string, times)
fruit <- c("apple", "pear", "banana")#将每个元素重读3次> str_dup(fruit,3)[1] "appleappleapple" "pearpearpear" "bananabananabanana"#每个元素分别重复,2次 4次 6次> str_dup(fruit,c(2,4,6))[1] "appleapple" "pearpearpearpear"[3] "bananabananabananabananabananabanana"#某字符串分别重复0,1,2,3,4,5次> str_dup("na", 0:5)[1] "" "na" "nana" "nanana" "nananana" "nanananana"
**str_start()/str_ends()**: 判断某字符是否在字符串的开头/结尾,类似于str_detect(str,pattern="^str")/str_detect(str,pattern="str$"),negate = TRUE代表取反
fruit <- c("apple", "pear", "banana")#判断是每个元素是否是以“a”开头> str_starts(fruit,"a")[1] TRUE FALSE FALSE#判断是否每个元素是以“a“结尾> str_ends(fruit,"a")[1] FALSE FALSE TRUE#取反> str_starts(fruit,"a",negate = T)[1] FALSE TRUE TRUE#简单正则判断> str_starts(fruit,"[a,b]",negate = T)[1] FALSE TRUE FALSE
**str_flatten()**: 将字符串连接,collapse =""可以指定连接符
> str_flatten(letters,collapse = "_")[1] "a_b_c_d_e_f_g_h_i_j_k_l_m_n_o_p_q_r_s_t_u_v_w_x_y_z"
**str_glue()**/**str_glue_data()**: 字符串的参数传递,类似于函数的形参和实参的传递
#字符串的参数传递> name <- "bill"> age <- "50"> anniversary <- as.Date("1991-10-12")> str_glue(+ "my name is {name} " ,+ "my age next {age}",+ "my data is {format(anniversary, '%A, %B %d, %Y')}"+ )my name is bill my age next 50my data is 星期六, 十月 12, 1991#从数据框中进行参数传递> mtcars %>% str_glue_data("{rownames(.)} has {hp} hp") %>% head()Mazda RX4 has 110 hpMazda RX4 Wag has 110 hpDatsun 710 has 93 hpHornet 4 Drive has 110 hpHornet Sportabout has 175 hpValiant has 105 hp
**str_length()**: 统计字符串的长度,或向量中每个元素的长度
#统计单一字符串的长度> leter <- "abcdaaa"> str_length(leter)[1] 7#统计向量中每个元素的字符串长度,如 统计mtcars中每种车的名字长度> str_length(rownames(mtcars))[1] 9 13 10 14 17 7 10 9 8 8 9 10 10 11 18 19 17 8 11 14 13 16 11 10 16 9 13 12 14 12 13 10
**str_locate()**/**str_locate_all()**: 匹配指定字符串第一次/全部出现的位置,支持正则
fruit <- c("apple", "banana", "pear", "pineapple")#匹配字母“ap”第一次出现的位置> str_locate(fruit,"ap")start end[1,] 1 2[2,] NA NA[3,] NA NA[4,] 5 6#匹配特殊字符,如结尾$, end列为每个元素结尾的位置 If the match is of length 0, (e.g. from a special match like $) end will be one character less than start.> str_locate(fruit,"$")start end[1,] 6 5[2,] 7 6[3,] 5 4[4,] 10 9#统计“a”出现的全部位置,返回对象为list> str_locate_all(fruit,"a")[[1]]start end[1,] 1 1[[2]]start end[1,] 2 2[2,] 4 4[3,] 6 6[[3]]start end[1,] 3 3[[4]]start end[1,] 5 5
**str_order()**/**str_sort()**: 对字符串排序,前者返回值是排序后的index值,后者返回实际值, 二者参数一致,decreasing=: 默认FALSE从大到小,反之从小到大na_last=: 默认TURE, 即NA排在最后,FALSE, NA排在开头,NA丢弃NAlocale=: 指定语言习惯, 默认en(英语)即可numeric=: 是否按照字符串中的数字排序字符串,默认FALSE
#以排列处理为例子> a <- str_c("T",c(1:3),"_1")> b <- str_c("T",c(1:3),"_2")> c <- str_c("T",c(1:3),"_3")> d <- c(a,b,c)> d[1] "T1_1" "T2_1" "T3_1" "T1_2" "T2_2" "T3_2" "T1_3" "T2_3" "T3_3"> str_sort(d)[1] "T1_1" "T1_2" "T1_3" "T2_1" "T2_2" "T2_3" "T3_1" "T3_2" "T3_3"#按照字符串中的数字排序字符串> x <- c("100a10", "100a5", "2b", "2a")> str_sort(x, numeric = TRUE)[1] "2a" "2b" "100a5" "100a10"
**str_pad()**: 字符串补齐功能,选项width=: 指定补齐的长度side=c("left", "right", "both"): 指定补齐的字符位于哪里,左中右pad="": 指定补齐的字符串时使用的符号
> str_pad(c("a", "abc", "abcdef"), width = 11,side = "both",pad = "+")[1] "+++++a+++++" "++++abc++++" "++abcdef+++"
**str_trunc()**: 字符串截齐width=: 指定截齐的长度side=c("left", "right", "both"): 指定截齐方式,左中右ellipsis=: 指定被截去字符的替代符,默认···
> x <- "This string is moderately long"> rbind(+ str_trunc(x, 20, "right"),+ str_trunc(x, 20, "left"),+ str_trunc(x, 20, "center",ellipsis = "++++")+ )[,1][1,] "This string is mo..."[2,] "...s moderately long"[3,] "This str++++ely long"
**str_remove()**/**str_remove_all()**: 删除字符串中的指定字符(第一次出现/全部),支持正则表达式
> fruits <- c("one apple", "two pears", "three bananas")#删除元音字母(仅仅第一次出现)> str_remove(fruits, "[aeiou]")[1] "ne apple" "tw pears" "thre bananas"#删除所有处理结尾表示重复的数字与下划线> d[1] "T1_1" "T2_1" "T3_1" "T1_2" "T2_2" "T3_2" "T1_3" "T2_3" "T3_3"> str_remove_all(d,"_[1-3]$")[1] "T1" "T2" "T3" "T1" "T2" "T3" "T1" "T2" "T3"
**str_replace()/str_replace_all()/str_replace_na()**: 第一次/全部出现的指定字符串的替换,支持正则,最后一个指将NA由数值转换为普通字符串
#将第一次出现的元音字母替换为__> str_replace(fruits,pattern = "[aeiou]",replacement = "___")[1] "___ne apple" "tw___ pears" "thr___e bananas#将重复编号全部转换为a,b,c> str_replace_all(d,c("1$" = "a","2$" = "b" ,"3$" = "c"))[1] "T1_a" "T2_a" "T3_a" "T1_b" "T2_b" "T3_b" "T1_c" "T2_c" "T3_c"#指定字符转换为大/小写> str_replace_all(fruits, "[aeiou]", toupper)[1] "OnE ApplE" "twO pEArs" "thrEE bAnAnAs"> str_replace_all(fruits, "[AEIOU]", tolower)[1] "one apple" "two pears" "three bananas"#NA转换为普通的字符串"NA"> str_replace_na(c(NA, "abc", "def"))[1] "NA" "abc" "de f"
**str_lower()/str_upper()/str_title()/str_to_sentence ()**: 字符串大小写转换
#大小写转换> str_to_upper("i")[1] "I"> str_to_lower("I")[1] "i"#所有首字母大小写转换> str_to_title(dog)[1] "The Quick Brown Dog"#句首字母大写> str_to_sentence("the quick brown dog")[1] "The quick brown dog"
**str_split()/str_split_fixed()**: 分割字符串,默认返回值类型为列表/矩阵simplify=FALSE: 默认FALSE,返回值为列表(适用于str_split()),TURE返回值为矩阵n: 将字符串分割成几部分,当n大于最大可以分割的数量时,会自动以空字符串补齐
> fruits[1] "apples and oranges and pears and bananas"#将处理以“_”分割后可直接接索引> str_split(d, "_",simplify = T,n = 2)[,1] [,2][1,] "T1" "1"[2,] "T2" "1"[3,] "T3" "1"[4,] "T1" "2"[5,] "T2" "2"......#simplify = F 返回列表> str_split(fruits,"and",simplify = F)[[1]][1] "apples " " oranges " " pears " " bananas"#str_split_fixed()可以直接返回矩阵> str_split_fixed(fruits,"and",n = 3)[,1] [,2] [,3][1,] "apples " " oranges " " pears and bananas"
**str_trim()/str_squish()**: 删除字符串指定位置(开头、结尾、开头和结尾的多余的空格)/删除所有多余的空格side =c("left","right","both"): 分别指定开头、结尾、开头和结尾的多余的空格
#指定位置删除空格> str_trim(" String with trailing and leading white space asda",side = "left")[1] "String with trailing and leading white space asda"> str_trim(" String with trailing and leading white space asda ",side = "both")[1] "String with trailing and leading white space asda"> str_trim(" String with trailing and leading white space asda ",side = "right")[1] " String with trailing and leading white space asda"#删除全部多余空格> str_squish(c(" String with trailing and leading white space asda "," sa adad asda adad ada "))[1] "String with trailing and leading white space asda"[2] "sa adad asda adad ada"
**str_sub()/str_sub() <-**: 根据索引提取/替换字符串
start= end=: 指定要提取的字符串开头和结尾的index
omit_na=: 默认FALSE, 是否忽略NA
hw <- "Hadley Wickham"#提取前6个字符> str_sub(hw,1,6)[1] "Hadley"#仅指定start/end提取##从第二个字符开始取> str_sub(hw,start = 2)[1] "adley Wickham"##取到第二个字符> str_sub(hw,end = 2)[1] "Ha"#使用符索引进行提取##提取倒数第四到最后一个字符> str_sub(hw,-4,-1)[1] "kham"#如果输入字符串为向量,则对每个元素均提取> a[1] "a" "ab" "abc" "abcd"> str_sub(a,1,2)[1] "a" "ab" "ab" "ab"#可以直接进行赋值替换> str_sub(hw,1,6) <- "REPLACE"[1] "REPLACE Wicreplace"
**str_subset()/str_which()**: 分别包含指定字符的字符串/index, 支持正则表达式以及negate选项
> fruit[1] "apple" "banana" "pear" "pinapple"> str_subset(fruit,"p")[1] "apple" "pear" "pinapple"> str_which(fruit,"p")[1] 1 3 4#显示除了NA外所有的字符/index> str_subset(c("a", NA, "b"), ".")[1] "a" "b"> str_which(c("a", NA, "b"), ".")[1] 1 3
**str_extract()/str_extract_all()**: 用于提取字符串中符合条件的部分(第一次/全部)
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")#返回含有a或p的部分,仅仅第一次出现> str_extract(shopping_list, "[ap]")[1] "a" "a" "a" NA#返回全部符合条件的部分> str_extract_all(shopping_list, "[ap]",simplify = T)[,1] [,2] [,3][1,] "a" "p" "p"[2,] "a" "" ""[3,] "a" "a" ""[4,] "" "" ""
**str_wrap()**: 用于控制字符串的输出格式width: 输出每行中字符串的个数indent: 控制每段首行缩进字符数exdent: 悬挂缩进字符数**str_view()/str_view_all()**:用于正则表达式的匹配情况查看
