- stringr
- 1. str_c粘贴,相当于paste0
- 2. str_length() 相当于nchar() 返回字符串长度
- 3.str_sub and sbustr()( in base package)
- 4.str_dup duplicate strings 重复字符串
- 5.去掉空白字符str_trim
- 6.str_extract()以向量的形式返回提取值
- 7.str_replace_all(x,a,b)把x里的a全部替换为b
- stringi
- 1.连接 连接两个字符串,注意与paste paste0的区别
- 2.compare字符创比较
- 3.stri_count(a,fixed/regex=b)在a固定或正则匹配计数b
- 4.stri_count_boundaries计算边界
- 5.stri_dup(a,b)是将a分别重复b此,stri_duplicated是否为重复值
- 6.stri_dect(a,b) 字符创a中是否含有b,返回逻辑值
- 7.stri_extract_all() 提取字符创的内容,返回值列表
- 7.stri_isempty()判断字符创或向量是否为空
- 8 stri_locate_all(a,fixed = b) 返回字符串b在字符创a中的全部位置
- strto系列 转换字母大小写
[toc]
stringr
1. str_c粘贴,相当于paste0
> library(stringr)> str_c('a','b',sep = '-')[1] "a-b"> paste0('a','b',sep = '-')[1] "ab-"> paste('a','b',sep = '-')[1] "a-b"
2. str_length() 相当于nchar() 返回字符串长度
> str_length("strinng")[1] 7> nchar("strinng")[1] 7
3.str_sub and sbustr()( in base package)
> yxf <- 'yi xue fang'> str_sub(yxf, c(1,4,8), c(2,6,11)) # return value and change value directly[1] "yi" "xue" "fang"> substr(yxf, c(1,4,8), c(2,6,11))[1] "yi"> str_sub(yxf, 1,1) <- 'Y';yxf # change value[1] "Yi xue fang"
4.str_dup duplicate strings 重复字符串
> fruit <- c("apple", "pear", "banana")> str_dup(fruit, 2)[1] "appleapple" "pearpear" "bananabanana"
5.去掉空白字符str_trim
string <- ' Eternal love for YanQ 'str_trim(string, side = 'both')
6.str_extract()以向量的形式返回提取值
str_extract_all()以列表的形式返回所有的值,
stri_locate_all() 以列表的形式返回所有位置
> phones <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",+ "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",+ "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",+ "Home: 543.355.3679")> str_extract(phones, "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})");phones[1] "219 733 8965" "329-293-8753" NA "595 794 7569"[5] "387 287 6718" NA "233.398.9187" "482 952 3315"[9] "239 923 8115" "579-499-7527" NA "543.355.3679"[1] " 219 733 8965" "329-293-8753 "[3] "banana" "595 794 7569"[5] "387 287 6718" "apple"[7] "233.398.9187 " "482 952 3315"[9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"[11] "$1000" "Home: 543.355.3679"> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")> str_extract(shopping_list, "\\d")[1] "4" NA NA "2"> str_extract(shopping_list, "[a-z]+")[1] "apples" "bag" "bag" "milk"> str_extract(shopping_list, "[a-z]{1,4}")[1] "appl" "bag" "bag" "milk"> str_extract(shopping_list, "\\b[a-z]{1,4}\\b")[1] NA "bag" "bag" "milk"> # Extract all matches> str_extract_all(shopping_list, "[a-z]+")[[1]][1] "apples" "x"[[2]][1] "bag" "of" "flour"[[3]][1] "bag" "of" "sugar"[[4]][1] "milk" "x"> str_extract_all(shopping_list, "\\b[a-z]+\\b")[[1]][1] "apples"[[2]][1] "bag" "of" "flour"[[3]][1] "bag" "of" "sugar"[[4]][1] "milk"> str_extract_all(shopping_list, "\\d")[[1]][1] "4"[[2]]character(0)[[3]]character(0)[[4]][1] "2"> # Simplify results into character matrix> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)[,1] [,2] [,3][1,] "apples" "" ""[2,] "bag" "of" "flour"[3,] "bag" "of" "sugar"[4,] "milk" "" ""> str_extract_all(shopping_list, "\\d", simplify = TRUE)[,1][1,] "4"[2,] ""[3,] ""[4,] "2"> # Extract all words> str_extract_all("This is, suprisingly, a sentence.", boundary("word"))[[1]][1] "This" "is" "suprisingly" "a" "sentence"
7.str_replace_all(x,a,b)把x里的a全部替换为b
> fruits <- c("one apple", "two pears", "three bananas")> str_replace_all(fruits, "[aeiou]", "-")[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
stringi
1.连接 连接两个字符串,注意与paste paste0的区别
> library(stringi)> stri_join(1:7, letters[1:7], sep='-')[1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"> stri_join(1:7, letters[1:7], collapse='-')[1] "1a-2b-3c-4d-5e-6f-7g"
2.compare字符创比较
> stri_cmp_eq('AB','aB')[1] FALSE> stri_cmp_neq('AB','aB')[1] TRUE> stri_cmp_gt('a121','b221')[1] FALSE> stri_cmp_lt('a121','b221')[1] TRUE
3.stri_count(a,fixed/regex=b)在a固定或正则匹配计数b
> language <- c('Python','R', 'PHP', 'Ruby', 'Java',+ 'JavaScript', 'C', 'Oracle', 'C++', 'C#', 'Spark', 'Go',+ 'Room', 'Good', 'Pathon', 'ScriptJava', 'R2R', 'C+','C*',"r")> stri_count(language, fixed = 'R')[1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0> stri_count(language, regex = '^J')[1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4.stri_count_boundaries计算边界
> test <- 'The above-mentioned features are very useful.+ Warm thanks to their developers.'> stri_count_boundaries(test, type="word")[1] 29> stri_count_boundaries(test, type="sentence")[1] 2> stri_count_boundaries(test, type="character")[1] 82> stri_count_words(test)[1] 12
5.stri_dup(a,b)是将a分别重复b此,stri_duplicated是否为重复值
> stri_dup(c("abc", "pqrst"), c(4, 2))[1] "abcabcabcabc" "pqrstpqrst"> stri_duplicated(c("a", "b", "a", NA, "a", NA))[1] FALSE FALSE TRUE FALSE TRUE TRUE> stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)[1] TRUE FALSE TRUE TRUE FALSE FALSE> stri_duplicated_any(c("a", "b", "a", NA, "a", NA))[1] 3
6.stri_dect(a,b) 字符创a中是否含有b,返回逻辑值
fixed 是完全匹配,regex是正则匹配
> stri_detect_fixed(c("stringi R", "REXAMINE", "123"), c('i', 'R', '0'))[1] TRUE TRUE FALSE> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), '^ab')[1] TRUE TRUE TRUE TRUE TRUE> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), 't\\b')[1] FALSE TRUE TRUE FALSE FALSE> stri_detect_regex(c('ABOUT','abort','AboVE'), '^ab', case_insensitive = TRUE)[1] TRUE TRUE TRUE> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a")[1] TRUE TRUE FALSE TRUE FALSE> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a1")[1] TRUE FALSE FALSE FALSE FALSE> stri_startswith_fixed(c("abaDc", "aabadc",'ababa'), "ba", from=2)[1] TRUE FALSE TRUE> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba')[1] FALSE FALSE TRUE> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba', to = 3)[1] TRUE FALSE TRUE
7.stri_extract_all() 提取字符创的内容,返回值列表
These functions extract all substrings matching a given pattern.
stri_extract_all_*extracts all the matches.stri_extract_first_*andstri_extract_last_*yield the first or the last matches, respectively.
> tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_B0_2004',+ 'EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05')> #Generate a strings composed by several sequence names.> stri_extract_all(tEmp_text, regex = '[0-9]{2,4}\\b')[[1]][1] "2008"[[2]][1] "2004"[[3]][1] "2007"[[4]][1] "2005"[[5]][1] "05"> stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)[[1]][1] "aba" "aBA" "Aba"#case_insensitive=TRUE 不区分大小写> stri_extract_all_boundaries("stringi: THE string processing package 123.48...")[[1]][1] "stringi: " "THE " "string " "processing " "package "[6] "123.48..."> stri_extract_all_words("stringi: THE string processing package 123.48...")[[1]][1] "stringi" "THE" "string" "processing" "package"[6] "123.48"> stri_count_boundaries("stringi: THE string processing package 123.48...", type="word")[1] 15
7.stri_isempty()判断字符创或向量是否为空
> stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104',' ',''))[1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE
8 stri_locate_all(a,fixed = b) 返回字符串b在字符创a中的全部位置
> stri_locate_all('I want to learn R to promote my statistical skills', fixed='to')[[1]]start end[1,] 8 9[2,] 19 20
strto系列 转换字母大小写
> library(stringr)> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")> str_to_title(a)[1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"> dog <- "The quick brown dog"> str_to_upper(dog) #全部大写[1] "THE QUICK BROWN DOG"> str_to_lower(dog) #全部小写[1] "the quick brown dog"> str_to_title(dog) #首字母大写[1] "The Quick Brown Dog"> str_to_sentence("the quick brown dog") #句子第一个词的第一个字母大写[1] "The quick brown dog"#另一个包里的函数capitalize> library(Hmisc)> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")> capitalize(tolower(a))[1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"
