- stringr
- 1. str_c粘贴,相当于paste0
- 2. str_length() 相当于nchar() 返回字符串长度
- 3.str_sub and sbustr()( in base package)
- 4.str_dup duplicate strings 重复字符串
- 5.去掉空白字符str_trim
- 6.str_extract()以向量的形式返回提取值
- 7.str_replace_all(x,a,b)把x里的a全部替换为b
- stringi
- 1.连接 连接两个字符串,注意与paste paste0的区别
- 2.compare字符创比较
- 3.stri_count(a,fixed/regex=b)在a固定或正则匹配计数b
- 4.stri_count_boundaries计算边界
- 5.stri_dup(a,b)是将a分别重复b此,stri_duplicated是否为重复值
- 6.stri_dect(a,b) 字符创a中是否含有b,返回逻辑值
- 7.stri_extract_all() 提取字符创的内容,返回值列表
- 7.stri_isempty()判断字符创或向量是否为空
- 8 stri_locate_all(a,fixed = b) 返回字符串b在字符创a中的全部位置
- strto系列 转换字母大小写
[toc]
stringr
1. str_c粘贴,相当于paste0
> library(stringr)
> str_c('a','b',sep = '-')
[1] "a-b"
> paste0('a','b',sep = '-')
[1] "ab-"
> paste('a','b',sep = '-')
[1] "a-b"
2. str_length() 相当于nchar() 返回字符串长度
> str_length("strinng")
[1] 7
> nchar("strinng")
[1] 7
3.str_sub and sbustr()( in base package)
> yxf <- 'yi xue fang'
> str_sub(yxf, c(1,4,8), c(2,6,11)) # return value and change value directly
[1] "yi" "xue" "fang"
> substr(yxf, c(1,4,8), c(2,6,11))
[1] "yi"
> str_sub(yxf, 1,1) <- 'Y';yxf # change value
[1] "Yi xue fang"
4.str_dup duplicate strings 重复字符串
> fruit <- c("apple", "pear", "banana")
> str_dup(fruit, 2)
[1] "appleapple" "pearpear" "bananabanana"
5.去掉空白字符str_trim
string <- ' Eternal love for YanQ '
str_trim(string, side = 'both')
6.str_extract()以向量的形式返回提取值
str_extract_all()以列表的形式返回所有的值,
stri_locate_all() 以列表的形式返回所有位置
> phones <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
+ "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
+ "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
+ "Home: 543.355.3679")
> str_extract(phones, "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})");phones
[1] "219 733 8965" "329-293-8753" NA "595 794 7569"
[5] "387 287 6718" NA "233.398.9187" "482 952 3315"
[9] "239 923 8115" "579-499-7527" NA "543.355.3679"
[1] " 219 733 8965" "329-293-8753 "
[3] "banana" "595 794 7569"
[5] "387 287 6718" "apple"
[7] "233.398.9187 " "482 952 3315"
[9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"
[11] "$1000" "Home: 543.355.3679"
> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d")
[1] "4" NA NA "2"
> str_extract(shopping_list, "[a-z]+")
[1] "apples" "bag" "bag" "milk"
> str_extract(shopping_list, "[a-z]{1,4}")
[1] "appl" "bag" "bag" "milk"
> str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
[1] NA "bag" "bag" "milk"
> # Extract all matches
> str_extract_all(shopping_list, "[a-z]+")
[[1]]
[1] "apples" "x"
[[2]]
[1] "bag" "of" "flour"
[[3]]
[1] "bag" "of" "sugar"
[[4]]
[1] "milk" "x"
> str_extract_all(shopping_list, "\\b[a-z]+\\b")
[[1]]
[1] "apples"
[[2]]
[1] "bag" "of" "flour"
[[3]]
[1] "bag" "of" "sugar"
[[4]]
[1] "milk"
> str_extract_all(shopping_list, "\\d")
[[1]]
[1] "4"
[[2]]
character(0)
[[3]]
character(0)
[[4]]
[1] "2"
> # Simplify results into character matrix
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
[,1] [,2] [,3]
[1,] "apples" "" ""
[2,] "bag" "of" "flour"
[3,] "bag" "of" "sugar"
[4,] "milk" "" ""
> str_extract_all(shopping_list, "\\d", simplify = TRUE)
[,1]
[1,] "4"
[2,] ""
[3,] ""
[4,] "2"
> # Extract all words
> str_extract_all("This is, suprisingly, a sentence.", boundary("word"))
[[1]]
[1] "This" "is" "suprisingly" "a" "sentence"
7.str_replace_all(x,a,b)把x里的a全部替换为b
> fruits <- c("one apple", "two pears", "three bananas")
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
stringi
1.连接 连接两个字符串,注意与paste paste0的区别
> library(stringi)
> stri_join(1:7, letters[1:7], sep='-')
[1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"
> stri_join(1:7, letters[1:7], collapse='-')
[1] "1a-2b-3c-4d-5e-6f-7g"
2.compare字符创比较
> stri_cmp_eq('AB','aB')
[1] FALSE
> stri_cmp_neq('AB','aB')
[1] TRUE
> stri_cmp_gt('a121','b221')
[1] FALSE
> stri_cmp_lt('a121','b221')
[1] TRUE
3.stri_count(a,fixed/regex=b)在a固定或正则匹配计数b
> language <- c('Python','R', 'PHP', 'Ruby', 'Java',
+ 'JavaScript', 'C', 'Oracle', 'C++', 'C#', 'Spark', 'Go',
+ 'Room', 'Good', 'Pathon', 'ScriptJava', 'R2R', 'C+','C*',"r")
> stri_count(language, fixed = 'R')
[1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0
> stri_count(language, regex = '^J')
[1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4.stri_count_boundaries计算边界
> test <- 'The above-mentioned features are very useful.
+ Warm thanks to their developers.'
> stri_count_boundaries(test, type="word")
[1] 29
> stri_count_boundaries(test, type="sentence")
[1] 2
> stri_count_boundaries(test, type="character")
[1] 82
> stri_count_words(test)
[1] 12
5.stri_dup(a,b)是将a分别重复b此,stri_duplicated是否为重复值
> stri_dup(c("abc", "pqrst"), c(4, 2))
[1] "abcabcabcabc" "pqrstpqrst"
> stri_duplicated(c("a", "b", "a", NA, "a", NA))
[1] FALSE FALSE TRUE FALSE TRUE TRUE
> stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
[1] TRUE FALSE TRUE TRUE FALSE FALSE
> stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
[1] 3
6.stri_dect(a,b) 字符创a中是否含有b,返回逻辑值
fixed 是完全匹配,regex是正则匹配
> stri_detect_fixed(c("stringi R", "REXAMINE", "123"), c('i', 'R', '0'))
[1] TRUE TRUE FALSE
> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), '^ab')
[1] TRUE TRUE TRUE TRUE TRUE
> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), 't\\b')
[1] FALSE TRUE TRUE FALSE FALSE
> stri_detect_regex(c('ABOUT','abort','AboVE'), '^ab', case_insensitive = TRUE)
[1] TRUE TRUE TRUE
> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a")
[1] TRUE TRUE FALSE TRUE FALSE
> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a1")
[1] TRUE FALSE FALSE FALSE FALSE
> stri_startswith_fixed(c("abaDc", "aabadc",'ababa'), "ba", from=2)
[1] TRUE FALSE TRUE
> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba')
[1] FALSE FALSE TRUE
> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba', to = 3)
[1] TRUE FALSE TRUE
7.stri_extract_all() 提取字符创的内容,返回值列表
These functions extract all substrings matching a given pattern.
stri_extract_all_*
extracts all the matches.stri_extract_first_*
andstri_extract_last_*
yield the first or the last matches, respectively.
> tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_B0_2004',
+ 'EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05')
> #Generate a strings composed by several sequence names.
> stri_extract_all(tEmp_text, regex = '[0-9]{2,4}\\b')
[[1]]
[1] "2008"
[[2]]
[1] "2004"
[[3]]
[1] "2007"
[[4]]
[1] "2005"
[[5]]
[1] "05"
> stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)
[[1]]
[1] "aba" "aBA" "Aba"
#case_insensitive=TRUE 不区分大小写
> stri_extract_all_boundaries("stringi: THE string processing package 123.48...")
[[1]]
[1] "stringi: " "THE " "string " "processing " "package "
[6] "123.48..."
> stri_extract_all_words("stringi: THE string processing package 123.48...")
[[1]]
[1] "stringi" "THE" "string" "processing" "package"
[6] "123.48"
> stri_count_boundaries("stringi: THE string processing package 123.48...", type="word")
[1] 15
7.stri_isempty()判断字符创或向量是否为空
> stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104',' ',''))
[1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE
8 stri_locate_all(a,fixed = b) 返回字符串b在字符创a中的全部位置
> stri_locate_all('I want to learn R to promote my statistical skills', fixed='to')
[[1]]
start end
[1,] 8 9
[2,] 19 20
strto系列 转换字母大小写
> library(stringr)
> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
> str_to_title(a)
[1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"
> dog <- "The quick brown dog"
> str_to_upper(dog) #全部大写
[1] "THE QUICK BROWN DOG"
> str_to_lower(dog) #全部小写
[1] "the quick brown dog"
> str_to_title(dog) #首字母大写
[1] "The Quick Brown Dog"
> str_to_sentence("the quick brown dog") #句子第一个词的第一个字母大写
[1] "The quick brown dog"
#另一个包里的函数capitalize
> library(Hmisc)
> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
> capitalize(tolower(a))
[1] "Ptprc" "Epcam" "Mme" "Cd3g" "Cd3e" "Cd68" "Cd79a"