R包 - R包_stringr_sringi - 《C020101_R》

stringr

[toc]

stringr

1. str_c粘贴，相当于paste0

> library(stringr)
> str_c('a','b',sep = '-')
[1] "a-b"
> paste0('a','b',sep = '-')
[1] "ab-"
> paste('a','b',sep = '-')
[1] "a-b"

2. str_length() 相当于nchar() 返回字符串长度

> str_length("strinng") 
[1] 7
> nchar("strinng")
[1] 7

3.str_sub and sbustr()( in base package)

> yxf <- 'yi xue fang'
> str_sub(yxf, c(1,4,8), c(2,6,11)) # return value and change value directly
[1] "yi"   "xue"  "fang"
> substr(yxf, c(1,4,8), c(2,6,11))
[1] "yi"
> str_sub(yxf, 1,1) <-  'Y';yxf  # change value
[1] "Yi xue fang"

4.str_dup duplicate strings 重复字符串

> fruit <- c("apple", "pear", "banana")
> str_dup(fruit, 2)
[1] "appleapple"   "pearpear"     "bananabanana"

5.去掉空白字符str_trim

string <- ' Eternal love for YanQ '
str_trim(string, side = 'both')

6.str_extract（）以向量的形式返回提取值

str_extract_all（）以列表的形式返回所有的值，

stri_locate_all() 以列表的形式返回所有位置

> phones <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
+             "387 287 6718", "apple", "233.398.9187  ", "482 952 3315",
+             "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
+             "Home: 543.355.3679")
> str_extract(phones, "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})");phones
 [1] "219 733 8965" "329-293-8753" NA             "595 794 7569"
 [5] "387 287 6718" NA             "233.398.9187" "482 952 3315"
 [9] "239 923 8115" "579-499-7527" NA             "543.355.3679"
 [1] " 219 733 8965"                 "329-293-8753 "                
 [3] "banana"                        "595 794 7569"                 
 [5] "387 287 6718"                  "apple"                        
 [7] "233.398.9187  "                "482 952 3315"                 
 [9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"           
[11] "$1000"                         "Home: 543.355.3679" 
> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d")
[1] "4" NA  NA  "2"
> str_extract(shopping_list, "[a-z]+")
[1] "apples" "bag"    "bag"    "milk"  
> str_extract(shopping_list, "[a-z]{1,4}")
[1] "appl" "bag"  "bag"  "milk"
> str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
[1] NA     "bag"  "bag"  "milk"
> # Extract all matches
> str_extract_all(shopping_list, "[a-z]+")
[[1]]
[1] "apples" "x"     
[[2]]
[1] "bag"   "of"    "flour"
[[3]]
[1] "bag"   "of"    "sugar"
[[4]]
[1] "milk" "x"   
> str_extract_all(shopping_list, "\\b[a-z]+\\b")
[[1]]
[1] "apples"
[[2]]
[1] "bag"   "of"    "flour"
[[3]]
[1] "bag"   "of"    "sugar"
[[4]]
[1] "milk"
> str_extract_all(shopping_list, "\\d")
[[1]]
[1] "4"
[[2]]
character(0)
[[3]]
character(0)
[[4]]
[1] "2"
> # Simplify results into character matrix
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
     [,1]     [,2] [,3]   
[1,] "apples" ""   ""     
[2,] "bag"    "of" "flour"
[3,] "bag"    "of" "sugar"
[4,] "milk"   ""   ""     
> str_extract_all(shopping_list, "\\d", simplify = TRUE)
     [,1]
[1,] "4" 
[2,] ""  
[3,] ""  
[4,] "2" 
> # Extract all words
> str_extract_all("This is, suprisingly, a sentence.", boundary("word"))
[[1]]
[1] "This"        "is"          "suprisingly" "a"           "sentence"

7.str_replace_all（x，a，b）把x里的a全部替换为b

> fruits <- c("one apple", "two pears", "three bananas")
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

stringi

1.连接连接两个字符串，注意与paste paste0的区别

> library(stringi)
> stri_join(1:7, letters[1:7], sep='-')
[1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"
> stri_join(1:7, letters[1:7], collapse='-')
[1] "1a-2b-3c-4d-5e-6f-7g"

2.compare字符创比较

> stri_cmp_eq('AB','aB')
[1] FALSE
> stri_cmp_neq('AB','aB')
[1] TRUE
> stri_cmp_gt('a121','b221')
[1] FALSE
> stri_cmp_lt('a121','b221')
[1] TRUE

3.stri_count（a，fixed/regex=b）在a固定或正则匹配计数b

> language <- c('Python','R', 'PHP', 'Ruby', 'Java', 
+               'JavaScript', 'C', 'Oracle', 'C++', 'C#', 'Spark', 'Go',
+               'Room', 'Good', 'Pathon', 'ScriptJava', 'R2R', 'C+','C*',"r")
> stri_count(language, fixed = 'R')
 [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0
> stri_count(language, regex = '^J')
 [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

4.stri_count_boundaries计算边界

> test <- 'The above-mentioned    features are very useful. 
+ Warm thanks to their developers.'
> stri_count_boundaries(test, type="word")
[1] 29
> stri_count_boundaries(test, type="sentence")
[1] 2
> stri_count_boundaries(test, type="character")
[1] 82
> stri_count_words(test)
[1] 12

5.stri_dup（a,b）是将a分别重复b此，stri_duplicated是否为重复值

> stri_dup(c("abc", "pqrst"), c(4, 2))
[1] "abcabcabcabc" "pqrstpqrst"  
> stri_duplicated(c("a", "b", "a", NA, "a", NA))
[1] FALSE FALSE  TRUE FALSE  TRUE  TRUE
> stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
[1]  TRUE FALSE  TRUE  TRUE FALSE FALSE
> stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
[1] 3

6.stri_dect（a，b）字符创a中是否含有b，返回逻辑值

fixed 是完全匹配，regex是正则匹配

> stri_detect_fixed(c("stringi R", "REXAMINE", "123"), c('i', 'R', '0'))
[1]  TRUE  TRUE FALSE
> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), '^ab')
[1] TRUE TRUE TRUE TRUE TRUE
> stri_detect_regex(c("above", "abort", "about", "abnormal", 'abandon'), 't\\b')
[1] FALSE  TRUE  TRUE FALSE FALSE
> stri_detect_regex(c('ABOUT','abort','AboVE'), '^ab', case_insensitive = TRUE)
[1] TRUE TRUE TRUE
> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a")
[1]  TRUE  TRUE FALSE  TRUE FALSE
> stri_startswith_fixed(c("a1", "a2", "b3", "a4", "c5"), "a1")
[1]  TRUE FALSE FALSE FALSE FALSE
> stri_startswith_fixed(c("abaDc", "aabadc",'ababa'), "ba", from=2)
[1]  TRUE FALSE  TRUE
> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba')
[1] FALSE FALSE  TRUE
> stri_endswith_fixed(c("abaDc", "aabadc",'ababa'),'ba', to = 3)
[1]  TRUE FALSE  TRUE

7.stri_extract_all() 提取字符创的内容,返回值列表

These functions extract all substrings matching a given pattern.

stri_extract_all_* extracts all the matches.
stri_extract_first_* and stri_extract_last_* yield the first or the last matches, respectively.

> tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_B0_2004',
+               'EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05')
> #Generate a strings composed by several sequence names.
> stri_extract_all(tEmp_text, regex = '[0-9]{2,4}\\b')
[[1]]
[1] "2008"
[[2]]
[1] "2004"
[[3]]
[1] "2007"
[[4]]
[1] "2005"
[[5]]
[1] "05"
> stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)
[[1]]
[1] "aba" "aBA" "Aba"
#case_insensitive=TRUE 不区分大小写
> stri_extract_all_boundaries("stringi: THE string processing package 123.48...")
[[1]]
[1] "stringi: "   "THE "        "string "     "processing " "package "   
[6] "123.48..."  
> stri_extract_all_words("stringi: THE string processing package 123.48...")
[[1]]
[1] "stringi"    "THE"        "string"     "processing" "package"   
[6] "123.48"    
> stri_count_boundaries("stringi: THE string processing package 123.48...", type="word")
[1] 15

7.stri_isempty（）判断字符创或向量是否为空

> stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104',' ',''))
[1] FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE

8 stri_locate_all（a，fixed = b）返回字符串b在字符创a中的全部位置

> stri_locate_all('I want to learn R to promote my statistical skills', fixed='to')
[[1]]
     start end
[1,]     8   9
[2,]    19  20

strto系列转换字母大小写

> library(stringr)
> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
> str_to_title(a)
[1] "Ptprc" "Epcam" "Mme"   "Cd3g"  "Cd3e"  "Cd68"  "Cd79a"
> dog <- "The quick brown dog"
> str_to_upper(dog)  #全部大写
[1] "THE QUICK BROWN DOG"
> str_to_lower(dog) #全部小写
[1] "the quick brown dog"
> str_to_title(dog) #首字母大写
[1] "The Quick Brown Dog"
> str_to_sentence("the quick brown dog") #句子第一个词的第一个字母大写
[1] "The quick brown dog"
#另一个包里的函数capitalize
> library(Hmisc)
> a = c("PTPRC","EPCAM", 'MME',"CD3G","CD3E", "CD68", "CD79A")
> capitalize(tolower(a))
[1] "Ptprc" "Epcam" "Mme"   "Cd3g"  "Cd3e"  "Cd68"  "Cd79a"

R包_stringr_sringi

stringr

1. str_c粘贴，相当于paste0

2. str_length() 相当于nchar() 返回字符串长度

3.str_sub and sbustr()( in base package)

4.str_dup duplicate strings 重复字符串

5.去掉空白字符str_trim

6.str_extract（）以向量的形式返回提取值

7.str_replace_all（x，a，b）把x里的a全部替换为b

stringi

1.连接 连接两个字符串，注意与paste paste0的区别

2.compare字符创比较

3.stri_count（a，fixed/regex=b）在a固定或正则匹配计数b

4.stri_count_boundaries计算边界

5.stri_dup（a,b）是将a分别重复b此，stri_duplicated是否为重复值

6.stri_dect（a，b） 字符创a中是否含有b，返回逻辑值

7.stri_extract_all() 提取字符创的内容,返回值列表

7.stri_isempty（）判断字符创或向量是否为空

8 stri_locate_all（a，fixed = b） 返回字符串b在字符创a中的全部位置

strto系列 转换字母大小写

1.连接连接两个字符串，注意与paste paste0的区别

6.stri_dect（a，b）字符创a中是否含有b，返回逻辑值

8 stri_locate_all（a，fixed = b）返回字符串b在字符创a中的全部位置

strto系列转换字母大小写