6 min read

Notes about strings handling in R

Here is just my note about R functions that I do not know/remember for strings handling.

Reading tables/text

read.table(): main function to read file in table format
read.csv(): reads csv files separated by a comma “,”
read.csv2(): reads csv files separated by a semicolon “;”
read.delim(): reads files separated by tabs “\t”
read.delim2(): similar to read.delim()
read.fwf(): read fixed width format files
readLines():

Printing

print(): generic printing
noquote(): print with no quotes
cat(): concatenation, it also has sep= argument as well as fill=.
format(): special formats
toString(): convert to string
sprintf(): printing

string manipulations

nchar(): number of characters
tolower(): convert to lower case
toupper(): convert to upper case
casefold(): case folding
chartr(): character translation
abbreviate(): abbreviation
substring(): substrings of a character vector
substr(): substrings of a character vector

set operations

union(): set union
intersect(): intersection
setdiff(): set difference
setequal(): equal sets
identical(): exact equality
is.element(): is element
%in%(): contains
sort(): sorting
paste(rep()): repetition

better to know regular expression

\\d: match a digit character
\\D: match a non-digit character
\\s: match a space character
\\S: match a non-space character
\\w: match a word character
\\W: match a non-word character
\\b: match a word boundary
\\B: match a non-(word boundary)
\\h: match a horizontal space
\\H: match a non-horizontal space
\\v: match a vertical space
\\V: match a non-vertical space

grep() gsub()

Escaping special characters in R

.: the period or dot. Esape in R by \\.
$: the dollar sign. Esape in R by \\$
*: the asterisk or star. Esape in R by \\*
+: the plus sign. Esape in R by \\+
?: the question mark. Esape in R by \\?
|: the vertical bar. Esape in R by \\|
\: the backslash . Esape in R by \\\\
^: the caret. Esape in R by \\^
[: the opening square bracket. Esape in R by \\[
]: the closing square bracket. Esape in R by \\]
{: the opening curly bracket. Esape in R by \\{
}: the closing curly bracket. Esape in R by \\}
(: the opening round bracket. Esape in R by \\(
): the closing round bracket. Esape in R by \\)

stringr package is a must have one

Examples

data(USArrests)
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
states=rownames(USArrests)
states
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"
noquote(states) # noquote()
##  [1] Alabama        Alaska         Arizona        Arkansas      
##  [5] California     Colorado       Connecticut    Delaware      
##  [9] Florida        Georgia        Hawaii         Idaho         
## [13] Illinois       Indiana        Iowa           Kansas        
## [17] Kentucky       Louisiana      Maine          Maryland      
## [21] Massachusetts  Michigan       Minnesota      Mississippi   
## [25] Missouri       Montana        Nebraska       Nevada        
## [29] New Hampshire  New Jersey     New Mexico     New York      
## [33] North Carolina North Dakota   Ohio           Oklahoma      
## [37] Oregon         Pennsylvania   Rhode Island   South Carolina
## [41] South Dakota   Tennessee      Texas          Utah          
## [45] Vermont        Virginia       Washington     West Virginia 
## [49] Wisconsin      Wyoming
# print without quotes & no line indicators, fill=.
cat(states, sep="-", fill=40) 
## Alabama-Alaska-Arizona-Arkansas-
## California-Colorado-Connecticut-
## Delaware-Florida-Georgia-Hawaii-Idaho-
## Illinois-Indiana-Iowa-Kansas-Kentucky-
## Louisiana-Maine-Maryland-Massachusetts-
## Michigan-Minnesota-Mississippi-Missouri-
## Montana-Nebraska-Nevada-New Hampshire-
## New Jersey-New Mexico-New York-
## North Carolina-North Dakota-Ohio-
## Oklahoma-Oregon-Pennsylvania-
## Rhode Island-South Carolina-
## South Dakota-Tennessee-Texas-Utah-
## Vermont-Virginia-Washington-
## West Virginia-Wisconsin-Wyoming
abbreviate(states,minlength=5) # abbreviate() 
##        Alabama         Alaska        Arizona       Arkansas     California 
##        "Alabm"        "Alask"        "Arizn"        "Arkns"        "Clfrn" 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##        "Colrd"        "Cnnct"        "Delwr"        "Flord"        "Georg" 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##        "Hawai"        "Idaho"        "Illns"        "Indin"         "Iowa" 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##        "Kanss"        "Kntck"        "Lousn"        "Maine"        "Mryln" 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##        "Mssch"        "Mchgn"        "Mnnst"        "Mssss"        "Missr" 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##        "Montn"        "Nbrsk"        "Nevad"        "NwHmp"        "NwJrs" 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##        "NwMxc"        "NwYrk"        "NrthC"        "NrthD"         "Ohio" 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##        "Oklhm"        "Oregn"        "Pnnsy"        "RhdIs"        "SthCr" 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##        "SthDk"        "Tnnss"        "Texas"         "Utah"        "Vrmnt" 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##        "Virgn"        "Wshng"        "WstVr"        "Wscns"        "Wymng"
month.name # month names
##  [1] "January"   "February"  "March"     "April"     "May"      
##  [6] "June"      "July"      "August"    "September" "October"  
## [11] "November"  "December"
library(stringr)
str_detect(string=states, pattern="k")
##  [1] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [34]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE) # gregexpr()
states[1]
## [1] "Alabama"
positions_a[[1]]
## [1] 1 3 5 7
## attr(,"match.length")
## [1] 1 1 1 1
## attr(,"useBytes")
## [1] TRUE
str_locate_all(string = states, pattern = "[Aa]")[[1]]
##      start end
## [1,]     1   1
## [2,]     3   3
## [3,]     5   5
## [4,]     7   7
str_count(states, "[Aa]")
##  [1] 4 3 2 3 2 1 0 2 1 1 2 1 0 2 1 2 0 2 1 2 2 1 1 0 0 2 2 2 1 0 0 0 2 2 0
## [36] 2 0 2 1 2 2 0 1 1 0 1 1 1 0 0
'The "R" project for statistical computing'
## [1] "The \"R\" project for statistical computing"
length("")  # length 1
## [1] 1
length(character(0)) # length 0
## [1] 0
format(13.7, nsmall=3) # format for pretty print
## [1] "13.700"
format(c(6, 13.1), digits = 2)
## [1] " 6" "13"
format(c(6, 13.1), digits = 2, nsmall = 1)
## [1] " 6.0" "13.1"
format(c("A", "BB", "CCC"), width = 5, justify = "centre") # justify only for char.
## [1] "  A  " " BB  " " CCC "
format(c("A", "BB", "CCC"), width = 5, justify = "left")
## [1] "A    " "BB   " "CCC  "
format(c("A", "BB", "CCC"), width = 5, justify = "none")
## [1] "A"   "BB"  "CCC"
format(123456789, big.mark = ",") # big.mark
## [1] "123,456,789"
crazy = c("Here's to the crazy ones", "The misfits", "The rebels")
chartr("aei", "#!?", crazy)
## [1] "H!r!'s to th! cr#zy on!s" "Th! m?sf?ts"             
## [3] "Th! r!b!ls"
y = c("may", "the", "force", "be", "with", "you")
substr(y, 2, 3) <- ":)"
y
## [1] "m:)"   "t:)"   "f:)ce" "b:"    "w:)h"  "y:)"
str_sub(string = y, start = 2, 3) <- "-("
y
## [1] "m-(" "t-(" "f-(" "b-(" "w-(" "y-("
paste(rep("x", 4), collapse = "")
## [1] "xxxx"
str_pad("hashtag", width = 9, side = "both", pad = "-")
## [1] "-hashtag-"
change = c("Be the change", "you want to be")
word(change, 1)
## [1] "Be"  "you"
word(change, 2, -1)
## [1] "the change" "want to be"
toString(17.04)
## [1] "17.04"
toString(c(17.04, 1978))
## [1] "17.04, 1978"
toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))
## [1] "Bonjour, 123, TRUE, NA, 1"
toString(c("one", "two", "3333333333"), width = 8)
## [1] "one,...."