Here is just my note about R functions that I do not know/remember for strings handling.
Reading tables/text
read.table()
: main function to read file in table format
read.csv()
: reads csv files separated by a comma “,”
read.csv2()
: reads csv files separated by a semicolon “;”
read.delim()
: reads files separated by tabs “\t”
read.delim2()
: similar to read.delim()
read.fwf()
: read fixed width format files
readLines()
:
Printing
print()
: generic printing
noquote()
: print with no quotes
cat()
: concatenation, it also has sep=
argument as well as fill=
.
format()
: special formats
toString()
: convert to string
sprintf()
: printing
string manipulations
nchar()
: number of characters
tolower()
: convert to lower case
toupper()
: convert to upper case
casefold()
: case folding
chartr()
: character translation
abbreviate()
: abbreviation
substring()
: substrings of a character vector
substr()
: substrings of a character vector
set operations
union()
: set union
intersect()
: intersection
setdiff()
: set difference
setequal()
: equal sets
identical()
: exact equality
is.element()
: is element
%in%()
: contains
sort()
: sorting
paste(rep())
: repetition
better to know regular expression
\\d
: match a digit character
\\D
: match a non-digit character
\\s
: match a space character
\\S
: match a non-space character
\\w
: match a word character
\\W
: match a non-word character
\\b
: match a word boundary
\\B
: match a non-(word boundary)
\\h
: match a horizontal space
\\H
: match a non-horizontal space
\\v
: match a vertical space
\\V
: match a non-vertical space
grep()
gsub()
Escaping special characters in R
.
: the period or dot. Esape in R by \\.
$
: the dollar sign. Esape in R by \\$
*
: the asterisk or star. Esape in R by \\*
+
: the plus sign. Esape in R by \\+
?
: the question mark. Esape in R by \\?
|
: the vertical bar. Esape in R by \\|
\
: the backslash . Esape in R by \\\\
^
: the caret. Esape in R by \\^
[
: the opening square bracket. Esape in R by \\[
]
: the closing square bracket. Esape in R by \\]
{
: the opening curly bracket. Esape in R by \\{
}
: the closing curly bracket. Esape in R by \\}
(
: the opening round bracket. Esape in R by \\(
)
: the closing round bracket. Esape in R by \\)
stringr
package is a must have one
Examples
data(USArrests)
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
states=rownames(USArrests)
states
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
noquote(states) # noquote()
## [1] Alabama Alaska Arizona Arkansas
## [5] California Colorado Connecticut Delaware
## [9] Florida Georgia Hawaii Idaho
## [13] Illinois Indiana Iowa Kansas
## [17] Kentucky Louisiana Maine Maryland
## [21] Massachusetts Michigan Minnesota Mississippi
## [25] Missouri Montana Nebraska Nevada
## [29] New Hampshire New Jersey New Mexico New York
## [33] North Carolina North Dakota Ohio Oklahoma
## [37] Oregon Pennsylvania Rhode Island South Carolina
## [41] South Dakota Tennessee Texas Utah
## [45] Vermont Virginia Washington West Virginia
## [49] Wisconsin Wyoming
# print without quotes & no line indicators, fill=.
cat(states, sep="-", fill=40)
## Alabama-Alaska-Arizona-Arkansas-
## California-Colorado-Connecticut-
## Delaware-Florida-Georgia-Hawaii-Idaho-
## Illinois-Indiana-Iowa-Kansas-Kentucky-
## Louisiana-Maine-Maryland-Massachusetts-
## Michigan-Minnesota-Mississippi-Missouri-
## Montana-Nebraska-Nevada-New Hampshire-
## New Jersey-New Mexico-New York-
## North Carolina-North Dakota-Ohio-
## Oklahoma-Oregon-Pennsylvania-
## Rhode Island-South Carolina-
## South Dakota-Tennessee-Texas-Utah-
## Vermont-Virginia-Washington-
## West Virginia-Wisconsin-Wyoming
abbreviate(states,minlength=5) # abbreviate()
## Alabama Alaska Arizona Arkansas California
## "Alabm" "Alask" "Arizn" "Arkns" "Clfrn"
## Colorado Connecticut Delaware Florida Georgia
## "Colrd" "Cnnct" "Delwr" "Flord" "Georg"
## Hawaii Idaho Illinois Indiana Iowa
## "Hawai" "Idaho" "Illns" "Indin" "Iowa"
## Kansas Kentucky Louisiana Maine Maryland
## "Kanss" "Kntck" "Lousn" "Maine" "Mryln"
## Massachusetts Michigan Minnesota Mississippi Missouri
## "Mssch" "Mchgn" "Mnnst" "Mssss" "Missr"
## Montana Nebraska Nevada New Hampshire New Jersey
## "Montn" "Nbrsk" "Nevad" "NwHmp" "NwJrs"
## New Mexico New York North Carolina North Dakota Ohio
## "NwMxc" "NwYrk" "NrthC" "NrthD" "Ohio"
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## "Oklhm" "Oregn" "Pnnsy" "RhdIs" "SthCr"
## South Dakota Tennessee Texas Utah Vermont
## "SthDk" "Tnnss" "Texas" "Utah" "Vrmnt"
## Virginia Washington West Virginia Wisconsin Wyoming
## "Virgn" "Wshng" "WstVr" "Wscns" "Wymng"
month.name # month names
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
library(stringr)
str_detect(string=states, pattern="k")
## [1] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [34] TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE) # gregexpr()
states[1]
## [1] "Alabama"
positions_a[[1]]
## [1] 1 3 5 7
## attr(,"match.length")
## [1] 1 1 1 1
## attr(,"useBytes")
## [1] TRUE
str_locate_all(string = states, pattern = "[Aa]")[[1]]
## start end
## [1,] 1 1
## [2,] 3 3
## [3,] 5 5
## [4,] 7 7
str_count(states, "[Aa]")
## [1] 4 3 2 3 2 1 0 2 1 1 2 1 0 2 1 2 0 2 1 2 2 1 1 0 0 2 2 2 1 0 0 0 2 2 0
## [36] 2 0 2 1 2 2 0 1 1 0 1 1 1 0 0
'The "R" project for statistical computing'
## [1] "The \"R\" project for statistical computing"
length("") # length 1
## [1] 1
length(character(0)) # length 0
## [1] 0
format(13.7, nsmall=3) # format for pretty print
## [1] "13.700"
format(c(6, 13.1), digits = 2)
## [1] " 6" "13"
format(c(6, 13.1), digits = 2, nsmall = 1)
## [1] " 6.0" "13.1"
format(c("A", "BB", "CCC"), width = 5, justify = "centre") # justify only for char.
## [1] " A " " BB " " CCC "
format(c("A", "BB", "CCC"), width = 5, justify = "left")
## [1] "A " "BB " "CCC "
format(c("A", "BB", "CCC"), width = 5, justify = "none")
## [1] "A" "BB" "CCC"
format(123456789, big.mark = ",") # big.mark
## [1] "123,456,789"
crazy = c("Here's to the crazy ones", "The misfits", "The rebels")
chartr("aei", "#!?", crazy)
## [1] "H!r!'s to th! cr#zy on!s" "Th! m?sf?ts"
## [3] "Th! r!b!ls"
y = c("may", "the", "force", "be", "with", "you")
substr(y, 2, 3) <- ":)"
y
## [1] "m:)" "t:)" "f:)ce" "b:" "w:)h" "y:)"
str_sub(string = y, start = 2, 3) <- "-("
y
## [1] "m-(" "t-(" "f-(" "b-(" "w-(" "y-("
paste(rep("x", 4), collapse = "")
## [1] "xxxx"
str_pad("hashtag", width = 9, side = "both", pad = "-")
## [1] "-hashtag-"
change = c("Be the change", "you want to be")
word(change, 1)
## [1] "Be" "you"
word(change, 2, -1)
## [1] "the change" "want to be"
toString(17.04)
## [1] "17.04"
toString(c(17.04, 1978))
## [1] "17.04, 1978"
toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))
## [1] "Bonjour, 123, TRUE, NA, 1"
toString(c("one", "two", "3333333333"), width = 8)
## [1] "one,...."