install.packages('package-name',repos='http://cran.us.r-project.org')
factors in r
nlevels(movielens$genres)
> library()
> library(dslabs)
> str(murders)
# defining murder rate as before
murder_rate <- murders$total / murders$population * 100000
# creating a logical vector that specifies if the murder rate in that state is less than or equal to 0.71
index <- murder_rate <= 0.71
# determining which states have murder rates less than or equal to 0.71
murders$state[index]
# calculating how many states have a murder rate less than or equal to 0.71
sum(index)
# creating the two logical vectors representing our conditions
west <- murders$region == "West"
safe <- murder_rate <= 1
# defining an index and identifying states with both conditions true
index <- safe & west
murders$state[index]
# to determine the murder rate in Massachusetts we may do the following
ind <- which(murders$state == "Massachusetts")
murder_rate[ind]
# to obtain the indices and subsequent murder rates of New York, Florida, Texas, we do:
ind <- match(c("New York", "Florida", "Texas"), murders$state)
ind
murder_rate[ind]
# to see if Boston, Dakota, and Washington are states
c("Boston", "Dakota", "Washington") %in% murders$state
# installing and loading the dplyr package
install.packages("dplyr")
library(dplyr)
# adding a column with mutate
library(dslabs)
data("murders")
murders <- mutate(murders, rate = total / population * 100000)
# subsetting with filter
filter(murders, rate <= 0.71)
# selecting columns with select
new_table <- select(murders, state, region, rate)
# using the pipe
murders %>% select(state, region, rate) %>% filter(rate <= 0.71)
# creating a data frame with stringAsFactors = FALSE
grades <- data.frame(names = c("John", "Juan", "Jean", "Yao"),
exam_1 = c(95, 80, 90, 85),
exam_2 = c(90, 85, 85, 90),
stringsAsFactors = FALSE)
# a simple scatterplot of total murders versus population
x <- murders$population / 10^6
y <- murders$total
plot(x, y)
# a histogram of murder rates
hist(murders$rate)
# boxplots of murder rates by region
boxplot(rate~region, data = murders)
# an example showing the general structure of an if-else statement
a <- 0
if(a!=0){
print(1/a)
} else{
print("No reciprocal for 0.")
}
# an example that tells us which states, if any, have a murder rate less than 0.5
library(dslabs)
data(murders)
murder_rate <- murders$total / murders$population*100000
ind <- which.min(murder_rate)
if(murder_rate[ind] < 0.5){
print(murders$state[ind])
} else{
print("No state has murder rate that low")
}
# changing the condition to < 0.25 changes the result
if(murder_rate[ind] < 0.25){
print(murders$state[ind])
} else{
print("No state has a murder rate that low.")
}
# the ifelse() function works similarly to an if-else conditional
a <- 0
ifelse(a > 0, 1/a, NA)
# the ifelse() function is particularly useful on vectors
a <- c(0,1,2,-4,5)
result <- ifelse(a > 0, 1/a, NA)
# the ifelse() function is also helpful for replacing missing values
data(na_example)
no_nas <- ifelse(is.na(na_example), 0, na_example)
sum(is.na(no_nas))
# the any() and all() functions evaluate logical vectors
z <- c(TRUE, TRUE, FALSE)
any(z)
all(z)
# example of defining a function to compute the average of a vector x
avg <- function(x){
s <- sum(x)
n <- length(x)
s/n
}
# we see that the above function and the pre-built R mean() function are identical
x <- 1:100
identical(mean(x), avg(x))
# variables inside a function are not defined in the workspace
s <- 3
avg(1:10)
s
# the general form of a function
my_function <- function(VARIABLE_NAME){
perform operations on VARIABLE_NAME and calculate VALUE
VALUE
}
# functions can have multiple arguments as well as default values
avg <- function(x, arithmetic = TRUE){
n <- length(x)
ifelse(arithmetic, sum(x)/n, prod(x)^(1/n))
}
# creating a function that computes the sum of integers 1 through n
compute_s_n <- function(n){
x <- 1:n
sum(x)
}
# a very simple for-loop
for(i in 1:5){
print(i)
# a for-loop for our summation
m <- 25
s_n <- vector(length = m) # create an empty vector
for(n in 1:m){
s_n[n] <- compute_s_n(n)
}
# creating a plot for our summation function
n <- 1:m
plot(n, s_n)
# a table of values comparing our function to the summation formula
head(data.frame(s_n = s_n, formula = n*(n+1)/2))
# overlaying our function with the summation formula
plot(n, s_n)
lines(n, n*(n+1)/2)