install.packages('package-name',repos='http://cran.us.r-project.org')

factors in r

nlevels(movielens$genres)

> library()

> library(dslabs)

> str(murders)

# defining murder rate as before

murder_rate <- murders$total / murders$population * 100000

# creating a logical vector that specifies if the murder rate in that state is less than or equal to 0.71

index <- murder_rate <= 0.71

# determining which states have murder rates less than or equal to 0.71

murders$state[index]

# calculating how many states have a murder rate less than or equal to 0.71

sum(index)

# creating the two logical vectors representing our conditions

west <- murders$region == "West"

safe <- murder_rate <= 1

# defining an index and identifying states with both conditions true

index <- safe & west

murders$state[index]

# to determine the murder rate in Massachusetts we may do the following

ind <- which(murders$state == "Massachusetts")

murder_rate[ind]

# to obtain the indices and subsequent murder rates of New York, Florida, Texas, we do:

ind <- match(c("New York", "Florida", "Texas"), murders$state)

ind

murder_rate[ind]

# to see if Boston, Dakota, and Washington are states

c("Boston", "Dakota", "Washington") %in% murders$state

# installing and loading the dplyr package

install.packages("dplyr")

library(dplyr)

# adding a column with mutate

library(dslabs)

data("murders")

murders <- mutate(murders, rate = total / population * 100000)

# subsetting with filter

filter(murders, rate <= 0.71)

# selecting columns with select

new_table <- select(murders, state, region, rate)

# using the pipe

murders %>% select(state, region, rate) %>% filter(rate <= 0.71)

# creating a data frame with stringAsFactors = FALSE

grades <- data.frame(names = c("John", "Juan", "Jean", "Yao"),

exam_1 = c(95, 80, 90, 85),

exam_2 = c(90, 85, 85, 90),

stringsAsFactors = FALSE)

# a simple scatterplot of total murders versus population

x <- murders$population / 10^6

y <- murders$total

plot(x, y)

# a histogram of murder rates

hist(murders$rate)

# boxplots of murder rates by region

boxplot(rate~region, data = murders)

# an example showing the general structure of an if-else statement

a <- 0

if(a!=0){

print(1/a)

} else{

print("No reciprocal for 0.")

}

# an example that tells us which states, if any, have a murder rate less than 0.5

library(dslabs)

data(murders)

murder_rate <- murders$total / murders$population*100000

ind <- which.min(murder_rate)

if(murder_rate[ind] < 0.5){

print(murders$state[ind])

} else{

print("No state has murder rate that low")

}

# changing the condition to < 0.25 changes the result

if(murder_rate[ind] < 0.25){

print(murders$state[ind])

} else{

print("No state has a murder rate that low.")

}

# the ifelse() function works similarly to an if-else conditional

a <- 0

ifelse(a > 0, 1/a, NA)

# the ifelse() function is particularly useful on vectors

a <- c(0,1,2,-4,5)

result <- ifelse(a > 0, 1/a, NA)

# the ifelse() function is also helpful for replacing missing values

data(na_example)

no_nas <- ifelse(is.na(na_example), 0, na_example)

sum(is.na(no_nas))

# the any() and all() functions evaluate logical vectors

z <- c(TRUE, TRUE, FALSE)

any(z)

all(z)

# example of defining a function to compute the average of a vector x

avg <- function(x){

s <- sum(x)

n <- length(x)

s/n

}

# we see that the above function and the pre-built R mean() function are identical

x <- 1:100

identical(mean(x), avg(x))

# variables inside a function are not defined in the workspace

s <- 3

avg(1:10)

s

# the general form of a function

my_function <- function(VARIABLE_NAME){

perform operations on VARIABLE_NAME and calculate VALUE

VALUE

}

# functions can have multiple arguments as well as default values

avg <- function(x, arithmetic = TRUE){

n <- length(x)

ifelse(arithmetic, sum(x)/n, prod(x)^(1/n))

}

# creating a function that computes the sum of integers 1 through n

compute_s_n <- function(n){

x <- 1:n

sum(x)

}

# a very simple for-loop

for(i in 1:5){

print(i)

# a for-loop for our summation

m <- 25

s_n <- vector(length = m) # create an empty vector

for(n in 1:m){

s_n[n] <- compute_s_n(n)

}

# creating a plot for our summation function

n <- 1:m

plot(n, s_n)

# a table of values comparing our function to the summation formula

head(data.frame(s_n = s_n, formula = n*(n+1)/2))

# overlaying our function with the summation formula

plot(n, s_n)

lines(n, n*(n+1)/2)