#Jennifer Young
#Movie Recommendation Project
#Recommendation systems are used more and more, as consumers expect suggestions based
#on their known likes so that they can discover new likes in products, movies,
mu
sic
#and other interests. They assist users in finding what they might be interested in
#based on their preferences and previous interactions. In this report, a movie
#recommendation system using the MovieLens dataset from HarvardX’s Data Science
#Professional Certificate3 program will be covered. GroupLens Research is the
#organization that collected the data sets for this project from their site:
#(https://movielens.org
)
.
## First specify the packages of interest
packages = c(“tidyverse”, “caret”,
“ggplot2”)
## Now load or install&load all
package.check <- lapply(
packages,
FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
}
)
library(tidyverse)
library(caret)
library(ggplot2)
dl <- tempfile()
download.file(“http://files.grouplens.org/datasets/movielens/ml-10m.zip”, dl)
ratings <- read.table(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c(“userId”, “movieId”, “rating”, “timestamp”))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
movies <- as.data.frame(movies) %>% mutate(movieId = as.integer(movieId),
title = as.character(title),
genres = as.character(genres))
movielens <- left_join(ratings, movies, by = "movieId")
#Methods and Analysis
#There are five steps in the data analysis process that must be completed.
#In this case, the data must be prepared. The dataset from was downloaded from
#the MovieLens website and split into two subsets used for training and validation.
#In this case, we named the training set “edx” and the validation set “validation”.
#For training and testing, the edx set was split again into two subsets.
#The edx set is trained with the model when it reaches the RMSE goal and the
#validation set is used for final validation. During data exploration and
#visualization, charts are crated to understand the data and how it affects
#the outcome. We observe the mean of observed values, the distribution of ratings,#
#mean movie ratings, movie effect, user effect and number of ratings per movie.
#We improve the RMSE by including the user and movie effects and applying the
#regularization parameter for samples that have few ratings.
# The Validation subset will be 10% of the MovieLens data.
set.seed(1)
test_index <- createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)
edx <- movielens[-test_index,]
temp <- movielens[test_index,]
#Make sure userId and movieId in validation set are also in edx subset:
validation <- temp %>%
semi_join(edx, by = “movieId”) %>%
semi_join(edx, by = “userId”)
# Add rows removed from validation set back into edx set
removed <- anti_join(temp, validation)
edx <- rbind(edx, removed)
rm(dl, ratings, movies, test_index, temp, movielens, removed)
# lists six variables “userID”, “movieID”, “rating”, “timestamp”, “title”, and “genres” in data frame
head(edx) %>%
print.data.frame()
#Looking for missing values
summary(edx)
#unique movies and users in the edx subset
edx %>%
summarize(n_users = n_distinct(userId),
n_movies = n_distinct(movieId))
#distribution of ratings (histogram)
edx %>%
ggplot(aes(rating)) +
geom_histogram(binwidth = 0.25, color = “red”) +
scale_x_discrete(limits = c(seq(0.5,5,0.5))) +
scale_y_continuous(breaks = c(seq(0, 3000000, 500000))) +
ggtitle(“Rating distribution”)
#ratingspermovie (Histogram)
edx %>%
count(movieId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 25, color = “yellow”) +
scale_x_log10() +
xlab(“Number of ratings”) +
ylab(“Number of movies”) +
ggtitle(“Number of ratings per movie”)
#movies rated once (chart)
edx %>%
group_by(movieId) %>%
summarize(count = n()) %>%
filter(count == 1) %>%
left_join(edx, by = “movieId”) %>%
group_by(title) %>%
summarize(rating = rating, n_rating = count) %>%
slice(1:20) %>%
knitr::kable()
#User ratings (Histogram)
edx %>%
count(userId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 25, color = “green”) +
scale_x_log10() +
xlab(“Number of ratings”) +
ylab(“Number of users”) +
ggtitle(“Number of ratings given by users”)
#Mean user ratings
edx %>%
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating)) %>%
ggplot(aes(b_u)) +
geom_histogram(bins = 25, color = “white”) +
xlab(“Mean rating”) +
ylab(“Number of users”) +
ggtitle(“Mean movie ratings given by users”) +
scale_x_discrete(limits = c(seq(0.5,5,0.5))) +
theme_light()
#compute the RMSE
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings – predicted_ratings)^2))
}
#Average movie rating model
mu <- mean(edx$rating)
mu
naive_rmse
<- RMSE(validation$rating, mu)
naive_rmse
rmse_results <- tibble(method = "Average movie rating model", RMSE = naive_rmse)
rmse_results %>% knitr::kable()
#Movie effect model
movie_avgs <- edx %>%
group_by(movieId) %>%
summarize(b_i = mean(rating – mu))
movie_avgs %>% qplot(b_i, geom =”histogram”, bins = 10, data = ., color = I(“red”),
ylab = “Number of movies”, main = “Number of movies with the computed b_i”)
predicted_ratings <- mu + validation %>%
left_join(movie_avgs, by=’movieId’) %>%
pull(b_i)
model_1_rmse <- RMSE(predicted_ratings, validation$rating)
rmse_results <- bind_rows(rmse_results,
tibble(method=”Movie effect model”,
RMSE = model_1_rmse ))
rmse_results %>% knitr::kable()
#Movie and user effect model
user_avgs<- edx %>%
left_join(movie_avgs, by=’movieId’) %>%
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating – mu – b_i))
user_avgs%>% qplot(b_u, geom =”histogram”, bins = 25, data = ., color = I(“magenta”))
user_avgs <- edx %>%
left_join(movie_avgs, by=’movieId’) %>%
group_by(userId) %>%
summarize(b_u = mean(rating – mu – b_i))
predicted_ratings <- validation%>%
left_join(movie_avgs, by=’movieId’) %>%
left_join(user_avgs, by=’userId’) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
model_2_rmse <- RMSE(predicted_ratings, validation$rating)
rmse_results <- bind_rows(rmse_results,
tibble(method=”Movie and user effect model”,
RMSE = model_2_rmse))
rmse_results %>% knitr::kable()
#Regularized movie and user effect model
lambda
s <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
mu <- mean(edx$rating)
b_i <- edx %>%
group_by(movieId) %>%
summarize(b_i = sum(rating – mu)/(n()+l))
b_u <- edx %>%
left_join(b_i, by=”movieId”) %>%
group_by(userId) %>%
summarize(b_u = sum(rating – b_i – mu)/(n()+l))
predicted_ratings <-
validation %>%
left_join(b_i, by = “movieId”) %>%
left_join(b_u, by = “userId”) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
return(RMSE(predicted_ratings, validation$rating))
})
qplot(lambdas, rmses)
lambda <- lambdas[which.min(rmses)]
lambdarmse_results <- bind_rows(rmse_results,
tibble(method=”Regularized movie and user effect model”,
RMSE = min(rmses)))
rmse_results %>% knitr::kable()
#Results
#For the average movie rating model that we generated first, the result was 1.0606506.
#After accounting for movie effects, we lowered the average to .9437046. In order to lower
#the RMSE even more, we added both the movie and user effects with the result of .8655329.
#Finally, we used regularization to penalize samples with few ratings and got the
#final result of .8649857.
#Conclusion
#In conclusion, we downloaded the data set and prepared it for analysis.
#We looked for various insights and created a simple model from the mean of the
#observed ratings. After that, we added the movie and user effects in an attempt
#to model user behavior. Finally, we conducted regularization that added a
#penalty for the movies and users with the least number of ratings. We achieved
#a model with an RMSE of 0.8649857.
print(“Operating System:”)
version
MOViELENSSCRIPT.R
jenniferyoung
2021-04-17
#Jennifer Young
#Movie Recommendation Project
#Recommendation systems are used more and more, as consumers expect suggestions based
#on their known likes so that they can discover new likes in products, movies, music
#and other interests. They assist users in finding what they might be interested in
#based on their preferences and previous interactions. In this report, a movie
#recommendation system using the MovieLens dataset from HarvardX’s Data Science
#Professional Certificate3 program will be covered. GroupLens Research is the
#organization that collected the data sets for this project from their site:
#(https://movielens.org).
## First specify the packages of interest
packages = c(“tidyverse”, “caret”,
“ggplot2”)
## Now load or install&load all
package.check %
group_by(title) %>%
summarize(rating = rating, n_rating = count) %>%
slice(1:20) %>%
knitr::kable()
title
100 Feet (2008)
4 (2005)
5 Centimeters per Second (Byôsoku 5 senchimêtoru) (2007)
Accused (Anklaget) (2005)
Ace of Hearts (2008)
Ace of Hearts, The (1921)
Adios, Sabata (Indio Black, sai che ti dico: Sei un gran figlio di. . . ) (1971)
Africa addio (1966)
Archangel (1990)
Bad Blood (Mauvais sang) (1986)
Battle of Russia, The (Why We Fight, 5) (1943)
Bell Boy, The (1918)
Black Tights (1-2-3-4 ou Les Collants noirs) (1960)
Blind Shaft (Mang jing) (2003)
5
rating
n_rating
2.0
2.5
3.5
0.5
2.0
3.5
1.5
3.0
2.5
4.5
3.5
4.0
3.0
2.5
1
1
1
1
1
1
1
1
1
1
1
1
1
1
title
Blue Light, The (Das Blaue Licht) (1932)
Borderline (1950)
Boys Life 4: Four Play (2003)
Brothers of the Head (2005)
Caótica Ana (2007)
Chapayev (1934)
rating
n_rating
5.0
3.0
3.0
2.5
4.5
1.5
1
1
1
1
1
1
#User ratings (Histogram)
edx %>%
count(userId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 25, color = “green”) +
scale_x_log10() +
xlab(“Number of ratings”) +
ylab(“Number of users”) +
ggtitle(“Number of ratings given by users”)
Number of ratings given by users
8000
Number of users
6000
4000
2000
0
10
100
1000
Number of ratings
#Mean user ratings
edx %>%
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating)) %>%
ggplot(aes(b_u)) +
geom_histogram(bins = 25, color = “white”) +
6
10000
xlab(“Mean rating”) +
ylab(“Number of users”) +
ggtitle(“Mean movie ratings given by users”) +
scale_x_discrete(limits = c(seq(0.5,5,0.5))) +
theme_light()
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(…)` or `scale_*_continuous()`?
Mean movie ratings given by users
4000
Number of users
3000
2000
1000
0
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
5.0
Mean rating
#compute the RMSE
RMSE %
pull(b_i)
model_1_rmse %
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating – mu – b_i))
user_avgs%>% qplot(b_u, geom =”histogram”, bins = 25, data = ., color = I(“magenta”))
4000
3000
2000
1000
0
−2
−1
0
1
b_u
user_avgs %
left_join(movie_avgs, by=’movieId’) %>%
group_by(userId) %>%
summarize(b_u = mean(rating – mu – b_i))
predicted_ratings %
left_join(movie_avgs, by=’movieId’) %>%
left_join(user_avgs, by=’userId’) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
model_2_rmse %
left_join(b_u, by = “userId”) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
})
return(RMSE(predicted_ratings, validation$rating))
qplot(lambdas, rmses)
10
0.8655329
rmses
0.8654
0.8652
0.8650
0.0
2.5
5.0
7.5
10.0
lambdas
lambda