Harvard University GroupLens Research and Rating Distribution RStudio Project

#Jennifer Young

#Movie Recommendation Project

#Recommendation systems are used more and more, as consumers expect suggestions based

#on their known likes so that they can discover new likes in products, movies,

sic

#and other interests. They assist users in finding what they might be interested in

#based on their preferences and previous interactions. In this report, a movie

#recommendation system using the MovieLens dataset from HarvardX’s Data Science

#Professional Certificate3 program will be covered. GroupLens Research is the

#organization that collected the data sets for this project from their site:

#(https://movielens.org

)

## First specify the packages of interest

packages = c(“tidyverse”, “caret”,

“ggplot2”)

## Now load or install&load all

package.check <- lapply(

packages,

FUN = function(x) {

if (!require(x, character.only = TRUE)) {

install.packages(x, dependencies = TRUE)

library(x, character.only = TRUE)

}

)

library(tidyverse)

library(caret)

library(ggplot2)

dl <- tempfile()

download.file(“http://files.grouplens.org/datasets/movielens/ml-10m.zip”, dl)

ratings <- read.table(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),

col.names = c(“userId”, “movieId”, “rating”, “timestamp”))

movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)

colnames(movies) <- c("movieId", "title", "genres")

movies <- as.data.frame(movies) %>% mutate(movieId = as.integer(movieId),

title = as.character(title),

genres = as.character(genres))

movielens <- left_join(ratings, movies, by = "movieId")

#Methods and Analysis

#There are five steps in the data analysis process that must be completed.

#In this case, the data must be prepared. The dataset from was downloaded from

#the MovieLens website and split into two subsets used for training and validation.

#In this case, we named the training set “edx” and the validation set “validation”.

#For training and testing, the edx set was split again into two subsets.

#The edx set is trained with the model when it reaches the RMSE goal and the

#validation set is used for final validation. During data exploration and

#visualization, charts are crated to understand the data and how it affects

#the outcome. We observe the mean of observed values, the distribution of ratings,#

#mean movie ratings, movie effect, user effect and number of ratings per movie.

#We improve the RMSE by including the user and movie effects and applying the

#regularization parameter for samples that have few ratings.

# The Validation subset will be 10% of the MovieLens data.

set.seed(1)

test_index <- createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)

edx <- movielens[-test_index,]

temp <- movielens[test_index,]

#Make sure userId and movieId in validation set are also in edx subset:

validation <- temp %>%

semi_join(edx, by = “movieId”) %>%

semi_join(edx, by = “userId”)

# Add rows removed from validation set back into edx set

removed <- anti_join(temp, validation)

edx <- rbind(edx, removed)

rm(dl, ratings, movies, test_index, temp, movielens, removed)

# lists six variables “userID”, “movieID”, “rating”, “timestamp”, “title”, and “genres” in data frame

head(edx) %>%

print.data.frame()

#Looking for missing values

summary(edx)

#unique movies and users in the edx subset

edx %>%

summarize(n_users = n_distinct(userId),

n_movies = n_distinct(movieId))

#distribution of ratings (histogram)

edx %>%

ggplot(aes(rating)) +

geom_histogram(binwidth = 0.25, color = “red”) +

scale_x_discrete(limits = c(seq(0.5,5,0.5))) +

scale_y_continuous(breaks = c(seq(0, 3000000, 500000))) +

ggtitle(“Rating distribution”)

#ratingspermovie (Histogram)

edx %>%

count(movieId) %>%

ggplot(aes(n)) +

geom_histogram(bins = 25, color = “yellow”) +

scale_x_log10() +

xlab(“Number of ratings”) +

ylab(“Number of movies”) +

ggtitle(“Number of ratings per movie”)

#movies rated once (chart)

edx %>%

group_by(movieId) %>%

summarize(count = n()) %>%

filter(count == 1) %>%

left_join(edx, by = “movieId”) %>%

group_by(title) %>%

summarize(rating = rating, n_rating = count) %>%

slice(1:20) %>%

knitr::kable()

#User ratings (Histogram)

edx %>%

count(userId) %>%

ggplot(aes(n)) +

geom_histogram(bins = 25, color = “green”) +

scale_x_log10() +

xlab(“Number of ratings”) +

ylab(“Number of users”) +

ggtitle(“Number of ratings given by users”)

#Mean user ratings

edx %>%

group_by(userId) %>%

filter(n() >= 100) %>%

summarize(b_u = mean(rating)) %>%

ggplot(aes(b_u)) +

geom_histogram(bins = 25, color = “white”) +

xlab(“Mean rating”) +

ylab(“Number of users”) +

ggtitle(“Mean movie ratings given by users”) +

scale_x_discrete(limits = c(seq(0.5,5,0.5))) +

theme_light()

#compute the RMSE

RMSE <- function(true_ratings, predicted_ratings){

sqrt(mean((true_ratings – predicted_ratings)^2))

}

#Average movie rating model

mu <- mean(edx$rating)

naive_rmse

<- RMSE(validation$rating, mu)

naive_rmse

rmse_results <- tibble(method = "Average movie rating model", RMSE = naive_rmse)

rmse_results %>% knitr::kable()

#Movie effect model

movie_avgs <- edx %>%

group_by(movieId) %>%

summarize(b_i = mean(rating – mu))

movie_avgs %>% qplot(b_i, geom =”histogram”, bins = 10, data = ., color = I(“red”),

ylab = “Number of movies”, main = “Number of movies with the computed b_i”)

predicted_ratings <- mu + validation %>%

left_join(movie_avgs, by=’movieId’) %>%

pull(b_i)

model_1_rmse <- RMSE(predicted_ratings, validation$rating)

rmse_results <- bind_rows(rmse_results,

tibble(method=”Movie effect model”,

RMSE = model_1_rmse ))

rmse_results %>% knitr::kable()

#Movie and user effect model

user_avgs<- edx %>%

left_join(movie_avgs, by=’movieId’) %>%

group_by(userId) %>%

filter(n() >= 100) %>%

summarize(b_u = mean(rating – mu – b_i))

user_avgs%>% qplot(b_u, geom =”histogram”, bins = 25, data = ., color = I(“magenta”))

user_avgs <- edx %>%

left_join(movie_avgs, by=’movieId’) %>%

group_by(userId) %>%

summarize(b_u = mean(rating – mu – b_i))

predicted_ratings <- validation%>%

left_join(movie_avgs, by=’movieId’) %>%

left_join(user_avgs, by=’userId’) %>%

mutate(pred = mu + b_i + b_u) %>%

pull(pred)

model_2_rmse <- RMSE(predicted_ratings, validation$rating)

rmse_results <- bind_rows(rmse_results,

tibble(method=”Movie and user effect model”,

RMSE = model_2_rmse))

rmse_results %>% knitr::kable()

#Regularized movie and user effect model

lambda

s <- seq(0, 10, 0.25)

rmses <- sapply(lambdas, function(l){

mu <- mean(edx$rating)

b_i <- edx %>%

group_by(movieId) %>%

summarize(b_i = sum(rating – mu)/(n()+l))

b_u <- edx %>%

left_join(b_i, by=”movieId”) %>%

group_by(userId) %>%

summarize(b_u = sum(rating – b_i – mu)/(n()+l))

predicted_ratings <-

validation %>%

left_join(b_i, by = “movieId”) %>%

left_join(b_u, by = “userId”) %>%

mutate(pred = mu + b_i + b_u) %>%

pull(pred)

return(RMSE(predicted_ratings, validation$rating))

})

qplot(lambdas, rmses)

lambda <- lambdas[which.min(rmses)]

lambdarmse_results <- bind_rows(rmse_results,

tibble(method=”Regularized movie and user effect model”,

RMSE = min(rmses)))

rmse_results %>% knitr::kable()

#Results

#For the average movie rating model that we generated first, the result was 1.0606506.

#After accounting for movie effects, we lowered the average to .9437046. In order to lower

#the RMSE even more, we added both the movie and user effects with the result of .8655329.

#Finally, we used regularization to penalize samples with few ratings and got the

#final result of .8649857.

#Conclusion

#In conclusion, we downloaded the data set and prepared it for analysis.

#We looked for various insights and created a simple model from the mean of the

#observed ratings. After that, we added the movie and user effects in an attempt

#to model user behavior. Finally, we conducted regularization that added a

#penalty for the movies and users with the least number of ratings. We achieved

#a model with an RMSE of 0.8649857.

print(“Operating System:”)

version

MOViELENSSCRIPT.R
jenniferyoung
2021-04-17
#Jennifer Young
#Movie Recommendation Project
#Recommendation systems are used more and more, as consumers expect suggestions based
#on their known likes so that they can discover new likes in products, movies, music
#and other interests. They assist users in finding what they might be interested in
#based on their preferences and previous interactions. In this report, a movie
#recommendation system using the MovieLens dataset from HarvardX’s Data Science
#Professional Certificate3 program will be covered. GroupLens Research is the
#organization that collected the data sets for this project from their site:
#(https://movielens.org).
## First specify the packages of interest
packages = c(“tidyverse”, “caret”,
“ggplot2”)
## Now load or install&load all
package.check %
group_by(title) %>%
summarize(rating = rating, n_rating = count) %>%
slice(1:20) %>%
knitr::kable()
title
100 Feet (2008)
4 (2005)
5 Centimeters per Second (Byôsoku 5 senchimêtoru) (2007)
Accused (Anklaget) (2005)
Ace of Hearts (2008)
Ace of Hearts, The (1921)
Adios, Sabata (Indio Black, sai che ti dico: Sei un gran figlio di. . . ) (1971)
Africa addio (1966)
Archangel (1990)
Bad Blood (Mauvais sang) (1986)
Battle of Russia, The (Why We Fight, 5) (1943)
Bell Boy, The (1918)
Black Tights (1-2-3-4 ou Les Collants noirs) (1960)
Blind Shaft (Mang jing) (2003)
5
rating
n_rating
2.0
2.5
3.5
0.5
2.0
3.5
1.5
3.0
2.5
4.5
3.5
4.0
3.0
2.5
1
1
1
1
1
1
1
1
1
1
1
1
1
1
title
Blue Light, The (Das Blaue Licht) (1932)
Borderline (1950)
Boys Life 4: Four Play (2003)
Brothers of the Head (2005)
Caótica Ana (2007)
Chapayev (1934)
rating
n_rating
5.0
3.0
3.0
2.5
4.5
1.5
1
1
1
1
1
1
#User ratings (Histogram)
edx %>%
count(userId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 25, color = “green”) +
scale_x_log10() +
xlab(“Number of ratings”) +
ylab(“Number of users”) +
ggtitle(“Number of ratings given by users”)
Number of ratings given by users
8000
Number of users
6000
4000
2000
0
10
100
1000
Number of ratings
#Mean user ratings
edx %>%
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating)) %>%
ggplot(aes(b_u)) +
geom_histogram(bins = 25, color = “white”) +
6
10000
xlab(“Mean rating”) +
ylab(“Number of users”) +
ggtitle(“Mean movie ratings given by users”) +
scale_x_discrete(limits = c(seq(0.5,5,0.5))) +
theme_light()
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(…)` or `scale_*_continuous()`?
Mean movie ratings given by users
4000
Number of users
3000
2000
1000
0
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
5.0
Mean rating
#compute the RMSE
RMSE %
pull(b_i)
model_1_rmse %
group_by(userId) %>%
filter(n() >= 100) %>%
summarize(b_u = mean(rating – mu – b_i))
user_avgs%>% qplot(b_u, geom =”histogram”, bins = 25, data = ., color = I(“magenta”))
4000
3000
2000
1000
0
−2
−1
0
1
b_u
user_avgs %
left_join(movie_avgs, by=’movieId’) %>%
group_by(userId) %>%
summarize(b_u = mean(rating – mu – b_i))
predicted_ratings %
left_join(movie_avgs, by=’movieId’) %>%
left_join(user_avgs, by=’userId’) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
model_2_rmse %
left_join(b_u, by = “userId”) %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
})
return(RMSE(predicted_ratings, validation$rating))
qplot(lambdas, rmses)
10
0.8655329
rmses
0.8654
0.8652
0.8650
0.0
2.5
5.0
7.5
10.0
lambdas
lambda

Turn in your highest-quality paper
Get a qualified writer to help you with

“ Harvard University GroupLens Research and Rating Distribution RStudio Project ”

Get high-quality paper

Guarantee! All work is written by expert writers!

Still stressed from student homework?

Get quality assistance from academic writers!

Order now