R Codes for Introduction to Research in Marketing:
R Programming:
Week 1:
# Import boxoffice data:
install.packages(c("data.table","readxl"))
library(data.table)
library(readxl)
setwd("/Users/rafaelhoutepen/Downloads/IRM")
boxofficemojo_com <- read_excel("boxofficemojo.com.xlsx")
setDT(boxofficemojo_com)
View(boxofficemojo_com)
summary(boxofficemojo_com)
# Import the imdb data:
install.packages("readr")
library(readr)
imdb_com <- read_csv("imdb.com.csv")
setDT(imdb_com)
View(imdb_com)
summary(imdb_com)
imdb_com[, budget_num := as.numeric(imdb.com_budget)]
# Merge the two data sets:
movies <- merge(boxofficemojo_com,imdb_com, by.x =
c("boxofficemojo.com_imdb.com_id"), by.y = c("imdb.com_id"), all.x = TRUE)
View(movies)
# Save the workspace and the newly created data set:
save.image("Data.RData")
write_csv(movies, "movies.csv")
install.packages("writexl")
library(writexl)
write_xlsx(movies, "movies.xlsx")
# Visualization:
boxplot(movies$boxofficemojo.com_openinggross)
table(movies$boxofficemojo.com_MPAArating)
barplot(table(movies$boxofficemojo.com_MPAArating))
barplot(table(movies$boxofficemojo.com_MPAArating)/
sum(table(movies$boxofficemojo.com_MPAArating))*100)
install.packages("ggplot2")
library(ggplot2)
ggplot(movies, aes(boxofficemojo.com_MPAArating)) + geom_bar()
ggplot(movies, aes(boxofficemojo.com_MPAArating)) + geom_bar(aes(y =
after_stat(count)/sum(after_stat(count))*100)) + ylab("percentage")
# Bivariate visualization:
, movies[, boxofficemojo.com_MPAArating_R := ifelse(boxofficemojo.com_MPAArating == 'R',
1, 0)]
movies[is.na(boxofficemojo.com_MPAArating_R), boxofficemojo.com_MPAArating_R := 0]
ggplot(movies, aes(x=as.factor(boxofficemojo.com_MPAArating_R),
y=boxofficemojo.com_openinggross)) + geom_boxplot()
ggplot(movies, aes(x=as.factor(imdb.com_basedonbook),
y=boxofficemojo.com_openinggross)) + geom_boxplot()
ggplot(movies[!is.na(imdb.com_basedonbook),], aes(x=as.factor(imdb.com_basedonbook),
y=boxofficemojo.com_openinggross)) + geom_boxplot()
ggplot(movies, aes(x=budget_num, y=boxofficemojo.com_openinggross)) + geom_point()
# Aggregate and then plot:
temp <- movies[, .(boxofficemojo.com_openinggross_mean =
mean(boxofficemojo.com_openinggross)), by=c("imdb.com_year")]
temp <- movies[, .(boxofficemojo.com_openinggross_mean =
mean(boxofficemojo.com_openinggross, na.rm=TRUE)), by=c("imdb.com_year")]
temp <- movies[!is.na(imdb.com_year), .(boxofficemojo.com_openinggross_mean =
mean(boxofficemojo.com_openinggross, na.rm=TRUE)), by=c("imdb.com_year")]
setorderv(temp, c("imdb.com_year"))
ggplot(temp, aes(x=imdb.com_year, y=boxofficemojo.com_openinggross_mean)) +
geom_line()
# Hypothesis testing:
movies[!is.na(imdb.com_basedonbook), .(boxofficemojo.com_openinggross_mean =
mean(boxofficemojo.com_openinggross, na.rm=TRUE)), by=c("imdb.com_basedonbook")]
install.packages("car")
library(car)
leveneTest(boxofficemojo.com_openinggross ~ as.factor(imdb.com_basedonbook), movies,
center=mean)
t.test(boxofficemojo.com_openinggross ~ imdb.com_basedonbook, movies,
var.equal=TRUE)
# Question 1:
subset(movies, boxofficemojo.com_openingtheaters >= 500)
wide_release_movies <- movies[boxofficemojo.com_openingtheaters >= 500]
View(wide_release_movies)
# Question 2:
# Remove NAs from 'imdb.com_genres' column in 'wide_release_movies'
wide_release_movies$imdb.com_genres <-
na.omit(wide_release_movies$imdb.com_genres)
# Create a new dataset without NAs in 'imdb.com_genres'
wide_release_movies_no_na <- wide_release_movies[!
is.na(wide_release_movies$imdb.com_genres), ]
library(dplyr)