#
# Predicting total movie grosses after one week
#
movie13 <- read.csv("c:/class/ascii/movie13.csv")
#
attach(movie13)
hist(Opening.Gross)
hist(Total.Gross)
plot(Opening.Gross,Total.Gross)
Log.domestic <- log10(Total.Gross)
Log.1st.weekend <- log10(Opening.Gross)
plot(Log.1st.weekend,Log.domestic)
movie1.lm <- lm(Log.domestic ~ Log.1st.weekend)
summary(movie1.lm)
#
# Again, I do NOT recommend using the plot() command to get residual plots, as the ones that
# are produced are nonstandard in various ways. In particular, it does NOT provide a plot of standardized
# residuals versus fitted values. You should use direct commands to plot the appropriate plots, or (even
# more conveniently) the fourinone function that I gave earlier (if you do this, BE SURE to ask for the
# residuals in the plots to be standardized residuals!).
#
# This is a way to get the standardized residuals, which can of course be useful for the purpose of
# examining them more carefully later
#
stdres <- rstandard(movie1.lm)
#
# Separate plots
#
plot(fitted(movie1.lm),stdres,xlab="Fitted values",ylab="Standardized residuals")
qqnorm(stdres)
hist(stdres)
#
# All at once in a four-in-one plot
#
fourinone(movie1.lm, stdres=TRUE)
#
# Leverage values and Cook's distances are obtained as follows
#
hatvalues(movie1.lm)
cooks.distance(movie1.lm)
#
# Constructing index plots of the diagnostics is straightforward, and you can include dashed lines marking the rough guidelines
#
n <- nrow(movie13)
plot(1:n, stdres, type="b", xlab="Index", ylab="Standardized residuals")
abline(h=2.5, lty=2)
abline(h=-2.5, lty=2)
plot(1:n, hatvalues(movie1.lm), type="b", xlab="Index", ylab="Leverage values")
abline(h=0.034, lty=2)
plot(1:n, cooks.distance(movie1.lm), type="b", xlab="Index", ylab="Cook's D")
abline(h=1, lty=2)
movie2.lm <- lm(Log.domestic ~ Log.1st.weekend + Rotten.Tomatoes.Audience.Score)
summary(movie2.lm)
fourinone(movie2.lm, stdres=TRUE)
stdres <- rstandard(movie2.lm)
plot(1:n, stdres, type="b", xlab="Index", ylab="Standardized residuals")
abline(h=2.5, lty=2)
abline(h=-2.5, lty=2)
plot(1:n, hatvalues(movie2.lm), type="b", xlab="Index", ylab="Leverage values")
abline(h=0.051, lty=2)
plot(1:n, cooks.distance(movie2.lm), type="b", xlab="Index", ylab="Cook's D")
abline(h=1, lty=2)
#
# The package ForwardSearch implements outlier identification methods designed to avoid masking and swamping effects that are based on those
# described in the Hadi and Simonoff (2013), although they are implemented in a different way. Don't forget that you have to install the
# package before you can run it. I'm not recommending that you use this, but in case you are interested here is an example of its application.
#
library(ForwardSearch)
#
# You have to create a matrix of all of the predictors used in the regression
#
X <- cbind(Log.1st.weekend, Rotten.Tomatoes.Audience.Score)
outlierFit <- ForwardSearch.fit(X, Log.domestic)
outlierFit
#
# The following plot suggests stopping at 133 points, as then the residual goes outside the guidance lines.
#
ForwardSearch.plot(outlierFit)
#
# Here are the estimated least squares coefficients trimming of 14 points:
#
ForwardSearch.stopped(outlierFit, 133)
#
# They're not that different from the estimates on all of the observations, but the following rows are flagged:
# 12, 119, 73, 141, 1, 69, 19, 11, 93, 26, 82, 34, 130, 86; these do include the four movies with largest residuals from the fit
# on all of the data, but I don't know that I buy that they are actually outliers.
#
movie14 <- read.csv("c:/class/ascii/movie14.csv")
newmovie <- data.frame(Log.1st.weekend = log10(movie14$Opening.Gross))
pred14 <- predict(movie1.lm,newmovie,interval=c("prediction"))
data.frame(movie14$Name,10^pred14[,2],movie14$Total.Gross,10^pred14[,3])
plot(c(1:22),10^pred14[,2],type="b",pch=1,lty=1,ylim=c(0,500000000),xlab="Index",ylab="Total Gross")
lines(c(1:22),movie14$Total.Gross,type="b",pch=2,lty=2)
lines(c(1:22),10^pred14[,3],type="b",pch=3,lty=3)