#
# Predicting bankruptcy in the telecommunications industry
#
bankruptcy <- read.csv("c:/class/ascii/bankruptcy.csv")
attach(bankruptcy)
bankruptcy
boxplot(WC.TA ~ Bankrupt,xlab="Bankrupt",ylab="WC.TA")
boxplot(RE.TA ~ Bankrupt,xlab="Bankrupt",ylab="RE.TA")
boxplot(EBIT.TA ~ Bankrupt,xlab="Bankrupt",ylab="EBIT.TA")
boxplot(S.TA ~ Bankrupt,xlab="Bankrupt",ylab="S.TA")
boxplot(BVE.BVL ~ Bankrupt,xlab="Bankrupt",ylab="BVE.BVL")
#
# Logistic regression is fit using the glm() function, telling R that the family is binomial. Note
# that R does not give likelihood ratio tests, but rather Wald tests; likelihood ratio
# tests are determined using the drop1() command.
#
bank1 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500)
summary(bank1)
#
# This is the likelihood ratio test for whether all slopes equal 0
#
gstat <- bank1$null.deviance - deviance(bank1)
cbind(gstat, 1-pchisq(gstat,length(coef(bank1))-1))
#
# Odds ratios
#
exp(coef(bank1)[-1])
#
# You can get a 95% confidence interval for the odds ratio by exponentiating the ends of a confidence
# interval for the slope
#
exp(confint.default(bank1))
#
# The drop1() command gives the likelihood ratio tests for each slope, along with AIC values for the
# model that omits that variable
#
drop1(bank1, test="LRT")
#
# The car package gives the (approximate) VIF values
#
library(car)
vif(bank1)
#
# The AIC value given here is not the same as that given by Minitab, but comparisons between AIC values
# for different models will be the same (which is all that matters).
#
AIC(bank1)
#
# The ResourceSelection package gives the Hosmer-Lemeshow test. It will not give the same results as that in Minitab
# when there are ties in the data (that is, multiple observations with the same estimated response). You can also print out the
# underlying counts that determine the HL test, as is optional in Minitab.
#
library(ResourceSelection)
hoslem.test(Bankrupt, fitted(bank1))
#
# The rms library includes a different test that is appropriate for 0/1 response data with no replications. To use it,
# first use the lrm() command in the rms library to fit the model, and then use the residuals() command to get the test.
# This function also gives all of the summary measures of association in the stats component, with Somers D
# being represented by Dxy
#
library(rms)
bank1.lrm <- lrm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, x=T, y=T, data=bankruptcy)
residuals(bank1.lrm, type="gof")
bank1.lrm$stats
library(leaps)
leaps(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt,nbest=2)
#
# The bestglm library will perform best subsets for generalized linear models (including logistic regression), and also
# allows factor (categorical) variables, which leaps() does not. The default criterion used is BIC, but AIC can be
# requested as an option (from which AICc can be calculated if wished). There really is no reason to use the "quick and
# dirty" OLS-based leaps() in R, since bestglm gives the actual best subsets; you should use bestglm. Don't forget
# the "family=binomial" specification! Note also that bestglm counts parameters slightly differently in its calculation of
# AIC, but once again all comparisons between models will be the same.
#
library(bestglm)
logitbest <- bestglm(data.frame(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt), IC="AIC", family=binomial)
logitbest$Subsets
bank2.3 <- glm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T)
summary(bank2.3)
gstat <- bank2.3$null.deviance - deviance(bank2.3)
cbind(gstat, 1-pchisq(gstat,length(coef(bank2.3))-1))
exp(coef(bank2.3)[-1])
drop1(bank2.3, test="LRT")
vif(bank2.3)
hoslem.test(Bankrupt, fitted(bank2.3))
bank2.3.lrm <- lrm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankruptcy)
residuals(bank2.3.lrm, type="gof")
bank2.3.lrm$stats
#
# The glm.diag() function from the boot package gives diagnostics for generalized linear models, while the
# glm.diag.plots() function gives diagnostic plots. The rp, h, and cook attributes refer to standardized
# Pearson residuals, leverage values, and Cook's distances, respectively. The built-in functions for
# hatvalues() and cooks.distance() will work for glm objects as well, but the standardized residuals
# produced by rstandard() are not the standardized Pearson residuals.
#
library(boot)
bankdiag2 <- glm.diag(bank2.3)
spearson2 <- residuals(bank2.3, type="pearson")/sqrt(1-bankdiag2$h)
cbind(spearson2,bankdiag2$cook,bankdiag2$h)
plot(fitted(bank2.3), rstandard(bank2.3, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids")
bank2.4 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T)
summary(bank2.4)
gstat <- bank2.4$null.deviance - deviance(bank2.4)
cbind(gstat, 1-pchisq(gstat,length(coef(bank2.4))-1))
exp(coef(bank2.4)[-1])
drop1(bank2.4, test="LRT")
vif(bank2.4)
hoslem.test(Bankrupt, fitted(bank2.4))
bank2.4.lrm <- lrm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankruptcy)
residuals(bank2.4.lrm, type="gof")
bank2.4.lrm$stats
bankdiag2 <- glm.diag(bank2.4)
spearson2 <- residuals(bank2.4, type="pearson")/sqrt(1-bankdiag2$h)
cbind(spearson2,bankdiag2$cook,bankdiag2$h)
plot(fitted(bank2.4), rstandard(bank2.4, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids")
bankrupt2 <- bankruptcy[-1,]
attach(bankrupt2)
#
# The results aren't exactly the same when there is separation, but the symptoms of the problem
# are similar
#
bank3 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500)
summary(bank3)
logitbest2 <- bestglm(data.frame(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt), IC="AIC", family=binomial)
logitbest2$Subsets
#
# AIC definitely prefers the four-predictor model, which fits perfectly. A reasonable argument is that since this is
# unrealistic as a statement about the true population, the simpler three-predictor model should be used instead.
#
bank4 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T)
summary(bank4)
gstat <- bank4$null.deviance - deviance(bank4)
cbind(gstat, 1-pchisq(gstat,length(coef(bank4))-1))
exp(coef(bank4)[-1])
drop1(bank4, test="LRT")
vif(bank4)
hoslem.test(Bankrupt, fitted(bank4))
bank4.lrm <- lrm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankrupt2)
residuals(bank4.lrm, type="gof")
bank4.lrm$stats
bankdiag4 <- glm.diag(bank4)
spearson4 <- residuals(bank4, type="pearson")/sqrt(1-bankdiag4$h)
bankfit4 <- predict(bank4,type="response")
cbind(spearson4,bankdiag4$cook,bankdiag4$h,bankfit4)
plot(fitted(bank4), rstandard(bank4, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids")
#
# Classification table; the model classifies perfectly on these values, but don't forget that it got the outlier wrong
#
bankrupt.predict <- as.numeric(fitted(bank4) > .5)
table(Bankrupt, bankrupt.predict)
#
# If we had a set of new (validation) data we could check the ability of the model to make future classifications.
# Say newdata is a data set (data frame) based on other companies, with all of the same variable names as were
# used in the original model fit. Probabilities of bankruptcy for the new data are obtained using the predict()
# command:
#
# pred.new <- predict(bank4, newdata, type="response")
#
# A classification table would be constructed the same way as when it is based on the original data:
#
# newclass.predict <- as.numeric(pred.new > .5)
# table(newdata$Bankrupt, newclass.predict)
#
# This is also how you would get estimated probabilities (and hence classifications) for any leverage points that you
# omitted in your original fitting, so that you can produce a correct reported correct classification rate.
#
# To get prospective probability estimates, adjust the retrospective logits and the transform back.
# The retrospective logits are obtained from the predict() function.
#
# VERY IMPORTANT NOTE HERE: since the model fits perfectly on these data, these will just be 0 or 1 anyway. In most
# situations, of course, that would not be the case, and the calculation below is a viable one.
#
prosplogit <- predict(bank4) + log((.1*25)/(.9*24))
prospprob <- exp(prosplogit)/(1 + exp(prosplogit))
prospprob