# # Predicting bankruptcy in the telecommunications industry # bankruptcy <- read.csv("c:/class/ascii/bankruptcy.csv") attach(bankruptcy) bankruptcy boxplot(WC.TA ~ Bankrupt,xlab="Bankrupt",ylab="WC.TA") boxplot(RE.TA ~ Bankrupt,xlab="Bankrupt",ylab="RE.TA") boxplot(EBIT.TA ~ Bankrupt,xlab="Bankrupt",ylab="EBIT.TA") boxplot(S.TA ~ Bankrupt,xlab="Bankrupt",ylab="S.TA") boxplot(BVE.BVL ~ Bankrupt,xlab="Bankrupt",ylab="BVE.BVL") # # Logistic regression is fit using the glm() function, telling R that the family is binomial. Note # that R does not give likelihood ratio tests, but rather Wald tests; likelihood ratio # tests are determined using the drop1() command. # bank1 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500) summary(bank1) # # This is the likelihood ratio test for whether all slopes equal 0 # gstat <- bank1\$null.deviance - deviance(bank1) cbind(gstat, 1-pchisq(gstat,length(coef(bank1))-1)) # # Odds ratios # exp(coef(bank1)[-1]) # # You can get a 95% confidence interval for the odds ratio by exponentiating the ends of a confidence # interval for the slope # exp(confint.default(bank1)) # # The drop1() command gives the likelihood ratio tests for each slope, along with AIC values for the # model that omits that variable # drop1(bank1, test="LRT") # # The car package gives the (approximate) VIF values # library(car) vif(bank1) # # The AIC value given here is not the same as that given by Minitab, but comparisons between AIC values # for different models will be the same (which is all that matters). # AIC(bank1) # # The ResourceSelection package gives the Hosmer-Lemeshow test. It will not give the same results as that in Minitab # when there are ties in the data (that is, multiple observations with the same estimated response). You can also print out the # underlying counts that determine the HL test, as is optional in Minitab. # library(ResourceSelection) hoslem.test(Bankrupt, fitted(bank1)) # # The rms library includes a different test that is appropriate for 0/1 response data with no replications. To use it, # first use the lrm() command in the rms library to fit the model, and then use the residuals() command to get the test. # This function also gives all of the summary measures of association in the stats component, with Somers D # being represented by Dxy # library(rms) bank1.lrm <- lrm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, x=T, y=T, data=bankruptcy) residuals(bank1.lrm, type="gof") bank1.lrm\$stats library(leaps) leaps(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt,nbest=2) # # The bestglm library will perform best subsets for generalized linear models (including logistic regression), and also # allows factor (categorical) variables, which leaps() does not. The default criterion used is BIC, but AIC can be # requested as an option (from which AICc can be calculated if wished). There really is no reason to use the "quick and # dirty" OLS-based leaps() in R, since bestglm gives the actual best subsets; you should use bestglm. Don't forget # the "family=binomial" specification! Note also that bestglm counts parameters slightly differently in its calculation of # AIC, but once again all comparisons between models will be the same. # library(bestglm) logitbest <- bestglm(data.frame(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt), IC="AIC", family=binomial) logitbest\$Subsets bank2.3 <- glm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T) summary(bank2.3) gstat <- bank2.3\$null.deviance - deviance(bank2.3) cbind(gstat, 1-pchisq(gstat,length(coef(bank2.3))-1)) exp(coef(bank2.3)[-1]) drop1(bank2.3, test="LRT") vif(bank2.3) hoslem.test(Bankrupt, fitted(bank2.3)) bank2.3.lrm <- lrm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankruptcy) residuals(bank2.3.lrm, type="gof") bank2.3.lrm\$stats # # The glm.diag() function from the boot package gives diagnostics for generalized linear models, while the # glm.diag.plots() function gives diagnostic plots. The rp, h, and cook attributes refer to standardized # Pearson residuals, leverage values, and Cook's distances, respectively. The built-in functions for # hatvalues() and cooks.distance() will work for glm objects as well, but the standardized residuals # produced by rstandard() are not the standardized Pearson residuals. # library(boot) bankdiag2 <- glm.diag(bank2.3) spearson2 <- residuals(bank2.3, type="pearson")/sqrt(1-bankdiag2\$h) cbind(spearson2,bankdiag2\$cook,bankdiag2\$h) plot(fitted(bank2.3), rstandard(bank2.3, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids") bank2.4 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T) summary(bank2.4) gstat <- bank2.4\$null.deviance - deviance(bank2.4) cbind(gstat, 1-pchisq(gstat,length(coef(bank2.4))-1)) exp(coef(bank2.4)[-1]) drop1(bank2.4, test="LRT") vif(bank2.4) hoslem.test(Bankrupt, fitted(bank2.4)) bank2.4.lrm <- lrm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankruptcy) residuals(bank2.4.lrm, type="gof") bank2.4.lrm\$stats bankdiag2 <- glm.diag(bank2.4) spearson2 <- residuals(bank2.4, type="pearson")/sqrt(1-bankdiag2\$h) cbind(spearson2,bankdiag2\$cook,bankdiag2\$h) plot(fitted(bank2.4), rstandard(bank2.4, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids") bankrupt2 <- bankruptcy[-1,] attach(bankrupt2) # # The results aren't exactly the same when there is separation, but the symptoms of the problem # are similar # bank3 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500) summary(bank3) logitbest2 <- bestglm(data.frame(cbind(WC.TA, RE.TA, EBIT.TA, S.TA, BVE.BVL),Bankrupt), IC="AIC", family=binomial) logitbest2\$Subsets # # AIC definitely prefers the four-predictor model, which fits perfectly. A reasonable argument is that since this is # unrealistic as a statement about the true population, the simpler three-predictor model should be used instead. # bank4 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T) summary(bank4) gstat <- bank4\$null.deviance - deviance(bank4) cbind(gstat, 1-pchisq(gstat,length(coef(bank4))-1)) exp(coef(bank4)[-1]) drop1(bank4, test="LRT") vif(bank4) hoslem.test(Bankrupt, fitted(bank4)) bank4.lrm <- lrm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, x=T, y=T, data=bankrupt2) residuals(bank4.lrm, type="gof") bank4.lrm\$stats bankdiag4 <- glm.diag(bank4) spearson4 <- residuals(bank4, type="pearson")/sqrt(1-bankdiag4\$h) bankfit4 <- predict(bank4,type="response") cbind(spearson4,bankdiag4\$cook,bankdiag4\$h,bankfit4) plot(fitted(bank4), rstandard(bank4, type="pearson"), xlab="Estimated probabilities", ylab="Standardized Pearson resids") # # Classification table; the model classifies perfectly on these values, but don't forget that it got the outlier wrong # bankrupt.predict <- as.numeric(fitted(bank4) > .5) table(Bankrupt, bankrupt.predict) # # If we had a set of new (validation) data we could check the ability of the model to make future classifications. # Say newdata is a data set (data frame) based on other companies, with all of the same variable names as were # used in the original model fit. Probabilities of bankruptcy for the new data are obtained using the predict() # command: # # pred.new <- predict(bank4, newdata, type="response") # # A classification table would be constructed the same way as when it is based on the original data: # # newclass.predict <- as.numeric(pred.new > .5) # table(newdata\$Bankrupt, newclass.predict) # # This is also how you would get estimated probabilities (and hence classifications) for any leverage points that you # omitted in your original fitting, so that you can produce a correct reported correct classification rate. # # To get prospective probability estimates, adjust the retrospective logits and the transform back. # The retrospective logits are obtained from the predict() function. # # VERY IMPORTANT NOTE HERE: since the model fits perfectly on these data, these will just be 0 or 1 anyway. In most # situations, of course, that would not be the case, and the calculation below is a viable one. # prosplogit <- predict(bank4) + log((.1*25)/(.9*24)) prospprob <- exp(prosplogit)/(1 + exp(prosplogit)) prospprob