"hoslem.test"<- function(glm.obj, deciles = T, cut.values = NULL, print.table = F) { # # Computes the Hosmer-Lemeshow statistic for testing the # fit of a logistic regression model (by comparing estimated # probabilities across levels of the response variable) # # Required arguments: # glm.obj # an glm object with Binomial family and logit link # # Optional arguments: # deciles # logical flag indicated whether to use the deciles of risk # as the categories of risk # cut.values # a vector of values (in [0,1]) specifying the categories of risk # note that cut.values will over-ride deciles if deciles=T # and cut.values is specified; useful for things like # quantiles(glm.obj$fitted,c(0,.25,.5,.75,1)) # # print.table # a logical flag, when T the table of risk categories and observed # and expected values is printed # # Value: # an object of class "htest" with the Hosmer-Lemeshow statistic, the degrees of freedom, # and the p-value of the test # # Side Effects: # if print.table=T, a the table of observed and expected frequencies is printed # # Details: # The Hosmer-Lemeshow test is a chi-squared, goodness-of-fit # test on the fitted values (predicted probabilities) against # the response (0/1) in a logistic regression. The predicted # probabilites are discretized into n.group classes for the # analysis. Using the deciles of the predicted probabilities # is an attractive alternative and is used here as the default. # # Reference: # Hosmer, David W. and Lemeshow, Stanley (1989), Applied Logistic # Regression. New York: John Wiley & Sons # # Author: # Brad Biggerstaff, Ph.D. # Centers for Disease Control and Prevention # National Center for Infectious Diseases # Division of Vector-Borne Infectous Diseases # P.O. Box 2087, Fort Collins, CO 80522-2087 # (970) 221-6473 ... bkb5@cdc.gov # # Note: this function has not been vigorously tested # please let the author know of any problems # # Auxiliary functions: # sort.data.frame, attached # and the following # numericfactor.to.numeric <- function(x) as.numeric(as.character(x)) # if(class(glm.obj)[1] != "glm") stop("You must use an object of class glm") if(glm.obj$family[1] != "Binomial" | glm.obj$family[2] != "Logit: log(mu/(1 - mu))") stop( "Only works for logistic regression") # # # Get the predicted values (pi.hat) # pi.hat <- predict(glm.obj, type = "response") # could use glm.obj$fitted # # fac.var = "factor variable" = just for a name # fac.var <- as.factor(glm.obj$y) if(length(levels(fac.var)) != 2) stop( "fac.var must have exactly 2 levels") # really not needed fac.var.levels <- levels(fac.var) # # # compute the quantiles/deciles of risk if(deciles) { n.groups <- 10 resp.groupings <- quantile(pi.hat, seq(0, 1, length = n.groups + 1)) } if(!is.null(cut.values)) { resp.groupings <- cut.values n.groups <- length(cut.values) - 1 } if(is.null(cut.values) & !deciles) { warning("deciles=F, yet no cut.values were provided...using deciles=T" ) n.groups <- 10 resp.groupings <- quantile(pi.hat, seq(0, 1, length = n.groups + 1)) } risk.groups <- cut(pi.hat, resp.groupings, labels = 1:n.groups, include.lowest = T) # # # Put everything into a data frame to make it easier to track compute.data <- data.frame(Y = glm.obj$y, YHat = pi.hat, RiskGroup = risk.groups, Group = fac.var) # # # Compute the Observed values within the two groups ObsY0 <- table(risk.groups[fac.var == fac.var.levels[1]]) ObsY0 <- data.frame(RiskGroup = dimnames(ObsY0)$.Names, ObsY0 = as.vector(ObsY0)) ObsY1 <- table(risk.groups[fac.var == fac.var.levels[2]]) ObsY1 <- data.frame(RiskGroup = dimnames(ObsY1)$.Names, ObsY1 = as.vector(ObsY1)) # # # Use table() and by() to get the number of p.hats in each # risk group and the expected values (sums of those p.hats) # by.result <- by(compute.data$YHat, compute.data$RiskGroup, FUN = sum) null.to.zero <- function(x) ifelse(is.null(x), 0, x) by.result <- unlist(lapply(by.result, null.to.zero)) data0 <- data.frame(cbind(1:n.groups, table(compute.data$RiskGroup), by.result)) names(data0) <- c("RiskGroup", "N", "Expected1") # # # Put all this together finaldata <- merge(merge(data0, ObsY1, c("RiskGroup")), ObsY0, c( "RiskGroup")) ## # # change RiskGroup so it's a numeric rather than factor variable -- # so it sort-orders correctly...this is really just for printing # finaldata$RiskGroup <- numericfactor.to.numeric(finaldata$RiskGroup) finaldata <- sort.data.frame(finaldata, "RiskGroup") # # Compute the expected value for the other group finaldata$Expected0 <- finaldata$N - finaldata$Expected1 ## # # the intervals that form the risk groups, for printing # interval.labels <- paste("[", format(resp.groupings[1:n.groups], digits = 4), ",", format(resp.groupings[2:(n.groups + 1)], digits = 4 ), ")", sep = "") finaldata$RiskInterval <- interval.labels[finaldata$RiskGroup] row.names(finaldata) <- as.character(finaldata$RiskGroup) if(print.table) { cat("\n") # print(finaldata[, c(1, 7, 2, 4, 3, 5, 6)]) # print in nice order cat("\n") } # # Finally compute the Hosmer-Lemeshow statistic (Pearson chi-squared statistic # on the observed/expected table in finaldata) # asked.for.df <- n.groups - 2 finaldata <- finaldata[finaldata$Expected0 > 0 & finaldata$Expected1 > 0,] statistic <- sum(c((finaldata$ObsY0 - finaldata$Expected0)^2/finaldata$ Expected0, (finaldata$ObsY1 - finaldata$Expected1)^2/finaldata$ Expected1)) n.groups <- dim(finaldata)[1] df <- n.groups - 2 if(df == 0) { cat(" Number of groups must be at least 3. If at least 3 were specified,\n" ) cat(" there are not enough distinct risk groups with expected values positive.\n" ) cat(" To see full table, use print.table=T\n\n") return(invisible()) } if(asked.for.df != df) { cat("Warning:\n") cat("\n Some expected values were 0, so degrees of freedom had to be adjusted\n" ) cat(" to match the correct table...see full table by specifying print.table=T\n\n" ) } ret<-list(method="Hosmer-Lemeshow Statistic",statistic = statistic, parameter=df, p.value = 1 - pchisq(statistic, df),data.name=glm.obj$call$data) names(ret$parameter)<-"df" names(ret$statistic) <- "Statistic" ret$alternative <- "fit is NOT adequate" class(ret)<-"htest" ret } "sort.data.frame" <- function(data, by = 1:dim(data)[2], asc = rep(T, length(by)), na.last = T) { # # this function works like the SAS SORT procedure # # data: a dataframe or a matrix, "dimnames" are used as the variable names # # by: a vector or a list of variable names or column indices specifying # the "sort by" variables, the default is to sort by all variables # in the order they appear in the data set # # asc: a vector with the same length as "by" indicating whether the sorting # of each "by" variable is in ascending order, the default order is # ascending # # na.last: a flag indicating whether missing values are placed as the last # elements in the data set, the default is T # # example: # # 1. sort "mydata" by "id" (ascending) and "group" (descending) # # newdata <- lsort(mydata, by=c("id","group"), asc=c(T,F)) # # 2. sort "mydata" by the third and the first variable (ascending) # # newdata <- lsort(mydata, by=c(3,1)) # # 3. sort "mydata" by "id" and the 7th variable (ascending) # # newdata <- lsort(mydata, by=list("id",7)) # m <- dim(data)[1] keys <- 1:m rotate <- m:1 for(i in length(by):1) { if(asc[i]) keys[] <- keys[sort.list(data[, by[[i]]][keys], na.last = na.last)] else keys[] <- keys[order(data[, by[[i]]][keys], rotate, na.last = na.last)[rotate]] } data[keys, ] }