Assessing model assumptions using transformed hazard plots

Background

Prior to fitting a model based on an assumed parametric form for the hazard function, a preliminary study of the validity of this assumption should be carried-out.

Let us compare the survivor function for the data with that from a chosen model. To do this we will transform the survivor function to produce a plot that should give a straight line if the assumed model is appropriate.

For the Weibull, twice taking logs of the survivor function with scale parameter \(\lambda\) and shape parameter \(\gamma\)

\[ log(-log S(t)) = log \lambda + \gamma log t \]

A plot of \(log(-log S(t))\) against \(log(t)\) would give an approximately straight line if the Weibull assumption is reasonable. The plot could also be used to give a rough estimate of the parameters.

Similarly, for the log-logistic distribution

\[ log S(t)/(1 - S(t)) = \theta - \kappa log t \]

For the log-normal distribution

\[ \Phi^{-1} (1 - S(t)) = (log t - \mu) / \sigma \] The slope and intercept of this line provide estimates of \(\sigma^{-1}\) and \(-\mu/\sigma\), respectively.

We can also check the assumption made with using the Cox regression model of proportional hazards by inspecting the log-cumulative hazard plot.

\[ log H_i(t) = \beta x_i + log H_0(t) \]

The transformed curves for different values of the explanatory variables will be parallel if PH holds.

See Collett (2013) for more details.

R examples

The package commonly used for survival analyses in R is the survival package (Therneau T 2021). We will begin by repeating an example from the survival help documentation.

This uses their reliability data. Firstly a little data manipulation is done before we plot the cumulative hazard plot against time using the in-built survival package plotting method with the cumhaz=TRUE argument.

library(survival)

data("reliability", package = "survival")

vdata <- with(valveSeat, data.frame(id = id, time2 = time, status = status))
first <- !duplicated(vdata$id)
vdata$time1 <- ifelse(first, 0, c(0, vdata$time[-nrow(vdata)]))
double <- which(vdata$time1 == vdata$time2)
vdata$time1[double] <- vdata$time1[double] - 0.01
vdata$time2[double - 1] <- vdata$time1[double]
vdata[1:7, c("id", "time1", "time2", "status")]
   id  time1  time2 status
1 251   0.00 761.00      0
2 252   0.00 759.00      0
3 327   0.00  98.00      1
4 327  98.00 667.00      0
5 328   0.00 326.00      1
6 328 326.00 652.99      1
7 328 652.99 653.00      1
fit <- survfit(Surv(time1, time2, status) ~ 1, data = vdata, id = id)
plot(fit, cumhaz = TRUE, xlab = "Days", ylab = "Cumulative hazard")

We can plot the log-cumulative hazard against log-time by simply plotting the survfit output values directly by specifying the x and y data explicitly.

plot(log(fit$time), log(fit$cumhaz), xlab = "log-Days", ylab = "Log-cumulative hazard", type = "l")

For the following, the latest development version fo the survHE package (Baio 2020) contains all of the functions that we will need. We can obtain this from GitHub with the following.

devtools::install_github("giabaio/survHE", ref = "devel")

In particular, we will need the plot_transformed_km function.

plot_transformed_km() function
plot_transformed_km <- function(fit, mod = 1, add_legend = FALSE,
                                graph = "base", ...) {
  
  dots <- list(...)
  
  graph <- match.arg(graph, c("base", "ggplot2"))
  
  if (length(mod) != 1)
    stop("Please provide at most one model index.")
  
  if (is.numeric(mod) && !mod <= length(fit$models))
    stop("More model names provided than available in list of model fits provided.")
  
  if (is.character(mod) && !mod %in% names(fit$models))
    stop("Model name not available in list of model fits provided.")
  
  dist <- get_distribution(fit, mod)
  
  distn_names <- list(
    "exp" = c("exp", "exponential"),
    "weibull" = c("weibull", "weibull.quiet", "weibullaf", "weibullph"),
    "loglogistic" = c("llogis", "loglogistic"),
    "lognormal" = c("lognormal", "lnorm"),
    "gompertz" = "gompertz")
  
  if (!dist %in% unname(unlist(distn_names)))
    stop("Distribution not available.")
  
  fit_km <- fit$misc$km
  
  n_strata <- length(fit_km$strata)
  
  if (n_strata == 0 || n_strata == 1) {
    fit_km$strata <- c("group" = length(fit_km$time))
  }
  
  model_strata <- rep(x = names(fit_km$strata),
                      times = fit_km$strata)
  
  times <- split(fit_km$time, model_strata)
  survs <- split(fit_km$surv, model_strata)
  
  params <- list()
  
  if (dist %in% distn_names[["exp"]]) {
    params <- list(
      FUN = "lines",
      xlab = "time",
      ylab = "-log(S(t))",
      main = "Exponential distributional assumption",
      x = times,
      y = lapply(survs, function(x) -log(x)),
      lty = 1:n_strata,
      col = 1:n_strata,
      type = "l")
  }
  
  if (dist %in% distn_names[["weibull"]]) {
    params <- list(
      FUN = "lines",
      xlab = "log(time)",
      ylab = "log(-log(S(t))) i.e. log cumulative hazard",
      main = "Weibull distributional assumption",
      x = lapply(times, log),
      y = lapply(survs, function(x) log(-log(x))),
      lty = 1:n_strata,
      col = 1:n_strata,
      type = "l")
  }
  
  if (dist %in% distn_names[["loglogistic"]]) {
    params <- list(
      FUN = "lines",
      xlab = "time",
      ylab = "log(S(t)/(1-S(t)))",
      main = "log-Logistic distributional assumption",
      x = lapply(times, log),
      y = lapply(survs, function(x) log(x/(1 - x))),
      lty = 1:n_strata,
      col = 1:n_strata,
      type = "l")
  }
  
  if (dist %in% distn_names[["lognormal"]]) {
    params <- list(
      FUN = "lines",
      xlab = "log(time)",
      ylab = expression(Phi^-1 ~ (1 - S(t))),
      main = "Log-normal distributional assumption",
      x = lapply(times, log),
      y = lapply(survs, function(x) qnorm(1 - x)),
      lty = 1:n_strata,
      col = 1:n_strata,
      type = "l")
  }
  
  default_pars <- list(
    x = NULL,
    type = "n",
    axes = FALSE,
    xlab = params$xlab,
    ylab = params$ylab,
    main = params$main,
    xlim = range(pretty(unlist(params$x))),
    ylim = range(pretty(unlist(params$y))))
  
  setup_pars <- modifyList(
    default_pars, dots[names(default_pars)])
  
  if (graph == "base") {
    
    # empty plot
    do.call(plot, setup_pars)
    
    axis(1); axis(2)
    
    # plot lines
    do.call(mapply, modifyList(params, dots))
    
    if (isTRUE(add_legend)) {
      legend("topright", names(survs), col = params$col,
             lty = params$lty, bty = "n")
    }
  }
  
  if (graph == "ggplot2") {
    
    if (!add_legend) {
      pos.legend <- "none"
    } else {
      pos.legend <- "right"}
    
    ggdata <- 
      data.frame(time = unlist(params$x),
                 y = unlist(params$y)) |>
      tibble::rownames_to_column("Group") |> 
      mutate(Group = gsub("\\d+", "", Group))
    
    p <- 
      ggplot(ggdata, aes(x = .data$time, y = .data$y,
                         group = .data$Group, col = .data$Group)) +
      geom_line() +
      do.call(labs,
              list(title = setup_pars$main,
                   x = setup_pars$xlab,
                   y = setup_pars$ylab)) +
      theme_bw() +
      theme(legend.position = pos.legend)
    
    print(p)
  }
  
  invisible(params)
}

get_distribution <- function(fit, mod) {
    m <- fit$models[[mod]]
    tolower(ifelse(fit$method == "hmc", m@model_name, m$dlist$name))
}

Now we can repeat the above analysis using the plot_transformed_km function. By setting distr = "exp" the cumulative hazard plot is returned.

library(survHE)

fit_exp <- survHE::fit.models(Surv(time1, time2, status) ~ 1,
                              data = vdata, distr = "exp", method = "mle")
plot_transformed_km(fit_exp)

Setting distr = "weibull" then we get the log-cumulative hazard against log-time plot.

fit_wei <- survHE::fit.models(Surv(time1, time2, status) ~ 1,
                              data = vdata, distr = "weibull", method = "mle")
plot_transformed_km(fit_wei)

The plot_transformed_km also provides plots for log-normal and log-logistic distribution assumptions with the corresponding transformation to the survival data.

Using flexsurv

Further, we could use the flexsurv package (Jackson 2016). This package contains lots of functions for a range of survival distributions.

The cumulative hazard can be plotted with the flexsurv plotting method with argument type = "cumhaz". The Kaplan-Meier is also overlaid by the model fit.

library("flexsurv")

fs1 <- flexsurvreg(Surv(time1, time2, status) ~ 1, data = vdata, dist = "exp")
plot(fs1, type = "cumhaz")

fs2 <- flexsurvreg(Surv(time1, time2, status) ~ 1, data = vdata, dist = "weibull")
plot(fs2, type = "cumhaz")

Using survHE

library("survHE")

fs1 <- fit.models(Surv(time1, time2, status) ~ 1, data = vdata, dist = "exp")
plot(fs1, type = "cumhaz")

fs2 <- fit.models(Surv(time1, time2, status) ~ 1, data = vdata, dist = "weibull")
plot(fs2, type = "cumhaz")

References

Baio, Gianluca. 2020. survHE: Survival Analysis for Health Economic Evaluation and Cost-Effectiveness Modeling.” J. Stat. Softw. https://doi.org/10.18637/jss.v000.i00.
Collett, Dave. 2013. Modelling Survival Data in Medical Research. 3rd ed. Chapman; Hall/CRC. https://doi.org/https://doi.org/10.1201/b18041.
Jackson, Christopher H. 2016. flexsurv: A Platform for Parametric Survival Modeling in R.” J. Stat. Softw. 70 (1): 1–33. https://doi.org/10.18637/jss.v070.i08.
Therneau T. 2021. A Package for Survival Analysis in R; Version 3.2-11.”, https://CRAN.R–project.org/package=survival. https://cran.r-project.org/package=survival.