Survival Analysis
CODE
Kaplan-Meier fitter
Kaplan-Meier fitter Based on Different Groups
Nelson-Aalen fitter Theory
Log-Rank-Test
Cox-Regression
AIC and model selection for parametric models
Medical
Operation
Business
Other
Survival time refers to as an amount of time until when a subject is alive or actively participates in a sutdy period.
Censoring is a form of missing data problem in which time to event is not observed for reasons.
Survival and hazard functions
Survival Function(S): The probability that an subject survives from the time origin (diagnosis of a disease) to a specified future time t. For example, S(100)=0.8 means that after 100 days, a subject’s survival probability is 0.8.
Hazard Function (H): The probability that an subject who is under observation at a time t has an event (death) at that time. For example, If h(100) = 0.8 means that after 100 days, the probability of being dead is 0.8.
!pip install lifelines
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib
from urllib import request
import lifelines
from lifelines import KaplanMeierFitter
print('lifelines version: ', lifelines.__version__)
# Download data
csv_src = "http://www-eio.upc.edu/~pau/cms/rdata/csv/survival/lung.csv"
csv_path = 'lung.csv'
urllib.request.urlretrieve(csv_src, csv_path)
# Create dataframe
df = pd.read_csv(csv_path)
# Precessing
df.loc[df.status == 1, 'dead'] = 0
df.loc[df.status == 2, 'dead'] = 1
df = df.drop(['Unnamed: 0', 'status'], axis=1)
df.head()
# Get statistical info
df.describe()
# Kaplan-Meier-Fitter
kmf = KaplanMeierFitter()
kmf.fit(durations = df.time, event_observed = df.dead)
# Create event table
# event_at: Timeline of dataset
# removed: The values of subject that are no longer part of our experiment
# observed: The value of the number of subjects that died during the experiment
# censored: The number of subject does not have an event during the observation time
# entrance: The value of new subject in a given timeline
# at_risk: The number of current subject under observation
kmf.event_table
S(t) = (Number of subjects at risk at the start - Number of subjects that died) / Number of subjects at risk at the start
# Calculate survival probability for given time
print("Survival probablity for t=0: ", kmf.predict(0))
print("Survival probablity for t=5: ", kmf.predict(5))
print("Survival probablity for t=11: ", kmf.predict(11))
print("Survival probablity for t=840: ", kmf.predict(840))
print(kmf.predict([0,5,11,840]))
# Get survial full list
print(kmf.survival_function_)
# Plot the survival graph
kmf.plot()
plt.title("Kaplan-Meier")
plt.xlabel("Number o days")
plt.ylabel("Probablity of survival")
plt.show()
# Get survial time median
survival_time_median = kmf.median_survival_time_
print(" The median survial time: ", survival_time_median)
# Survival probability wiht confidence interval
survival_confidence_interval = kmf.confidence_interval_survival_function_
survival_confidence_interval
# Plot survival function with confidence interval
plt.plot(survival_confidence_interval['KM_estimate_lower_0.95'], label="Lower")
plt.plot(survival_confidence_interval['KM_estimate_upper_0.95'], label="Upper")
plt.title("Survival Function with confidence interval")
plt.xlabel("Number of days")
plt.ylabel("Survial Probablity")
plt.legend()
plt.show()
# Cumulative survival
cumu_surv = kmf.cumulative_density_
cumu_surv
# Plot cumulative density
kmf.plot_cumulative_density()
plt.title("Cumulative density")
plt.xlabel("Number of days")
plt.ylabel("Probablity of death")
plt.show()
# Median time left for event
median_time_to_event = kmf.conditional_time_to_event_
plt.plot(median_time_to_event, label = "Median time left")
plt.title("Median time to event")
plt.xlabel("Total days")
plt.ylabel("Conditional median time to event")
plt.legend()
plt.show()
# KM for Male and Female
kmf_m = KaplanMeierFitter()
kmf_f = KaplanMeierFitter()
Male = df.query("sex == 1")
Female = df.query("sex == 2")
kmf_m.fit(durations=Male.time, event_observed=Male.dead, label="Male")
kmf_f.fit(durations=Female.time, event_observed=Female.dead, label="Female")
# Plot survival function
kmf_m.plot()
kmf_f.plot()
plt.title("KM Survival Probaility: Male vs Female")
plt.xlabel("Days")
plt.ylabel("Survival Probability")
plt.show()
# Plot cumulative density
kmf_m.plot_cumulative_density()
kmf_f.plot_cumulative_density()
plt.title("Cumulative density")
plt.xlabel("Number of days")
plt.ylabel("Probablity of death")
plt.show()
from lifelines import NelsonAalenFitter
naf = NelsonAalenFitter()
naf.fit(durations = df.time, event_observed = df.dead)
H(t) = Number of subjects that died / Number of subjects at risk
# Cumulative hazard
naf.cumulative_hazard_
# Plot cumulative hazard
naf.plot_cumulative_hazard()
plt.title("Cumulative Hazard")
plt.xlabel("Number of days")
plt.ylabel("Cumulative Probability of death")
plt.show()
# Predict
print("t = 500 days: ", naf.predict(500))
# cumulative hazard with confidence interval
hazard_confidence_interval = naf.confidence_interval_
plt.plot(hazard_confidence_interval['NA_estimate_lower_0.95'], label="Lower")
plt.plot(hazard_confidence_interval['NA_estimate_upper_0.95'], label="Upper")
plt.title("Cumulative harzard Function with confidence interval")
plt.xlabel("Number of days")
plt.ylabel("Cumulative hazard")
plt.legend()
plt.show()
# Cumulative hazard vs cumulative density
kmf.plot_cumulative_density(label="Cumulative Hazard")
naf.plot_cumulative_hazard(label="Cumulative Density")
plt.xlabel("Number of days")
plt.show()
Male vs Female
naf_m = NelsonAalenFitter()
naf_f = NelsonAalenFitter()
naf_m.fit(durations = Male.time, event_observed = Male.dead)
naf_f.fit(durations = Female.time, event_observed = Female.dead)
# Plot cumulative hazard Male vs Female
naf_m.plot_cumulative_hazard(label="Male")
naf_f.plot_cumulative_hazard(label="Female")
plt.title("Cumulative Hazard")
plt.xlabel("Number of days")
plt.ylabel("Cumulative Probability of death")
plt.show()
Log-rank test is a hypothesis test survival distribution of two samples.
A p-value between 0 and 1 denotes the statistical significance. The smaller the p-value, the more significant the statistical difference between groups being studied is.
Less than (5% = 0.05) P-value means there is a significant difference between the groups we compared.
from lifelines.statistics import logrank_test
time_m = Male.time
event_m = Male.dead
time_f = Female.time
event_f = Female.dead
results = logrank_test(time_m, time_f, event_m, event_f)
results.print_summary()
print("P-value: ", results.p_value)
survival function for both groups is significantly different.
The cox-proportional hazard model is a regression model generally used by medical researchers to determine the relationship between the survival time of a subject and one or more predictor variables.
h(t) = h0(t) * exp(b1X1 + b2X2 + ... + bnXn)
Where,
t = survivaltime
h(t) = hazard function
X1, X2, ..., Xn = covariates
b1, b2, ..., bn = measures the impact of covariates
from lifelines import CoxPHFitter
df2 = pd.read_csv("lung.csv")
df2 = df2.drop(['Unnamed: 0'], axis=1)
df2 = df2.dropna(subset=['inst', 'time', 'status', 'age', 'sex', 'ph.ecog', 'ph.karno', 'pat.karno', 'meal.cal', 'wt.loss'])
df2.loc[df2.status == 1, 'dead'] = 0
df2.loc[df2.status == 2, 'dead'] = 1
df2 = df2.drop(['status'], axis=1)
df2.head()
df2.describe()
cph = CoxPHFitter()
cph.fit(df2, duration_col='time', event_col='dead')
cph.print_summary()
# All covariates forest plot
cph.plot()
plt.show()
# Plotting the effect of varying a covariate
cph.plot_partial_effects_on_outcome(covariates='sex', values=[1, 2], cmap='coolwarm')
plt.title("Survival curves comparison: sex")
plt.xlabel("Number of days")
plt.ylabel("Survival")
plt.legend()
plt.show()
# Plotting the effect of varying a covariate
cph.plot_partial_effects_on_outcome(covariates='ph.ecog', values=[0, 1, 2, 3], cmap='coolwarm')
plt.title("Survival curves comparison: ph.ecog")
plt.xlabel("Number of days")
plt.ylabel("Survival")
plt.legend()
plt.show()
# Plotting the effect of varying a covariate
cph.plot_partial_effects_on_outcome(covariates='inst', values=[1, 3, 11, 15, 32], cmap='coolwarm')
plt.title("Survival curves comparison: inst")
plt.xlabel("Number of days")
plt.ylabel("Survival")
plt.legend()
plt.show()
# Plotting the effect of varying a covariate
cph.plot_partial_effects_on_outcome(covariates='ph.karno', values=[50, 70, 80, 90, 100], cmap='coolwarm')
plt.title("Survival curves comparison: ph.karno")
plt.xlabel("Number of days")
plt.ylabel("Survival")
plt.legend()
plt.show()
Cox Proportional Hazards regression can still fit survival models without knowing (or assuming) the distribution.
a log-normal, or a Weibull, or other parametric distribution make that assumption the maximum likelihood parametric approach
from lifelines import LogLogisticAFTFitter, WeibullAFTFitter, LogNormalAFTFitter
llf = LogLogisticAFTFitter().fit(df2, 'time', 'dead')
lnf = LogNormalAFTFitter().fit(df2, 'time', 'dead')
wf = WeibullAFTFitter().fit(df2, 'time', 'dead')
print(llf.AIC_)
print(lnf.AIC_)
print(wf.AIC_)