Because of a lapse in government funding, the information on this website may not be up to date, transactions submitted via the website may not be processed, and the agency may not be able to respond to inquiries until appropriations are enacted.

The NIH Clinical Center (the research hospital of NIH) is open. For more details about its operating status, please visit cc.nih.gov.

Updates regarding government operating status and resumption of normal operations can be found at OPM.gov.

Joint analysis of longitudinal and survival data measured on nested time-scales using shared parameter models: an application to fecundity data

Oxford_prediction.R


library(MASS)
library(nlme)
library(splines)
library(mvtnorm)

day_level <- read.csv(file="Oxford_test_day_level.csv")
women_level <- read.csv(file="Oxford_test_women_level.csv")

RE_ind <- day_level[,16] #Indicator that day should be used for RE estimation

#Get Data to be used for estimating the random effects and covariates
Y <- day_level[RE_ind==1,1] #Intercourse indicators
ind <- day_level[RE_ind==1,2] #Subject numbers
X <-as.matrix(day_level[RE_ind==1,3:11]) #Day-level covariates for intercourse probability
X_P <- as.matrix(day_level[RE_ind==1,12:13]) #Day-level covariates for peak
day <- day_level[RE_ind==1,14] #Day relative to ovulation (if known)
day_miss <- day_level[RE_ind==1,15] #Indicator that the day is known

X_T <- as.matrix(women_level[,1:2]) #TTP covariates
TTP <- women_level[,3]*0+1 #TTP is 1 since were using 1 cycle of data.
CEN <- 1-1*I(women_level[,3]==1 & women_level[,4]==0) #Indicator they didn't get pregnant in the first cycle.
nij <- matrix(women_level[,5],length(TTP),1) #Length of the follicular phase.
ind_po <- 1*I(nij[,1]!=0) #Indicator the length of the follicular is known.

Dat = list(Y=Y,ind=ind,X=X,X_P=X_P,X_T=X_T,T=TTP,CEN=CEN,nij=nij,day=day,day_miss=day_miss,ind_po=ind_po)

knots<- c(-13,-10,-7,-4,-2,1,4,7,10,13)
BK <- c(-20,50)
day[day>BK[2]] <- BK[2]
day[day<BK[1]] <- BK[1]
basis <- bs(day,knots=knots,Boundary.knots = BK)
X_B <- cbind(basis[(1:dim(basis)[1]),1:dim(basis)[2]])
nodes <- 10

source("R_programs.R")

par <- c(-1.019623017, -0.022231701, -0.222435068, 0.183268558, -2.199957766, 0.074197907, -0.148694702, -0.066871265, -0.213375205, -0.176945759, -0.478504015, 0.911746709, 0.236140303, 0.300086114, -1.891660637, -1.420562322, -1.627560036, -2.053313444, -2.52622992, -2.538376681, -0.261931586, 0.569964381, -0.205466003, 0.011981404, 0.172473525, 0.093544916, 0.608545158, 0.450351538, 1.590301719, -0.395095029, -0.339227847, -0.276937273, -1.407143639, 1.453161218, -1.282294669, 0.091406746, -0.014844835, -0.640734346, 15.64263015,-1.560648012)
par_var <- c(0.035825178, 0.000226101, 0.015432345, 0.002584707, 0.011242709, 0.001502623, 0.010793378, 0.000503257, 0.002061199, 0.009917989, 0.00831891, 0.005718286, 0.010713138, 0.015801399, 0.036025078, 0.031809002, 0.051345602, 0.089138962, 0.205026884, 0.243448575, 0.007686414, 0.031959542, 0.179537367, 0.068544324, 0.038251997, 0.030581415, 0.031546658, 0.029525749, 0.048995353, 0.02899384, 0.034356965, 0.028434758, 0.213223587, 0.915985517, 0.406959903, 0.004495922, 0.000228671, 0.0104776, 0.044865411, 0.0011484)
#Running the empirical Bayes procedure.
EB_proc <- mjm.EB(par,Dat,nodes,knots,BK)

#Outputing the empirical Bayes estimates and their covariance matricies.
EB_est <- EB_proc$Fin_EB.mat
EB_var <- EB_proc$Fin_Var.mat

#Estimating covariate values for cigarettes, alcohol, and period length
cov_est <- NULL
for(j in unique(ind)){
t_X <- X[ind==j,]
t_row <- c(max(t_X[,3]),mean(t_X[,4]),length(t_X[t_X[,5]==1,5]))
cov_est <- rbind(cov_est,t_row)
}

 

#Get the rest of the data to be for predicting.
Y <- day_level[RE_ind==0,1] #Intercourse indicators
ind <- day_level[RE_ind==0,2] #Subject numbers
X <-as.matrix(day_level[RE_ind==0,3:11]) #Day-level covariates for intercourse probability
X_P <- as.matrix(day_level[RE_ind==0,12:13]) #Day-level covariates for peak

X_T <- as.matrix(women_level[,1:2]) #TTP covariates
TTP <- women_level[,3] #TTP
CEN <- women_level[,4] #Censoring indicator

 

#putting in estimated values for covariates (lag will be done later)
for(j in unique(ind)){
#Estimated Covariates
t_est_covs <- cov_est[j,]
t_X <- X[ind==j,]
t_X[,3] <- t_est_covs[1]
t_X[,4] <- t_est_covs[2]
t_cyc <- X_P[ind==j,1]
for(k in unique(t_cyc)){
tt_X <- t_X[t_cyc==k,]
cyc_len <- length(tt_X)/9
tt_X[,5] <- c(rep(1,min(c(t_est_covs[3],cyc_len))),rep(0,max(c(cyc_len-t_est_covs[3],0))))
t_X[t_cyc==k,] <- tt_X
}
X[ind==j,] <- t_X
}

 

 

#Implementing the MC procedure for the predictions
MC <- 2000
set.seed(4)
int_matrix <- NULL
surv_array<- array(0,dim=c(length(unique(ind)),6,MC))
unix.time(for(u in 1:MC){

tpar <- rnorm(length(par),par,sqrt(par_var))
t_re_mat<- matrix(0,length(EB_est[,1]),3)
for(j in unique(ind)){
v.eb <- matrix(EB_var[j,],3,3)
teb <- rmvnorm(1,EB_est[j,],v.eb)
t_re_mat[j,] <- teb
}

Dat = list(ind=ind,X=X,X_P=X_P,X_T=X_T,re_mat= t_re_mat)

Prob_est <- mjm.pred.probs(tpar,Dat,knots,BK)
int_matrix <- rbind(int_matrix,Prob_est$int.prob.mat[,2])
surv_array[,,u] <- Prob_est$surv.prob.mat[,-1]
})

#Summarizing results for day/cycle
pred.probs <- apply(int_matrix,2,mean)
Y.a <- Y

#Calculating the Calibration ROC Curves
div.0.1 <- seq(0,1,0.05)
avg.pred<- NULL
avg.obs <- NULL
for(i in 1:20){
if(length(pred.probs[pred.probs>=div.0.1[i] & pred.probs<div.0.1[i+1]])>0 & length(Y.a[pred.probs>=div.0.1[i] & pred.probs<div.0.1[i+1]])>0){
avg.pred<- c(avg.pred,mean(pred.probs[pred.probs>=div.0.1[i] & pred.probs<div.0.1[i+1]]))
avg.obs <- c(avg.obs,mean(Y.a[pred.probs>=div.0.1[i] & pred.probs<div.0.1[i+1]]))
}
}

 

par(mfrow = c(1,2),mar = c(5,5, 3, 3),mgp=c(3,1,0))
plot(avg.pred,avg.obs,type="l",lwd=3,xlab=c("Predicted Probability"),ylab= c("Observed Probability"),ylim=c(0,0.9),xlim=c(0,0.9),cex.axis=1.3,cex.lab=1.3)
lines(c(0,1),c(0,1),lwd=2,lty=2)

c = sort(unique(pred.probs))

TPR = NULL
FPR = NULL
for(k in 1:length(c)){

TP = sum(1*I(Y.a==1 & pred.probs>c[k]))
FN = sum(1*I(Y.a==1 & pred.probs<=c[k]))
TN = sum(1*I(Y.a==0 & pred.probs<=c[k]))
FP = sum(1*I(Y.a==0 & pred.probs>c[k]))

TPR[k] = TP/(TP+FN)
FPR[k] = FP/(FP+TN)

}

TPR = c(1,TPR,0)
FPR = c(1,FPR,0)
AUC = 0
for(j in 2:length(TPR)){AUC = AUC+(TPR[j-1])*(FPR[j-1] - FPR[j])}
AUC

 

 

plot(FPR,TPR,type="l",axes=FALSE, ann=FALSE,lab=c(6,5,3),lwd=3)

axis(2,las=1, at=seq(0,1,.2), cex.axis=1.5)
axis(1,las=1, at=seq(0,1,.2), cex.axis=1.5)
lines(c(0,1),c(0,1))
box()
title(ylab = "Sensitivity",cex.lab=2)
title(xlab = "1 - Specificity",cex.lab=2)

 

#########################TTP ROC##################

T.pred <- 6
surv.est <- apply(surv_array[,T.pred,],1,mean)
t_TTP <- TTP[unique(ind)]
t_CEN <- CEN[unique(ind)]
Tg6 <- 1*I(t_TTP == T.pred | t_TTP > T.pred)

l = 1
surv.est <- surv.est[!(t_TTP < T.pred & t_CEN ==1)]
Tg6 <- Tg6[!(t_TTP < T.pred & t_CEN ==1)]
T_unq <-unique(ind)[!(t_TTP < T.pred & t_CEN ==1)]

True = Tg6
Prob = surv.est

c = sort(unique(Prob))

TPR = NULL
FPR = NULL
TP = NULL
FN = NULL
TN = NULL
FP = NULL
for(k in 1:length(c)){
TP[k] = sum(1*I(True==1 & Prob>c[k]))
FN[k] = sum(1*I(True==1 & Prob<=c[k]))
TN[k] = sum(1*I(True==0 & Prob<=c[k]))
FP[k] = sum(1*I(True==0 & Prob>c[k]))

TPR[k] = TP[k]/(TP[k]+FN[k])
FPR[k] = FP[k]/(FP[k]+TN[k])

}

TPR = c(1,TPR,0)
FPR = c(1,FPR,0)
AUC = 0
for(j in 2:length(TPR)){AUC = AUC+(TPR[j-1])*(FPR[j-1] - FPR[j])}
AUC

par(mfrow = c(1,1),mar = c(5,5, 2, 2),mgp=c(3,1,0))
plot(FPR,TPR,type="l",axes=FALSE, ann=FALSE,lab=c(6,5,3),lwd=3)

axis(2,las=1, at=seq(0,1,.2), cex.axis=1.5)
axis(1,las=1, at=seq(0,1,.2), cex.axis=1.5)
lines(c(0,1),c(0,1))
box()
title(ylab = "Sensitivity",cex.lab=2)
title(xlab = "1 - Specificity",cex.lab=2)

Back to Joint analysis of longitudinal and survival data measured on nested time-scales using shared parameter models: an application to fecundity data