Marginal analysis of measurement agreement among multiple raters with non-ignorable missing

R_function_file.R

#This file contains the deleting data by both MCAR and Non-Ignorable 1

#function FLSkp computes Fleiss Kappa of given data set
#this function works for binary data only
FLSkp = function(B){
     #B is the matrix containing binary ratings (0,1) with row standing for subjects and column standing for raters
     A=B+1 #Matrix A change binary labelling from (0,1) to (1,2)
     N=nrow(A) #number of subjects
     n=ncol(A) #number of ratings per subject
     R=matrix(0,nrow=N, ncol=2)
     for (i in 1:N){
          for (j in 1:2){
               R[i,j]=sum(A[i,]==j)
          }
     }
     p=colSums(R)/(N*n)
     kp=(sum(R^2)-N*n*(1+(n-1)*sum(p^2)))/(N*n*(n-1)*(1-sum(p^2)))
     FLSvar=2*(sum(p^2)-(2*n-3)*(sum(p^2))^2+2*(n-2)*sum(p^3))/(N*n*(n-1)*((1-sum(p^2))^2))

     r=rowSums(A-1)
     pi1=mean(r/n) # R-1 changes binary rating from (1,2) back to (0,1)
     pi0=mean((n-r)/n )
     pe_pi=pi0^2+pi1^2
     pa=mean(r*(r-1)/(n*(n-1)) + (n-r)*(n-r-1)/(n*(n-1)))
     gamma_pi=(pa-pe_pi)/(1-pe_pi)
     #return(c(pe_pi,pa))

#the following method is based on Gwet (2008)
     GM=matrix(NA,nrow=N, ncol=2)
     for (i in 1:N){
          pa_i=r[i]*(r[i]-1)/(n*(n-1)) + (n-r[i])*(n-r[i]-1)/(n*(n-1))
          pe_pi_i=pi1*r[i]/n + pi0*(n-r[i])/n
          gamma_pi_i=(pa_i-pe_pi)/(1-pe_pi)
          gamma_pi_i_star=gamma_pi_i-2*(1-gamma_pi)*((pe_pi_i-pe_pi)/(1-pe_pi))
          GM[i,1]=gamma_pi_i_star
          GM[i,2]=(gamma_pi_i_star-gamma_pi)^2
     }

     VGM=sum(GM[,2])/(N*(N-1))
     return(c(kp,gamma_pi,FLSvar,VGM))
}

##################################################################
############################ 8 raters ############################
##################################################################

#function sim generates data
sim=function(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1){
     #We require r=4.
     #p is the marginal probability
     #N is the sample size
     #p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1)
     #r is the number of raters
     X=matrix(NA, nrow=N, ncol=r)
     for (i in 1:N){
          X[i,1]=rbinom(1,1,p)
          if (X[i,1]==1){X[i,2]=rbinom(1,1,p2_1)}
          if (X[i,1]==0){X[i,2]=rbinom(1,1,((p-p*p2_1)/(1-p)))}

          if (X[i,1]==1){X[i,3]=rbinom(1,1,p3_1)}
          if (X[i,1]==0){X[i,3]=rbinom(1,1,((p-p*p3_1)/(1-p)))}

          if (X[i,1]==1){X[i,4]=rbinom(1,1,p4_1)}
          if (X[i,1]==0){X[i,4]=rbinom(1,1,((p-p*p4_1)/(1-p)))}

          if (X[i,1]==1){X[i,5]=rbinom(1,1,p5_1)}
          if (X[i,1]==0){X[i,5]=rbinom(1,1,((p-p*p5_1)/(1-p)))}

          if (X[i,1]==1){X[i,6]=rbinom(1,1,p6_1)}
          if (X[i,1]==0){X[i,6]=rbinom(1,1,((p-p*p6_1)/(1-p)))}

          if (X[i,1]==1){X[i,7]=rbinom(1,1,p7_1)}
          if (X[i,1]==0){X[i,7]=rbinom(1,1,((p-p*p7_1)/(1-p)))}

          if (X[i,1]==1){X[i,8]=rbinom(1,1,p8_1)}
          if (X[i,1]==0){X[i,8]=rbinom(1,1,((p-p*p8_1)/(1-p)))}

     }
     #JNT stands for joint probability
     JNT12=p*p2_1
     JNT13=p*p3_1
     JNT14=p*p4_1
     JNT15=p*p5_1
     JNT16=p*p6_1
     JNT17=p*p7_1
     JNT18=p*p8_1

     #p2_10=P(Y2=1|Y1=0), p3_10=P(Y3=1|Y1=0), p4_10=P(Y4=1|Y1=0)
     p2_10=(p/(1-p))*(1-JNT12/p)
     p3_10=(p/(1-p))*(1-JNT13/p)
     p4_10=(p/(1-p))*(1-JNT14/p)
     p5_10=(p/(1-p))*(1-JNT15/p)
     p6_10=(p/(1-p))*(1-JNT16/p)
     p7_10=(p/(1-p))*(1-JNT17/p)
     p8_10=(p/(1-p))*(1-JNT18/p)

     JNT23=p2_1*p3_1*p+p2_10*p3_10*(1-p)
     JNT24=p2_1*p4_1*p+p2_10*p4_10*(1-p)
     JNT25=p2_1*p5_1*p+p2_10*p5_10*(1-p)
     JNT26=p2_1*p6_1*p+p2_10*p6_10*(1-p)
     JNT27=p2_1*p7_1*p+p2_10*p7_10*(1-p)
     JNT28=p2_1*p8_1*p+p2_10*p8_10*(1-p)

     JNT34=p3_1*p4_1*p+p3_10*p4_10*(1-p)
     JNT35=p3_1*p5_1*p+p3_10*p5_10*(1-p)
     JNT36=p3_1*p6_1*p+p3_10*p6_10*(1-p)
     JNT37=p3_1*p7_1*p+p3_10*p7_10*(1-p)
     JNT38=p3_1*p8_1*p+p3_10*p8_10*(1-p)

     JNT45=p4_1*p5_1*p+p4_10*p5_10*(1-p)
     JNT46=p4_1*p6_1*p+p4_10*p6_10*(1-p)
     JNT47=p4_1*p7_1*p+p4_10*p7_10*(1-p)
     JNT48=p4_1*p8_1*p+p4_10*p8_10*(1-p)

     JNT56=p5_1*p6_1*p+p5_10*p6_10*(1-p)
     JNT57=p5_1*p7_1*p+p5_10*p7_10*(1-p)
     JNT58=p5_1*p8_1*p+p5_10*p8_10*(1-p)

     JNT67=p6_1*p7_1*p+p6_10*p7_10*(1-p)
     JNT68=p6_1*p8_1*p+p6_10*p8_10*(1-p)

     JNT78=p7_1*p8_1*p+p7_10*p8_10*(1-p)

     cor12=(JNT12-p*p)/(p*(1-p))
     cor13=(JNT13-p*p)/(p*(1-p))
     cor23=(JNT23-p*p)/(p*(1-p))

     JNT0_12=(1-p2_10)*(1-p)
     JNT0_13=(1-p3_10)*(1-p)
     JNT0_14=(1-p4_10)*(1-p)
     JNT0_15=(1-p5_10)*(1-p)
     JNT0_16=(1-p6_10)*(1-p)
     JNT0_17=(1-p7_10)*(1-p)
     JNT0_18=(1-p8_10)*(1-p)

     JNT0_23=(1-p2_1)*(1-p3_1)*p+(1-p2_10)*(1-p3_10)*(1-p)
     JNT0_24=(1-p2_1)*(1-p4_1)*p+(1-p2_10)*(1-p4_10)*(1-p)
     JNT0_25=(1-p2_1)*(1-p5_1)*p+(1-p2_10)*(1-p5_10)*(1-p)
     JNT0_26=(1-p2_1)*(1-p6_1)*p+(1-p2_10)*(1-p6_10)*(1-p)
     JNT0_27=(1-p2_1)*(1-p7_1)*p+(1-p2_10)*(1-p7_10)*(1-p)
     JNT0_28=(1-p2_1)*(1-p8_1)*p+(1-p2_10)*(1-p8_10)*(1-p)

     JNT0_34=(1-p3_1)*(1-p4_1)*p+(1-p3_10)*(1-p4_10)*(1-p)
     JNT0_35=(1-p3_1)*(1-p5_1)*p+(1-p3_10)*(1-p5_10)*(1-p)
     JNT0_36=(1-p3_1)*(1-p6_1)*p+(1-p3_10)*(1-p6_10)*(1-p)
     JNT0_37=(1-p3_1)*(1-p7_1)*p+(1-p3_10)*(1-p7_10)*(1-p)
     JNT0_38=(1-p3_1)*(1-p8_1)*p+(1-p3_10)*(1-p8_10)*(1-p)

     JNT0_45=(1-p4_1)*(1-p5_1)*p+(1-p4_10)*(1-p5_10)*(1-p)
     JNT0_46=(1-p4_1)*(1-p6_1)*p+(1-p4_10)*(1-p6_10)*(1-p)
     JNT0_47=(1-p4_1)*(1-p7_1)*p+(1-p4_10)*(1-p7_10)*(1-p)
     JNT0_48=(1-p4_1)*(1-p8_1)*p+(1-p4_10)*(1-p8_10)*(1-p)

     JNT0_56=(1-p5_1)*(1-p6_1)*p+(1-p5_10)*(1-p6_10)*(1-p)
     JNT0_57=(1-p5_1)*(1-p7_1)*p+(1-p5_10)*(1-p7_10)*(1-p)
     JNT0_58=(1-p5_1)*(1-p8_1)*p+(1-p5_10)*(1-p8_10)*(1-p)

     JNT0_67=(1-p6_1)*(1-p7_1)*p+(1-p6_10)*(1-p7_10)*(1-p)
     JNT0_68=(1-p6_1)*(1-p8_1)*p+(1-p6_10)*(1-p8_10)*(1-p)

JNT0_78=(1-p7_1)*(1-p8_1)*p+(1-p7_10)*(1-p8_10)*(1-p)

     FLSa= (JNT12+JNT13+JNT14+JNT15+JNT16+JNT17+JNT18+
                               JNT23+JNT24+JNT25+JNT26+JNT27+JNT28+
                                            JNT34+JNT35+JNT36+JNT37+JNT38+
                                                         JNT45+JNT46+JNT47+JNT48+
                                                                      JNT56+JNT57+JNT58+
                                                                                   JNT67+JNT68+
                                                                                                JNT78+
                  JNT0_12+JNT0_13+JNT0_14+JNT0_15+JNT0_16+JNT0_17+JNT0_18+
                                   JNT0_23+JNT0_24+JNT0_25+JNT0_26+JNT0_27+JNT0_28+
                                                    JNT0_34+JNT0_35+JNT0_36+JNT0_37+JNT0_38+
                                                                     JNT0_45+JNT0_46+JNT0_47+JNT0_48+
                                                                                      JNT0_56+JNT0_57+JNT0_58+
                                                                                                       JNT0_67+JNT0_68+
                                                                                                                         JNT0_78)

     FLSe=28*(p^2+(1-p)^2)

     FLS=(FLSa-FLSe)/((r*(r-1)/2)-FLSe)

#return(list(Data=X, EMcor12=cor(X[,1],X[,2]), EMcor13=cor(X[,1],X[,3]),EMcor23=cor(X[,2],X[,3]), cor12=cor12,cor13=cor13, cor23=cor23))

return(list(Data=X, FLS=FLS))
}

#A1=sim(p=0.7,r=8, N=100, p2_1=0.75,p3_1=0.85,p4_1=0.8,p5_1=0.9,p6_1=0.75,p7_1=0.85,p8_1=0.8) #kappa=0.2
#A1=sim(p=0.65,r=8, N=100, p2_1=0.95,p3_1=0.85,p4_1=0.8,p5_1=0.9,p6_1=0.85,p7_1=0.95,p8_1=0.89) #kappa=0.5
#A1=sim(p=0.35,r=8, N=100, p2_1=0.9,p3_1=0.95,p4_1=0.9,p5_1=0.9,p6_1=0.95,p7_1=0.95,p8_1=0.9) #kappa=0.8

#set.seed(1)
#A1=sim(0.7,4, 10000,0.8,0.9)

#A1$cor12
#A1$EMcor12
#A1$cor13
#A1$EMcor13
#A1$cor23
#A1$EMcor23

#colSums(A1$Data)/nrow(A1$Data)

#function resample does resampling from data and compute kappa
resample = function(A){
     N=nrow(A)
     R = matrix (NA, nrow=N, ncol=2)
     for (n in 1:N){
          #do the resampling on the subjects with two or more ratings
          if (ncol(A)-sum(is.na(A[n,]))>2){
               index=which(is.na(A[n,])==FALSE)
               R[n,]=A[n,sample(index,2,replace = FALSE)]
                    }
          else{R[n,]=A[n,which(is.na(A[n,])==FALSE)]}
          }

     kp=FLSkp(R) #when applying to two raters only, Fleiss Kappa simplifies to scott's pi'
     kphat=kp[1]
     kpvar=kp[4]

# tb=table(R[,1],R[,2])
# Pa=(tb[1,1]+tb[2,2])/nrow(R)
# Pe=(mean(sum(tb[1,]),sum(tb[,1]))/nrow(R))^2+(mean(sum(tb[2,]),sum(tb[,2]))/nrow(R))^2
# scott=(Pa-Pe)/(1-Pe)

return(list(R=R,kphat=kphat,kpvar=kpvar))
}

resampleMN=function(A, J){
#J is the number of times of resampling
B=matrix(NA, nrow=J,ncol=2)
for (j in 1:J){
     stat=resample(A)
     B[j,]=cbind(stat$kphat,stat$kpvar)
     }

     #the formula is for var(sqrt(N)(betahat-beta)). Thus to compute var(betahat), we should NOT include sample size N.
# Sigma= mean(B[,2])
# Sigma=var(B[,1])*(J-1)/J
     Sigma=(mean(B[,2])-var(B[,1])*(J-1)/J)

return(list(B=B,kpbar=mean(B[,1]),EMvar=var(B[,1]),THvar=Sigma))
}

#A=rbind(
#cbind( 1 , 0 , 1 , NA ),
#cbind( 1 , 1 , 0 , 0 ),
#cbind( NA , 1 , 1 , NA ),
#cbind( 0 , 1 , NA , NA ),
#cbind( 0 , 0 , 0 , 0 ),
#cbind( NA , 1 , 1 , 1 ),
#cbind( 1 , NA , 0 , 0 ),
#cbind( NA , 0 , 0 , NA ),
#cbind( 1 , 1 , 1 , 1 ),
#cbind( 0 , 0 , 0 , NA ))

#function WGEE computes estimates based on weighted GEE ()
WGEE=function(A){
     N=nrow(A)
     R=ncol(A)
     CB=choose(R,2)
     S = matrix(0, nrow=N, ncol=CB)
     Y = matrix(0, nrow=N, ncol=R) #Y restores all the non missing value of A
     for (i in 1:N){
     #nonmissing is the collection of rater indicies of that subject
     nonmissing=which(is.na(A[i,])==FALSE)
     Y[i,1:length(nonmissing)]=A[i,nonmissing]/length(nonmissing)
     #index is the collection of all available rating pairs
     index=combn(nonmissing, 2)
     #for different i, j varies
     for (j in 1:ncol(index)){
          s=A[i,index[1,j]]+A[i,index[2,j]]
          S[i,j]=s*(2-s)/ncol(index)
     }

}
     po=1-mean(rowSums(S))
     pi=mean(rowSums(Y))
     pc=pi^2+(1-pi)^2
     kp=(po-pc)/(1-pc)

     #start computing the variance of kappa by CLT and delta method
     V1=rowSums(S)
     V2=rowSums(Y)
     V1bar=mean(V1)
     V2bar=mean(V2)
     Sigmahat=cov(cbind(V1,V2))
     dKdV1=-1/(2*V2bar-2*V2bar^2)
     dKdV2=V1bar*(1-2*V2bar)/(2*(V2bar^2)*((1-V2bar)^2) )
     dKdV=rbind(dKdV1, dKdV2)

     Vkp=t(dKdV)%*%Sigmahat%*%dKdV/N

     return(cbind(kp,Vkp) )
}

# #function EMkp computes the empirical kappa
# EMkp=function(p,r, N, p2_1,p3_1,p4_1,R,J){
# #p is the marginal probability
# #N is the sample size
# #p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1)
# #r is the number of raters
# #R is the number of replicates of the simulated data generated in order to compute the true Fleiss Kappa
# M=matrix(NA, nrow=R, ncol=2)
# for (m in 1:R){
# S=sim(p,r, N, p2_1,p3_1,p4_1)
# S1=S$Data
# M[m,]=cbind(kappam.fleiss(S1)$value, resampleMN(S1, J)$kphat)
# }
# return(M)
# }

#completeKP=EMkp(0.7,4, 100,0.8,0.9,0.85,10, 10)

#kpMN=mean(EMkp(0.7,4, 100,0.8,0.9,0.85,100000,1)[,1]) #check the mean of ONLY the first column of this output, it is can be considered true

#completeKP=EMkp(0.7,4, 100,0.8,0.9,0.85,1000, 1000)

#Delete1 is MCAR
Delete1 <- function(X,Pmiss){
     #p is the probability of assigning missing
     I=nrow(X)
     J=ncol(X)
     Y=X
     for (i in 1:I){
          #for each subject, randomly select half of the time points and delete by 0.5 probability
          for (j in sample(1:J,(J-2),replace=F)){if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}}
     }
return(Y)
}

#Delete2 is non-ignorable 1st, depending on P(Y=1)
Delete2 <- function(X,Pmiss){
     #p is the probability of assigning missing
     I=nrow(X)
     J=ncol(X)
     Y=X
     for (i in 1:I){
          #for each subject, randomly select half of the time points and delete by 0.5 probability
          for (j in sample(1:J,(J-2),replace=F)){
               if(Y[i,j]==1){if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}}
               }
     }
return(Y)
}

#Delete3 is non-ignorable 2nd, depending on the variablity of rating per subject
Delete3 <- function(X,a,b){
     #p is the probability of assigning missing
     I=nrow(X)
     J=ncol(X)
     Y=X
     for (i in 1:I){
          Pmiss=pnorm(a+b*var(Y[i,]))
        #for each subject, randomly select half of the time points and delete by 0.5 probability
        for (j in sample(1:J,(J-2),replace=F)){
               if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}
               }
     }
return(Y)
}

#function EMkp computes the empirical kappa
EMmissingKP=function(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1,R,J,Pmiss,a,b){
     #p is the marginal probability
     #N is the sample size
     #p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1); p4_1=P(Y4=1|Y1=1)
     #r is the number of raters
     #R is the number of replicates of the simulated data generated in order to compute the true Fleiss Kappa
     M=matrix(NA, nrow=R, ncol=18)
     for (m in 1:R){
          S=sim(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1)
#delete MCAR
     S1=Delete1(S$Data,Pmiss)
     complete1=S1[apply(is.na(S1),1,sum)==0,] #complete extracts the rows without "NA"
     naive1=FLSkp(complete1)
     propose1=resampleMN(S1, J)
     WGEE1=WGEE(S1)

#delete depends on P(Y=1)
     S2=Delete2(S$Data,Pmiss)
     complete2=S2[apply(is.na(S2),1,sum)==0,] #complete extracts the rows without "NA"
     naive2=FLSkp(complete2)
     propose2=resampleMN(S2, J)
     WGEE2=WGEE(S2)

#Delete depends on the variablity of rating per subject
     S3=Delete3(S$Data,a,b)
     complete3=S3[apply(is.na(S3),1,sum)==0,] #complete extracts the rows without "NA"
     naive3=FLSkp(complete3)
     propose3=resampleMN(S3, J)
     WGEE3=WGEE(S3)

     M[m,]=cbind(naive1[1], naive1[4], propose1$kpbar, propose1$THvar,WGEE1[1],WGEE1[2],
                         naive2[1], naive2[4], propose2$kpbar, propose2$THvar,WGEE2[1],WGEE2[2],
                         naive3[1], naive3[4], propose3$kpbar, propose3$THvar,WGEE3[1],WGEE3[2])
     cat("[", m ,"] ",M[m,],"\n")
          }
     return(M)
}

p=as.double(commandArgs()[3])#marginal probability
r=as.double(commandArgs()[4])#number of raters
N=as.double(commandArgs()[5])#sample size
p2_1=as.double(commandArgs()[6])#P(Y2=1|Y1=1)
p3_1=as.double(commandArgs()[7])#P(Y3=1|Y1=1)
p4_1=as.double(commandArgs()[8])#P(Y4=1|Y1=1)
p5_1=as.double(commandArgs()[9])#P(Y5=1|Y1=1)
p6_1=as.double(commandArgs()[10])#P(Y6=1|Y1=1)
p7_1=as.double(commandArgs()[11])#P(Y7=1|Y1=1)
p8_1=as.double(commandArgs()[12])#P(Y8=1|Y1=1)

R=as.double(commandArgs()[13])#the number of replicates of the simulated data generated
J=as.double(commandArgs()[14])#the number of times of resampling
Pmiss=as.double(commandArgs()[15])#probability of assigning missingness
b=as.double(commandArgs()[16])#parameter determining probability of assigning missingness
a=-4

set.seed(1)

SimResult=EMmissingKP(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1,R,J,Pmiss,a,b)
FLS=round(sim(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1)$FLS,2)

filename = paste("/home/xiey5/Method2/result/8raterFLS",FLS,"Pmiss",Pmiss,"N",N,"b",b, ".txt",sep = "")

write.table(SimResult, filename,row.names=F,col.names=F)

#missingKP=EMmissingKP(0.7,4, 100,0.8,0.9,0.85,10, 10,0.3)

#missingKP=EMmissingKP(0.7,4, 100,0.8,0.9,0.85,1000, 1000, 0.3)

Back to Marginal analysis of measurement agreement among multiple raters with non-ignorable missing