# Marginal analysis of measurement agreement among multiple raters with non-ignorable missing

## R_function_file.R

#This file contains the deleting data by both MCAR and Non-Ignorable 1

#function FLSkp computes Fleiss Kappa of given data set
#this function works for binary data only
FLSkp = function(B){
#B is the matrix containing binary ratings (0,1) with row standing for subjects and column standing for raters
A=B+1 #Matrix A change binary labelling from (0,1) to (1,2)
N=nrow(A) #number of subjects
n=ncol(A) #number of ratings per subject
R=matrix(0,nrow=N, ncol=2)
for (i in 1:N){
for (j in 1:2){
R[i,j]=sum(A[i,]==j)
}
}
p=colSums(R)/(N*n)
kp=(sum(R^2)-N*n*(1+(n-1)*sum(p^2)))/(N*n*(n-1)*(1-sum(p^2)))
FLSvar=2*(sum(p^2)-(2*n-3)*(sum(p^2))^2+2*(n-2)*sum(p^3))/(N*n*(n-1)*((1-sum(p^2))^2))

r=rowSums(A-1)
pi1=mean(r/n) # R-1 changes binary rating from (1,2) back to (0,1)
pi0=mean((n-r)/n )
pe_pi=pi0^2+pi1^2
pa=mean(r*(r-1)/(n*(n-1)) + (n-r)*(n-r-1)/(n*(n-1)))
gamma_pi=(pa-pe_pi)/(1-pe_pi)
#return(c(pe_pi,pa))

#the following method is based on Gwet (2008)
GM=matrix(NA,nrow=N, ncol=2)
for (i in 1:N){
pa_i=r[i]*(r[i]-1)/(n*(n-1)) + (n-r[i])*(n-r[i]-1)/(n*(n-1))
pe_pi_i=pi1*r[i]/n + pi0*(n-r[i])/n
gamma_pi_i=(pa_i-pe_pi)/(1-pe_pi)
gamma_pi_i_star=gamma_pi_i-2*(1-gamma_pi)*((pe_pi_i-pe_pi)/(1-pe_pi))
GM[i,1]=gamma_pi_i_star
GM[i,2]=(gamma_pi_i_star-gamma_pi)^2
}

VGM=sum(GM[,2])/(N*(N-1))
return(c(kp,gamma_pi,FLSvar,VGM))
}

##################################################################
############################ 8 raters ############################
##################################################################

#function sim generates data
sim=function(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1){
#We require r=4.
#p is the marginal probability
#N is the sample size
#p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1)
#r is the number of raters
X=matrix(NA, nrow=N, ncol=r)
for (i in 1:N){
X[i,1]=rbinom(1,1,p)
if (X[i,1]==1){X[i,2]=rbinom(1,1,p2_1)}
if (X[i,1]==0){X[i,2]=rbinom(1,1,((p-p*p2_1)/(1-p)))}

if (X[i,1]==1){X[i,3]=rbinom(1,1,p3_1)}
if (X[i,1]==0){X[i,3]=rbinom(1,1,((p-p*p3_1)/(1-p)))}

if (X[i,1]==1){X[i,4]=rbinom(1,1,p4_1)}
if (X[i,1]==0){X[i,4]=rbinom(1,1,((p-p*p4_1)/(1-p)))}

if (X[i,1]==1){X[i,5]=rbinom(1,1,p5_1)}
if (X[i,1]==0){X[i,5]=rbinom(1,1,((p-p*p5_1)/(1-p)))}

if (X[i,1]==1){X[i,6]=rbinom(1,1,p6_1)}
if (X[i,1]==0){X[i,6]=rbinom(1,1,((p-p*p6_1)/(1-p)))}

if (X[i,1]==1){X[i,7]=rbinom(1,1,p7_1)}
if (X[i,1]==0){X[i,7]=rbinom(1,1,((p-p*p7_1)/(1-p)))}

if (X[i,1]==1){X[i,8]=rbinom(1,1,p8_1)}
if (X[i,1]==0){X[i,8]=rbinom(1,1,((p-p*p8_1)/(1-p)))}

}
#JNT stands for joint probability
JNT12=p*p2_1
JNT13=p*p3_1
JNT14=p*p4_1
JNT15=p*p5_1
JNT16=p*p6_1
JNT17=p*p7_1
JNT18=p*p8_1

#p2_10=P(Y2=1|Y1=0), p3_10=P(Y3=1|Y1=0), p4_10=P(Y4=1|Y1=0)
p2_10=(p/(1-p))*(1-JNT12/p)
p3_10=(p/(1-p))*(1-JNT13/p)
p4_10=(p/(1-p))*(1-JNT14/p)
p5_10=(p/(1-p))*(1-JNT15/p)
p6_10=(p/(1-p))*(1-JNT16/p)
p7_10=(p/(1-p))*(1-JNT17/p)
p8_10=(p/(1-p))*(1-JNT18/p)

JNT23=p2_1*p3_1*p+p2_10*p3_10*(1-p)
JNT24=p2_1*p4_1*p+p2_10*p4_10*(1-p)
JNT25=p2_1*p5_1*p+p2_10*p5_10*(1-p)
JNT26=p2_1*p6_1*p+p2_10*p6_10*(1-p)
JNT27=p2_1*p7_1*p+p2_10*p7_10*(1-p)
JNT28=p2_1*p8_1*p+p2_10*p8_10*(1-p)

JNT34=p3_1*p4_1*p+p3_10*p4_10*(1-p)
JNT35=p3_1*p5_1*p+p3_10*p5_10*(1-p)
JNT36=p3_1*p6_1*p+p3_10*p6_10*(1-p)
JNT37=p3_1*p7_1*p+p3_10*p7_10*(1-p)
JNT38=p3_1*p8_1*p+p3_10*p8_10*(1-p)

JNT45=p4_1*p5_1*p+p4_10*p5_10*(1-p)
JNT46=p4_1*p6_1*p+p4_10*p6_10*(1-p)
JNT47=p4_1*p7_1*p+p4_10*p7_10*(1-p)
JNT48=p4_1*p8_1*p+p4_10*p8_10*(1-p)

JNT56=p5_1*p6_1*p+p5_10*p6_10*(1-p)
JNT57=p5_1*p7_1*p+p5_10*p7_10*(1-p)
JNT58=p5_1*p8_1*p+p5_10*p8_10*(1-p)

JNT67=p6_1*p7_1*p+p6_10*p7_10*(1-p)
JNT68=p6_1*p8_1*p+p6_10*p8_10*(1-p)

JNT78=p7_1*p8_1*p+p7_10*p8_10*(1-p)

cor12=(JNT12-p*p)/(p*(1-p))
cor13=(JNT13-p*p)/(p*(1-p))
cor23=(JNT23-p*p)/(p*(1-p))

JNT0_12=(1-p2_10)*(1-p)
JNT0_13=(1-p3_10)*(1-p)
JNT0_14=(1-p4_10)*(1-p)
JNT0_15=(1-p5_10)*(1-p)
JNT0_16=(1-p6_10)*(1-p)
JNT0_17=(1-p7_10)*(1-p)
JNT0_18=(1-p8_10)*(1-p)

JNT0_23=(1-p2_1)*(1-p3_1)*p+(1-p2_10)*(1-p3_10)*(1-p)
JNT0_24=(1-p2_1)*(1-p4_1)*p+(1-p2_10)*(1-p4_10)*(1-p)
JNT0_25=(1-p2_1)*(1-p5_1)*p+(1-p2_10)*(1-p5_10)*(1-p)
JNT0_26=(1-p2_1)*(1-p6_1)*p+(1-p2_10)*(1-p6_10)*(1-p)
JNT0_27=(1-p2_1)*(1-p7_1)*p+(1-p2_10)*(1-p7_10)*(1-p)
JNT0_28=(1-p2_1)*(1-p8_1)*p+(1-p2_10)*(1-p8_10)*(1-p)

JNT0_34=(1-p3_1)*(1-p4_1)*p+(1-p3_10)*(1-p4_10)*(1-p)
JNT0_35=(1-p3_1)*(1-p5_1)*p+(1-p3_10)*(1-p5_10)*(1-p)
JNT0_36=(1-p3_1)*(1-p6_1)*p+(1-p3_10)*(1-p6_10)*(1-p)
JNT0_37=(1-p3_1)*(1-p7_1)*p+(1-p3_10)*(1-p7_10)*(1-p)
JNT0_38=(1-p3_1)*(1-p8_1)*p+(1-p3_10)*(1-p8_10)*(1-p)

JNT0_45=(1-p4_1)*(1-p5_1)*p+(1-p4_10)*(1-p5_10)*(1-p)
JNT0_46=(1-p4_1)*(1-p6_1)*p+(1-p4_10)*(1-p6_10)*(1-p)
JNT0_47=(1-p4_1)*(1-p7_1)*p+(1-p4_10)*(1-p7_10)*(1-p)
JNT0_48=(1-p4_1)*(1-p8_1)*p+(1-p4_10)*(1-p8_10)*(1-p)

JNT0_56=(1-p5_1)*(1-p6_1)*p+(1-p5_10)*(1-p6_10)*(1-p)
JNT0_57=(1-p5_1)*(1-p7_1)*p+(1-p5_10)*(1-p7_10)*(1-p)
JNT0_58=(1-p5_1)*(1-p8_1)*p+(1-p5_10)*(1-p8_10)*(1-p)

JNT0_67=(1-p6_1)*(1-p7_1)*p+(1-p6_10)*(1-p7_10)*(1-p)
JNT0_68=(1-p6_1)*(1-p8_1)*p+(1-p6_10)*(1-p8_10)*(1-p)

JNT0_78=(1-p7_1)*(1-p8_1)*p+(1-p7_10)*(1-p8_10)*(1-p)

FLSa= (JNT12+JNT13+JNT14+JNT15+JNT16+JNT17+JNT18+
JNT23+JNT24+JNT25+JNT26+JNT27+JNT28+
JNT34+JNT35+JNT36+JNT37+JNT38+
JNT45+JNT46+JNT47+JNT48+
JNT56+JNT57+JNT58+
JNT67+JNT68+
JNT78+
JNT0_12+JNT0_13+JNT0_14+JNT0_15+JNT0_16+JNT0_17+JNT0_18+
JNT0_23+JNT0_24+JNT0_25+JNT0_26+JNT0_27+JNT0_28+
JNT0_34+JNT0_35+JNT0_36+JNT0_37+JNT0_38+
JNT0_45+JNT0_46+JNT0_47+JNT0_48+
JNT0_56+JNT0_57+JNT0_58+
JNT0_67+JNT0_68+
JNT0_78)

FLSe=28*(p^2+(1-p)^2)

FLS=(FLSa-FLSe)/((r*(r-1)/2)-FLSe)

#return(list(Data=X, EMcor12=cor(X[,1],X[,2]), EMcor13=cor(X[,1],X[,3]),EMcor23=cor(X[,2],X[,3]), cor12=cor12,cor13=cor13, cor23=cor23))

return(list(Data=X, FLS=FLS))
}

#A1=sim(p=0.7,r=8, N=100, p2_1=0.75,p3_1=0.85,p4_1=0.8,p5_1=0.9,p6_1=0.75,p7_1=0.85,p8_1=0.8) #kappa=0.2
#A1=sim(p=0.65,r=8, N=100, p2_1=0.95,p3_1=0.85,p4_1=0.8,p5_1=0.9,p6_1=0.85,p7_1=0.95,p8_1=0.89) #kappa=0.5
#A1=sim(p=0.35,r=8, N=100, p2_1=0.9,p3_1=0.95,p4_1=0.9,p5_1=0.9,p6_1=0.95,p7_1=0.95,p8_1=0.9) #kappa=0.8

#set.seed(1)
#A1=sim(0.7,4, 10000,0.8,0.9)

#A1\$cor12
#A1\$EMcor12
#A1\$cor13
#A1\$EMcor13
#A1\$cor23
#A1\$EMcor23

#colSums(A1\$Data)/nrow(A1\$Data)

#function resample does resampling from data and compute kappa
resample = function(A){
N=nrow(A)
R = matrix (NA, nrow=N, ncol=2)
for (n in 1:N){
#do the resampling on the subjects with two or more ratings
if (ncol(A)-sum(is.na(A[n,]))>2){
index=which(is.na(A[n,])==FALSE)
R[n,]=A[n,sample(index,2,replace = FALSE)]
}
else{R[n,]=A[n,which(is.na(A[n,])==FALSE)]}
}

kp=FLSkp(R) #when applying to two raters only, Fleiss Kappa simplifies to scott's pi'
kphat=kp[1]
kpvar=kp[4]

# tb=table(R[,1],R[,2])
# Pa=(tb[1,1]+tb[2,2])/nrow(R)
# Pe=(mean(sum(tb[1,]),sum(tb[,1]))/nrow(R))^2+(mean(sum(tb[2,]),sum(tb[,2]))/nrow(R))^2
# scott=(Pa-Pe)/(1-Pe)

return(list(R=R,kphat=kphat,kpvar=kpvar))
}

resampleMN=function(A, J){
#J is the number of times of resampling
B=matrix(NA, nrow=J,ncol=2)
for (j in 1:J){
stat=resample(A)
B[j,]=cbind(stat\$kphat,stat\$kpvar)
}

#the formula is for var(sqrt(N)(betahat-beta)). Thus to compute var(betahat), we should NOT include sample size N.
# Sigma= mean(B[,2])
# Sigma=var(B[,1])*(J-1)/J
Sigma=(mean(B[,2])-var(B[,1])*(J-1)/J)

return(list(B=B,kpbar=mean(B[,1]),EMvar=var(B[,1]),THvar=Sigma))
}

#A=rbind(
#cbind( 1 , 0 , 1 , NA ),
#cbind( 1 , 1 , 0 , 0 ),
#cbind( NA , 1 , 1 , NA ),
#cbind( 0 , 1 , NA , NA ),
#cbind( 0 , 0 , 0 , 0 ),
#cbind( NA , 1 , 1 , 1 ),
#cbind( 1 , NA , 0 , 0 ),
#cbind( NA , 0 , 0 , NA ),
#cbind( 1 , 1 , 1 , 1 ),
#cbind( 0 , 0 , 0 , NA ))

#function WGEE computes estimates based on weighted GEE ()
WGEE=function(A){
N=nrow(A)
R=ncol(A)
CB=choose(R,2)
S = matrix(0, nrow=N, ncol=CB)
Y = matrix(0, nrow=N, ncol=R) #Y restores all the non missing value of A
for (i in 1:N){
#nonmissing is the collection of rater indicies of that subject
nonmissing=which(is.na(A[i,])==FALSE)
Y[i,1:length(nonmissing)]=A[i,nonmissing]/length(nonmissing)
#index is the collection of all available rating pairs
index=combn(nonmissing, 2)
#for different i, j varies
for (j in 1:ncol(index)){
s=A[i,index[1,j]]+A[i,index[2,j]]
S[i,j]=s*(2-s)/ncol(index)
}

}
po=1-mean(rowSums(S))
pi=mean(rowSums(Y))
pc=pi^2+(1-pi)^2
kp=(po-pc)/(1-pc)

#start computing the variance of kappa by CLT and delta method
V1=rowSums(S)
V2=rowSums(Y)
V1bar=mean(V1)
V2bar=mean(V2)
Sigmahat=cov(cbind(V1,V2))
dKdV1=-1/(2*V2bar-2*V2bar^2)
dKdV2=V1bar*(1-2*V2bar)/(2*(V2bar^2)*((1-V2bar)^2) )
dKdV=rbind(dKdV1, dKdV2)

Vkp=t(dKdV)%*%Sigmahat%*%dKdV/N

return(cbind(kp,Vkp) )
}

# #function EMkp computes the empirical kappa
# EMkp=function(p,r, N, p2_1,p3_1,p4_1,R,J){
# #p is the marginal probability
# #N is the sample size
# #p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1)
# #r is the number of raters
# #R is the number of replicates of the simulated data generated in order to compute the true Fleiss Kappa
# M=matrix(NA, nrow=R, ncol=2)
# for (m in 1:R){
# S=sim(p,r, N, p2_1,p3_1,p4_1)
# S1=S\$Data
# M[m,]=cbind(kappam.fleiss(S1)\$value, resampleMN(S1, J)\$kphat)
# }
# return(M)
# }

#completeKP=EMkp(0.7,4, 100,0.8,0.9,0.85,10, 10)

#kpMN=mean(EMkp(0.7,4, 100,0.8,0.9,0.85,100000,1)[,1]) #check the mean of ONLY the first column of this output, it is can be considered true

#completeKP=EMkp(0.7,4, 100,0.8,0.9,0.85,1000, 1000)

#Delete1 is MCAR
Delete1 <- function(X,Pmiss){
#p is the probability of assigning missing
I=nrow(X)
J=ncol(X)
Y=X
for (i in 1:I){
#for each subject, randomly select half of the time points and delete by 0.5 probability
for (j in sample(1:J,(J-2),replace=F)){if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}}
}
return(Y)
}

#Delete2 is non-ignorable 1st, depending on P(Y=1)
Delete2 <- function(X,Pmiss){
#p is the probability of assigning missing
I=nrow(X)
J=ncol(X)
Y=X
for (i in 1:I){
#for each subject, randomly select half of the time points and delete by 0.5 probability
for (j in sample(1:J,(J-2),replace=F)){
if(Y[i,j]==1){if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}}
}
}
return(Y)
}

#Delete3 is non-ignorable 2nd, depending on the variablity of rating per subject
Delete3 <- function(X,a,b){
#p is the probability of assigning missing
I=nrow(X)
J=ncol(X)
Y=X
for (i in 1:I){
Pmiss=pnorm(a+b*var(Y[i,]))
#for each subject, randomly select half of the time points and delete by 0.5 probability
for (j in sample(1:J,(J-2),replace=F)){
if(rbinom(n=1,size=1,prob=Pmiss)==1){Y[i,j]=NA}
}
}
return(Y)
}

#function EMkp computes the empirical kappa
EMmissingKP=function(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1,R,J,Pmiss,a,b){
#p is the marginal probability
#N is the sample size
#p2_1=P(Y2=1|Y1=1); p3_1=P(Y3=1|Y1=1); p4_1=P(Y4=1|Y1=1)
#r is the number of raters
#R is the number of replicates of the simulated data generated in order to compute the true Fleiss Kappa
M=matrix(NA, nrow=R, ncol=18)
for (m in 1:R){
S=sim(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1)
#delete MCAR
S1=Delete1(S\$Data,Pmiss)
complete1=S1[apply(is.na(S1),1,sum)==0,] #complete extracts the rows without "NA"
naive1=FLSkp(complete1)
propose1=resampleMN(S1, J)
WGEE1=WGEE(S1)

#delete depends on P(Y=1)
S2=Delete2(S\$Data,Pmiss)
complete2=S2[apply(is.na(S2),1,sum)==0,] #complete extracts the rows without "NA"
naive2=FLSkp(complete2)
propose2=resampleMN(S2, J)
WGEE2=WGEE(S2)

#Delete depends on the variablity of rating per subject
S3=Delete3(S\$Data,a,b)
complete3=S3[apply(is.na(S3),1,sum)==0,] #complete extracts the rows without "NA"
naive3=FLSkp(complete3)
propose3=resampleMN(S3, J)
WGEE3=WGEE(S3)

M[m,]=cbind(naive1[1], naive1[4], propose1\$kpbar, propose1\$THvar,WGEE1[1],WGEE1[2],
naive2[1], naive2[4], propose2\$kpbar, propose2\$THvar,WGEE2[1],WGEE2[2],
naive3[1], naive3[4], propose3\$kpbar, propose3\$THvar,WGEE3[1],WGEE3[2])
cat("[", m ,"] ",M[m,],"\n")
}
return(M)
}

p=as.double(commandArgs()[3])#marginal probability
r=as.double(commandArgs()[4])#number of raters
N=as.double(commandArgs()[5])#sample size
p2_1=as.double(commandArgs()[6])#P(Y2=1|Y1=1)
p3_1=as.double(commandArgs()[7])#P(Y3=1|Y1=1)
p4_1=as.double(commandArgs()[8])#P(Y4=1|Y1=1)
p5_1=as.double(commandArgs()[9])#P(Y5=1|Y1=1)
p6_1=as.double(commandArgs()[10])#P(Y6=1|Y1=1)
p7_1=as.double(commandArgs()[11])#P(Y7=1|Y1=1)
p8_1=as.double(commandArgs()[12])#P(Y8=1|Y1=1)

R=as.double(commandArgs()[13])#the number of replicates of the simulated data generated
J=as.double(commandArgs()[14])#the number of times of resampling
Pmiss=as.double(commandArgs()[15])#probability of assigning missingness
b=as.double(commandArgs()[16])#parameter determining probability of assigning missingness
a=-4

set.seed(1)

SimResult=EMmissingKP(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1,R,J,Pmiss,a,b)
FLS=round(sim(p,r, N, p2_1,p3_1,p4_1,p5_1,p6_1,p7_1,p8_1)\$FLS,2)

filename = paste("/home/xiey5/Method2/result/8raterFLS",FLS,"Pmiss",Pmiss,"N",N,"b",b, ".txt",sep = "")

write.table(SimResult, filename,row.names=F,col.names=F)

#missingKP=EMmissingKP(0.7,4, 100,0.8,0.9,0.85,10, 10,0.3)

#missingKP=EMmissingKP(0.7,4, 100,0.8,0.9,0.85,1000, 1000, 0.3)