# Marginal analysis of measurement agreement among multiple raters with non-ignorable missing

## AnalysisProj2.R

setwd("P:/projects/R_function/Chen/")

source("P:/projects/R_function/Chen/visualization.r")
#column names "STUDYID, SITEUSER, PhysicianQ5, SurgeonQ5, RspGrp, ReviewerGroup"
dim(visualization)

visualization[1:25,]

#By #1 Population, delete the subjects with NO digital image: 071320961, 071321131, 071321541, 071321751, 071322031, 071322401, 071323231, 071323581
missing=c(
which(visualization[,1]==071320961),
which(visualization[,1]==071321131),
which(visualization[,1]==071321541),
which(visualization[,1]==071321751),
which(visualization[,1]==071322031),
which(visualization[,1]==071322401),
which(visualization[,1]==071323231),
which(visualization[,1]==071323581))

#delete subjects without photo
complete = visualization[-missing,]

#extract all the records in round 101
round1 = complete[complete[,7]==101,]

#reorder all the records in round 101
round1OR = round1[order(round1[,1],round1[,2]),]

# Residents (1,2,3,4)
# 16017 16018 16019 16020

# Local experts (6,7,9,10)
# 16022 16024 16026 16027

# International Experts (5,8,11,12)
# 16021 16025 16028 16029

#surgeon (13)

rating = matrix(NA,nrow=148,ncol=13)
for (i in 1:148){rating[i,]=c(round1OR[(12*(i-1)+1):(12*i),3],round1OR[(12*i),4])}

#overall missing
sum(rating==9)
sum(rating==9)/(148*13)

sum(rating[,c(1,2,3,4,6,7,9,10)]==9)
#[1] 210
sum(rating[,c(1,2,3,4,6,7,9,10)]==9)/(148*8)
#[1] 0.1773649

N=nrow(rating)
#dichotomizing
R1=rating[,1]
R2=rating[,2]
R3=rating[,3]
R4=rating[,4]

LE1=rating[,6]
LE2=rating[,7]
LE3=rating[,9]
LE4=rating[,10]

#compute the missing rate for each rater
sum(R1==9)/148
sum(R2==9)/148
sum(R3==9)/148
sum(R4==9)/148

sum(LE1==9)/148
sum(LE2==9)/148
sum(LE3==9)/148
sum(LE4==9)/148
#finish computing the missing rate for each rater

IE1=rating[,5]
IE2=rating[,8]
IE3=rating[,11]
IE4=rating[,12]

R1[R1==9]=NA
R1[R1>0]=1
R2[R2==9]=NA
R2[R2>0]=1
R3[R3==9]=NA
R3[R3>0]=1
R4[R4==9]=NA
R4[R4>0]=1

LE1[LE1==9]=NA
LE1[LE1>0]=1
LE2[LE2==9]=NA
LE2[LE2>0]=1
LE3[LE3==9]=NA
LE3[LE3>0]=1
LE4[LE4==9]=NA
LE4[LE4>0]=1

IE1[IE1==9]=NA
IE1[IE1>0]=1
IE2[IE2==9]=NA
IE2[IE2>0]=1
IE3[IE3==9]=NA
IE3[IE3>0]=1
IE4[IE4==9]=NA
IE4[IE4>0]=1

R=cbind(R1,R2,R3,R4)
Rsdt=R[rowSums(is.na(R))<3,]
LE=cbind(LE1,LE2,LE3,LE4)
LCLEX=LE[rowSums(is.na(LE))<3,]
IE=cbind(IE1,IE2,IE3,IE4)
INTLEX=IE[rowSums(is.na(IE))<3,]

#naive
completeINTLEX=INTLEX[apply(is.na(INTLEX),1,sum)==0,]
FLSkp(completeINTLEX)

completeLCLEX=LCLEX[apply(is.na(LCLEX),1,sum)==0,]
FLSkp(completeLCLEX)

completeRsdt=Rsdt[apply(is.na(Rsdt),1,sum)==0,]
FLSkp(completeRsdt)

#resampling
resampleMN(INTLEX,200)\$kpbar
resampleMN(Rsdt,200)\$kpbar
resampleMN(LCLEX,200)\$kpbar

#weighted GEE
WGEE(INTLEX)
WGEE(Rsdt)
WGEE(LCLEX)

#missing counting
sum(rowSums(is.na(R))==0)
sum(rowSums(is.na(R))==1)
sum(rowSums(is.na(R))==2)
sum(rowSums(is.na(R))==3)
sum(rowSums(is.na(R))==4)

sum(rowSums(is.na(IE))==0)
sum(rowSums(is.na(IE))==1)
sum(rowSums(is.na(IE))==2)
sum(rowSums(is.na(IE))==3)
sum(rowSums(is.na(IE))==4)

sum(rowSums(is.na(LE))==0)
sum(rowSums(is.na(LE))==1)
sum(rowSums(is.na(LE))==2)
sum(rowSums(is.na(LE))==3)
sum(rowSums(is.na(LE))==4)

R_LE=cbind(R,LE)
rowSums(is.na(R_LE))
R_LE_available=R_LE[rowSums(is.na(R_LE))<7,]

#missing counting
sum(rowSums(is.na(R_LE))==0)
sum(rowSums(is.na(R_LE))==1)
sum(rowSums(is.na(R_LE))==2)
sum(rowSums(is.na(R_LE))==3)
sum(rowSums(is.na(R_LE))==4)
sum(rowSums(is.na(R_LE))==5)
sum(rowSums(is.na(R_LE))==6)
sum(rowSums(is.na(R_LE))==7)
sum(rowSums(is.na(R_LE))==8)

#informative cluster size
n=rowSums(is.na(R_LE_available)==F)
na=is.na(R_LE_available)==F
v=matrix(0,nrow=nrow(R_LE_available), ncol=1)
for (i in 1:nrow(R_LE_available))
{v[i,1]=var(R_LE_available[i,is.na(R_LE_available[i,])==F])}
#A=cbind(v,n)
#mean(A[A[,2]<=4,1])
#mean(A[A[,2]>4,1])
B=cbind(sqrt(v),n)
#mean(B[B[,2]>4,1])
#mean(B[B[,2]<=4,1])
#boxplot(B[,1]~B[,2])

#lm(sqrt(v)~n)

#Call:
# lm(formula = sqrt(v) ~ n)

#Coefficients:
# (Intercept) n
#0.37765 -0.02366
#plot(sqrt(v) ~ n)
#abline(0.37765, -0.02366)

mean(B[B[,2]==3,1])
mean(B[B[,2]==4,1])
mean(B[B[,2]==5,1])
mean(B[B[,2]==6,1])
mean(B[B[,2]==7,1])
mean(B[B[,2]==8,1])

#average variability (standard deviation)
mean(c(mean(B[B[,2]==3,1]), mean(B[B[,2]==4,1])) ) #4 or less ratings
mean(c(mean(B[B[,2]==5,1]), mean(B[B[,2]==6,1]), mean(B[B[,2]==7,1]), mean(B[B[,2]==8,1])) ) #5 or more ratings

E=cbind(rbind(mean(B[B[,2]==3,1]),
mean(B[B[,2]==4,1]),
mean(B[B[,2]==5,1]),
mean(B[B[,2]==6,1]),
mean(B[B[,2]==7,1]),
mean(B[B[,2]==8,1])),rbind(3,4,5,6,7,8))

postscript("P:/projects/R_function/Chen/AvgSDrating.ps",width=7,height=7,horizontal=FALSE)

barplot(E[,1],ylim=c(0,0.5),xlab="Number of ratings",ylab="Average standard deviation",names.arg=c("3", "4", "5", "6", "7", "8"))
#title("Relationship between averaged standard deviation among the available ratings and the number of available ratings")

graphics.off()

#naive
completeR_LE=R_LE_available[apply(is.na(R_LE_available),1,sum)==0,]
naive=FLSkp(completeR_LE)
naive[1]-1.96*sqrt(naive[4])
naive[1]+1.96*sqrt(naive[4])

set.seed(3)
#resampling
RSMP=resampleMN(R_LE_available,10000)
RSMP\$kpbar-1.96*sqrt(RSMP\$THvar)
RSMP\$kpbar+1.96*sqrt(RSMP\$THvar)

#weighted GEE
WGEE=WGEE(R_LE_available)
WGEE[1]-1.96*sqrt(WGEE[2])
WGEE[1]+1.96*sqrt(WGEE[2])