Esta es una derivación de esta pregunta: ¿Cómo comparar dos grupos con múltiples mediciones para cada individuo con R?
En las respuestas allí (si entendí correctamente) aprendí que la varianza dentro del sujeto no afecta las inferencias hechas sobre las medias grupales y que está bien simplemente tomar los promedios de las medias para calcular la media grupal, luego calcular la varianza dentro del grupo y usar eso para realizar pruebas de significación. Me gustaría usar un método en el que cuanto mayor sea la varianza dentro del sujeto, menos seguro estoy de lo que significa el grupo o entender por qué no tiene sentido desear eso.
Aquí hay una gráfica de los datos originales junto con algunos datos simulados que usaron las mismas medias de sujeto, pero muestrearon las mediciones individuales para cada sujeto de una distribución normal usando esas medias y una pequeña varianza dentro del sujeto (sd = .1). Como se puede ver, los intervalos de confianza a nivel de grupo (fila inferior) no se ven afectados por esto (al menos la forma en que los calculé).
También usé rjags para estimar las medias grupales de tres maneras. 1) Use los datos originales sin procesar 2) Use solo los medios de Sujeto 3) Use los datos simulados con pequeños SD dentro del sujeto
Los resultados están abajo. Usando este método, vemos que los intervalos creíbles del 95% son más estrechos en los casos # 2 y # 3. Esto cumple con mi intuición de lo que me gustaría que ocurriera al hacer inferencias sobre los medios grupales, pero no estoy seguro de si esto es solo un artefacto de mi modelo o una propiedad de intervalos creíbles.
Nota. Para usar rjags, primero debe instalar JAGS desde aquí: http://sourceforge.net/projects/mcmc-jags/files/
Los diversos códigos están abajo.
Los datos originales:
structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6,
6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10,
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12,
12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15,
15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18,
18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8,
22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20,
6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16,
2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8,
12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8,
10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26,
20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list(
NULL, c("Group", "Subject", "Value")))
Obtener medias del sujeto y simular los datos con una pequeña variación dentro del sujeto:
#Get Subject Means
means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)
#Initialize "dat2" dataframe
dat2<-dat
#Sample individual measurements for each subject
temp=NULL
for(i in 1:nrow(means)){
temp<-c(temp,rnorm(6,means[i,3], .1))
}
#Set Simulated values
dat2[,3]<-temp
La función para adaptarse al modelo JAGS:
require(rjags)
#Jags fit function
jags.fit<-function(dat2){
#Create JAGS model
modelstring = "
model{
for(n in 1:Ndata){
y[n]~dnorm(mu[subj[n]],tau[subj[n]]) T(0, )
}
for(s in 1:Nsubj){
mu[s]~dnorm(muG,tauG) T(0, )
tau[s] ~ dgamma(5,5)
}
muG~dnorm(10,.01) T(0, )
tauG~dgamma(1,1)
}
"
writeLines(modelstring,con="model.txt")
#############
#Format Data
Ndata = nrow(dat2)
subj = as.integer( factor( dat2$Subject ,
levels=unique(dat2$Subject ) ) )
Nsubj = length(unique(subj))
y = as.numeric(dat2$Value)
dataList = list(
Ndata = Ndata ,
Nsubj = Nsubj ,
subj = subj ,
y = y
)
#Nodes to monitor
parameters=c("muG","tauG","mu","tau")
#MCMC Settings
adaptSteps = 1000
burnInSteps = 1000
nChains = 1
numSavedSteps= nChains*10000
thinSteps=20
nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains )
#Create Model
jagsModel = jags.model( "model.txt" , data=dataList,
n.chains=nChains , n.adapt=adaptSteps , quiet=FALSE )
# Burn-in:
cat( "Burning in the MCMC chain...\n" )
update( jagsModel , n.iter=burnInSteps )
# Getting DIC data:
load.module("dic")
# The saved MCMC chain:
cat( "Sampling final MCMC chain...\n" )
codaSamples = coda.samples( jagsModel , variable.names=parameters ,
n.iter=nPerChain , thin=thinSteps )
mcmcChain = as.matrix( codaSamples )
result = list(codaSamples=codaSamples, mcmcChain=mcmcChain)
}
Ajuste el modelo a cada grupo de cada conjunto de datos:
#Fit to raw data
groupA<-jags.fit(dat[which(dat[,1]==1),])
groupB<-jags.fit(dat[which(dat[,1]==2),])
groupC<-jags.fit(dat[which(dat[,1]==3),])
#Fit to subject mean data
groupA2<-jags.fit(means[which(means[,1]==1),])
groupB2<-jags.fit(means[which(means[,1]==2),])
groupC2<-jags.fit(means[which(means[,1]==3),])
#Fit to simulated raw data (within-subject sd=.1)
groupA3<-jags.fit(dat2[which(dat2[,1]==1),])
groupB3<-jags.fit(dat2[which(dat2[,1]==2),])
groupC3<-jags.fit(dat2[which(dat2[,1]==3),])
Intervalo creíble / función de intervalo de mayor densidad:
#HDI Function
get.HDI<-function(sampleVec,credMass){
sortedPts = sort( sampleVec )
ciIdxInc = floor( credMass * length( sortedPts ) )
nCIs = length( sortedPts ) - ciIdxInc
ciWidth = rep( 0 , nCIs )
for ( i in 1:nCIs ) {
ciWidth[ i ] = sortedPts[ i + ciIdxInc ] - sortedPts[ i ]
}
HDImin = sortedPts[ which.min( ciWidth ) ]
HDImax = sortedPts[ which.min( ciWidth ) + ciIdxInc ]
HDIlim = c( HDImin , HDImax, credMass )
return( HDIlim )
}
Primera trama:
layout(matrix(c(1,1,2,2,3,4),nrow=3,ncol=2, byrow=T))
boxplot(dat[,3]~dat[,2],
xlab="Subject", ylab="Value", ylim=c(0, 1.2*max(dat[,3])),
col=c(rep("Red",length(which(dat[,1]==unique(dat[,1])[1]))/6),
rep("Green",length(which(dat[,1]==unique(dat[,1])[2]))/6),
rep("Blue",length(which(dat[,1]==unique(dat[,1])[3]))/6)
),
main="Original Data"
)
stripchart(dat[,3]~dat[,2], vert=T, add=T, pch=16)
legend("topleft", legend=c("Group A", "Group B", "Group C", "Individual Means +/- 95% CI"),
col=c("Red","Green","Blue", "Grey"), lwd=3, bty="n", pch=c(15),
pt.cex=c(rep(0.1,3),1),
ncol=3)
for(i in 1:length(unique(dat[,2]))){
m<-mean(examp[which(dat[,2]==unique(dat[,2])[i]),3])
ci<-t.test(dat[which(dat[,2]==unique(dat[,2])[i]),3])$conf.int[1:2]
points(i-.3,m, pch=15,cex=1.5, col="Grey")
segments(i-.3,
ci[1],i-.3,
ci[2], lwd=4, col="Grey"
)
}
boxplot(dat2[,3]~dat2[,2],
xlab="Subject", ylab="Value", ylim=c(0, 1.2*max(dat2[,3])),
col=c(rep("Red",length(which(dat2[,1]==unique(dat2[,1])[1]))/6),
rep("Green",length(which(dat2[,1]==unique(dat2[,1])[2]))/6),
rep("Blue",length(which(dat2[,1]==unique(dat2[,1])[3]))/6)
),
main=c("Simulated Data", "Same Subject Means but Within-Subject SD=.1")
)
stripchart(dat2[,3]~dat2[,2], vert=T, add=T, pch=16)
legend("topleft", legend=c("Group A", "Group B", "Group C", "Individual Means +/- 95% CI"),
col=c("Red","Green","Blue", "Grey"), lwd=3, bty="n", pch=c(15),
pt.cex=c(rep(0.1,3),1),
ncol=3)
for(i in 1:length(unique(dat2[,2]))){
m<-mean(examp[which(dat2[,2]==unique(dat2[,2])[i]),3])
ci<-t.test(dat2[which(dat2[,2]==unique(dat2[,2])[i]),3])$conf.int[1:2]
points(i-.3,m, pch=15,cex=1.5, col="Grey")
segments(i-.3,
ci[1],i-.3,
ci[2], lwd=4, col="Grey"
)
}
means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)
boxplot(means[,3]~means[,1], col=c("Red","Green","Blue"),
ylim=c(0,1.2*max(means[,3])), ylab="Value", xlab="Group",
main="Original Data"
)
stripchart(means[,3]~means[,1], pch=16, vert=T, add=T)
for(i in 1:length(unique(means[,1]))){
m<-mean(means[which(means[,1]==unique(means[,1])[i]),3])
ci<-t.test(means[which(means[,1]==unique(means[,1])[i]),3])$conf.int[1:2]
points(i-.3,m, pch=15,cex=1.5, col="Grey")
segments(i-.3,
ci[1],i-.3,
ci[2], lwd=4, col="Grey"
)
}
legend("topleft", legend=c("Group Means +/- 95% CI"), bty="n", pch=15, lwd=3, col="Grey")
means2<-aggregate(Value~Group+Subject, data=dat2, FUN=mean)
boxplot(means2[,3]~means2[,1], col=c("Red","Green","Blue"),
ylim=c(0,1.2*max(means2[,3])), ylab="Value", xlab="Group",
main="Simulated Data Group Averages"
)
stripchart(means2[,3]~means2[,1], pch=16, vert=T, add=T)
for(i in 1:length(unique(means2[,1]))){
m<-mean(means[which(means2[,1]==unique(means2[,1])[i]),3])
ci<-t.test(means[which(means2[,1]==unique(means2[,1])[i]),3])$conf.int[1:2]
points(i-.3,m, pch=15,cex=1.5, col="Grey")
segments(i-.3,
ci[1],i-.3,
ci[2], lwd=4, col="Grey"
)
}
legend("topleft", legend=c("Group Means +/- 95% CI"), bty="n", pch=15, lwd=3, col="Grey")
Segunda trama:
layout(matrix(c(1,2,3,4,4,4,5,5,5,6,6,6),nrow=4,ncol=3, byrow=T))
#Plot priors
plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),5,5), type="l", lwd=4,
xlab="Value", ylab="Density",
main="Prior on Within-Subject Precision"
)
plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),1,1), type="l", lwd=4,
xlab="Value", ylab="Density",
main="Prior on Within-Group Precision"
)
plot(seq(0,300,by=.01),dnorm(seq(0,300,by=.01),10,100), type="l", lwd=4,
xlab="Value", ylab="Density",
main="Prior on Group Means"
)
#Set overall xmax value
x.max<-1.1*max(groupA$mcmcChain[,"muG"],groupB$mcmcChain[,"muG"],groupC$mcmcChain[,"muG"],
groupA2$mcmcChain[,"muG"],groupB2$mcmcChain[,"muG"],groupC2$mcmcChain[,"muG"],
groupA3$mcmcChain[,"muG"],groupB3$mcmcChain[,"muG"],groupC3$mcmcChain[,"muG"]
)
#Plot result for raw data
#Set ymax
y.max<-1.1*max(density(groupA$mcmcChain[,"muG"])$y,density(groupB$mcmcChain[,"muG"])$y,density(groupC$mcmcChain[,"muG"])$y)
plot(density(groupA$mcmcChain[,"muG"]),xlim=c(0,x.max),
ylim=c(-.1*y.max,y.max), lwd=3, col="Red",
main="Group Mean Estimates: Fit to Raw Data", xlab="Value"
)
lines(density(groupB$mcmcChain[,"muG"]), lwd=3, col="Green")
lines(density(groupC$mcmcChain[,"muG"]), lwd=3, col="Blue")
hdi<-get.HDI(groupA$mcmcChain[,"muG"], .95)
segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col="Red")
hdi<-get.HDI(groupB$mcmcChain[,"muG"], .95)
segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col="Green")
hdi<-get.HDI(groupC$mcmcChain[,"muG"], .95)
segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col="Blue")
####
#Plot result for mean data
#x.max<-1.1*max(groupA2$mcmcChain[,"muG"],groupB2$mcmcChain[,"muG"],groupC2$mcmcChain[,"muG"])
y.max<-1.1*max(density(groupA2$mcmcChain[,"muG"])$y,density(groupB2$mcmcChain[,"muG"])$y,density(groupC2$mcmcChain[,"muG"])$y)
plot(density(groupA2$mcmcChain[,"muG"]),xlim=c(0,x.max),
ylim=c(-.1*y.max,y.max), lwd=3, col="Red",
main="Group Mean Estimates: Fit to Subject Means", xlab="Value"
)
lines(density(groupB2$mcmcChain[,"muG"]), lwd=3, col="Green")
lines(density(groupC2$mcmcChain[,"muG"]), lwd=3, col="Blue")
hdi<-get.HDI(groupA2$mcmcChain[,"muG"], .95)
segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col="Red")
hdi<-get.HDI(groupB2$mcmcChain[,"muG"], .95)
segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col="Green")
hdi<-get.HDI(groupC2$mcmcChain[,"muG"], .95)
segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col="Blue")
####
#Plot result for simulated data
#Set ymax
#x.max<-1.1*max(groupA3$mcmcChain[,"muG"],groupB3$mcmcChain[,"muG"],groupC3$mcmcChain[,"muG"])
y.max<-1.1*max(density(groupA3$mcmcChain[,"muG"])$y,density(groupB3$mcmcChain[,"muG"])$y,density(groupC3$mcmcChain[,"muG"])$y)
plot(density(groupA3$mcmcChain[,"muG"]),xlim=c(0,x.max),
ylim=c(-.1*y.max,y.max), lwd=3, col="Red",
main=c("Group Mean Estimates: Fit to Simulated data", "(Within-Subject SD=0.1)"), xlab="Value"
)
lines(density(groupB3$mcmcChain[,"muG"]), lwd=3, col="Green")
lines(density(groupC3$mcmcChain[,"muG"]), lwd=3, col="Blue")
hdi<-get.HDI(groupA3$mcmcChain[,"muG"], .95)
segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col="Red")
hdi<-get.HDI(groupB3$mcmcChain[,"muG"], .95)
segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col="Green")
hdi<-get.HDI(groupC3$mcmcChain[,"muG"], .95)
segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col="Blue")
EDITAR con mi versión personal de la respuesta de @ StéphaneLaurent
Usé el modelo que describió para tomar muestras de una distribución normal con media = 0, entre la varianza del sujeto = 1 y dentro del error / varianza del sujeto = 0.1,1,10,100. Un subconjunto de los intervalos de confianza se muestra en los paneles izquierdos, mientras que la distribución de sus anchos se muestra en los paneles derechos correspondientes. Esto me ha convencido de que él es 100% correcto. Sin embargo, todavía estoy confundido con mi ejemplo anterior, pero seguiré con una nueva pregunta más centrada.
El código para la simulación y los gráficos anteriores:
dev.new()
par(mfrow=c(4,2))
num.sims<-10000
sigmaWvals<-c(.1,1,10,100)
muG<-0 #Grand Mean
sigma.between<-1 #Between Experiment sd
for(sigma.w in sigmaWvals){
sigma.within<-sigma.w #Within Experiment sd
out=matrix(nrow=num.sims,ncol=2)
for(i in 1:num.sims){
#Sample the three experiment means (mui, i=1:3)
mui<-rnorm(3,muG,sigma.between)
#Sample the three obersvations for each experiment (muij, i=1:3, j=1:3)
y1j<-rnorm(3,mui[1],sigma.within)
y2j<-rnorm(3,mui[2],sigma.within)
y3j<-rnorm(3,mui[3],sigma.within)
#Put results in data frame
d<-as.data.frame(cbind(
c(rep(1,3),rep(2,3),rep(3,3)),
c(y1j, y2j, y3j )
))
d[,1]<-as.factor(d[,1])
#Calculate means for each experiment
dmean<-aggregate(d[,2]~d[,1], data=d, FUN=mean)
#Add new confidence interval data to output
out[i,]<-t.test(dmean[,2])$conf.int[1:2]
}
#Calculate % of intervals that contained muG
cover<-matrix(nrow=nrow(out),ncol=1)
for(i in 1:nrow(out)){
cover[i]<-out[i,1]<muG & out[i,2]>muG
}
sub<-floor(seq(1,nrow(out),length=100))
plot(out[sub,1], ylim=c(min(out[sub,1]),max(out[sub,2])),
xlab="Simulation #", ylab="Value", xaxt="n",
main=c(paste("# of Sims=",num.sims),
paste("% CIs Including muG=",100*round(length(which(cover==T))/nrow(cover),3)))
)
axis(side=1, at=1:100, labels=sub)
points(out[sub,2])
cnt<-1
for(i in sub){
segments(cnt, out[i,1],cnt,out[i,2])
cnt<-cnt+1
}
abline(h=0, col="Red", lwd=3)
hist(out[,2]-out[,1], freq=F, xlab="Width of 95% CI",
main=c(paste("muG=", muG),
paste("Sigma Between=",sigma.between),
paste("Sigma Within=",sigma.within))
)
}