# Reading Data
rawdata<-read.table("wiki.txt",sep=";",skip=1)
helpme=data.matrix(rawdata[,c(11:13,19:21,27:42)])-1
helpme[helpme==0]=NaN
begdata<-helpme[complete.cases(helpme),]
quality<-begdata[,c(1:6)]
image<-begdata[,c(7:9,21,22)]
perschar<-begdata[,c(10:12,18:20)]
behavior<-begdata[,c(13:17)]
percuse<-begdata[,1:3]
qual<-begdata[,4:6]
socim<-begdata[,7:9]
jobrel<-begdata[,c(21,22)]
shaatt<-begdata[,10:12]
prof<-begdata[,18:20]
data=cbind(matrix(ceiling(rowSums(percuse)/3),ncol=1),matrix(ceiling(rowSums(qual)/3),ncol=1),
	matrix(ceiling(rowSums(socim)/3),ncol=1),matrix(ceiling(rowSums(jobrel)/2),ncol=1),
	matrix(ceiling(rowSums(shaatt)/3),ncol=1),matrix(ceiling(rowSums(prof)/3),ncol=1),
	matrix(ceiling(rowSums(behavior)/5),ncol=1))
Q=ncol(data)
TP=nrow(data)


# Mixture Model, Collapse Gibbs Sampler
K=4
alpha=1
eta=1
s=5
z=matrix(sample(1:K,TP,replace=TRUE,prob=NULL),ncol=1)
probz=matrix(0,nrow=TP,ncol=K)
jointprob=-Inf
maxjointprob=jointprob
zstar=z
probzstar=probz
convergence=0
NS=500
ni=1
const=lgamma(K*alpha)-K*lgamma(alpha)-lgamma(K*alpha+TP)+K*Q*lgamma(s*eta)-K*Q*s*lgamma(eta)
while (ni<NS){
	help1=(matrix(rep(z,times=K),ncol=K)==matrix(rep(1:K,TP),ncol=K,byrow=TRUE)) 
	help2=array(rep(data,TP),dim=c(TP,Q,TP))
	help3=array(0,dim=c(TP,Q,TP))
	for (i in 1:TP){
		help3[,,i]=matrix(data[i,],nrow=TP,ncol=Q,byrow=TRUE)
	}
	help=(help2==help3)
	for (j in 1:K){
		for(i in 1:TP){
			A=help[,,i]&matrix(help1[,j],ncol=Q,nrow=TP)
			A[i,]=FALSE
			m=colSums(A)
			classnum=sum(help1[,j])
			numer=prod(eta+m)*(alpha+classnum)
			denum=(s*eta+classnum)^Q*(K*alpha+TP-1)
			probz[i,j]=log(numer/denum)
		}
	}
	maxprob=matrix(rep(apply(probz,1,max),times=K),ncol=K)
	helpprobz=probz-maxprob
	normalization=maxprob+log(matrix(rep(rowSums (exp(helpprobz), na.rm = FALSE, dims = 1),times=K),ncol=K))
	probz=probz-normalization
	for (i in 1:TP){
		z[i,1]=sample(1:K,1,prob=exp(probz[i, ]))
		if (i==TP || i==floor(TP/4) || i==floor(TP/2) || i==floor(3*TP/4)){
			logprob=0
			help1=(matrix(rep(z,times=K),ncol=K)==matrix(rep(1:K,TP),ncol=K,byrow=TRUE))
			pers=colSums(help1)
			logprob=logprob+sum(lgamma(alpha+pers))
			logprob=logprob-Q*sum(lgamma(s*eta+pers))
			for (j in 1:s){
				for (k in 1:K){
					answerjclassk=colSums((data==matrix(j,nrow=TP,ncol=Q))&matrix(help1[,k],ncol=Q,nrow=TP))
					logprob=logprob+sum(lgamma(eta+answerjclassk))
				}
			}
			logprob=logprob+const
			if (logprob>maxjointprob){
				maxjointprob=logprob
				zstar=z
				probzstar=probz
			}
			jointprob=c(jointprob,logprob)		
		}
	}
	ni=ni+1
	if (abs(jointprob[length(jointprob)]-jointprob[length(jointprob)-1])<abs(jointprob[length(jointprob)]/1000)){
		convergence=1	
	}
}
NI=length(jointprob)
plot(1:(NI-1),jointprob[2:NI], xlab="iteration", ylab="log of joint probability")

# Cross-Validation, Mixture Model, Collapse Gibbs Sampler
population=c(2,3,4,5,6,7)
alpha=1
eta=1
s=5
P1=floor(TP*0.25)
train=array(0,dim=c(P1,Q,4))
trainindex=c(sample(1:TP,P1,replace=FALSE,prob=NULL))
train[,,1]=data[trainindex,]
index=trainindex
trainindex=c(sample(setdiff(1:TP,trainindex),P1,replace=FALSE,prob=NULL))
train[,,2]=data[trainindex,]
index=union(trainindex,index)
trainindex=c(sample(setdiff(1:TP,index),P1,replace=FALSE,prob=NULL))
train[,,3]=data[trainindex,]
index=union(trainindex,index)
trainindex=setdiff(1:TP,index)
train[,,4]=data[trainindex[1:floor(TP*0.25)],]
P=3*P1
combcrvl=combn(1:4, 3, FUN = NULL, simplify = TRUE)
jointprobtraining=matrix(0,nrow=ncol(combcrvl),ncol=length(population))
jointprobtesting=matrix(0,nrow=ncol(combcrvl),ncol=length(population))
averagejointprobtraining=matrix(0,nrow=1,ncol=length(population))
averagejointprobtesting=matrix(0,nrow=1,ncol=length(population))
sdjointprobtraining=matrix(0,nrow=1,ncol=length(population))
sdjointprobtesting=matrix(0,nrow=1,ncol=length(population))
maxtest=-Inf

for (pop in 1:(length(population))){
K=population[pop]
const=lgamma(K*alpha)-K*lgamma(alpha)-lgamma(K*alpha+P)+K*Q*lgamma(s*eta)-K*Q*s*lgamma(eta)
distbeta=array(0,dim=c(s,Q,K,ncol(combcrvl)))
disttheta=matrix(0,nrow=K,ncol=ncol(combcrvl))

for (fold in 1:ncol(combcrvl)){
traindata=rbind(train[,,combcrvl[1,fold]],train[,,combcrvl[2,fold]],train[,,combcrvl[3,fold]])
testdata=train[,,10-sum(combcrvl[,fold])]
z=matrix(sample(1:K,P,replace=TRUE,prob=NULL),ncol=1)
probz=matrix(0,nrow=P,ncol=K)
jointprob=-Inf
maxjointprob=jointprob
zstar=z
probzstar=probz
convergence=0
NS=1000
ni=1
while (ni<NS){
	help1=(matrix(rep(z,times=K),ncol=K)==matrix(rep(1:K,P),ncol=K,byrow=TRUE)) 
	help2=array(rep(traindata,P),dim=c(P,Q,P))
	help3=array(0,dim=c(P,Q,P))
	for (i in 1:P){
		help3[,,i]=matrix(traindata[i,],nrow=P,ncol=Q,byrow=TRUE)
	}
	help=(help2==help3)
	for (j in 1:K){
		for(i in 1:P){
			A=help[,,i]&matrix(help1[,j],ncol=Q,nrow=P)
			A[i,]=FALSE
			m=colSums(A)
			classnum=sum(help1[,j])
			numer=prod(eta+m)*(alpha+classnum)
			denum=(s*eta+classnum)^Q*(K*alpha+P-1)
			probz[i,j]=log(numer/denum)
		}
	}
	maxprob=matrix(rep(apply(probz,1,max),times=K),ncol=K)
	helpprobz=probz-maxprob
	normalization=maxprob+log(matrix(rep(rowSums (exp(helpprobz), na.rm = FALSE, dims = 1),times=K),ncol=K))
	probz=probz-normalization
	for (i in 1:P){
		z[i,1]=sample(1:K,1,prob=exp(probz[i, ]))
		if (i==P || i==floor(P/4) || i==floor(P/2) || i==floor(3*P/4)){
			logprob=0
			help1=(matrix(rep(z,times=K),ncol=K)==matrix(rep(1:K,P),ncol=K,byrow=TRUE))
			pers=colSums(help1)
			logprob=logprob+sum(lgamma(alpha+pers))
			logprob=logprob-Q*sum(lgamma(s*eta+pers))
			for (j in 1:s){
				for (k in 1:K){
					answerjclassk=colSums((traindata==matrix(j,nrow=P,ncol=Q))&matrix(help1[,k],ncol=Q,nrow=P))
					logprob=logprob+sum(lgamma(eta+answerjclassk))
				}
			}
			logprob=logprob+const
			if (logprob>maxjointprob){
				maxjointprob=logprob
				zstar=z
				probzstar=probz
			}
			jointprob=c(jointprob,logprob)		
		}
	}
	ni=ni+1
	if (abs(jointprob[length(jointprob)]-jointprob[length(jointprob)-1])<abs(jointprob[length(jointprob)]/1000)){
		convergence=1	
	}
}
help1=(matrix(rep(zstar,times=K),ncol=K)==matrix(rep(1:K,P),ncol=K,byrow=TRUE))
theta=matrix(0,nrow=K,ncol=1)
beta=array(0,dim=c(s,Q,K))
for (i in 1:K){
	classi=sum(zstar==i)
	theta[i]=(sum(zstar==i)+alpha)/(P+K*alpha)
	if (classi>0){
		for (j in 1:Q){
			beta[,j,i]=t((colSums((matrix(rep(traindata[help1[,i],j],times=s),ncol=s)==matrix(rep(1:s,times=classi),ncol=s,byrow=TRUE)))+eta)/(classi+s*eta))			
		}
	} else{
		beta[,j,i]=1/s
	}
}
testjointprob=0
for (i in 1:(nrow(testdata))){
	personprob=0
	for (j in 1:K){
		classprob=theta[j]
		for (q in 1:Q){
			classprob=classprob*beta[testdata[i,q],q,j]
		}
		personprob=personprob+classprob
	}
	testjointprob=testjointprob+log(personprob)
}
distbeta[,,,fold]=beta
disttheta[,fold]=theta
jointprobtraining[fold,pop]=maxjointprob
jointprobtesting[fold,pop]=testjointprob
}
averagejointprobtraining[1,pop]=sum(jointprobtraining[,pop])/ncol(combcrvl)
averagejointprobtesting[1,pop]=sum(jointprobtesting[,pop])/ncol(combcrvl)
sdjointprobtraining[1,pop]=sqrt(sum((jointprobtraining[,pop]-averagejointprobtraining[1,pop])^2)/ncol(combcrvl))
sdjointprobtesting[1,pop]=sqrt(sum((jointprobtesting[,pop]-averagejointprobtesting[1,pop])^2)/ncol(combcrvl))

if (averagejointprobtesting[1,pop]>maxtest){
maxtest=averagejointprobtesting[1,pop]
betaopt=array(0,dim=c(s,Q,K))
thetaopt=matrix(0,nrow=K,ncol=1)
for (a in 1:ncol(combcrvl)){
betaopt=betaopt+distbeta[,,,a]
}
betaopt=betaopt/ncol(combcrvl)
thetaopt=rowMeans(disttheta)
Kopt=K
}
}

# Spectral Clustering
A=matrix(0,nrow=TP,ncol=TP)
D=matrix(0,nrow=TP,ncol=TP)
sigma2=0.06
K=4
for (i in 1:TP){
	d=rowSums(abs(data-matrix(rep(data[i,],times=TP),ncol=Q,nrow=TP,byrow=TRUE)))
	A[,i]=exp(-d/(2*sigma2))
	A[i,i]=0
}
D=diag(rowSums(A))
Dinverse=solve(D)
Dinverse.eig=eigen(Dinverse)
Dinverse.sqrt=Dinverse.eig$vectors %*% diag(sqrt(Dinverse.eig$values)) %*% solve(Dinverse.eig$vectors)
L=Dinverse.sqrt %*% A %*% Dinverse.sqrt
L.eig=eigen(L)
X=L.eig$vectors[,1:K]
Y=X / matrix(rep(sqrt(rowSums(X * X)),times=K), ncol=K,nrow=TP)
Ycluster= kmeans(Y, K, iter.max = 100, nstart = 100,algorithm = c("Hartigan-Wong", "Lloyd", "Forgy","MacQueen"), trace=FALSE)
help1=matrix(rep(1:K,times=TP),ncol=K,byrow=TRUE)
help2=matrix(rep(Ycluster$cluster,times=K),ncol=K)
help=(help2==help1)
classresponse=matrix(0,ncol=Q,nrow=2*K)
for (i in 1:K){
	response=data[help[,i],]
	classresponse[i,]=colSums(response)/nrow(response)
	average=matrix(rep(classresponse[i,],times=nrow(response)),ncol=Q,byrow=TRUE)
	classresponse[i+K,]=sqrt(colSums((response-average) * (response-average))/nrow(response))	
}