Minggu, 15 September 2013

Matlab Code of PCA-Part for K-Means initialization

function idx=PCAPartKmeans(data,k,method)
% input:
% k= number of clusters
% method:
% 1=HHQL (Householder-QL), is modified function taken from myEig in LAPACK (http://eriesadewo.blogspot.com/2014/05/matlab-code-of-householder-ql-method.html)
% 2=simplified truncated power method (http://eriesadewo.blogspot.com/2013/08/simplified-code-of-truncated-power.html),
% output:
% data indexed on cluster number
% Source:
% Su, T., Dy, J., 2004. A Deterministic Method for Initializing K-means Clustering. In: Proc. 16th IEEE International Conference on Tools with Tools with Artificial Intelligence (ICTAI2004), pp.784-786.

% Last Modified: 1 Februari 2014

% Erie Sadewo & A. Syahrul Choir
% erie@bps.go.id

% The user of this function is requested to cite:
% Sadewo, E., Mashuri, M., Barakbah, A.R., 2014. Pengaruh penggunaan metode power dan truncated power pada PCA-Part untuk inisialisasi K-Means. Prosiding Seminar Nasional Sains & Teknologi V, pp. 1326-1335.

centroid=PCAInitCentroid(data,k,method);
idx=kmeans(data,k,'Start',centroid);

function centroid=PCAInitCentroid(data,k,method)
newdata=data;
imaxSSE=1;
SSE=1;
for i=1:(k-1)
clust{imaxSSE}=[];
SSE(imaxSSE)=[];
[C1 C2 SSE1 SSE2]=pca_part(newdata,k,method);
clust{imaxSSE}=C1;
clust{i+1}=C2;
SSE(imaxSSE)=SSE1;
SSE(i+1)=SSE2;
imaxSSE=find(SSE==max(SSE));
newdata=clust{imaxSSE};
end

for i=1:k
m=size(clust{i},1);
if m==1;
centroid{i}=clust{i};
else
centroid{i}=mean(clust{i});
end
end
centroid=cell2mat((centroid)');

function [C1 C2 SSE1 SSE2]=pca_part(data,k,method)
S=cov(data);
if method==1
[V ~]=HHQL(S,k);
elseif method==2
[V ~]=tpower(S);
end
[C1 C2 SSE1 SSE2]=binary_part(data,method,V);

function [C1 C2 SSE1 SSE2]=binary_part(data,method,V)
mu=mean(data);
PC=data*V;
PCmu=mu*V;

if method==1
ind1=find(PC>PCmu);
ind2=find(PCPCmu);
end
ind3=find(PC==PCmu);

C1=data(ind1,:);
C2=data(ind2,:);
if size(C1,1)==1;
mu1=C1;
else
mu1=mean(C1);
end
if size(C2,1)==1;
mu2=C2;
else
mu2=mean(C2);
end
PCmu1=mu1*V;
PCmu2=mu2*V;

if isempty(ind3)==0;
jarak1= sqrt((PCmu1-PCmu).^2);
jarak2= sqrt((PCmu2-PCmu).^2);
if jarak1 C1=[C1;data(ind3,:)];
ind1=[ind1 ind3];
C1=data(ind1,:);
mu1=mean(C1);
else
C2=[C2;data(ind3,:)];
ind2=[ind2 ind3];
C2=data(ind2,:);
mu2=mean(C2);
end
end

SSE1= sum(sqrt(sum((C1-repmat(mu1,(size(C1,1)),1)).^2,2)).^2);
SSE2= sum(sqrt(sum((C2-repmat(mu2,(size(C2,1)),1)).^2,2)).^2);