ECG-Kit 1.0

File: <base>/common/prtools/stumpc.m (14,042 bytes)
%STUMPC Decision stump classifier
% 
%   W = STUMPC(A,CRIT,N)
%   W = A*STUMPC([],CRIT,N)
%   W = A*STUMPC(CRIT,N)
% 
% Computation of a decision tree classifier out of a dataset A using 
% a binary splitting criterion CRIT:
%   INFCRIT  -  information gain
%   MAXCRIT  -  purity (default)
%   FISHCRIT -  Fisher criterion
% Just N (default N=1) nodes are computed.
% 
% see also DATASETS, MAPPINGS, TREEC, TREE_MAP

% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands

% $Id: stumpc.m,v 1.2 2009/07/10 11:19:20 duin Exp $

function w = stumpc(varargin)
  
	mapname = 'Decision Stump';
  argin = shiftargin(varargin,{'char','integer'});
  argin = shiftargin(argin,'integer',2);
  argin = setdefaults(argin,[],'maxcrit',1);
  
  if mapping_task(argin,'definition')
    w = define_mapping(argin,'untrained',mapname);
    
  elseif mapping_task(argin,'training')			% Train a mapping.
  
    [a,crit,n] = deal(argin{:});
    islabtype(a,'crisp');
    isvaldset(a,1,2); % at least 1 object per class, 2 classes

    % First get some useful parameters:
    [m,k,c] = getsize(a);
    nlab = getnlab(a);
    tree = maketree(+a,nlab,c,crit,n);

    % Store the results:
    w = prmapping('tree_map','trained',{tree,1},getlablist(a),k,c);
    w = setname(w,mapname);
    w = setcost(w,a);
    
  end

	return

%MAKETREE General tree building algorithm
% 
% 	tree = maketree(A,nlab,c,crit,stop)
% 
% Constructs a binary decision tree using the criterion function
% specified in the string crit ('maxcrit', 'fishcrit' or 'infcrit' 
% (default)) for a set of objects A. stop is a counter for the number of
% branches that are allowed below the present.
% 
% Definition of the resulting tree:
% 
% 	tree(n,1) - feature number to be used in node n
% 	tree(n,2) - threshold t to be used
% 	tree(n,3) - node to be processed if value <= t
% 	tree(n,4) - node to be processed if value > t
% 	tree(n,5:4+c) - aposteriori probabilities for all classes in
% 			node n
% 
% If tree(n,3) == 0, stop, class in tree(n,1)
% 
% This is a low-level routine called by treec.
% 
% See also infstop, infcrit, maxcrit, fishcrit and mapt.

% Authors: Guido te Brake, TWI/SSOR, Delft University of Technology
%     R.P.W. Duin, TN/PH, Delft University of Technology
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function tree = maketree(a,nlab,c,crit,n) 
		[m,k] = size(a); 

	% Construct the tree:

	% When all objects have the same label, create an end-node:
	if all([nlab == nlab(1)]) 
		% Avoid giving 0-1 probabilities, but 'regularize' them a bit using
		% a 'uniform' Bayesian prior:
		p = ones(1,c)/(m+c); p(nlab(1)) = (m+1)/(m+c);
		tree = [nlab(1),0,0,0,p];
	else
		% now the tree is recursively constructed further:
		[f,j,t] = feval(crit,+a,nlab); % use desired split criterion
		
		p = sum(expandd(nlab),1);
		if length(p) < c, p = [p,zeros(1,c-length(p))]; end
		% When the stop criterion is not reached yet, we recursively split
		% further:
        
		if n >= 1
			% Make the left branch:
			J = find(a(:,j) <= t);
      tl = maketree(+a(J,:),nlab(J),c,crit,n-1);
			
      % Make the right branch:
			K = find(a(:,j) > t);
			tr = maketree(+a(K,:),nlab(K),c,crit,n-1);
			
      % Fix the node labelings before the branches can be 'glued'
			% together to a big tree:
			[t1,t2] = size(tl);
      tl = tl + [zeros(t1,2) tl(:,[3 4])>0 zeros(t1,c)];
			[t3,t4] = size(tr);
			tr = tr + (t1+1)*[zeros(t3,2) tr(:,[3 4])>0 zeros(t3,c)];
		
      % Make the complete tree: the split-node and the branches:
			tree= [[j,t,2,t1+2,(p+1)/(m+c)]; tl; tr]; 
		else
			% We reached the stop criterion, so make an end-node:
			[mt,cmax] = max(p);
			tree = [cmax,0,0,0,(p+1)/(m+c)];
		end
		
	end
	return

%MAXCRIT Maximum entropy criterion for best feature split.
% 
% 	[f,j,t] = maxcrit(A,nlabels)
% 
% Computes the value of the maximum purity f for all features over 
% the data set A given its numeric labels. j is the optimum feature,
% t its threshold. This is a low level routine called for constructing
% decision trees.
% 
% [1] L. Breiman, J.H. Friedman, R.A. Olshen, and C.J. Stone, 
% Classification and regression trees, Wadsworth, California, 1984. 

% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl 
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function [f,j,t] = maxcrit(a,nlab)
		[m,k] = size(a);
	c = max(nlab);
	% -variable T is an (2c)x k matrix containing:
	%      minimum feature values class 1
	%      maximum feature values class 1
	%      minimum feature values class 2
	%      maximum feature values class 2
	%            etc.
	% -variable R (same size) contains:
	%      fraction of objects which is < min. class 1.
	%      fraction of objects which is > max. class 1.
	%      fraction of objects which is < min. class 2.
	%      fraction of objects which is > max. class 2.
	%            etc.
	% These values are collected and computed in the next loop:
	T = zeros(2*c,k); R = zeros(2*c,k);
	for j = 1:c
		L = (nlab == j);
		if sum(L) == 0
			T([2*j-1:2*j],:) = zeros(2,k);
			R([2*j-1:2*j],:) = zeros(2,k);
		else
			T(2*j-1,:) = min(a(L,:),[],1);
			R(2*j-1,:) = sum(a < ones(m,1)*T(2*j-1,:),1);
			T(2*j,:) = max(a(L,:),[],1);
			R(2*j,:) = sum(a > ones(m,1)*T(2*j,:),1);
		end
	end
	% From R the purity index for all features is computed:
	G = R .* (m-R);
	% and the best feature is found:
	[gmax,tmax] = max(G,[],1);
	[f,j] = max(gmax);
	Tmax = tmax(j);
	if Tmax ~= 2*floor(Tmax/2)
		t = (T(Tmax,j) + max(a(find(a(:,j) < T(Tmax,j)),j)))/2;
	else
		t = (T(Tmax,j) + min(a(find(a(:,j) > T(Tmax,j)),j)))/2;
	end
	return

%INFCRIT The information gain and its the best feature split.
% 
% 	[f,j,t] = infcrit(A,nlabels)
% 
% Computes over all features the information gain f for its best 
% threshold from the dataset A and its numeric labels. For f=1: 
% perfect discrimination, f=0: complete mixture. j is the optimum 
% feature, t its threshold. This is a lowlevel routine called for 
% constructing decision trees.

% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function [g,j,t] = infcrit(a,nlab)
		[m,k] = size(a);
	c = max(nlab);
	mininfo = ones(k,2);
	% determine feature domains of interest
	[sn,ln] = min(a,[],1); 
	[sx,lx] = max(a,[],1);
	JN = (nlab(:,ones(1,k)) == ones(m,1)*nlab(ln)') * realmax;
	JX = -(nlab(:,ones(1,k)) == ones(m,1)*nlab(lx)') * realmax;
	S = sort([sn; min(a+JN,[],1); max(a+JX,[],1); sx]);
	% S(2,:) to S(3,:) are interesting feature domains
	P = sort(a);
	Q = (P >= ones(m,1)*S(2,:)) & (P <= ones(m,1)*S(3,:));
	% these are the feature values in those domains
	for f=1:k,		% repeat for all features
		af = a(:,f);
		JQ = find(Q(:,f));
		SET = P(JQ,f)';
		if JQ(1) ~= 1
			SET = [P(JQ(1)-1,f), SET];
		end
		n = length(JQ);
		if JQ(n) ~= m
			SET = [SET, P(JQ(n)+1,f)];
		end
		n = length(SET) -1;
		T = (SET(1:n) + SET(2:n+1))/2; % all possible thresholds
		L = zeros(c,n); R = L;     % left and right node object counts per class
		for j = 1:c
			J = find(nlab==j); mj = length(J);
			if mj == 0
				L(j,:) = realmin*ones(1,n); R(j,:) = L(j,:);
			else
				L(j,:) = sum(repmat(af(J),1,n) <= repmat(T,mj,1)) + realmin;
				R(j,:) = sum(repmat(af(J),1,n) > repmat(T,mj,1)) + realmin;
			end
		end
		infomeas =  - (sum(L .* log10(L./(ones(c,1)*sum(L)))) ...
			       + sum(R .* log10(R./(ones(c,1)*sum(R))))) ...
		    ./ (log10(2)*(sum(L)+sum(R))); % criterion value for all thresholds
		[mininfo(f,1),j] = min(infomeas);     % finds the best
		mininfo(f,2) = T(j);     % and its threshold
	end   
	g = 1-mininfo(:,1)';
	[finfo,j] = min(mininfo(:,1));		% best over all features
	t = mininfo(j,2);			% and its threshold
	return

%FISHCRIT Fisher's Criterion and its best feature split 
% 
% 	[f,j,t] = fishcrit(A,nlabels)
% 
% Computes the value of the Fisher's criterion f for all features 
% over the dataset A with given numeric labels. Two classes only. j 
% is the optimum feature, t its threshold. This is a lowlevel 
% routine called for constructing decision trees.

% Copyright R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function [f,j,t] = fishcrit(a,nlab)
		[m,k] = size(a);
	c = max(nlab);
	if c > 2
		error('Not more than 2 classes allowed for Fisher Criterion')
	end
	% Get the mean and variances of both the classes:
	J1 = find(nlab==1);
	J2 = find(nlab==2);
	u = (mean(a(J1,:),1) - mean(a(J2,:),1)).^2;
	s = std(a(J1,:),0,1).^2 + std(a(J2,:),0,1).^2 + realmin;
	% The Fisher ratio becomes:
	f = u ./ s;
	% Find then the best feature:
	[ff,j] = max(f);
	% Given the feature, compute the threshold:
	m1 = mean(a(J1,j),1);
	m2 = mean(a(J2,j),1);
	w1 = m1 - m2; w2 = (m1*m1-m2*m2)/2;
	if abs(w1) < eps % the means are equal, so the Fisher
			 % criterion (should) become 0. Let us set the thresold
			 % halfway the domain
			 t = (max(a(J1,j),[],1) + minc(a(J2,j),[],1)) / 2;
	else
		t = w2/w1;
	end
	return

%INFSTOP Quinlan's Chi-square test for early stopping
% 
% 	crt = infstop(A,nlabels,j,t)
% 
% Computes the Chi-square test described by Quinlan [1] to be used 
% in maketree for forward pruning (early stopping) using dataset A 
% and its numeric labels. j is the feature used for splitting and t 
% the threshold. 
%
% [1] J.R. Quinlan, Simplifying Decision Trees, 
% Int. J. Man - Machine Studies, vol. 27, 1987, pp. 221-234.
% 
% See maketree, treec, classt, prune 

% Guido te Brake, TWI/SSOR, TU Delft.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function crt = infstop(a,nlab,j,t)
		[m,k] = size(a);
	c = max(nlab);
	aj = a(:,j);
	ELAB = expandd(nlab); 
	L = sum(ELAB(aj <= t,:),1) + 0.001;
	R = sum(ELAB(aj > t,:),1) + 0.001;
	LL = (L+R) * sum(L) / m;
	RR = (L+R) * sum(R) / m;
	crt = sum(((L-LL).^2)./LL + ((R-RR).^2)./RR);
	return

%PRUNEP Pessimistic pruning of a decision tree
% 
% 	tree = prunep(tree,a,nlab,num)
% 
% Must be called by giving a tree and the training set a. num is the 
% starting node, if omitted pruning starts at the root. Pessimistic 
% pruning is defined by Quinlan.
% 
% See also maketree, treec, mapt 

% Guido te Brake, TWI/SSOR, TU Delft.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function tree = prunep(tree,a,nlab,num)
		if nargin < 4, num = 1; end;
	[N,k] = size(a);
	c = size(tree,2)-4;
	if tree(num,3) == 0, return, end;
	w = prmapping('treec','trained',{tree,num},[1:c]',k,c);
	ttt=tree_map(prdataset(a,nlab),w);
	J = testc(ttt)*N;
	EA = J + nleaves(tree,num)./2;   % expected number of errors in tree
	P = sum(expandd(nlab,c),1);     % distribution of classes
					%disp([length(P) c])
					[pm,cm] = max(P);     % most frequent class
					E = N - pm;     % errors if substituted by leave
					SD = sqrt((EA * (N - EA))/N);
					if (E + 0.5) < (EA + SD)	     % clean tree while removing nodes
						[mt,kt] = size(tree);
						nodes = zeros(mt,1); nodes(num) = 1; n = 0;
						while sum(nodes) > n;	     % find all nodes to be removed
							n = sum(nodes);
							J = find(tree(:,3)>0 & nodes==1);
							nodes(tree(J,3)) = ones(length(J),1); 
							nodes(tree(J,4)) = ones(length(J),1); 
						end
						tree(num,:) = [cm 0 0 0 P/N];
						nodes(num) = 0; nc = cumsum(nodes);
						J = find(tree(:,3)>0);% update internal references
						tree(J,[3 4]) = tree(J,[3 4]) - reshape(nc(tree(J,[3 4])),length(J),2);
						tree = tree(~nodes,:);% remove obsolete nodes
					else 
						K1 = find(a(:,tree(num,1)) <= tree(num,2));
						K2 = find(a(:,tree(num,1)) >  tree(num,2));

						tree = prunep(tree,a(K1,:),nlab(K1),tree(num,3));
						tree = prunep(tree,a(K2,:),nlab(K2),tree(num,4));
					end
					return

%PRUNET Prune tree by testset
% 
% 	tree = prunet(tree,a)
% 
% The test set a is used to prune a decision tree. 

% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function tree = prunet(tree,a)
		[m,k] = size(a);
	[n,s] = size(tree);
	c = s-4;
	erre = zeros(1,n);
	deln = zeros(1,n);
	w = prmapping('treec','trained',{tree,1},[1:c]',k,c);
	[f,lab,nn] = tree_map(a,w);  % bug, this works only if a is dataset, labels ???
	[fmax,cmax] = max(tree(:,[5:4+c]),[],2);
	nngood = nn([1:n]'+(cmax-1)*n);
	errn = sum(nn,2) - nngood;% errors in each node
	sd = 1;
	while sd > 0
		erre = zeros(n,1);
		deln = zeros(1,n);
		endn = find(tree(:,3) == 0)';	% endnodes
		pendl = max(tree(:,3*ones(1,length(endn)))' == endn(ones(n,1),:)');
		pendr = max(tree(:,4*ones(1,length(endn)))' == endn(ones(n,1),:)');
		pend = find(pendl & pendr);		% parents of two endnodes
		erre(pend) = errn(tree(pend,3)) + errn(tree(pend,4));
		deln = pend(find(erre(pend) >= errn(pend))); % nodes to be leaved
		sd = length(deln);
		if sd > 0
			tree(tree(deln,3),:) = -1*ones(sd,s);
			tree(tree(deln,4),:) = -1*ones(sd,s);
			tree(deln,[1,2,3,4]) = [cmax(deln),zeros(sd,3)];
		end
	end
	return

%NLEAVES Computes the number of leaves in a decision tree
% 
% 	number = nleaves(tree,num)
% 
% This procedure counts the number of leaves in a (sub)tree of the 
% tree by using num. If num is omitted, the root is taken (num = 1).
% 
% This is a utility used by maketree. 

% Guido te Brake, TWI/SSOR, TU Delft
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands

function number = nleaves(tree,num)
		if nargin < 2, num = 1; end
	if tree(num,3) == 0
		number = 1 ;
	else
		number = nleaves(tree,tree(num,3)) + nleaves(tree,tree(num,4));
	end
	return