Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: <base>/sources/alistairewj_at_gmail.com/entry6/apply_tree.m (3,203 bytes)
function pred = apply_tree( tree , xtrain , NaNNbre, xtrain_normalized )
%   APPLY_TREE calculates a single tree's contribution to the prediction
%   
%   pred = apply_tree( tree, xtrain, NaNNbre) calculates the contributions
%       from tree on data in xtrain. NaNNbre is the number of non-NaN 
%       entries in each column of xtrain.
%
%   pred = apply_tree( tree, xtrain, NaNNbre, xtrain_normalized) allows the
%   user to input xtrain_normalized, which is the value of xtrain if it
%   were used to generate a normal variable with zero mean and unit
%   standard deviation. This will speed up the function.
%
% tree is a vector has the following parameters
% - 1-3: Variables indices for first three nodes
% - 4-5: Threshods for nodes 1 and 2 
% - 6: slope
% - 7-9 : missing value param for nodes 1 to 3
% - 10 : architechture type (1,2,3,4), i.e. location of the final node in the tree
% - 11: intercept 

%	$LastChangedBy: alistair $
%	$LastChangedDate: 2012-05-16 13:48:37 +0100 (Wed, 16 May 2012) $
%	$Revision: 10 $
%	Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13
%	Contact: alistairewj@gmail.com

N = size(xtrain,1);
pred=zeros(N,1);
%% First Node - split the data based upon tree(4), which represents a fractional rank (i.e., data ranks are from 1-10, tree(4) = 0.4, then the split is 4)
% first split value [0,1] * number of non-nan values
obs1 = xtrain(:,tree(1))>tree(4)*NaNNbre(tree(1));  
                                      % rank greater than...  
% if number of NaNs * missingness for that param < number of values in the split
if(NaNNbre(tree(1))*tree(7)<sum(obs1))  % tree(7) is a factor which indicates whether missing values should be kept in this split
    obs1(isnan(obs1))=1; 
end % missing values
% obs1 is now all the values to the right of the split
if(tree(10)<3) obs1=~obs1; end % tree structure 

%% Second Node - get the indices of the observations in the 2nd split
obs2=xtrain(:,tree(2))>tree(5)*NaNNbre(tree(2));
if(NaNNbre(tree(2))*tree(8)<sum(obs2)) obs2(isnan(obs2))=1; end
if(mod(tree(10),2)==0) obs2=~obs2; end

%% Third node (final leaf) - only use observations that satisfy both splits
obs1 = obs1 & obs2; % take the intersection

% now s1 aren't missing, s2 are missing (again indices logicals)
obs1nans = obs1 & isnan(xtrain(:,tree(3)));
obsTot = obs1 & ~isnan(xtrain(:,tree(3)));

% take the ranks, assume normal distribution and synthetize the
% data: don't give a #*& about the values
% THIS STEP SHOULD BE DONE BEFORE THE MCMC LOOP.
if nargin<4
    xtrain_normalized = norminv(xtrain(obsTot,tree(3))/NaNNbre(tree(3)));
else
    xtrain_normalized = xtrain_normalized(obsTot,tree(3));
end

% %=== Center the normalization on the patients in the regression subset
% xtrain_normalized = xtrain_normalized - mean(xtrain_normalized);

temp2=tree(6)*sort(xtrain_normalized);
if(sum(obsTot)>0) % if you have at least one non-missing value 
    % allocate missing value score, intercept + synthetic data based on t7(9)
    % synthetic data is the value closest to t9
    pred(obs1nans) = tree(11) + temp2(round(0.5+tree(9)*sum(obsTot))); 
end

% non-missong
pred(obsTot) = tree(11) + tree(6)*xtrain_normalized;
pred(isnan(pred))=0;