Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: (4,551 bytes)
function pred = apply_tree_quick( forest, NaNNbre, xtest_rk, xtrain_rk, xtest_rk_normalized, xtrain_rk_normalized )
%   APPLY_TREE calculates a single tree's contribution to the prediction
%
%   pred = apply_tree( tree, xtrain, NaNNbre) calculates the contributions
%       from tree on data in xtrain. NaNNbre is the number of non-NaN
%       entries in each column of xtrain.
%
%   pred = apply_tree( tree, xtrain, NaNNbre, xtrain_normalized) allows the
%   user to input xtrain_normalized, which is the value of xtrain if it
%   were used to generate a normal variable with zero mean and unit
%   standard deviation. This will speed up the function.
%
% tree is a vector has the following parameters
% - 1-3: Variables indices for first three nodes
% - 4-5: Threshods for nodes 1 and 2
% - 6: slope
% - 7-9 : missing value param for nodes 1 to 3
% - 10 : architechture type (1,2,3,4), i.e. location of the final node in the tree
% - 11: intercept

%	$LastChangedBy: alistair $
%	$LastChangedDate: 2012-05-16 13:48:37 +0100 (Wed, 16 May 2012) $
%	$Revision: 10 $
%	Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13
%	Contact: alistairewj@gmail.com

% treeidx = 1;
% tree = forest(1,:,treeidx);
Ntrees = size(forest,2);
if Ntrees > 1
    Ntrees = Ntrees-1; % do not include intercept in forest calcs
end
%% APPLY TREE
N = size(xtest_rk,1);

%% First Node
% first split value [0,1] * number of non-nan values
i1 = forest(1,1:Ntrees);
i4 = forest(4,1:Ntrees);
i7 = forest(7,1:Ntrees);
i1_NaN = NaNNbre(i1);
obs1 = bsxfun(@gt,xtest_rk(:,i1),i4.*i1_NaN); % get observations in 1st split
%         obs1 = xtrain(:,tree(1)) > tree(4)*NaNNbre(tree(1));
% rank greater than...
% if number of NaNs * missingness for that param < number of values in the split
%         if(NaNNbre(tree(1))*tree(7)<sum(obs1))
%             obs1(isnan(obs1))=1;
%         end % missing values
obs1train = bsxfun(@gt, xtrain_rk(:,i1),i4.*i1_NaN);
obs1train = (i1_NaN.*i7) < sum(obs1train,1);
obs1(isnan(xtest_rk(:,i1)) & repmat(obs1train, N, 1)) = true;

%         if(tree(10)<3)
%             obs1=~obs1;
%         end % tree structure
i10 = repmat(forest(10,1:Ntrees) < 3,N,1);
obs1(i10) = ~obs1(i10); % include fraction of missing values in split

%% Second Node
i2 = forest(2,1:Ntrees);
i5 = forest(5,1:Ntrees);
i8 = forest(8,1:Ntrees);
i2_NaN = NaNNbre(i2);

%         obs2=xtrain(:,tree(2))>tree(5)*NaNNbre(tree(2));
obs2 = bsxfun(@gt,xtest_rk(:,i2),i5.*i2_NaN);

%         if(NaNNbre(tree(2))*tree(8)<sum(obs2))
%             obs2(isnan(obs2))=1;
%         end
obs2train = bsxfun(@gt, xtrain_rk(:,i2),i5.*i2_NaN);
obs2train = (i2_NaN.*i8)<sum(obs2train,1);
obs2(isnan(xtest_rk(:,i2)) & repmat(obs2train,N,1)) = true;

%         if(mod(tree(10),2)==0)
%             obs2=~obs2;
%         end
i10 = repmat(mod(forest(10,1:Ntrees),2)==0,N,1);
obs2(i10) = ~obs2(i10);

%% Third node (final leaf)
obs1 = obs1 & obs2; % take the intersection
% obs1 now contains only the data to be evaluated by the tree

i3 = forest(3,1:Ntrees);
% now s1 aren't missing, s2 are missing (again indices logicals)
obs1nans = obs1 & isnan(xtest_rk(:,i3)); % missing values
obsTot = obs1 & ~isnan(xtest_rk(:,i3)); % non-missing values

% take the ranks, assume normal distribution and synthetize the
% data: don't give a #*& about the values
% THIS STEP SHOULD BE DONE BEFORE THE MCMC LOOP.

%=== Need to extract the values at obsTot for the features indexed in i3
% Will use obsTot as a multiplicative factor to select/de-select obs
% x_normalized = bsxfun(@minus,xtest_rk_normalized(:,i3),nanmean(xtrain_rk_normalized(:,i3),1));
x_normalized = xtest_rk_normalized(:,i3);
%=== Find the value selected from the training data to be a surrogate 
% for missing values
i6 = forest(6,1:Ntrees); % multiplicative factor on the training values

%=== assume it's already sorted
% temp2=bsxfun(@times,i6,sort(xtrain_rk_normalized(:,i3),1,'ascend'));
% temp 2 contains all the training values - index into it for the selected
% value used to represent missing data
i9 = forest(9,1:Ntrees);
idxMV = ceil(i9.*(NaNNbre(i3)-1));
idxMV = sub2ind([size(xtrain_rk_normalized,1),Ntrees],idxMV,1:Ntrees);

temp2 = xtrain_rk_normalized(:,i3);
temp2 = temp2(idxMV); 
temp2 = i6 .* temp2;
temp2 = repmat(reshape(temp2,[1,Ntrees]),N,1);

pred = zeros(N,Ntrees);

i11 = repmat(forest(11,1:Ntrees),N,1); % intercept
pred(obs1nans) = i11(obs1nans) + temp2(obs1nans);
predTot = bsxfun(@times, i6, x_normalized);
pred(obsTot) = i11(obsTot) + predTot(obsTot);
pred(isnan(pred))=0;
pred = sum(pred,2);

end