Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0
(4,551 bytes)
function pred = apply_tree_quick( forest, NaNNbre, xtest_rk, xtrain_rk, xtest_rk_normalized, xtrain_rk_normalized )
% APPLY_TREE calculates a single tree's contribution to the prediction
%
% pred = apply_tree( tree, xtrain, NaNNbre) calculates the contributions
% from tree on data in xtrain. NaNNbre is the number of non-NaN
% entries in each column of xtrain.
%
% pred = apply_tree( tree, xtrain, NaNNbre, xtrain_normalized) allows the
% user to input xtrain_normalized, which is the value of xtrain if it
% were used to generate a normal variable with zero mean and unit
% standard deviation. This will speed up the function.
%
% tree is a vector has the following parameters
% - 1-3: Variables indices for first three nodes
% - 4-5: Threshods for nodes 1 and 2
% - 6: slope
% - 7-9 : missing value param for nodes 1 to 3
% - 10 : architechture type (1,2,3,4), i.e. location of the final node in the tree
% - 11: intercept
% $LastChangedBy: alistair $
% $LastChangedDate: 2012-05-16 13:48:37 +0100 (Wed, 16 May 2012) $
% $Revision: 10 $
% Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13
% Contact: alistairewj@gmail.com
% treeidx = 1;
% tree = forest(1,:,treeidx);
Ntrees = size(forest,2);
if Ntrees > 1
Ntrees = Ntrees-1; % do not include intercept in forest calcs
end
%% APPLY TREE
N = size(xtest_rk,1);
%% First Node
% first split value [0,1] * number of non-nan values
i1 = forest(1,1:Ntrees);
i4 = forest(4,1:Ntrees);
i7 = forest(7,1:Ntrees);
i1_NaN = NaNNbre(i1);
obs1 = bsxfun(@gt,xtest_rk(:,i1),i4.*i1_NaN); % get observations in 1st split
% obs1 = xtrain(:,tree(1)) > tree(4)*NaNNbre(tree(1));
% rank greater than...
% if number of NaNs * missingness for that param < number of values in the split
% if(NaNNbre(tree(1))*tree(7)<sum(obs1))
% obs1(isnan(obs1))=1;
% end % missing values
obs1train = bsxfun(@gt, xtrain_rk(:,i1),i4.*i1_NaN);
obs1train = (i1_NaN.*i7) < sum(obs1train,1);
obs1(isnan(xtest_rk(:,i1)) & repmat(obs1train, N, 1)) = true;
% if(tree(10)<3)
% obs1=~obs1;
% end % tree structure
i10 = repmat(forest(10,1:Ntrees) < 3,N,1);
obs1(i10) = ~obs1(i10); % include fraction of missing values in split
%% Second Node
i2 = forest(2,1:Ntrees);
i5 = forest(5,1:Ntrees);
i8 = forest(8,1:Ntrees);
i2_NaN = NaNNbre(i2);
% obs2=xtrain(:,tree(2))>tree(5)*NaNNbre(tree(2));
obs2 = bsxfun(@gt,xtest_rk(:,i2),i5.*i2_NaN);
% if(NaNNbre(tree(2))*tree(8)<sum(obs2))
% obs2(isnan(obs2))=1;
% end
obs2train = bsxfun(@gt, xtrain_rk(:,i2),i5.*i2_NaN);
obs2train = (i2_NaN.*i8)<sum(obs2train,1);
obs2(isnan(xtest_rk(:,i2)) & repmat(obs2train,N,1)) = true;
% if(mod(tree(10),2)==0)
% obs2=~obs2;
% end
i10 = repmat(mod(forest(10,1:Ntrees),2)==0,N,1);
obs2(i10) = ~obs2(i10);
%% Third node (final leaf)
obs1 = obs1 & obs2; % take the intersection
% obs1 now contains only the data to be evaluated by the tree
i3 = forest(3,1:Ntrees);
% now s1 aren't missing, s2 are missing (again indices logicals)
obs1nans = obs1 & isnan(xtest_rk(:,i3)); % missing values
obsTot = obs1 & ~isnan(xtest_rk(:,i3)); % non-missing values
% take the ranks, assume normal distribution and synthetize the
% data: don't give a #*& about the values
% THIS STEP SHOULD BE DONE BEFORE THE MCMC LOOP.
%=== Need to extract the values at obsTot for the features indexed in i3
% Will use obsTot as a multiplicative factor to select/de-select obs
% x_normalized = bsxfun(@minus,xtest_rk_normalized(:,i3),nanmean(xtrain_rk_normalized(:,i3),1));
x_normalized = xtest_rk_normalized(:,i3);
%=== Find the value selected from the training data to be a surrogate
% for missing values
i6 = forest(6,1:Ntrees); % multiplicative factor on the training values
%=== assume it's already sorted
% temp2=bsxfun(@times,i6,sort(xtrain_rk_normalized(:,i3),1,'ascend'));
% temp 2 contains all the training values - index into it for the selected
% value used to represent missing data
i9 = forest(9,1:Ntrees);
idxMV = ceil(i9.*(NaNNbre(i3)-1));
idxMV = sub2ind([size(xtrain_rk_normalized,1),Ntrees],idxMV,1:Ntrees);
temp2 = xtrain_rk_normalized(:,i3);
temp2 = temp2(idxMV);
temp2 = i6 .* temp2;
temp2 = repmat(reshape(temp2,[1,Ntrees]),N,1);
pred = zeros(N,Ntrees);
i11 = repmat(forest(11,1:Ntrees),N,1); % intercept
pred(obs1nans) = i11(obs1nans) + temp2(obs1nans);
predTot = bsxfun(@times, i6, x_normalized);
pred(obsTot) = i11(obsTot) + predTot(obsTot);
pred(isnan(pred))=0;
pred = sum(pred,2);
end