Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0
(9,584 bytes)
function [ forests ] = NicForest_train_parallel(xtrain,ytrain,Ntrees,width, opt)
% [ forests , ytrain_pred ] = NicForest_train_parallel(xtrain,ytrain,NTrees,width, opt)
% Trains a forest using specified NTrees, Width, and MCMC parameters
% stored in opt, in parallel using parfor.
% $LastChangedBy: alistair $
% $LastChangedDate: 2012-05-16 13:48:37 +0100 (Wed, 16 May 2012) $
% $Revision: 10 $
% Originally written on GLNXA64 by Alistair Johnson, 09-May-2012 16:26:13
% Contact: alistairewj@gmail.com
%% Initialise parameters
[N,P]=size(xtrain);
if nargin>4
MCCite=opt.MCCite; % number of Iterations for each MCMC repetition
MCCsave=opt.MCCsave; % Save forest every MCCsave
MCCres=opt.MCCres; % Number of reset during MCCres iterations
MCCnbr = MCCite*MCCres;
% next two are chosen on therotical grounds based on the data
% Get refereence and try to implement automated parameters here
% width^2*Ntrees/2 ~ var(logit(Pi)) where Pi is the pred from reasonable model.
% Ntrees=500;
NtreesUpdate = opt.NtreesUpdate;
else
%=== defaults
MCCite=100000; % number of Iterations for each MCMC repetition
MCCsave=2e3; % Save forest every MCCsave
MCCres=2; % Number of reset during MCCres iterations
MCCnbr = MCCite*MCCres;
% next two are chosen on therotical grounds based on the data
% Get refereence and try to implement automated parameters here
% width^2*Ntrees/2 ~ var(logit(Pi)) where Pi is the pred from reasonable model.
% Ntrees=500;
NtreesUpdate = 2;
end
%=== Rank and replace training data
xtrain = tiedrankrelative(xtrain);
NaNNbre = sum( ~isnan(xtrain) , 1 ) + 1; %+1 is to symetrize the ratio number of observation/NaNNbre
%=== Determine method: regression or classification
num_tar = numel(unique(ytrain));
if num_tar==1
error('Only one class provided');
elseif num_tar==2
likelihoodFcn = 'll';
%=== set default intercept+sigma
intercept = log(1/(-1+1/mean(ytrain)));
sigma = 0;
else
likelihoodFcn = 'normll';
intercept = mean(ytrain);
sigma = sqrt(var(ytrain));
end
%% Normalize data
% squish data by assuming it is drawn from a normal distribution
%=== Scale ranks between 0->1
xtrain_normalized = bsxfun(@rdivide,xtrain,NaNNbre);
%=== Squish into normal distribution!
xtrain_normalized=norminv(xtrain_normalized,0,1);
%=== Initialize forest
%=== Create cell - each cell contains independent MCMC runs
forests_cell = cellfun(@(x) zeros(floor(MCCite/MCCsave*4/5), 11, Ntrees+1), cell(MCCres,1), 'UniformOutput',false);
priors_cell = cellfun(@(x) zeros(floor(MCCite/MCCsave*4/5), 2), cell(MCCres,1), 'UniformOutput',false);
% goes, number of forest
% 11 paramers in each tree
% trees+1 is number of trees in the forest + 1 is intercept
% (b0 in regression)
%% Monte carlo
count=0;
for r=1:MCCres
%=========================%
%=== RESET MONTE CARLO ===%
%=========================%
%=== Initialize forest
[forest adds sums] = initialize_forest( Ntrees , P , ytrain );
BurnInFlag = false;
%=== Initialize priors + add to adds/sums
prior = zeros(2,MCCite+1); % 2xMCCite - [intercept; sigma]
prior(1,1) = intercept; % initial sigma
prior(2,1) = sigma; % initial sigma
%=== for efficiency, generate all the random variables used to update
%priors before the MCMC loop
prior(2,2:end) = normrnd(0,width,1,size(prior,2)-1);
prior(2,2:end) = gamrnd(1000000,1/1000000,1,size(prior,2)-1);
% prior(2,2:end) = 0.02*rand(1,size(prior,2)-1) - 0.01; % uniform distribution
adds(:,Ntrees+1)=prior(1,1);
sums=sums+prior(1,1);
%=== Keep track of the variables used in vector
varUsed = zeros(3,P);
for k=1:3
varUsed(k,:) = hist(forest(k,:),1:P); % count number of variables used in forest
end
%=========================%
%=== BEGIN MONTE CARLO ===%
%=========================%
for j=1:MCCite % for each MCMC iteration
% Create the next forest candidate
cForest = forest;
prior(1,j+1)=prior(1,j)+prior(1,j+1); % randomize intercept
prior(2,j+1)=abs(prior(2,j)*prior(2,j+1)); % randomize sigma
v=zeros(N,NtreesUpdate);
tempsums=sums+prior(1,j+1)-prior(1,j); % current tree intercept contribution
UpdatedTreesNbr= randsample(Ntrees,NtreesUpdate,0); % randomly selected 2 trees
% each iteration considers 2 random trees to update
for i=1:NtreesUpdate
% Update tree in the forest
cForest(:,UpdatedTreesNbr(i)) = update_tree( forest(:,UpdatedTreesNbr(i)) ,.5 , width , P, varUsed) ;
% Apply tree
v(:,i) = apply_tree( cForest(:,UpdatedTreesNbr(i)) , xtrain , NaNNbre, xtrain_normalized );
% temp score matrix where to update only the two changed trees
% where adds is he old contribution (overwriten 2 trees)
tempsums=tempsums+v(:,i)-adds(:,UpdatedTreesNbr(i));
end
% Compute log likelyhood for current and previous forest
e1=feval(likelihoodFcn,sums,ytrain,prior(2,j)); % log likelihood previous forest
e2=feval(likelihoodFcn,tempsums,ytrain,prior(2,j+1)); % log likelihood new forest
% Metropolis accepting step
if(rand(1)<exp(e1-e2))
varRemove = forest(1:3,UpdatedTreesNbr);
varAdd = cForest(1:3,UpdatedTreesNbr);
for i=1:length(UpdatedTreesNbr)
% accept the new forests
forest(:,UpdatedTreesNbr(i)) = cForest(:,UpdatedTreesNbr(i));
adds(:,UpdatedTreesNbr(i))=v(:,i); %update adds
% update the variable count tracking
idxUpdate = sub2ind([3,P],[1;2;3],varRemove(:,i));
varUsed(idxUpdate) = varUsed(idxUpdate) - 1;
idxUpdate = sub2ind([3,P],[1;2;3],varAdd(:,i));
varUsed(idxUpdate) = varUsed(idxUpdate) + 1;
end
sums=tempsums; %update sums
else
%=== rejected, reset prior to previous value
prior(:,j+1) = prior(:,j);
end
% save forests every MCCsave
if(mod(j,MCCsave)==0)
if ~BurnInFlag && j>= (MCCite/5) % >= 20% into sampling
% if MCCsave = 2000, MCCres = 40, then never in burn-in period
BurnInFlag=true; % no longer in burn-in period.
end
if BurnInFlag
count=count+1;
forests_cell{r}(count,:,:)=forest;
priors_cell{r}(count,:)= prior(:,j+1);
end
fprintf('%2.0f%% complete.\n',(j+(r-1)*MCCite)/MCCnbr*100);
end
end
%=== Remove extra forests which occur when parameters MCCSave + MCCIte don't match
if count < size(forests_cell{r},1)
forests_cell{r} = forests_cell{r}(1:count,:,:);
end
end
%=== Create forests/priors
forests = cell2mat(forests_cell);
priors = cell2mat(priors_cell);
%=== Add intercept prior to end of forest matrix
forests = cat(3,forests,repmat(priors(:,1),1,11));
% if num_tar>2
% forests = cat(3,forests,repmat(priors(:,2),1,11)); % sigma
% end
end
function [p] = ll(pred,tar,sigma) % log likelihood
pred = invlogit(pred);
p = sum(-tar.*log(pred)-(1-tar).*log(1-pred));
end
function [p] = normll(pred,tar,sigma) % log likelihood
p = log(sigma)*numel(pred) + 0.5*sum((tar-pred).^2)/(sigma^2);
end
function [p] = invlogit(p) % inverse logit
p = 1./(1+exp(-p));
end
function [forest adds sums] = initialize_forest( Ntrees , Nvar , ytrain )
forest=zeros(11,Ntrees); %initialize matrix of trees (current forest)
N=length(ytrain);
for i=1:3
forest(i,1:Ntrees) = randsample(1:Nvar,Ntrees,1);
end
forest(7:9,:)=0.5;
adds=zeros(N,Ntrees+1);
sums=zeros(N,1);
end
function cT = update_tree(cT,p,width,varNum,varUsed)
% tree is a vector has the following parameters
% - 1-3: Variables indices for first three nodes
% - 4-5: Threshods for nodes 1 and 2
% - 6: slope
% - 7-9 : missing value param for nodes 1 to 3
% - 10 : architechture type (1,2,3,4), i.e. location of the final node in the tree
% - 11: intercept
% ramdomly create a entirely new tree or a similar to speed up
p2 = 0.5; % Probability of propagating a variable towards bottom of the tree
leakage = 0.1; % Probability of ignoring weights for variable selection
varTemp = sum(varUsed(1:2,:),1);
if(rand(1)<p)
%=== Randomize variables to split on
for k=1:2 % For the split variables, weight them using both splits
if rand(1) < leakage
cT(k)=randsample(varNum,1,1); % sample ignoring weights
else
cT(k)=randsample(varNum,1,1,varTemp); % resample variables using weights
end
end
%=== Randomize variable to regress on
% For the regression variable, which is weighted independetly
if rand(1) < leakage
cT(3)=randsample(varNum,1,1); % sample ignoring weights
else
cT(3)=randsample(varNum,1,1,varUsed(3,:)); % resample variables using weights
end
% if(rand(1)<p2); cT(2)=cT(1); end % encourage 2nd split to be first split
% if(rand(1)<p2); cT(3)=cT(2); end % encourage 3rd split to be 2nd split
cT([4:5 7:9])=rand(5,1);
cT([6 11])=randn(2,1)*width;
cT(10)=randsample(1:4,1);
else
% tree similar to previous
cT([4:5 7:9])=mod(cT([4:5 7:9])+(rand(5,1)-0.5)/2,1);
cT([6 11])=1/2*cT([6 11])+randn(2,1)*sqrt(1-(1/2)^2)*width;
end
end