Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: <base>/sources/alistairewj_at_gmail.com/entry6/stat_calc_struct.m (8,193 bytes)
function [ out, roc ] = stat_calc_struct( pred, target, lessStr )
%stat_calc_struct Unleashes many statistical tests to determine the efficacy
% of a given set of predictions, pred, on a set of dichotomous targets,
% target. Predictions should range between 0-1, and represent the
% probability estimate that the target is 1.
%   [ out ] = stat_calc_struct(pred,target) calculates the following
%   statistical tests and stores them in the corresponding structure fields
%   listed:
%
%       AUROC   - Area under the receiver operator (characteristic) curve
%       HL      - Hosmer-Lemeshow Statistic
%       HLD     - Hosmer-Lemeshow Statistic normalized to variable range
%       acc     - Mean accuracy of the prediction using a threshold of 0.5
%       R       - Shapiro's R, the geometric mean of positive outcomes
%       B       - Brier's score, the mean square error
%       SMR     - Standardized mortality ratio. Mean observed outcomes
%                   divided by mean predicted outcomes.
%       coxA    - Alpha coefficient in cox calibration.
%       coxB    - Beta coefficient in cox calibration.
%                   Cox calibration involves a linear regression of the
%                   predictions onto the targets. coxA is the constant
%                   offset, while coxB is the slope.
%       TP      - True positives.
%       FP      - False positives.
%       TN      - True negatives.
%       FN      - False negatives.
%       sens    - Sensitivity. Calculated as: TP/(TP+FN)
%       spec    - Specificity. Calculated as: TN/(TN+FP)
%       ppv     - Positive predictive value. Calculated as: TP/(TP+FP)
%       npv     - Negative predictive value. Calculated as: TN/(TN+FN)
%       ll      - Log likelihood
%
%   [ out,roc ] = stat_calc_struct(pred,target) additionally calculates the
%       values pairs which form the ROC curve.
%           roc.x - Stores 1-specificity, plotted on the x-axis
%           roc.y - Stores Sensitivity, plotted on the y-axis

%	Copyright 2011 Alistair Johnson

%	$LastChangedBy: alistair $
%	$LastChangedDate: 2012-06-01 17:44:43 +0100 (Fri, 01 Jun 2012) $
%	$Revision: 25 $
%	Originally written on GLNXA64 by Alistair Johnson
%	Contact: alistairewj@gmail.com

out=[]; roc=[];
%=== Save computation time by skipping cox
if nargin>2 && strcmp(lessStr,'less')
    lessFlag = true;
else
    %=== Default, calculate cox calibration curves
    lessFlag = false;
end

%=== Add in some error checks
if isempty(pred)
    out=defaultStruct();
    return;
elseif sum(isnan(pred))==length(pred)
    % Pred is all NaNs
    out = defaultStruct();
    return;
end

if max(pred>1) || max(pred<0)
    warning('stat_calc_struct:IllConditioned',...
        ['\nThis function will return potentially erroneous values \n' ...
        'when predictions are not in the range [0,1].\n']);
end

% c-stat
out.AUROC=cstat(pred,target);

% Shapiro's R (geometric mean of positive targetcomes)
% transform into naural logarithm, take arithmetic mean, transform back
out.R=exp(sum(log(pred(target==1)))/sum(target));

% Brier's Score, B (mean square error)
out.B=mean((target-pred).^2);

% Accuracy
out.acc=1-(sum(abs(round(target-pred)),1)/length(target));

% hosmer-lemeshow
out.HL=lemeshow(pred,target);
out.HLmine=hlCStat(pred,target);
out.HLD=lemeshowNormalized(pred,target);

% SMR
out.SMR=sum(target)/sum(pred);


idx1 = target==1;
idx0 = target==0;
out.TP=sum(pred(idx1) >= 0.5);
out.FN=sum(pred(idx1) < 0.5);
out.FP=sum(pred(idx0) >= 0.5);
out.TN=sum(pred(idx0) < 0.5);

out.sens=out.TP/(out.TP+out.FN);
out.spec=out.TN/(out.TN+out.FP);
out.ppv=out.TP/(out.TP+out.FP);
out.npv=out.TN/(out.TN+out.FN);

out.ll = sum(-target.*log(pred)-(1-target).*log(1-pred));

% cox linear regression testing
if ~lessFlag
    b=glmfit(pred,target,'binomial','link','identity');
    out.coxA=b(1); out.coxB=b(2);
end

if nargout>1 
    if any(idx1) && any(idx0)
        [roc.x,roc.y]=perfcurve(target,pred,1);
    else
        roc.x=[]; roc.y=[];
    end
end

end

function [ H ] = lemeshow( pred, outcome )

data = [outcome,pred];
D=10; % Use D-iles (i.e., deciles as in Lemeshow's original paper)
N=length(data(:,1));
M=floor(N/D); % M should always be 400 for the challenge
data=sortrows(data,2); % rearrange the matrix by estimated risk
CM=zeros(D,3)+NaN;
centroid=zeros(D-1,1)+NaN;
for i=1:D
    %Set number of deaths proportional to Y true value
    ind2=i*M;      % highest bin in decile D
    ind1=ind2-M+1; % lowest bin
    Obs=sum([data(ind1:ind2,1)]); % number of observed deaths within decile D
    centroid(i)=mean(data(ind1:ind2,2)); % mean estimated risk within decile
    Exp=M*centroid(i); % expected number of deaths within decile
    Htemp=(Obs-Exp)^2/(M*centroid(i)*(1-centroid(i)) + 0.001);
    CM(i,:)=[Obs Exp Htemp];
end

% Hosmer-Lemeshow H statistic, normalized by range of decile risk
H=sum(CM(:,3));

end


function [ H ] = lemeshowNormalized( pred, outcome )

data = [outcome,pred];
D=10; % Use D-iles (i.e., deciles as in Lemeshow's original paper)
N=length(data(:,1));
M=floor(N/D); % M should always be 400 for the challenge
data=sortrows(data,2); % rearrange the matrix by estimated risk
CM=zeros(D,3)+NaN;
centroid=zeros(D-1,1)+NaN;
for i=1:D
    %Set number of deaths proportional to Y true value
    ind2=i*M;      % highest bin in decile D
    ind1=ind2-M+1; % lowest bin
    Obs=sum([data(ind1:ind2,1)]); % number of observed deaths within decile D
    centroid(i)=mean(data(ind1:ind2,2)); % mean estimated risk within decile
    Exp=M*centroid(i); % expected number of deaths within decile
    Htemp=(Obs-Exp)^2/(M*centroid(i)*(1-centroid(i)) + 0.001);
    CM(i,:)=[Obs Exp Htemp];
end

% Hosmer-Lemeshow H statistic, normalized by range of decile risk
H=sum(CM(:,3))/(centroid(10)-centroid(1));

end

function [ G ] = hlCStat( pred, outcome )
%hltest hosmer lemeshow test
%   [ G ] = hltest(pred, outcome) calculates the hosmer-lemeshow C statistic
%   for a given set of predictions and outcomes. Steps include:
%       1) Sort data from lowest pred to highest
%       2) Split the data into 10 bins with equal size
%       3) Calculate the mean square error for each bin
%       4) Divide the each bin by the number of samples in each bin
%       5) Divide each bin by the mean prediction in that bin
%       6) Divide each bin by (1-mean prediction) in that bin
%       7) Sum across all bins
%
bins=30;

% sort predictions from lowest to highest
[pred,ind]=sort(pred,1,'ascend');
outcome=outcome(ind);
G=zeros(10,1);

bin = floor(1+length(pred)*(0:bins)/bins);
for q = 1:bins
    int = bin(q):(bin(q+1)-1); %bin indexes
    N = length(int); % number of patients in bin
    E=sum((pred(int))); % expected
    O=sum(round(outcome(int))); % observed
    Eprob = mean(pred(int)) ; % expected probability
    G(q) = (E-O)^2 / (Eprob*N*(1-Eprob));
end
G = sum(G);
end

function [ c ] = cstat( pred, target )
%cstat Calculates the c-statistic
%   [ c ] = cstat (pred, out)
%       pred        - vector containing the predicted targets
%       target      - vector containing the true targets
%
%       c is equivalent to the AUROC, a measure of model discrimination
%           Mathematically: Pr(pred|out==1 > pred|out==0)

%=== Arrange predictions
[pred,idx] = sort(pred,1,'ascend');
target=target(idx);
[N,P] = size(pred);

%=== Find location of negative targets
c=zeros(1,P); % 1xP where P is # of AUROCs to calculate
negative=false(N,P);

for n=1:P
    negative(:,n) = target(:,n)==0;
    
    %=== Count the number of negative targets below each element
    negativeCS = cumsum(negative(:,n),1);
    
    %=== Only get positive targets
    pos = negativeCS(~negative(:,n));
    
    c(n) = sum(pos);
end
count=sum(negative,1); %=== count number who are negative
count = count .* (N-count); %=== multiply by positives
c=c./count;

end

function [defStruct] = defaultStruct()
    defStruct = struct('AUROC',[],...
        'HL',[],...
        'acc',[],...
        'R',[],...
        'B',[],...
        'SMR',[],...
        'coxA',[],...
        'coxB',[],...
        'TP',[],...
        'FP',[],...
        'TN',[],...
        'FN',[],...
        'sens',[],...
        'spec',[],...
        'ppv',[],...
        'npv',[]);
end