Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0
(22,275 bytes)
function [ data ] = pnPreprocess(data)
%PNPREPROCESS Output data after preprocessing
% [ data ] = pnPreprocess(data)
% Primary variables:
% data - Cell array data, rows = # of patients
% dataDesc - Cell array of data fields/descriptions
% dataFixed - Cell array of demographic data fields/descriptions
% tmp - Data for a single parameter, e.g. 'HR'
% idxRem - Index for data to be removed at end of the loop
% idxManip - Index for data to be manipulated some how in switch block
% high/low - Extracted data which is manipulated and re-imputed into tmp
% Copyright 2012 Alistair Johnson
% $LastChangedBy$
% $LastChangedDate$
% $Revision$
% Originally written on PCWIN64 by Alistair Johnson, 25-Apr-2012 01:37:49
% Contact: alistairewj@gmail.com
data_processed = data;
[dataDesc,dataFixed] = pnDataDescriptions();
%=== Loop through the 3 fixed, demographic fields
for k=1:size(dataFixed,1)
fn = dataFixed{k,1};
fprintf('\n%%=== %s ===%%\n', fn);
[tmp,idx] = pnExtractField(data_processed,fn);
%=== Reset delete indices
idxRem = [];
switch fn
case 'RecordID'
continue;
case 'Age'
idxManip = cellfun(@(x) x>100, tmp(:,4),'UniformOutput',false);
high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
high = cellfun(@(x) x*0+105, high, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = high{m};
end
fprintf('Replaced %2.0f values of 200 with 105.\n',sum(cell2mat(idxManip)));
%=== Analyze residuals of data, check for bias
% idxManip = cellfun(@(x) numel(x), tmp(:,4),'UniformOutput',false);
% tmpDataAnalyze = cellfun(@(x,y) x(1:y-1) - x(y), tmp(:,4), idxManip, 'UniformOutput',false);
% tmpDataAnalyze = cell2mat(tmpDataAnalyze(cellfun(@(x) ~isempty(x), tmpDataAnalyze)));
% hist(tmpDataAnalyze,-10:1:10); xlabel('Age (1:end-1) - Age(end)');
idxManip = cellfun(@(x) [true(numel(x)-1,1);false], tmp(:,4),'UniformOutput',false);
%=== Impute 0s for vector values
tmp(:,4) = cellfun(@(x,y) x-x.*y, tmp(:,4), idxManip, 'UniformOutput',false);
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); % index ==0 for removal
fprintf('Deleted %2.0f vector values, leaving only the end value.\n',N);
case 'Gender'
idxManip = cellfun(@(x) (x==-1), tmp(:,4),'UniformOutput',false);
high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
high = cellfun(@(x) NaN, high, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = high{m};
end
fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip)));
case 'Height'
% 1 centimetre = 0.393700787 inches
% 1 foot = 30.48 centimetres
% 1 inch = 2.54 centimetres
idxManip = cellfun(@(x) x==-1, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) NaN, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(~cell2mat(idxManip)));
idxManip = cellfun(@(x) x<10, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*100, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multipled %2.0f values of <10 by 100 (1.8->180).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) x<25, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*10, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multipled %2.0f values of <25 by 10 (18->180).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) x<100, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*2.54, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multipled %2.0f values of <100 by 2.2 (81.8->180).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) x>1000, tmp(:,4),'UniformOutput',false);
high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
high = cellfun(@(x) x*0.1, high, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = high{m};
end
fprintf('Multipled %2.0f values of >1000 by 0.1 (1800->180).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) x>250, tmp(:,4),'UniformOutput',false);
high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
high = cellfun(@(x) x/2.54, high, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = high{m};
end
fprintf('Multipled %2.0f values of >250 by 0.3937 (400->157).\n',sum(cell2mat(idxManip)));
end
%=== Replace data with tmp
for m=2:size(data,2)
data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false);
end
%=== Delete entries flagged for deletion
if ~isempty(idxRem)
for m=2:size(data,2)
data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false);
end
end
end
%=== Loop through all the fields, and process each appropriately
for k=1:size(dataDesc,1)
fn = dataDesc{k,1};
fprintf('\n%%=== %s ===%%\n', fn);
[tmp,idx] = pnExtractField(data_processed,fn);
idxRem=[];
switch fn
case 'Albumin'
fprintf('No preprocessing performed.\n');
case 'ALP'
%=== Possible conversion errors in ALP, ALT, AST:
% Listed as measured in IU
% 1 Katal = 60,000,000 IU
% 1 micro Katal = 60 IU
% 1 nano Katal = 0.060 IU
% *** micro kats are sometimes used.
% Reference range: 30 to 120 IU/L
fprintf('No preprocessing performed - Note: highly skewed. Dependent on age and gender.\n');
case 'ALT'
% Reference range: 7 to 56 IU/L
fprintf('No preprocessing performed.\n');
case 'AST'
% Reference range: 5 to 40 IU/L
% Very heavy tailed: Normal range 5 to 40 IU/L, 10%% data > 1000 IU/L.
fprintf('No preprocessing performed.\n');
case 'Bilirubin'
% Reference range: 0.2 to 1.2 mg/dL
% 1 mg/dL == 17.1 µmol/L
% It is possible to have >36 mg/dL in cases of transplant, etc.
% Can't unilaterally fix this.
fprintf('No preprocessing performed.\n');
case 'BUN'
% Reference range: 10-20 mg/dl (3.6-71 mmol/liter)
% 1 mg/dL == 0.357 mmol/L
% Can't really convert anything here.
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
% for m=2:size(tmp,2)
% tmp(:,m) = cellfun(@(x,y) x(y), tmp(:,m), idxManip,'UniformOutput',false);
% end
case 'Cholesterol'
% Reference range:
% Desirable <200 mg/dl <5.17 mmol/L
% Borderline high 200-239 mg/dl 5.17-6.18 mmol/L
% High ≥240 mg/dl ≥6.18 mmol/L
% 1 mg/dL == 0.0259 mmol/L
% Nothing bad here.
fprintf('No preprocessing needed.\n');
case 'Creatinine'
% Reference range: 0.6-1.3 mg/dl
% <1.5 mg/dl (NEJM)
% 1 mg/dL == 88.4 µmol/L
%=== ~6 is a reasonable maximum, 7.6 is very confident
% Seems OK.
fprintf('No preprocessing needed.\n');
case 'DiasABP'
%=== First, delete '0's since we don't know if it was badly
% converted or missing
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
[idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -1, 'eq');
fprintf('Deleted %2.0f values which were -1.\n',N);
idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
idxManip = cellfun(@(x) x>170, tmp(:,4),'UniformOutput',false);
idxManip2 = cellfun(@(x) x<200, tmp(:,4),'UniformOutput',false);
idxManip = cellfun(@(x) sum(x), idxManip);
idxManip2 = cellfun(@(x) sum(~x), idxManip2);
fprintf('There exist %2.0f values between 170-200, which were left as is.\n',sum(idxManip)-sum(idxManip2));
[idxRem2, N] = pnPreprocessRemovalIndices(tmp, idx, 200, 'gt');
[idxRem] = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
fprintf('Deleted %2.0f values above 200.\n',N);
case 'FiO2'
fprintf('No preprocessing performed.\n');
case 'GCS'
fprintf('No preprocessing performed.\n');
case 'Glucose'
% Reference ranges:
% Fasting
% Normal 75-115 mg/dl 4.2-6.4 mmol/L
% Diabetes mellitus >125 mg/dl >7.0 mmol/L
% 2 Hr post-meal <120 mg/dl <6.7 mmol/L
% 1 mg/dL == 0.0555 mmol/L
fprintf('No preprocessing performed.\n');
case 'HCO3'
% Reference ranges: 21-30 mEq/L 21-28 mmol/L
% 1 mEq/L == 1 mmol/L
fprintf('No preprocessing performed.\n');
case 'HCT'
% Reference range: 41.0-53.0%
fprintf('No preprocessing performed.\n');
case 'HR'
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
[idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 300, 'eq');
idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false); % Combine removal indices
fprintf('Deleted %2.0f values which were 300.\n',N);
case 'K'
% Reference ranges: 3.5-5.0 mEq/L 3.5-5.0 mmol/L
% 1 mEq/L == 1 mmol/L
fprintf('No preprocessing performed.\n');
case 'Lactate' % mmol/L
% Reference ranges: 5-15 mg/dl 0.6-1.7 mmol/liter
% 1 mg/dL == 0.111 mmol/L
fprintf('No preprocessing performed.\n');
case 'Mg' % mmol/L
% Reference ranges: 1.8-3.0 mg/dl 0.8-1.2 mmol/L
%1 mg/dL == 0.411 mmol/L
%1 mEq/L == 0.50 mmol/L
fprintf('No preprocessing performed.\n');
case 'MAP' % mmHg
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
fprintf('Deleted %2.0f values which were less than 1.\n',N);
case 'MechVent'
fprintf('No preprocessing performed.\n');
case 'Na' % mEq/L
% Reference ranges: 136-145 mEq/L 136-145 mmol/L
% 1 mEq/L == 1 mmol/L
%=== Interesting spike at 150, possible rounding bias?
fprintf('No preprocessing performed.\n');
case 'NIDiasABP' % mmHg
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
fprintf('Deleted %2.0f values which were less than 1.\n',N);
case 'NIMAP' % mmHg
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
fprintf('Deleted %2.0f values which were less than 1.\n',N);
case 'NISysABP' % mmHg
% Values below 1??
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'eq');
fprintf('Deleted %2.0f values which less than 1.\n',N);
case 'PaCO2' % mmHg
% Reference range:
% 4.7-6.0 kPa
% 35-45 mmHg
% 1 kPa == 7.5006 mmHg
% Values below 1 must be wrong...
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
fprintf('Deleted %2.0f values which were less than 1.\n',N);
% idxManip = cellfun(@(x) x<1, tmp(:,4),'UniformOutput',false);
% low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
% low = cellfun(@(x) x*100, low, 'UniformOutput',false);
%
% for m=1:size(tmp,1)
% tmp{m,4}(idxManip{m}) = low{m};
% end
case 'PaO2' % mmHg
% Reference range:
% 11-13 kPa
% 75-100 mmHg
% 1 kPa == 7.5006 mmHg
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
idxManip = cellfun(@(x) x<10, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*7.5006, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Replaced %2.0f values which were kPa, not mmHg.\n',sum(cell2mat(idxManip)));
case 'pH'
% Reference range:
% 7.34-7.45 units
%=== This is a mess
% There are 5 values between 94-100, not sure why. Probably put
% in the wrong field? Doesn't match with an [H+] conversion
idxManip = cellfun(@(x) (x>0.65 & x<0.8), tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*7.5006, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multiplied %2.0f values by 10 (0.7->7).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) (x>65 & x<80), tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*0.1, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multiplied %2.0f values by 0.1 (70->7).\n',sum(cell2mat(idxManip)));
idxManip = cellfun(@(x) (x>650 & x<800), tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*0.01, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multiplied %2.0f values by 0.01 (700->7).\n',sum(cell2mat(idxManip)));
%=== Values <6.5 and >0.8 are deleted
% Note: (there are no values around 0.6-0.75)
[idxRem] = pnPreprocessRemovalIndices(tmp, idx, 0.8, 'gt');
[idxRem2] = pnPreprocessRemovalIndices(tmp, idx, 6.5, 'lt');
idxRem = cellfun(@(x,y) x & y, idxRem, idxRem2, 'UniformOutput',false);
N = sum(cell2mat(cellfun(@(x) sum(x), idxRem, 'UniformOutput', false)));
fprintf('Deleted %2.0f values which were between [0.8,6.5].\n',N);
%=== Values >80 & <650 are deleted
% Note: (there are no values around 0.6-0.75)
[idxRem3] = pnPreprocessRemovalIndices(tmp, idx, 80, 'gt');
[idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 650, 'lt');
idxRem = cellfun(@(x,y,z) (x & y) | z, idxRem3, idxRem4, idxRem, 'UniformOutput',false);
N = sum(cell2mat(cellfun(@(x,y) sum(x&y), idxRem3, idxRem4, 'UniformOutput', false)));
fprintf('Deleted %2.0f values which were between [80,650].\n',N);
case 'Platelets' % cells/nL
% Reference range: 150-350 (10e3)/mm^3
% 1 (10e3)/µL == 1 (10e3)/mm^3 == 1/nL
% Errors are probably in orders of 1000
fprintf('No preprocessing performed.\n');
case 'RespRate' % bpm
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
case 'SaO2' % %
% Reference range: 94-100
fprintf('No preprocessing performed.\n');
case 'SysABP' % mmHg
% Reference range:
% 10-14 kPa
% 75-105 mmHg
% 1 kPa == 7.5006 mmHg
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
% for m=2:size(tmp,2)
% tmp(:,m) = cellfun(@(x,y) x(y), tmp(:,m), idxManip,'UniformOutput',false);
% end
fprintf('Deleted %2.0f values which were less than 1.\n',N);
case 'Temp'
%=== Check for Farenheit measurements and incorrect conversions
%=== There are 129 values which are too low...
%=== First, there are 39 '0's, which should be set to deleted,
%since we don't know if it was badly converted or missing
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'le');
fprintf('Deleted %2.0f values which were 0.\n',N);
[idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -17, 'lt');
fprintf('Deleted %2.0f values which were -17.8 (i.e. incorrectly converted from 0).\n',N);
idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
%=== Now, assume the rest of the values have been erroneously
% converted from F to C, so convert them back.
idxManip = cellfun(@(x) x<20, tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*9/5+32, low, 'UniformOutput',false);
idxManip2 = cellfun(@(x) x<10, low, 'UniformOutput',false);
for m=1:size(low,1)
low{m}(idxManip2{m}) = low{m}(idxManip2{m})*9/5+32;
end
%=== spooky action at a distance
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Replaced %2.0f values which were incorrectly converted.\n',sum(cell2mat(idxManip)));
fprintf('Replaced %2.0f values of these values twice.\n',sum(cell2mat(idxManip2)));
case 'TroponinI'
% Reference ranges: 0-0.4 µg/L
% Could be off by orders of 10
idxManip = cellfun(@(x) (x>30), tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) x*0.1, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Multiplied %2.0f values by 0.11 (30->0.3).\n',sum(cell2mat(idxManip)));
case 'TroponinT'
% Reference ranges: 0-0.1 µg/L
% Could be off by orders of 10.
fprintf('No preprocessing performed.\n');
case 'Urine'
fprintf('No preprocessing performed.\n');
case 'WBC' % cells/nL
% Reference range: 4.5-11.0
% 1 (10^3)/µL == 1 (10^9)/L
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
fprintf('Deleted %2.0f values which were less than 1.\n',N);
case 'Weight' % kg
% Reference range:
% 86.6 kg (190.9 lb) (males)
% 74.4 kg (164.0 lb) (females)
% 1 kilogram = 2.20462262 pounds, 1 pound = 0.45359237 kilograms
% 1 kilogram = 0.15747304 stones, 1 stone = 6.35029318 kilograms
%=== Impute NaN in the first weight, delete the others
idxManip = cellfun(@(x) (x==-1), tmp(:,4),'UniformOutput',false);
low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
low = cellfun(@(x) NaN, low, 'UniformOutput',false);
for m=1:size(tmp,1)
tmp{m,4}(idxManip{m}) = low{m};
end
fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip)));
[idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
fprintf('Deleted %2.0f values which were 0.\n',N);
%=== The following values continually pop up:
% 0.6 is on patient 3889 (subid 142393)
% Their weight is constant at 70, then becomes 0.6 at 467 min
[idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 35, 'lt');
idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
fprintf('Deleted %2.0f values which were less than 35.\n',N);
end
%=== Replace data with tmp
for m=2:size(data,2)
data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false);
end
%=== Remove deleted data from 'data'
if ~isempty(idxRem)
for m=2:size(data,2)
data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false);
end
end
end
end