# File: deid.config # Example configuration file used by the de-identification # software, deid.pl, to de-identify the gold standard corpus # and output performance statistics. # Authors: Margaret Douglass, Ishna Neamatullah, William J. Long, Li-wei Lehman # Last modified by Li-wei Lehman (lilehman@mit.edu) Nov. 2007 #Description: This configuration file allows you to # (1) set certain global variables (2) turn certain filters on/off, # and (3) turn certain dictionaries on/off. The general format is # = # # The is 'y'/'n' or '0'/'1' or some other values depending # on the configuration string # For example, for the configuration string "Gold standard comparison", # set the value to either 0 or 1. See rest of this file for more examples. # IMPORTANT: do not change the configuration string, as this might # cause the software to not recognize the configuration setting. ######################################################################## #################Configure Comparison or Output Mode#################### # "Gold standard comparison = 0" for output mode. # "Gold standard comparison = 1" for performance comparison mode; a # gold standard corpus and PHI list must be provided. Gold standard comparison = 1 #Gold standard comparison = 0 ######################################################################## ########Configure Date Related Variables for De-identification########## # Date offset should be an integer than represents the number of days # to date shift in re-identifying dates in the medical notes. # This date offset will be applied to all patients. To use a different # date shift for different patient, set "PID to date offset mapping" # to 'y', and provide the mapping in a file called "shift.txt" in the # same directory. # Date offset is 0 in GS comparison mode, since we are not outputing # any de-ided text with date shift. For output mode, set Date offset # to a number of days that will be used for to date shift for all # patients. Note that this offset is ignored if a PID to date offset file # is available. Date offset = 0 # PID to date shift mappings: if set to 'y', the code will # load patient-specific date-shift from file "shift.txt"; PID to date offset mapping = n #Format for the default date should be MM/DD/YYYY Date Default = 01/01/2020 # The "Two Digit Year Threshold" is used to determine whether # to interpret the year as a year in the 1900's or 2000's. # Must be a 1- or 2-digit number. # Two digit years > Threshold are interepreted as in the 1900's # Two digit years <= Threshold are interpreted as in the 2000's # The following threshold is set according to the re-identified dates # that appear in our gold standard corpus. Two Digit Year Threshold = 30 ######################################################################## ##################Configure De-identification Filters:################## # De-identification filters used: # PHI categories filtered: # 1. Social Security Numbers (SSN) # 2. Uniform Resource Locators (URL) # 3. Email addresses # 4. Telephone/fax numbers # 5. Provider/unit/medical record numbers # 6. Ages over 90 # 7. Locations and hospital names # 8. Dates # 9. Names # 10.U.S. States #Note: # GS (gold standard) filters patterns (e.g. ward names) specific to # gold std corpus (which are nursing notes). The filter for DS should # always be set to "n" for this distribution, as it applies only to # patterns we see in our discharge summaries, which are not included # in this distribution. # Use 'y' to set the filter on or 'n' to turn the filter off SSN filter = y URL filter = y Email filter = y Telephone filter = y Unit number filter = y Age filter = y Location filter = y Date filter = y Name filter = y State filter = n GS filter = y DS filter = n ######################################################################### #########Configure Dictionary Loading for De-identification ############# # Note: there are more dictionaries than listed here. The ones listed # here are the ones we allow you to enable/disable the loading of the # dictionaries for. Generic first/lastname dictionaries are always loaded. # Lists used: # 1. PID to patient name mappings: "lists/pid_patientname.txt"; # 2. Country names: "lists/countries_unambig.txt"; # 3. Company names: # a) "lists/company_names_unambig.txt", # b) "lists/company_names_ambig.txt". # 4. Ethnicities: "lists/ethnicities_unambig.txt"; # 5. Hospitals: "lists/stripped_hospitals.txt"; # 6. Locations:; # a) "lists/locations_unambig.txt", # b) "lists/locations_ambig.txt", # 7. LocalPlaces: # a) "lists/local_places_unambig.txt", # b) "lists/local_places_ambig.txt". # 8. Doctor names: # a) "lists/doctor_first_names.txt" # b) "lists/doctor_last_names.txt" # 9. US States: # a) lists/us_states.txt # b) lists/us_states_abbre.tx # c) lists/more_us_state_abbreviations.txt" #Configure lists/dictionaries: # Use 'y' to load the dictionary or 'n' to not load the dictionary # Note that we load the State dictionary for de-identification # of patterns of zipcode and university/college names with # State names in it. PID to patient name mapping = y Country names = n Company names = y Ethnicities = n Hospital names = y Location names = y Doctor names = y LocalPlaces names = y State names = y