Cerebral Vasoregulation in Elderly with Stroke 1.0.0

File: <base>/conversion/convert-final.ipynb (38,785 bytes)
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "from multiprocessing import Pool\n",
    "import os\n",
    "import pdb\n",
    "import re\n",
    "import shutil\n",
    "\n",
    "import cxutils as cx\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import wfdb\n",
    "%matplotlib qt\n",
    "\n",
    "#os.getcwd()\n",
    "base_project_dir = '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation'\n",
    "base_write_dir = os.path.join(base_project_dir, 'output/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Subject numbers are in the form **SXXXX**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 1 - Stand sit tests\n",
    "\n",
    "## Directories\n",
    "\n",
    "- Input directory: `labview`\n",
    "- Output directory: `sit-stand`\n",
    "\n",
    "## Data Description\n",
    "\n",
    "Labview and windaq files. 7 channel 500Hz. Files in the form `S####A.dat` or `S####A.wdq`.\n",
    "\n",
    "Channels are:\n",
    "0. marker\n",
    "1. ecg\n",
    "2. abp\n",
    "3. thermst\n",
    "4. flow rate\n",
    "5. o2\n",
    "6. co2\n",
    "\n",
    "There are many more `dat` files than `wdq` files.\n",
    "\n",
    "- There is one anomalous file that ends with 'A.dat': `S0361SACA.dat`, in addition to the expected `S0361SA.dat`.\n",
    "- There are 3 files: S0214SA, S0218SA.dat, S0221SA.dat, which have file lengths not divisible by 7."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, 'labview')\n",
    "write_dir = os.path.join(base_write_dir, 'sit-stand')\n",
    "\n",
    "input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('A.dat')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspecting for format.\n",
    "# We know there are supposed to be 7 channels.\n",
    "\n",
    "for file in input_files[:5]:\n",
    "    file_size = os.path.getsize(file)\n",
    "    \n",
    "    # This is expected to be 0. And it is for all files except: S0214SA.dat S0218SA.dat S0221SA.dat\n",
    "    print(file_size % 7)\n",
    "    \n",
    "    if file_size % 7:\n",
    "        continue\n",
    "        \n",
    "    # Inspect duration. We know it should be about 5 minutes sitting, 5 minutes standing, for at least 10m (600s) total.\n",
    "    # Time (s) = samples per channel / 500\n",
    "    bytes_per_chan = file_size / 7\n",
    "    # 4 bytes per sample gives us range from 600 to 1000+ seconds (except anomaly 90s). Seems reasonable. \n",
    "    # If 8 bytes, it would not be long enough.\n",
    "    print(bytes_per_chan / 500 / 4)\n",
    "    \n",
    "    # Graph the waveforms. Sweep parameters: endianness, bits, signed/unsigned/float.\n",
    "    # It seems >f4 is the correct format. Only int8 (signed or unsigned) gives any other reasonable wave\n",
    "    # but due to duration, logic (no 8 bit precision), and magnitude, we decide that >f4 is correct.\n",
    "    i = 0\n",
    "    \n",
    "    for endian in ['>', '<']:\n",
    "        for fmt in ['i', 'f']:\n",
    "            for bit in ['1', '2', '4', '8']:\n",
    "                \n",
    "                if fmt == 'f' and int(bit) < 4:\n",
    "                    continue\n",
    "                    \n",
    "                dtype = endian + fmt + bit\n",
    "                a = np.fromfile(file, dtype).reshape((-1, 7))\n",
    "                plt.figure(i)\n",
    "                plt.title(dtype)\n",
    "                # Plot channel 1, which should be ecg\n",
    "                plt.plot(a[:,1])\n",
    "                i += 1\n",
    "            \n",
    "    plt.show()\n",
    "\n",
    "# Correct format is: >f4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect the anomalous files.\n",
    "for file in input_files:\n",
    "    file_size = os.path.getsize(file)\n",
    "    \n",
    "    # S0214SA.dat S0218SA.dat S0221SA.dat have incorrect file sizes for 7 channels\n",
    "    if file_size % 7:\n",
    "        # Read the maximum block of 28 bits (7 channels, 4 bytes/sample) and visualize\n",
    "        readable_size = file_size - file_size % 28\n",
    "        sig = np.fromfile(file, '>f4', count=int(readable_size/4)).reshape((-1, 7))\n",
    "        sig[:, 2] = sig[:, 2] * 100\n",
    "        sig[:, 5] = sig[:, 5] * 9.09\n",
    "        sig[:, 6] = sig[:, 6] * 100\n",
    "        wfdb.plot_items(sig, title=cx.basebasename(file))\n",
    "        input()\n",
    "    \n",
    "    # S0361SACA.dat has an anomalous name in addition to the expected S0361SA.dat.\n",
    "    if file in [os.path.join(input_dir, f) for f in ['S0361SACA.dat', 'S0361SA.dat']]:\n",
    "        sig = np.fromfile(file, '>f4').reshape((-1, 7))\n",
    "        sig[:, 2] = sig[:, 2] * 100\n",
    "        sig[:, 5] = sig[:, 5] * 9.09\n",
    "        sig[:, 6] = sig[:, 6] * 100\n",
    "        \n",
    "        wfdb.plot_items(sig, title=cx.basebasename(file))\n",
    "        input()\n",
    "        \n",
    "# Comments: S0214SA, S0218SA.dat, S0221SA.dat, look like nothing in 7 channel option. Ranges make no sense.\n",
    "# S0361SA.dat looks correct. sig_len ~= 360000. S0361SACA.dat channels actually have same amplitude range as other files,\n",
    "# but it seems massively downsampled. No useful recognizable waveform shape.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Investigate whether the non-7 channel files may have another number of channels\n",
    "# Inspect the anomalous files. \n",
    "for file in input_files:\n",
    "    file_size = os.path.getsize(file)\n",
    "    # S0214SA.dat S0218SA.dat S0221SA.dat have incorrect file sizes for 7 channels\n",
    "    if file_size % 7:\n",
    "        # Read the maximum block of 28 bits (7 channels, 4 bytes/sample) and visualize\n",
    "        for n_sig in range(1, 9):\n",
    "            readable_size = file_size - file_size % (4 * n_sig)\n",
    "            sig = np.fromfile(file, '>f4', count=int(readable_size/4)).reshape((-1, n_sig))\n",
    "            wfdb.plot_items(sig, title=cx.basebasename(file))\n",
    "            input()\n",
    "\n",
    "# All channels from 1-8 do not look like proper data. Ignore these files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write the files\n",
    "for file in input_files:\n",
    "    file_size = os.path.getsize(file)\n",
    "    # Skip the anomalous files\n",
    "    if file_size % 7 or file.endswith('S0361SACA.dat'):\n",
    "        continue\n",
    "    \n",
    "    record_name = 's' + cx.basebasename(file).replace('SA', '-sit-stand')[1:]\n",
    "    if '00231' in record_name:\n",
    "        record_name = record_name.replace('00231', '0231')\n",
    "    # There's a file S00231A.dat. Should be S0231.\n",
    "    sig = np.fromfile(file, '>f4').reshape((-1, 7))\n",
    "    \n",
    "    # Apply calibrations according to spreadsheet\n",
    "    sig[:, 2] = sig[:, 2] * 100\n",
    "    sig[:, 5] = sig[:, 5] * 9.09\n",
    "    sig[:, 6] = sig[:, 6] * 7.74\n",
    "    \n",
    "    wfdb.wrsamp(record_name, fs=500, units=['NU','mV', 'mmHg','NU', 'NU', 'mmHg', 'mmHg'],\n",
    "                sig_name=['marker', 'ecg', 'abp', 'thermst', 'flow_rate', 'o2', 'co2'],\n",
    "                p_signal=sig, fmt=['16'] * 7, write_dir=write_dir)\n",
    "\n",
    "# Overall comments: The amplitude of the C02 channels between records seems to be in 2 groups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect the written files\n",
    "for r in cx.list_records(write_dir):\n",
    "    record = wfdb.rdrecord(r)\n",
    "    wfdb.plot_wfdb(record, title=r)\n",
    "    input()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 2 - 24h bp\n",
    "\n",
    "## Directories\n",
    "\n",
    "- Input directory: 24h-bp\n",
    "- Output directory: 24h-bp\n",
    "\n",
    "## Data description\n",
    "\n",
    "Text files of bp summaries, collected every 20-30m. `*.R` and `*.V` files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, '24h-bp')\n",
    "write_dir = os.path.join(base_write_dir, '24h-bp')\n",
    "\n",
    "input_files = cx.list_files(input_dir, extensions=['R','V'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "# They seem to be readable. Just change names\n",
    "\n",
    "# Renaming:\n",
    "# Got rid of extra 0 in S00169SA.R and S00169SA.V. S0033A.L/R -> S0033SA.L/R\n",
    "# There is S02324SA.R and S02324SA.V. This subject number is incorrect?\n",
    "# S0205.R and S0205SA.R, and S0205.V and S0205SA.V have same content. Delete S0205.R and S0205.V.\n",
    "\n",
    "# all_names = []\n",
    "for file in input_files:\n",
    "    \n",
    "    if file.endswith('.R'):\n",
    "        file_type = 'raw'\n",
    "    else:\n",
    "        file_type = 'verified'\n",
    "    \n",
    "    base_name = cx.basebasename(file).lower()\n",
    "    \n",
    "    if base_name.endswith('sa'):\n",
    "        file_name = '-'.join([base_name[:-2], 'bp', file_type, '0']) + '.txt'\n",
    "    elif base_name.endswith('sb'):\n",
    "        file_name = '-'.join([base_name[:-2], 'bp', file_type, '1']) + '.txt'\n",
    "    else:\n",
    "        file_name = '-'.join([base_name, 'bp', file_type, '0']) + '.txt'\n",
    "    \n",
    "    # all_names.append(file_name)\n",
    "    shutil.copyfile(file, os.path.join(write_dir, file_name))\n",
    "    \n",
    "# print(len(input_files), len(all_names), len(set(all_names)))  # These should all match"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 3 - Beat to beat bp\n",
    "\n",
    "\n",
    "## Directories\n",
    "\n",
    "- Input directory: portapress, portapress-new\n",
    "- Output directory: resting-bp\n",
    "\n",
    "## Data Description\n",
    "\n",
    "Recorded with portapress/beatscope. Resting continuous bp for 1.5/2 hours in supine, reclining, or sitting position.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, 'portapress')\n",
    "write_dir = os.path.join(base_write_dir, 'resting-bp')\n",
    "input_files = cx.list_files(input_dir, extensions=['dat'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# File sizes are not all divisible by 2. Seems we are forced to use the beatscope software to figure out\n",
    "# the format.\n",
    "for file in input_files:\n",
    "    file_size = os.path.getsize(file)\n",
    "    #print(file_size % 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 4 - 24h Myography\n",
    "\n",
    "## Directories\n",
    "\n",
    "- Input directory: me6000\n",
    "- Output directory: 24h-electromyography\n",
    "\n",
    "## Data Description\n",
    "\n",
    "We have .markers, .TFF, and .txt files. But not all records have text/marker files. They all have tff, so we have to convert these. The tff files should have 7 channels. But they're not all 7.\n",
    "\n",
    "They contain ecg, eeg, and accelerometer data.\n",
    "\n",
    "\n",
    "header values od -t u2 -N 512 --endian=big 07031501.TFF\n",
    "\n",
    "sample values od -t d2 -N 512 --endian=big 07031501.TFF\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wfdb.io import rdtff\n",
    "\n",
    "input_dir = os.path.join(base_project_dir, 'me6000/data')\n",
    "write_dir = os.path.join(base_write_dir, '24h-electromyography')\n",
    "\n",
    "# tff files are separated by subjects\n",
    "subject_dirs = [d for d in cx.list_dirs(input_dir) if os.path.basename(d).lower().startswith('s')]\n",
    "input_files = cx.list_files(input_dir, extensions=['dat'])\n",
    "\n",
    "\n",
    "tff_files = cx.list_files(subject_dirs, extensions=['TFF'])\n",
    "\n",
    "# I renamed the files. Lowercased the 's', replace underscores with hyphens, and append subject\n",
    "# numbers to tff files if missing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identifying all the problem files\n",
    "\n",
    "def tryread(filename):\n",
    "    \"\"\"\n",
    "    Return the filename if it fails\n",
    "    \"\"\"\n",
    "    name = os.path.basename(filename)\n",
    "    try:\n",
    "        _ = iordtff(filename)\n",
    "        print('%s succeeded' % name)\n",
    "    except:\n",
    "        print('%s failed' % name)\n",
    "        return filename\n",
    "\n",
    "# tff files are separated by subjects\n",
    "subject_dirs = [d for d in cx.list_dirs(input_dir) if os.path.basename(d).lower().startswith('s')]\n",
    "tff_files = cx.list_files(subject_dirs, extensions=['TFF'])\n",
    "\n",
    "with Pool(processes=30) as pool:\n",
    "    problem_files = pool.map(tryread, tff_files)\n",
    "problem_files = list(set([f for f in problem_files if f]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# These problem files have incorrect lengths for their specified number of channels.\n",
    "# I read up to just before the end, and plot the signals.\n",
    "problem_files = [\n",
    "    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0355/s0355-07090605.TFF',\n",
    "    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0164/s0164-06033006.TFF',\n",
    "    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0371/s0371-07091105.TFF',\n",
    "    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0376/s0376-07091805.TFF',\n",
    "    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0343/s0343-07080705.TFF'\n",
    "]\n",
    "\n",
    "def getproblem(file):\n",
    "    signal, fields, markers, triggers = rdtff(file, cut_end=True)\n",
    "    print('Finished file %s' % file)\n",
    "    return signal, fields, markers, triggers\n",
    "\n",
    "with Pool(processes=5) as pool:\n",
    "    values = pool.map(getproblem, problem_files)\n",
    "\n",
    "for i in range(len(problem_files)):\n",
    "    signal, fields, markers, triggers = values[i]\n",
    "    wfdb.plot_items(signal, ylabel=fields['sig_name'], title=os.path.basename(problem_files[i]))\n",
    "    \n",
    "# They look file, so we will take them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# The final conversion. Convert ALL tff files to wfdb\n",
    "def convert_tff(file):\n",
    "    \"\"\"\n",
    "    Convert the tff to wfdb\n",
    "    \"\"\"\n",
    "    problem_files = [\n",
    "        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0355/s0355-07090605.TFF',\n",
    "        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0164/s0164-06033006.TFF',\n",
    "        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0371/s0371-07091105.TFF',\n",
    "        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0376/s0376-07091805.TFF',\n",
    "        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0343/s0343-07080705.TFF'\n",
    "    ]\n",
    "    if file in problem_files:\n",
    "        cut_end = True\n",
    "    else:\n",
    "        cut_end = False\n",
    "    \n",
    "    # Read the tff file\n",
    "    signal, fields, markers, triggers = rdtff(file, cut_end=cut_end)\n",
    "    \n",
    "    # Write the wfdb record file\n",
    "    wfdb.wrsamp(record_name=cx.basebasename(file), fs=fields['fs'], d_signal=signal,\n",
    "                sig_name=fields['sig_name'], adc_gain=[1]*signal.shape[1],\n",
    "                fmt=['16']*fields['n_sig'], baseline=[0]*fields['n_sig'],\n",
    "                units=['uV']*fields['n_sig'], base_time=fields['base_time'],\n",
    "                base_date=fields['base_date'], write_dir=write_dir)\n",
    "    # Write any annotation locations\n",
    "    if markers.size:\n",
    "        wfdb.wrann(record_name=cx.basebasename(file), sample=markers, extension='marker',\n",
    "                  symbol=['\"']*markers.size, aux_note=['marker']*markers.size, write_dir=write_dir)\n",
    "    if triggers.size:\n",
    "        wfdb.wrann(record_name=cx.basebasename(file), sample=triggers, extension='trigger',\n",
    "                  symbol=['\"']*triggers.size, aux_note=['trigger']*triggers.size, write_dir=write_dir)\n",
    "    print('Converted file %s' % os.path.basename(file))\n",
    "\n",
    "# NEED MORE RAM! Wish I could use more processes.\n",
    "with Pool(processes=8) as pool:\n",
    "    _ = pool.map(convert_tff, tff_files)\n",
    "\n",
    "print('DONE!')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 5 - Pedar\n",
    "## Directories\n",
    "\n",
    "- Input directory: `pedar`\n",
    "- Output directory: `walking`\n",
    "\n",
    "## Data Description\n",
    "\n",
    "There are `.ASC`, `.9rg`, and `gtc` text files. There are also some xls duplicates which are not useful.\n",
    "\n",
    "- All 9rg files have 18 lines and only have numbers/nan. 9 channel.\n",
    "- ASC have text headers. 99 channel.\n",
    "- `gtc` files have N channels. Space delimited. What are channel names? Double space delimitation...\n",
    "\n",
    "12 minute walk test.\n",
    "\n",
    "Problems: S0378A1.9rg same name different content in 9rg and asc folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. gtc files. There is no date/time info.\n",
    "# I renamed the column headings and got rid of the extra '*' characters present in some lines\n",
    "# I renamed SO197S-1.gtc (S OH 197... why do this???) to S0197S-2.gtc and SO157S-1.gtc to SO157S-4.gtc\n",
    "input_dir = os.path.join(base_project_dir, 'pedar/gtc')\n",
    "write_dir = os.path.join(base_write_dir, 'walking')\n",
    "input_files = cx.list_files(input_dir, extensions=['gtc'])\n",
    "\n",
    "# Convert the 9rg files to header and csv text files.\n",
    "\n",
    "def read_9rg(file):\n",
    "    header_lines = []\n",
    "    with open(file, 'r') as fp:\n",
    "        for line in fp:\n",
    "            if line.startswith('step_no'):\n",
    "                break\n",
    "            header_lines.append(line.strip())\n",
    "    data = pd.read_csv(file, delim_whitespace=True, skiprows=len(header_lines))\n",
    "    return header_lines, data\n",
    "\n",
    "# Write data to csv files, and headers to txt files\n",
    "for file in input_files:\n",
    "    lines, data = read_9rg(file)\n",
    "    # First line is useless\n",
    "    with open(os.path.join(write_dir, cx.basebasename(file)+'-steps-info.txt'), 'w') as fp:\n",
    "        for line in lines[1:]:\n",
    "            fp.write(\"%s\\n\" % line)\n",
    "    data.to_csv(os.path.join(write_dir, cx.basebasename(file)+'-steps.csv'), index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. asc files. The fs is specified in the header info. There is no date/time info.\n",
    "# Get the data and convert to wfdb\n",
    "# I renamed SO157S-1.asc to S0157s-4.asc, to link to the original SO157S-1.gtc mentioned above\n",
    "\n",
    "base_project_dir = '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation'\n",
    "base_write_dir = os.path.join(base_project_dir, 'output/')\n",
    "\n",
    "input_dir = os.path.join(base_project_dir, 'pedar/asc')\n",
    "write_dir = os.path.join(base_write_dir, 'walking')\n",
    "input_files = cx.list_files(input_dir, extensions=['ASC'])\n",
    "\n",
    "def read_asc(file):\n",
    "    with open(file, 'r') as fp:\n",
    "        n_header_lines = 0\n",
    "        for line in fp:\n",
    "            line = line.strip()\n",
    "            n_header_lines += 1\n",
    "            # Try to find fs. Format: 'time per frame[secs]:  0.019'\n",
    "            if 'time per frame' in line:\n",
    "                rx = re.compile('time per frame\\[secs\\]\\:\\s+(?P<period>\\d{1}\\.\\d+)')\n",
    "                period = re.findall(rx, line)[0]\n",
    "                fs = 1/float(period)\n",
    "            if line.startswith('time'):\n",
    "                break\n",
    "        data = pd.read_csv(file, delim_whitespace=True, skipinitialspace=True, skiprows=n_header_lines, header=None)\n",
    "        return data, fs\n",
    "\n",
    "for file in input_files:\n",
    "    data, fs = read_asc(file)\n",
    "    n_sig = data.shape[1] - 1\n",
    "    wfdb.wrsamp(cx.basebasename(file)+'-pressure', fs=fs, units=n_sig*['N/cm2'],\n",
    "                sig_name=['m1_pressure_'+str(ch) for ch in range(1, int(n_sig/2)+1)]+['m2_pressure_'+str(ch) for ch in range(1, int(n_sig/2)+1)],\n",
    "                p_signal=data.iloc[:, 1:].values, fmt=n_sig*['16'], write_dir=write_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. 9rg files. Just rename them and replace whitespaces with commas\n",
    "\n",
    "input_dir = os.path.join(base_project_dir, 'pedar/9rg')\n",
    "write_dir = os.path.join(base_write_dir, 'walking')\n",
    "input_files = cx.list_files(input_dir, extensions=['9rg'])\n",
    "\n",
    "rx = re.compile(r'\\s+')\n",
    "for file in input_files:\n",
    "    lines = cx.read_lines(file)\n",
    "    lines = [rx.sub(',', line.strip()) for line in lines]\n",
    "    cx.write_lines(os.path.join(write_dir, cx.basebasename(file)+'-9rg.csv'), lines)\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 6 - Head up tilt\n",
    "## Directories\n",
    "\n",
    "- Input directory: `labview`\n",
    "- Output directory: `head-up-tilt`\n",
    "\n",
    "## Data Description\n",
    "\n",
    "`S####B.dat` 10 channel labview files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, 'labview')\n",
    "write_dir = os.path.join(base_write_dir, 'head-up-tilt')\n",
    "\n",
    "input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('SB.dat')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write the files\n",
    "# All file_size % 40 == 0 which is good. 10 channel >f4\n",
    "for file in input_files:\n",
    "    sig = np.fromfile(file, '>f4').reshape((-1, 10))\n",
    "    record_name = 's' + cx.basebasename(file).replace('SB', '-head-up-tilt')[1:]\n",
    "    # Apply calibrations according to spreadsheet\n",
    "    sig[:, 2] = sig[:, 2] * 100\n",
    "    sig[:, 8] = sig[:, 8] * 9.09\n",
    "    sig[:, 9] = sig[:, 9] * 7.74\n",
    "    \n",
    "    wfdb.wrsamp(record_name, fs=500, units=['NU','mV', 'mmHg','cm/s', 'cm/s', 'cm/s', 'NU', 'NU', 'mmHg', 'mmHg'],\n",
    "                sig_name=['marker', 'ecg', 'abp', 'mcar', 'mcal', 'radi', 'thermst', 'flow_rate', 'o2', 'co2'],\n",
    "                p_signal=sig, fmt=['16'] * 10, write_dir=write_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect the written files\n",
    "for r in cx.list_records(write_dir):\n",
    "    record = wfdb.rdrecord(r)\n",
    "    wfdb.plot_wfdb(record, title=r)\n",
    "    input()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 7 - Sit to stand with eyes open and closed on balance platform\n",
    "\n",
    "## Directories\n",
    "\n",
    "- Input directory: `labview`\n",
    "- Output directory: `sit-stand-balance`\n",
    "\n",
    "## Data Description\n",
    "\n",
    "`S####C.dat` 15 channel labview files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, 'labview')\n",
    "write_dir = os.path.join(base_write_dir, 'sit-stand-balance')\n",
    "\n",
    "input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('SC.dat')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write the files\n",
    "# All file_size % 60 == 0 which is good. 15 channel >f4\n",
    "\n",
    "for file in input_files:\n",
    "    sig = np.fromfile(file, '>f4').reshape((-1, 15))\n",
    "    record_name = 's' + cx.basebasename(file).replace('SC', '-sit-stand-balance')[1:]\n",
    "    # Apply calibrations according to spreadsheet\n",
    "    sig[:, 2] = sig[:, 2] * 100\n",
    "    sig[:, 8] = sig[:, 8] * 9.09\n",
    "    sig[:, 9] = sig[:, 9] * 7.74\n",
    "    # There are inf values annoyingly\n",
    "    sig[np.where(np.isinf(sig))] = np.nan\n",
    "    wfdb.wrsamp(record_name, fs=500,\n",
    "                units=['NU','mV', 'mmHg','cm/s', 'cm/s', 'cm/s', 'NU', 'ml/s', 'mmHg', 'mmHg', 'mm', 'mm', 'mm', 'mm', 'mm'],\n",
    "                sig_name=['marker', 'ecg', 'abp', 'mcar', 'mcal', 'radi', 'thermst', 'flow_rate', 'o2', 'co2', 'fx', 'fy', 'fz', 'px', 'py'],\n",
    "                p_signal=sig, fmt=['16'] * 15, write_dir=write_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset 8 - Transcranial Doppler\n",
    "## Directories\n",
    "\n",
    "- Input directory: tcd/exported-all\n",
    "- Output directory: transcranial-doppler\n",
    "\n",
    "## Data Description\n",
    "\n",
    "`SXXXXSB.XL0` and `SXXXXSB.XL1` files. For the same subject, XL1 comes after XL0.\n",
    "\n",
    "Issues:\n",
    "- `S0185SB.XL0` has channels E2, E3, E7.... What are these?\n",
    "- Only `S0078SB.XL0` has etco2 channel.\n",
    "- `S0172SB.XL0` has two mcal channels. mcal comes before mcar. Should be ok? Drop the second mcal like we drop mcar."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = os.path.join(base_project_dir, 'tcd/exported-all')\n",
    "write_dir = os.path.join(base_write_dir, 'transcranial-doppler')\n",
    "input_files = cx.list_files(input_dir, extensions=['XL0', 'XL1'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# There are 3 anomalously named files: S0183.XL1 and S0121.XL1, S0176SC.XL0. Their contents are identical to S0183SB.XL1\n",
    "# S0121SB.XL1, S0176SB.XL1 so I deleted them. rm S0183.XL1 S0121.XL1 S0176SC.XL0\n",
    "\n",
    "# There's the file S0000SB.XL0 but there is no s0. Inside the file says: patient exam: S0172SC. Renamed the\n",
    "# file S0172SB.XL1 since the time starts just a bit after S0172SB.XL0. Renamed S0030DB.XL1 to S0030SB.XL1\n",
    "\n",
    "# Some files have extra labels in certain rows.\n",
    "# Extra annotations include: 'VALSALVA, 'BASELINE', 'T-HYPOVENT', 'STAND-EC', 'STAND-EO', 'HYPERVENT', 'HYPOVENT'\n",
    "# 'TILT'\n",
    "\n",
    "# Before reading csv content, we must extract these annotations into separate files, and clean them from the files.\n",
    "for file in input_files:\n",
    "    # Figure out header line\n",
    "    with open(file, 'r') as f:\n",
    "        n_header = 0\n",
    "        for line in f:\n",
    "            if line.startswith('TIME') and 'MCAR' in line:\n",
    "                n_header += 2\n",
    "                break\n",
    "            n_header += 1\n",
    "    dname, fname = os.path.split(file)\n",
    "\n",
    "    cx.clean_dirty_csv(file, output_csv_file=os.path.join(dname, 'clean-'+fname.lower()),\n",
    "                       output_bad_file=os.path.join(dname, 'extra-'+fname.lower()), skiprows=n_header)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the data files into wfdb\n",
    "input_files = cx.list_files(input_dir, extensions=['xl0', 'xl1'])\n",
    "input_files = [f for f in input_files if os.path.basename(f).startswith('clean')]\n",
    "\n",
    "def read_tcd(file):\n",
    "    header_lines = []\n",
    "    with open(file, 'r') as fp:\n",
    "        for line in fp:\n",
    "            if line.startswith('TIME') and 'MCAR' in line:\n",
    "                sig_names = [s.lower() for s in line.split()][1:]\n",
    "                break\n",
    "            header_lines.append(line.strip())\n",
    "            \n",
    "    data = pd.read_csv(file, delim_whitespace=True, skiprows=len(header_lines)+2, header=None)\n",
    "    base_time = datetime.datetime.strptime(data.iloc[0,0], '%H:%M:%S.%f').time()\n",
    "    \n",
    "    data = data.iloc[:, 1:]\n",
    "    \n",
    "    # Remove extra mcar mcal channels.\n",
    "    unwanted_inds = []\n",
    "    for cerebral_name in ['mcar', 'mcal']:\n",
    "        cerebral_inds = [i for i, e in enumerate(sig_names) if e == cerebral_name]\n",
    "        if len(cerebral_inds) > 1:\n",
    "            unwanted_inds += cerebral_inds[1:]\n",
    "            \n",
    "    if len(unwanted_inds):\n",
    "        wanted_inds = list(set(range(len(sig_names))) - set(unwanted_inds))\n",
    "        sig_names = [sig_names[i] for i in wanted_inds]\n",
    "        data = data.iloc[:, wanted_inds]\n",
    "    \n",
    "    # Alter signal names\n",
    "    name_map = {'mcar':'cerebral_blood_velocity_right','mcal':'cerebral_blood_velocity_left',\n",
    "                'brar':'brachial_blood_velocity', 'etco2':'et_co2', 'abp':'abp', 'co2':'co2', 'rsp':'resp',\n",
    "                'ekg':'ecg', 'e1':'e1', 'e2':'e2', 'e3':'e3', 'e7':'e7', 'e8':'e8', 'hr':'hr'}\n",
    "    sig_names = [name_map[s] for s in sig_names]\n",
    "    \n",
    "\n",
    "    \n",
    "    unit_dict = {'cerebral_blood_velocity_right': 'mmHg', 'cerebral_blood_velocity_left':'mmHg',\n",
    "                 'brachial_blood_velocity':'mmHg', 'abp':'mmHg', 'co2':'mmHg','et_co2':'mmHg',\n",
    "                 'resp':'NU', 'ecg':'uV', 'e1':'mmHg', 'e2':'mmHg', 'e3':'mmHg', 'e7':'mmHg', 'e8':'mmHg', 'hr':'bpm'}\n",
    "    units = [unit_dict[s] for s in sig_names]\n",
    "    return header_lines, sig_names, units, base_time, data\n",
    "\n",
    "# Write data to wfdb files\n",
    "# I don't think there's any important info in the header lines\n",
    "for file in input_files:\n",
    "    # Read the main data\n",
    "    header_lines, sig_names, units, base_time, data = read_tcd(file)\n",
    "    \n",
    "    # Read the corresponding annotations\n",
    "    ann_data = pd.read_csv(file.replace('clean', 'extra'), delim_whitespace=True, header=None)\n",
    "    \n",
    "    \n",
    "    n_sig = data.shape[1]\n",
    "    record_name = cx.basebasename(file)[6:-2] + '-tcd-' + file[-1]\n",
    "\n",
    "    wfdb.wrsamp(record_name, fs=50, units=units,\n",
    "                sig_name=sig_names,\n",
    "                p_signal=data.values, fmt=n_sig*['16'],\n",
    "                base_time=base_time,\n",
    "                write_dir=write_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11:57:18.410</td>\n",
       "      <td>34.3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>87.9</td>\n",
       "      <td>36.3</td>\n",
       "      <td>18.6</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-11.8</td>\n",
       "      <td>BASELINE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12:02:37.320</td>\n",
       "      <td>27.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>32.6</td>\n",
       "      <td>-1.2</td>\n",
       "      <td>77.4</td>\n",
       "      <td>38.5</td>\n",
       "      <td>17.1</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-11.1</td>\n",
       "      <td>STAND-EO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12:02:39.380</td>\n",
       "      <td>25.7</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.2</td>\n",
       "      <td>1.7</td>\n",
       "      <td>72.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>4.7</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-22.0</td>\n",
       "      <td>STAND-EO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12:05:52.740</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>96.6</td>\n",
       "      <td>39.5</td>\n",
       "      <td>40.4</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-20.5</td>\n",
       "      <td>BASELINE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12:10:59.660</td>\n",
       "      <td>32.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>37.8</td>\n",
       "      <td>0.0</td>\n",
       "      <td>82.6</td>\n",
       "      <td>36.2</td>\n",
       "      <td>35.0</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-9.0</td>\n",
       "      <td>STAND-EC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>12:14:12.060</td>\n",
       "      <td>27.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70.8</td>\n",
       "      <td>32.7</td>\n",
       "      <td>-0.1</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>-16.6</td>\n",
       "      <td>BASELINE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             0     1    2    3     4    5     6     7     8    9     10  \\\n",
       "0  11:57:18.410  34.3  0.0  0.0  29.2  0.0  87.9  36.3  18.6 -1.5 -11.8   \n",
       "1  12:02:37.320  27.5  0.0  0.0  32.6 -1.2  77.4  38.5  17.1 -1.5 -11.1   \n",
       "2  12:02:39.380  25.7  0.0  0.0  29.2  1.7  72.0  39.0   4.7 -1.5 -22.0   \n",
       "3  12:05:52.740  36.0  0.0  0.0  36.0  4.3  96.6  39.5  40.4 -1.5 -20.5   \n",
       "4  12:10:59.660  32.6  0.0  0.0  37.8  0.0  82.6  36.2  35.0 -1.5  -9.0   \n",
       "5  12:14:12.060  27.5  0.0  0.0  27.5  0.0  70.8  32.7  -0.1 -1.5 -16.6   \n",
       "\n",
       "         11  \n",
       "0  BASELINE  \n",
       "1  STAND-EO  \n",
       "2  STAND-EO  \n",
       "3  BASELINE  \n",
       "4  STAND-EC  \n",
       "5  BASELINE  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ann_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write annotation files for those lines that were removed from the XL files\n",
    "\n",
    "lines = cx.read_lines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}