Merge pull request #5 from christophriepe/v1_2

Add organ system identifier as well as 10 new targets
christophriepe · Feb 28, 2024 · ddaf12b · ddaf12b
2 parents 412376d + 02e890b
commit ddaf12b
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 187 deletions.
diff --git a/0_pre/0_process.ipynb b/0_pre/0_process.ipynb
@@ -179,6 +179,17 @@
     "categorial_numeric_encoding('identifier_cohort', raw_name='center')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Identifier: Organ System\n",
+    "raw['organ_system'] = raw['organ'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })\n",
+    "categorial_numeric_encoding('identifier_organ_system', raw_name='organ_system', not_supported=['Other_organ_system', 'Transplant'])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1220,7 +1231,7 @@
    "outputs": [],
    "source": [
     "# pH\n",
-    "numeric('ph', raw_name='abg_ph', min=0, max=10)"
+    "numeric('ph', raw_name='abg_ph', min=6, max=8)"
    ]
   },
   {
@@ -1230,7 +1241,7 @@
    "outputs": [],
    "source": [
     "# Base Excess\n",
-    "numeric('base_excess', raw_name='base_excess', min=-100, max=100)"
+    "numeric('base_excess', raw_name='base_excess', min=-50, max=50)"
    ]
   },
   {
@@ -1316,8 +1327,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Target: Clavien Dindo 5\n",
-    "categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')"
+    "# Target: Deceased after Discharge\n",
+    "categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')"
    ]
   },
   {
@@ -1326,8 +1337,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Target: Deceased after Discharge\n",
-    "categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')"
+    "# Target: Clavien Dindo 5\n",
+    "categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')"
    ]
   },
   {
@@ -1347,7 +1358,7 @@
    "outputs": [],
    "source": [
     "# Target: Pulmonary Embolism\n",
-    "categorial_numeric_encoding('target_pulmonary embolism', raw_name='pulmonary_embolism')"
+    "categorial_numeric_encoding('target_pulmonary_embolism', raw_name='pulmonary_embolism')"
    ]
   },
   {
@@ -1511,6 +1522,16 @@
     "data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(data[data['identifier_cohort'] == 0]))\n",
+    "print(len(data[data['identifier_cohort'] == 1]))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1540,6 +1561,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "center0 = data[data['identifier_cohort'] == 0]\n",
+    "\n",
     "completeness = data.count() / len(data) * 100\n",
     "completeness = completeness.apply(lambda x: round(x / 10) * 10)\n",
     "completeness = completeness.value_counts().sort_index()\n",
@@ -1552,7 +1575,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "center0 = data[data['identifier_cohort'] == 0]\n",
     "for column in center0.columns:\n",
     "    if center0[column].isna().sum() / len(center0) > 0.5:\n",
     "        data.drop(column, axis=1, inplace=True)\n",
@@ -1561,86 +1583,6 @@
     "data"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Intermediate Save"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data.to_csv('3_clean.csv', index = False)\n",
-    "data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(len(data[data['identifier_cohort'] == 0]))\n",
-    "print(len(data[data['identifier_cohort'] == 1]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "description: pd.DataFrame = pd.DataFrame()\n",
-    "\n",
-    "i = 0\n",
-    "for column in data.columns:\n",
-    "    data_total = data[column]\n",
-    "    data_training = data[data['identifier_cohort'] == 0][column]\n",
-    "    data_validation = data[data['identifier_cohort'] == 1][column]\n",
-    "\n",
-    "    description = description.append({\n",
-    "        'id': i,\n",
-    "        'name': column,\n",
-    "        'dimension': '',\n",
-    "        'lower_limit': '',\n",
-    "        'upper_limit': '',\n",
-    "        'type': data_total.dtype,\n",
-    "        'count': data_total.count(),\n",
-    "        'count (0)': data_training.count(),\n",
-    "        'count (1)': data_validation.count(),\n",
-    "        'completeness': round(data_total.count() / len(data_total) * 100, 2),\n",
-    "        'completeness (0)': round(data_training.count() / len(data_training) * 100, 2),\n",
-    "        'completeness (1)': round(data_validation.count() / len(data_validation) * 100, 2),\n",
-    "        'min': data_total.min(),\n",
-    "        'min (0)': data_training.min(),\n",
-    "        'min (1)': data_validation.min(),\n",
-    "        'max': data_total.max(),\n",
-    "        'max (0)': data_training.max(),\n",
-    "        'max (1)': data_validation.max(),\n",
-    "        'unique': data_total.nunique(),\n",
-    "        'unique (0)': data_training.nunique(),\n",
-    "        'unique (1)': data_validation.nunique(),\n",
-    "    }, ignore_index=True)\n",
-    "\n",
-    "    i += 1\n",
-    "\n",
-    "description.to_csv('4_description.csv', index=False)\n",
-    "description"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(description.to_markdown(index=False))"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1654,7 +1596,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 90 Day Mortality\n",
+    "# 30 Day Mortality\n",
     "data = data[data['target_30_day_mortality'].notna()]\n",
     "\n",
     "print(len(data[data['identifier_cohort'] == 0]))\n",
@@ -1669,7 +1611,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 30 Day Mortality\n",
+    "# 90 Day Mortality\n",
     "data = data[data['target_90_day_mortality'].notna()]\n",
     "\n",
     "print(len(data[data['identifier_cohort'] == 0]))\n",
@@ -1684,8 +1626,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 75% Completeness\n",
-    "data = data.dropna(thresh=len(data.columns) * 0.75)\n",
+    "# Completeness\n",
+    "data.dropna(thresh=len(data.columns) * 0.75, inplace=True)\n",
     "\n",
     "print(len(data[data['identifier_cohort'] == 0]))\n",
     "print(len(data[data['identifier_cohort'] == 1]))\n",
@@ -1700,7 +1642,6 @@
    "outputs": [],
    "source": [
     "data = data.loc[:, ~data.columns.str.startswith('meta_')]\n",
-    "data = data.loc[:, ~data.columns.str.startswith('target_') | data.columns.str.startswith('target_30_day_mortality') | data.columns.str.startswith('target_90_day_mortality')]\n",
     "data"
    ]
   },
@@ -1717,7 +1658,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.to_csv('5_final.csv', index = False)\n",
+    "data.to_csv('3_final.csv', index = False)\n",
     "data"
    ]
   },

diff --git a/0_pre/1_analyze.ipynb b/0_pre/1_analyze.ipynb
@@ -65,11 +65,11 @@
     "    training_column = data[data['identifier_cohort'] == 0][column]\n",
     "    validation_column = data[data['identifier_cohort'] == 1][column]\n",
     "\n",
-    "    print(f\"ALL: {data['asa'].count()} (N) {data['asa'].count() / len(data['asa'])} (%)\")\n",
+    "    print(f\"ALL: {data['asa'].count()} (N); {data['asa'].count() / len(data['asa'])} (%)\")\n",
     "    print(data[column].value_counts()); print(data[column].value_counts(normalize=True))\n",
-    "    print(f\"TRAINING: {training_column.count()} (N) {training_column.count() / len(training_column)} (%)\")\n",
+    "    print(f\"TRAINING: {training_column.count()} (N); {training_column.count() / len(training_column)} (%)\")\n",
     "    print(training_column.value_counts()); print(training_column.value_counts(normalize=True))\n",
-    "    print(f\"VALIDATION: {validation_column.count()} (N) {validation_column.count() / len(validation_column)} (%)\")\n",
+    "    print(f\"VALIDATION: {validation_column.count()} (N); {validation_column.count() / len(validation_column)} (%)\")\n",
     "    print(validation_column.value_counts()); print(validation_column.value_counts(normalize=True))\n",
     "    "
    ]
@@ -101,14 +101,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "training_data = data[data['identifier_cohort'] == 0]\n",
+    "pure_data = data.drop(columns=[col for col in data.columns if col.startswith('identifier_')]) # also drop additional targets?\n",
+    "pure_training_data = pure_data[data['identifier_cohort'] == 0]\n",
+    "pure_validation_data = pure_data[data['identifier_cohort'] == 1]\n",
     "\n",
-    "completeness = []\n",
-    "for column in training_data.columns:\n",
-    "    if column == 'identifier_cohort': continue\n",
-    "    completeness.append(training_data[column].count() / len(training_data[column]))\n",
-    "\n",
-    "print(sum(completeness) / len(completeness))"
+    "print(f'[All] {pure_data.count().sum()} (N); {pure_data.count().sum() / pure_data.size} (%)')\n",
+    "print(f'[Training] {pure_training_data.count().sum()} (N); {pure_training_data.count().sum() / pure_training_data.size} (%)')\n",
+    "print(f'[Validation] {pure_validation_data.count().sum()} (N); {pure_validation_data.count().sum() / pure_validation_data.size} (%)')"
    ]
   },
   {