Skip to content

Commit

Permalink
Merge pull request #5 from christophriepe/v1_2
Browse files Browse the repository at this point in the history
Add organ system identifier as well as 10 new targets
  • Loading branch information
christophriepe authored Feb 28, 2024
2 parents 412376d + 02e890b commit ddaf12b
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 187 deletions.
129 changes: 35 additions & 94 deletions 0_pre/0_process.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,17 @@
"categorial_numeric_encoding('identifier_cohort', raw_name='center')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Identifier: Organ System\n",
"raw['organ_system'] = raw['organ'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })\n",
"categorial_numeric_encoding('identifier_organ_system', raw_name='organ_system', not_supported=['Other_organ_system', 'Transplant'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1220,7 +1231,7 @@
"outputs": [],
"source": [
"# pH\n",
"numeric('ph', raw_name='abg_ph', min=0, max=10)"
"numeric('ph', raw_name='abg_ph', min=6, max=8)"
]
},
{
Expand All @@ -1230,7 +1241,7 @@
"outputs": [],
"source": [
"# Base Excess\n",
"numeric('base_excess', raw_name='base_excess', min=-100, max=100)"
"numeric('base_excess', raw_name='base_excess', min=-50, max=50)"
]
},
{
Expand Down Expand Up @@ -1316,8 +1327,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Target: Clavien Dindo 5\n",
"categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')"
"# Target: Deceased after Discharge\n",
"categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')"
]
},
{
Expand All @@ -1326,8 +1337,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Target: Deceased after Discharge\n",
"categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')"
"# Target: Clavien Dindo 5\n",
"categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')"
]
},
{
Expand All @@ -1347,7 +1358,7 @@
"outputs": [],
"source": [
"# Target: Pulmonary Embolism\n",
"categorial_numeric_encoding('target_pulmonary embolism', raw_name='pulmonary_embolism')"
"categorial_numeric_encoding('target_pulmonary_embolism', raw_name='pulmonary_embolism')"
]
},
{
Expand Down Expand Up @@ -1511,6 +1522,16 @@
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(len(data[data['identifier_cohort'] == 0]))\n",
"print(len(data[data['identifier_cohort'] == 1]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1540,6 +1561,8 @@
"metadata": {},
"outputs": [],
"source": [
"center0 = data[data['identifier_cohort'] == 0]\n",
"\n",
"completeness = data.count() / len(data) * 100\n",
"completeness = completeness.apply(lambda x: round(x / 10) * 10)\n",
"completeness = completeness.value_counts().sort_index()\n",
Expand All @@ -1552,7 +1575,6 @@
"metadata": {},
"outputs": [],
"source": [
"center0 = data[data['identifier_cohort'] == 0]\n",
"for column in center0.columns:\n",
" if center0[column].isna().sum() / len(center0) > 0.5:\n",
" data.drop(column, axis=1, inplace=True)\n",
Expand All @@ -1561,86 +1583,6 @@
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Intermediate Save"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv('3_clean.csv', index = False)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(len(data[data['identifier_cohort'] == 0]))\n",
"print(len(data[data['identifier_cohort'] == 1]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"description: pd.DataFrame = pd.DataFrame()\n",
"\n",
"i = 0\n",
"for column in data.columns:\n",
" data_total = data[column]\n",
" data_training = data[data['identifier_cohort'] == 0][column]\n",
" data_validation = data[data['identifier_cohort'] == 1][column]\n",
"\n",
" description = description.append({\n",
" 'id': i,\n",
" 'name': column,\n",
" 'dimension': '',\n",
" 'lower_limit': '',\n",
" 'upper_limit': '',\n",
" 'type': data_total.dtype,\n",
" 'count': data_total.count(),\n",
" 'count (0)': data_training.count(),\n",
" 'count (1)': data_validation.count(),\n",
" 'completeness': round(data_total.count() / len(data_total) * 100, 2),\n",
" 'completeness (0)': round(data_training.count() / len(data_training) * 100, 2),\n",
" 'completeness (1)': round(data_validation.count() / len(data_validation) * 100, 2),\n",
" 'min': data_total.min(),\n",
" 'min (0)': data_training.min(),\n",
" 'min (1)': data_validation.min(),\n",
" 'max': data_total.max(),\n",
" 'max (0)': data_training.max(),\n",
" 'max (1)': data_validation.max(),\n",
" 'unique': data_total.nunique(),\n",
" 'unique (0)': data_training.nunique(),\n",
" 'unique (1)': data_validation.nunique(),\n",
" }, ignore_index=True)\n",
"\n",
" i += 1\n",
"\n",
"description.to_csv('4_description.csv', index=False)\n",
"description"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(description.to_markdown(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -1654,7 +1596,7 @@
"metadata": {},
"outputs": [],
"source": [
"# 90 Day Mortality\n",
"# 30 Day Mortality\n",
"data = data[data['target_30_day_mortality'].notna()]\n",
"\n",
"print(len(data[data['identifier_cohort'] == 0]))\n",
Expand All @@ -1669,7 +1611,7 @@
"metadata": {},
"outputs": [],
"source": [
"# 30 Day Mortality\n",
"# 90 Day Mortality\n",
"data = data[data['target_90_day_mortality'].notna()]\n",
"\n",
"print(len(data[data['identifier_cohort'] == 0]))\n",
Expand All @@ -1684,8 +1626,8 @@
"metadata": {},
"outputs": [],
"source": [
"# 75% Completeness\n",
"data = data.dropna(thresh=len(data.columns) * 0.75)\n",
"# Completeness\n",
"data.dropna(thresh=len(data.columns) * 0.75, inplace=True)\n",
"\n",
"print(len(data[data['identifier_cohort'] == 0]))\n",
"print(len(data[data['identifier_cohort'] == 1]))\n",
Expand All @@ -1700,7 +1642,6 @@
"outputs": [],
"source": [
"data = data.loc[:, ~data.columns.str.startswith('meta_')]\n",
"data = data.loc[:, ~data.columns.str.startswith('target_') | data.columns.str.startswith('target_30_day_mortality') | data.columns.str.startswith('target_90_day_mortality')]\n",
"data"
]
},
Expand All @@ -1717,7 +1658,7 @@
"metadata": {},
"outputs": [],
"source": [
"data.to_csv('5_final.csv', index = False)\n",
"data.to_csv('3_final.csv', index = False)\n",
"data"
]
},
Expand Down
19 changes: 9 additions & 10 deletions 0_pre/1_analyze.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@
" training_column = data[data['identifier_cohort'] == 0][column]\n",
" validation_column = data[data['identifier_cohort'] == 1][column]\n",
"\n",
" print(f\"ALL: {data['asa'].count()} (N) {data['asa'].count() / len(data['asa'])} (%)\")\n",
" print(f\"ALL: {data['asa'].count()} (N); {data['asa'].count() / len(data['asa'])} (%)\")\n",
" print(data[column].value_counts()); print(data[column].value_counts(normalize=True))\n",
" print(f\"TRAINING: {training_column.count()} (N) {training_column.count() / len(training_column)} (%)\")\n",
" print(f\"TRAINING: {training_column.count()} (N); {training_column.count() / len(training_column)} (%)\")\n",
" print(training_column.value_counts()); print(training_column.value_counts(normalize=True))\n",
" print(f\"VALIDATION: {validation_column.count()} (N) {validation_column.count() / len(validation_column)} (%)\")\n",
" print(f\"VALIDATION: {validation_column.count()} (N); {validation_column.count() / len(validation_column)} (%)\")\n",
" print(validation_column.value_counts()); print(validation_column.value_counts(normalize=True))\n",
" "
]
Expand Down Expand Up @@ -101,14 +101,13 @@
"metadata": {},
"outputs": [],
"source": [
"training_data = data[data['identifier_cohort'] == 0]\n",
"pure_data = data.drop(columns=[col for col in data.columns if col.startswith('identifier_')]) # also drop additional targets?\n",
"pure_training_data = pure_data[data['identifier_cohort'] == 0]\n",
"pure_validation_data = pure_data[data['identifier_cohort'] == 1]\n",
"\n",
"completeness = []\n",
"for column in training_data.columns:\n",
" if column == 'identifier_cohort': continue\n",
" completeness.append(training_data[column].count() / len(training_data[column]))\n",
"\n",
"print(sum(completeness) / len(completeness))"
"print(f'[All] {pure_data.count().sum()} (N); {pure_data.count().sum() / pure_data.size} (%)')\n",
"print(f'[Training] {pure_training_data.count().sum()} (N); {pure_training_data.count().sum() / pure_training_data.size} (%)')\n",
"print(f'[Validation] {pure_validation_data.count().sum()} (N); {pure_validation_data.count().sum() / pure_validation_data.size} (%)')"
]
},
{
Expand Down
Loading

0 comments on commit ddaf12b

Please sign in to comment.