Preprocessing.py

# -*- coding: utf-8 -*-
"""
Created on Wed Apr  5 10:12:16 2023

@author: premchand
"""

import re

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict


import numpy as np


import pandas as pd

import os, sys, argparse

from matplotlib import pyplot as plt

models = [LogisticRegression(solver='lbfgs', max_iter=1000),  GaussianNB(),  DecisionTreeClassifier(), KNeighborsClassifier()]

DATA_DIR = "heart.csv"

def get_data(data_dir):
    df = pd.read_csv(data_dir)

    male = df.loc[df.sex == 1]
    female = df.loc[df.sex == 0]

    return df, male, female


def disease_percents(patients):
    wit = patients[patients.target == 1]
    without = patients[patients.target == 0]

    wit = (len(wit)/len(patients)) * 100
    without = (len(without)/len(patients)) * 100

    return wit, without

def numb_sex(males, females, total):
    numbMales = (len(males)/len(total))*100
    numbFemales = (len(females)/len(total))*100

    return numbMales, numbFemales

def create_sets(data):
    x = data.drop('target', axis=1)
    y = data.target

    scaler = MinMaxScaler(feature_range=(0, 1))
    X_split = scaler.fit_transform(x)
    X_train, X_test, y_train, y_test = train_test_split(X_split, y, test_size=0.3)

    array = data.values
    X = array[:,0:13]
    print(X)
    Y = array[:,13]

    return X_train, X_test, y_train, y_test, X, Y


def train(x_train, x_test, y_train, y_test, X, Y, models):
    for x in models:
        print('{}'.format(x))
        model = x
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        print('Confusion Matrix :')
        print(confusion_matrix(y_test, predictions))
        print('Accuracy Score :', accuracy_score(y_test, predictions))
        print('Report : ')
        print(classification_report(y_test, predictions))

        kfold = KFold(n_splits=10, random_state=7)

        print(cross_val_predict(model, X, Y, cv=kfold))

        result = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
        print(result)
        print("Accuracy: %.3f%% (%.3f%%)" % (result.mean() * 100.0, result.std() * 100.0))

def plot(data):
    pd.crosstab(data.cp, data.target).plot(kind ="bar")
    plt.title('Heart Disease Frequency According To CP')
    plt.xlabel('CP')
    plt.xticks(rotation=0)
    plt.legend(["Haven't Disease", "Have Disease"])
    plt.ylabel('Frequency of Disease or Not')

    pd.crosstab(data.fbs, data.target).plot(kind="bar")
    plt.title('fbs')
    plt.xlabel('fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)')
    plt.xticks(rotation=0)
    plt.legend(["Haven't Disease", "Have Disease"])
    plt.ylabel('Frequency of Disease or Not')

    pd.crosstab(data.exang, data.target).plot(kind="bar")
    plt.title('exercise induced angina')
    plt.xlabel('exercise induced angina (1 = yes; 0 = no)')
    plt.xticks(rotation=0)
    plt.legend(["Haven't Disease", "Have Disease"])
    plt.ylabel('Frequency of Disease or Not')

    pd.crosstab(data.slope, data.target).plot(kind="bar")
    plt.title('slope of the peak exercise ST')
    plt.xlabel('the slope of the peak exercise ST segment')
    plt.xticks(rotation=0)
    plt.legend(["Haven't Disease", "Have Disease"])
    plt.ylabel('Frequency of Disease or Not')
    plt.show()

    plt.scatter(x=data.age[data.target == 1], y=data.thalach[data.target == 1], c='red')
    plt.scatter(x=data.age[data.target == 0], y=data.thalach[data.target ==0], c ='green')
    plt.title('thalach')
    plt.xlabel('age')
    plt.xticks(rotation=0)
    plt.legend(["Have Disease", "Haven't Disease"])
    plt.ylabel('heart rate')
    plt.show()

    plt.scatter(x=data.chol[data.target == 1], y=data.thalach[data.target == 1], c='red')
    plt.scatter(x=data.chol[data.target == 0], y=data.thalach[data.target == 0], c='green')
    plt.title('thalach / chol')
    plt.xlabel('max heart rate')
    plt.xticks(rotation=0)
    plt.legend(["Have Disease", "Haven't Disease"])
    plt.ylabel('chol (mg/dl)')
    plt.show()


if __name__ == "__main__":
    data, male, female = get_data(DATA_DIR)
    

    x_train, x_test, y_train, y_test, X, Y = create_sets(data)
    train(x_train, x_test, y_train, y_test, X,Y, models)

    # plot(data)