-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA_iris.py
50 lines (39 loc) · 1.35 KB
/
EDA_iris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Basic EDA, plots, using inbuilt iris dataset
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
# Set the working directory (replace "path" with your actual path)
os.chdir("path")
# Load the dataset
iris = pd.read_csv('Iris.csv')
# Display the first few rows and basic statistics of the dataset
print(iris.head())
print(iris.describe())
# Count plot of species
sns.countplot(x='Species', data=iris)
plt.show()
# Scatter plot of SepalLengthCm vs SepalWidthCm
sns.scatterplot(x='SepalLengthCm', y='SepalWidthCm', hue='Species', data=iris)
plt.show()
# Pair plot of the dataset excluding the 'Id' column
sns.pairplot(iris.drop(['Id'], axis=1), hue='Species', height=2)
plt.show()
# Exclude non-numeric columns for correlation matrix
numeric_columns = iris.drop(columns=['Id', 'Species'])
# Compute the correlation matrix
corr_matrix = numeric_columns.corr(method='pearson')
print(corr_matrix)
# Plot the heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()
# Box plot of SepalWidthCm grouped by Species
plt.figure()
sns.boxplot(x='Species', y='SepalWidthCm', data=iris)
plt.show()
# Uncomment if seaborn needs to be installed
# cd python3 bin
# python -m pip install seaborn
# pip install seaborn
# py -m pip install seaborn