-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweek1_assi1_sol1.R
159 lines (124 loc) · 4.61 KB
/
week1_assi1_sol1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# # -*- coding: utf-8 -*-
# """Week1 Assi1 Sol1.ipynb
#
# Automatically generated by Colaboratory.
#
# Original file is located at
# https://colab.research.google.com/drive/1EhQuzPFGUbUdQu4sD6W4LRLM7iyKZknn
# """
###########################################################################
## Week-1, Homework-1, Sol-1
## Sreya Dhar
## Created: Feb 09, 2021
## Edited: Feb 13, 2021
###########################################################################
rm(list=ls())
## installing all the libaries in R kernel
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("hrbrthemes")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("pls")
# install.packages("caret")
# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("pcr")
# install.packages("pls")
## importing the libraries in R kernel
library(ISLR)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(ggplot2)
library(reshape2)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(rsample)
library(leaps)
library(car)
library(pls)
library(PerformanceAnalytics)
library(funModeling)
library(caret)
library(glmnet)
library (pls)
## set directory ##
setwd("C:/File G/EAS 507 Statistical Mining II/Week 1/HW-1")
head(College)
names(College)
glimpse(College)
status(College)
College_C <- College
glimpse(College_C)
status(College_C)
profiling_num(College_C)
College_n1 <- College_C %>% mutate_if(is.factor, as.numeric)
## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(College_n1, histogram=TRUE, pch=15)
res <- cor(College_n1, method="pearson")
corrplot::corrplot(res, method= "color", order = "hclust", tl.pos = 'n')
profiling_num(College_n1)
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
plot_num(College_n1)
describe(College_n1)
# (a) Use the function summary() to produce a numerical summary of the variables in the dataset.
summary(College_C)
head(data.matrix(College_n1))
head(College_C)
# (b) Use Pairs() to produce a scatterplot of the continuous variables in the data set
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
pairs(College_C[, -1], main = "Pairwise plot on College Dataset only for continuous variables")
College_C1<- College_C
# (d) Create a new qualitative variable called Elite by binning the variable Top10perc.
# We are going to divide universities into two groups based on whether or not the
# proportion of students coming from the two 10% of their high school exceeds 50%.
# Add this variable to your dataset.
College_C1['Elite'] = College_C['Top10perc']
# College_C1
College_C1$Elite[College_C$Top10perc <= 50] = 'No'
College_C1$Elite[College_C$Top10perc > 50] = 'Yes'
head(College_C1)
# d) Use the table function to figure out how many Elite schools there are
tab_1<- table(Elite = College_C1$Elite)
tab_1
## set up cut-off/breaks values
breaks <- c(0,50.01,100)
## specify interval/bin labels for Elite ##
tags <- c("No", "Yes")
## bucketing values into bins
group_tags <- cut(College_C1$Top10perc,
breaks=breaks,
include.lowest=TRUE,
right=FALSE,
labels=tags)
### inspect bins for different group_tags ##
summary(group_tags)
education_groups <- factor(group_tags,
levels = tags,
ordered = TRUE)
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
ggplot(data = as_tibble(group_tags), mapping = aes(x=value)) +
geom_bar(fill="bisque",color="white",alpha=0.9) +
stat_count(geom="text", aes(label=sprintf("%.4f",..count../length(group_tags))), vjust=-0.5) +
labs(x='Elite, %') +
theme_bw()
# e) Use the table function to figure out how many of the Elite schools are private
table(Elite = College_C1$Elite, Private = College_C1$Private)
# Do elite schools tend to have higher graduation rates? Ans == Yes
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
ggplot(data = College_C1, mapping = aes(x=Elite,y=Grad.Rate)) +
geom_jitter(aes(color='blue'),alpha=0.9) +
geom_boxplot(fill="bisque",color="black",alpha=0.7) +
labs(x='Elite', y = 'Graduation Rate, %') +
guides(color=FALSE) +
theme_bw()
## end ##