-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRWorkshop_part1_2022.R
274 lines (179 loc) · 8.6 KB
/
RWorkshop_part1_2022.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
################################################################################
############### MRF In Action: Intro to epi analysis in R ###############
################################################################################
# The Basics --------------------------------------------------------------
## part 1: install packages (only need to run once)
install.packages("readxl") #for importing excel files
install.packages("tidyverse") #collection of R packages including ggplot2 (visualizations),
#dplyr (data manipulation), tidyr (tidying data)
install.packages("skimr") #for descriptive stats
## part 2: load packages
library(readxl)
library(tidyverse)
library(skimr)
## part 3: set working directory
setwd("C:/Users/jr3639/OneDrive - Drexel University/Documents/students/RWorkshop2022") #insert your filepath here
## part 4: import dataset
d1 <- read_excel("Rworkshop_dataset.xlsx")
## sidenote: here's how you would export an excel file
install.packages("writexl")
library("writexl")
write_xlsx(d1,"dataexample.xlsx")
# Data Exploration and Visualization ----------------------------------------------------
View(d1)
dim(d1) #gives dimensions of the table
## part 1: summary stats
summary(d1) #shows datatype and other attributes; for numeric variables it also
#displays min, 1st quartile, median, mean, 3rd quartile and max values
#something funny is going on...
class(d1$asd) #"numeric"
d1$asd=factor(d1$asd)
class(d1$asd) #"factor"
#now do this for the other categorical variables too
d1$prenatal_vitamin=factor(d1$prenatal_vitamin)
d1$income=factor(d1$income)
d1$smoking=factor(d1$smoking)
summary(d1) #looks better!
skim(d1) #displays most of the numerical attributes from summary, and also missing
#values, more quantile information and an inline histogram for each variable
## part 2: basic distributions of variables
# make a plot of the distribution of the fruit variable
ggplot(data = d1) +
geom_histogram(aes(x = fruit))
# so what you're doing here is saying:
# 1) I want to make a plot and I want it to use the d1 dataset
# 2) I want to display a histogram of the fruit variable
#notice the warning message that pops up below - this means the data is being
#divided into 30 'bins' to display in the chart and R is prompting you that you
#probably need more
# let's try using 60 bins to display the data instead of the default 30
ggplot(data = d1) +
geom_histogram(aes(x = fruit), bins = 60)
#it looks like a pretty normal distribution
ggplot(data = d1) +
geom_histogram(aes(x = srs), bins = 60)
#close to normally distributed
## part 3: more visualizations
## Barchart - one discrete/categorical variable
ggplot(data=d1) +
geom_bar(mapping=aes(x=asd))
# now let's add color
ggplot(data=d1) +
geom_bar(mapping=aes(x=asd, fill=asd))
#anything in your aes() part of your command is something that is defined by the data
#so obviously your x variables is in your data
#above, I colored the bars by ASD status, so goes inside aes()
#but I wanted to just make the bars blue, that would not go inside aes()
#because it isn't determined by the data
ggplot(data=d1) +
geom_bar(mapping=aes(x=asd), fill="blue")
## Boxplot - one continuous and one categorical variable
ggplot(data=d1) +
geom_boxplot(mapping=aes(x=income,y=srs))
#this plot shows the "minimum" (Q1-1.5*IQR), "maximum" (Q3+1.5*IQR), median, first
#quartile and third quartile
#it would show outliers if we had them!
# let's make it fancy
ggplot(data=d1) +
geom_boxplot(mapping=aes(x=income,y=srs, fill=income), outlier.colour="black", outlier.shape=16, #changing appearance of outliers (which we don't have)
outlier.size=2, notch=TRUE)
#notch = TRUE - notch displays a confidence interval around the median which is
#normally based on the median +/- 1.58*IQR/sqrt(n). Notches are used to compare
#groups; if the notches of two boxes do not overlap, this is a strong evidence
#that the medians differ
## Scatterplot - two continuous variables
ggplot(data=d1) +
geom_point(mapping = aes(x=fruit, y=srs))
# adding color
ggplot(data=d1) +
geom_point(mapping = aes(x=fruit, y=srs, color=income))
## Different Facets
# adding a facet_wrap() lets you break apart the charts by another variable in the dataset
# you can specify nrow or ncol to tell it how many rows or columns to arrange the charts in
# When you call facet_wrap(), you have to add a ~ in front of the variable you're faceting on
ggplot(data=d1) +
geom_point(mapping = aes(x=fruit, y=srs, color=income))+
facet_wrap(~ income, nrow = 2)
## Multiple geoms, one plot
# here's an example of how things can stack on top of each other
ggplot(data = d1) +
geom_point(mapping=aes(x=fruit, y=srs, color=income)) +
geom_smooth(method="lm",mapping=aes(x=fruit, y=srs)) #adding a regression line
#lets make it look better!
ggplot(data = d1) +
geom_point(mapping=aes(x=fruit, y=srs, color=income)) +
geom_smooth(method="lm",mapping=aes(x=fruit, y=srs))+
ylab("SRS Score") + #add y axis title
xlab("Fruit Intake (SPD)") + #add x axis title
scale_y_continuous(breaks = c(0,30,60,90)) + #customize y axis scale
ggtitle("SRS Score by Fruit Intake in EARLI") + #add plot title
theme_bw()+ #removes gray background
theme(plot.title = element_text(hjust = 0.5)) #centers title
# there are a lot of fun things you can do with themes if you want to do some googling
# for example, there's a theme you can use that makes your charts look like charts in The Economist
install.packages("ggthemes")
library(ggthemes)
ggplot(data = d1) +
geom_point(mapping=aes(x=fruit, y=srs, color=income)) +
geom_smooth(method="lm",mapping=aes(x=fruit, y=srs))+
ylab("SRS Score") +
xlab("Fruit Intake (SPD)") +
scale_y_continuous(breaks = c(0,30,60,90)) +
ggtitle("SRS Score by Fruit Intake in EARLI") +
theme_economist()
## Exporting plots
#you can always click "export" in the output window to save a plot, or you can
#automate it with the following code
#first save your plot as an object
p1 <- ggplot(data=d1) +
geom_point(mapping = aes(x=fruit, y=srs))+
geom_smooth(method="lm",mapping=aes(x=fruit, y=srs))
#then export as a jpeg - it will save to your working directory folder
jpeg(filename="srs_fruit_scatter.jpg")
p1
dev.off()
# Data Analysis ----------------------------------------------------
## Part 1: linear regression
#LR is a model that uses a straight line to describe the relationship between
#variables.
# crude - only one independent variable (our primary exposure)
lin_c <- lm(srs ~ fruit, data = d1)
summary(lin_c)
# adjusted - multiple indpendent variables (primary exposure and covariates)
lin_a <- lm(srs ~ fruit + prepreg_bmi + maternal_age, data = d1)
summary(lin_a)
#now adding a categorical covariate to the adjusted model
lin_a2 <- lm(srs ~ fruit + prepreg_bmi + maternal_age + income, data = d1)
summary(lin_a2)
#notice that income is a categorical variable so R created dummy variables for me
#however... R is assuming the the ref for income is 1 but actually most people in
#the sample are in income level 3 (60/154)
d1$income <- relevel(d1$income, ref = "3")
#this tells R that I want level 3 (income 100k or more) to be the referent
lin_a2 <- lm(srs ~ fruit + prepreg_bmi + maternal_age + income, data = d1)
summary(lin_a2)
#it worked!
# Part 2: logistic regression
## crude
d1$asd <- relevel(d1$asd, ref = "0") #setting my reference to 0 just in case
#this isn't the default
log_c <- glm(asd ~ fruit, family="binomial", data=d1)
summary(log_c)
#The logistic regression coefficients give the change in the log odds of the
#outcome for a one unit increase in the predictor variable
## adjusted
log_a <- glm(asd ~ fruit + prepreg_bmi + maternal_age, family="binomial", data=d1)
summary(log_a)
#adding a categorical covariate
log_a2 <- glm(asd ~ fruit + prepreg_bmi + maternal_age + income, family="binomial", data=d1)
summary(log_a2)
## export regression results table to .txt
install.packages("stargazer")
library(stargazer)
x<-stargazer(log_a2, type = "text")
write.table(x, file = "logreg_adjusted.txt", sep = ",", quote = FALSE, row.names = F)
# this creates a .txt file but you can import that into word by:
# 1. Copy and paste the content of the .txt file into Word
# 2. In Word, select the text you just pasted from the .txt file
# 3. Go to Table ??? Convert ??? Convert Text to Table.
# 3. make sure "Commas" is selected under "Separate text at", click OK