forked from britishredcrosssociety/covid-19-vulnerability
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep deprivation - Scotland.r
132 lines (101 loc) · 5.93 KB
/
prep deprivation - Scotland.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
##
## Bespoke Index of Multiple Deprivation for Scotland
##
## According to the methodology (https://www2.gov.scot/Resource/0050/00504766.pdf):
## "Domain score is ranked to create domain rank. Each domain rank is standardised and transformed to an exponential distribution"
## ... so we'll take the published domain ranks (because they haven't published the transformed domain scores), and do the transformation ourselves
## (Note: using SIMD 2016 methodology and domain weights because they haven't published notes on SIMD 2020 yet)
##
library(tidyverse)
library(httr)
library(readxl)
library(janitor)
library(Hmisc)
# source("load lookup tables.r")
# lad_17_19 = read_csv("data/LAD 2017 to LAD 2019 codes.csv") # lookup of Local Authority codes from 2017 to 2019
##
## helper functions
##
# use the exponential transformation function listed in Welsh IMD's tech report - see Appendix A: https://gov.wales/sites/default/files/statistics-and-research/2020-02/welsh-index-multiple-deprivation-2019-technical-report.pdf
exp_transform = function(x) -23 * log(1 - x * (1 -exp(-100/23)))
scale_rank = function(x) (x - 1) / (nrow(imd_sco_scores) - 1) # normalise so rank is between 0 and 1
###############################################################################
## Create an index of multiple deprivation for income, employment, barriers and environment
##
# download "Scottish Index of Multiple Deprivation 2020: data zones", which conveniently includes domain ranks alongside ward and LA lookups
# source: https://www.gov.scot/publications/scottish-index-of-multiple-deprivation-2020-data-zone-look-up/
GET("https://www.gov.scot/binaries/content/documents/govscot/publications/statistics/2020/01/scottish-index-of-multiple-deprivation-2020-data-zone-look-up-file/documents/scottish-index-of-multiple-deprivation-data-zone-look-up/scottish-index-of-multiple-deprivation-data-zone-look-up/govscot%3Adocument/SIMD_2020_Datazone_lookup_tool.xlsx",
write_disk(tf <- tempfile(fileext = ".xlsx")))
imd_sco_scores = read_excel(tf, sheet = "SIMD 2020 DZ look-up data")
unlink(tf); rm(tf)
# create the mutiple deprivation index, using recommended weightings
# uncomment the commented lines to replicate the full IMD, to check this process works (it does)
imd_sco_covid = imd_sco_scores %>%
select(LSOA11CD = DZ
, Income = SIMD2020_Income_Domain_Rank, Employment = SIMD2020_Employment_Domain_Rank
, Barriers = SIMD2020_Access_Domain_Rank, Housing = SIMD2020_Housing_Domain_Rank
# uncomment this line if you want to recapitulate the full IMD score
# , Education = SIMD2020_Education_Domain_Rank, Health = SIMD2020_Health_Domain_Rank, Crime = SIMD2020_Crime_Domain_Rank
) %>%
# normalise the ranks so they're between 0 and 1
mutate_if(is.numeric, list(scaled = scale_rank)) %>%
# "Each domain rank is standardised and transformed to an exponential distribution" (https://www2.gov.scot/Resource/0050/00504766.pdf)
mutate_at(vars(ends_with("_scaled")), exp_transform) %>%
# use weighting on page 7 of https://www2.gov.scot/Resource/0050/00504822.pdf
mutate(IMD_score = (Income_scaled * 12) + (Employment_scaled * 12) + (Barriers_scaled * 4) + (Housing_scaled * 1)
# uncomment if you want to calculate full IMD score
# + (Education_scaled * 6) + (Health_scaled * 6) + (Crime_scaled * 2)
) %>%
# calculate IMD rank
mutate(IMD_rank = rank(IMD_score)) %>% # need to reverse the scoring of R's ranking algorithm to get the same style of ranking as in IMD; add 1 to make it not zero-based
# calculate IMD decile
mutate(IMD_decile = as.integer(cut2(IMD_rank, g = 10)),
IMD_quintile = as.integer(cut2(IMD_rank, g = 5)))
write_csv(imd_sco_covid, "output/covid-deprivation-scotland-LSOA.csv")
##
## DEBUGGING
##
# # how well does our calculated ranking (if using the full set of domains) recapitulate the original ranking? Pretty well: r = 0.9783251
# cor(imd_sco_scores$SIMD2020_Rank, imd_sco_covid$IMD_rank)
#
# # manually check individual examples
# imd_sco_covid %>%
# filter(IMD_rank == 4000) %>% # change this number
# select(LSOA11CD, IMD_rank) %>%
# left_join(imd_sco_scores %>% select(DZ, SIMD2020_Rank), by = c("LSOA11CD" = "DZ"))
###############################################################################
## Calculate proportion of the most-deprived LSOAs in each ward
##
# make Data Zone to Ward lookup table
lsoa_ward = imd_sco_scores %>%
select(LSOA11CD = DZ, MMWcode)
imd_sco_covid_ward = imd_sco_covid %>%
left_join(lsoa_ward, by = "LSOA11CD") %>%
# label LSOAs by whether they're in top 20% most-deprived then summarise by this label
mutate(IMD_top20 = ifelse(IMD_decile <= 2, "Top20", "Other")) %>%
tabyl(MMWcode, IMD_top20) %>%
# calculate proportion of most deprived LSOAs
mutate(Prop_top20 = Top20 / (Top20 + Other)) %>%
# split into quintiles
mutate(Deprivation_q = as.integer(cut2(Prop_top20, g = 5))) %>%
select(-Other, -Top20)
write_csv(imd_sco_covid_ward, "output/covid-deprivation-scotland-ward.csv")
###############################################################################
## Calculate proportion of the most-deprived LSOAs in each Local Authority
##
# make Data Zone to LA lookup table
lsoa_lad = imd_sco_scores %>%
select(LSOA11CD = DZ, LAD19CD = LAcode)
imd_sco_covid_lad = imd_sco_covid %>%
left_join(lsoa_lad, by = "LSOA11CD") %>%
# left_join(lad_17_19, by = "LAD17CD") %>%
# label LSOAs by whether they're in top 20% most-deprived then summarise by this label
mutate(IMD_top20 = ifelse(IMD_decile <= 2, "Top20", "Other")) %>%
tabyl(LAD19CD, IMD_top20) %>%
# calculate proportion of most deprived LSOAs
mutate(Prop_top20 = Top20 / (Top20 + Other)) %>%
# split into quintiles
mutate(Deprivation_q = as.integer(cut2(Prop_top20, g = 5))) %>%
select(-Other, -Top20)
write_csv(imd_sco_covid_lad, "output/covid-deprivation-scotland-LA.csv")
rm(imd_sco_scores, lsoa_ward, lsoa_lad, exp_transform, scale_rank)