-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrans_cluster.R
157 lines (120 loc) · 4.94 KB
/
trans_cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
library(sp)
library(rgdal)
library(data.table)
library(dplyr)
# load function to calculate euclidean distance between consecutive images
source("/Users/tadzio/Documents/UQProject/scripts/eudist.R")
#split dataset per transect
sdata = split(cdata, as.factor(cdata$trans))
# max distance interval within cluster
d = 70
clust.seq = lapply(sdata, int = d, function(x, int){
# assign input of lapply to new var
xy = x
# project coordinates to UTM
coordinates(xy) = c("lon", "lat")
proj4string(xy) = CRS("+proj=longlat +datum=WGS84")
xy = spTransform(xy, CRS("+proj=utm +zone=55 +datum=WGS84"))
# sequential clustering code
#--------------------------------------------
# calculate cumulative distance
# x is transformed to UTMs within eudist function
cumdist = cumsum(eudist(x)[,2])
# cut transect with specified intervals
sc = cut(cumdist, seq(0, max(cumdist), d))
# assign first image to cluster 1
# cl only contains factors for 2nd image up to the last since distance is calculated between consequetive images
sc = c(1, sc@.Data)
# split transect according to cluster (=sc)
subtrans = split(xy, as.factor(sc))
# calcualte values per cluster
subtrans.dat = lapply(subtrans, function(y){
# calculate max distance between images and number of images + quads per cluster
dmax = max(dist(y@coords))
no.img = length(y$image)
no.quad = sum(y$no.quad)
# define output
list(image = y$image,
no.img = no.img,
no.quad = no.quad,
dmax = dmax)
})
})
clust.hrc = lapply(sdata, int =d, function(x, int){
# assign input of lapply to new var
xy = x
# project coordinates to UTM
coordinates(xy) = c("lon", "lat")
proj4string(xy) = CRS("+proj=longlat +datum=WGS84")
xy = spTransform(xy, CRS("+proj=utm +zone=55 +datum=WGS84"))
# hierarchical clustering code
# -----------------------------------------
# use complete-linkage hierarchical clustering
hc = hclust(dist(coordinates(xy)), method = "complete")
# cut tree at specified distance
hc.d = cutree(hc, h=d)
# split transect according to hierarchical clusters (=hc.d)
subtrans2 = split(xy, as.factor(hc.d))
subtrans2.dat = lapply(subtrans2, function(w){
# calculate max distance between images and number of images + quads per cluster
dmax = max(dist(w@coords))
no.img = length(w$image)
no.quad = sum(w$no.quad)
# define output
list(image = w$image,
no.img = no.img,
no.quad = no.quad,
dmax = dmax)
})
})
# get cluster-id names
n = names(unlist(clust.seq, recursive = FALSE))
n2 = names(unlist(clust.hrc, recursive = FALSE))
# unlist and create list of dataframe
# for sequential clustering
clust.seq = lapply(unlist(clust.seq, recursive = FALSE), function(z){
as.data.frame(z)
})
# for hierarchical clustering
clust.hrc = lapply(unlist(clust.hrc, recursive = FALSE), function(z){
as.data.frame(z)
})
# bind list with id (=n, n2)
clust.seq = rbindlist(Map(cbind, id = as.factor(n), clust.seq))
clust.hrc = rbindlist(Map(cbind, id = as.factor(n2), clust.hrc))
# aggregate data
ag.seq = aggregate(cbind(no.quad, no.img, dmax) ~ id, data = clust.seq, mean, na.rm = TRUE)
ag.hrc = aggregate(cbind(no.quad, no.img, dmax) ~ id, data = clust.hrc, mean, na.rm = TRUE)
# show histograms
par(mfrow=c(2,2))
hist(ag.seq$no.quad, xlim = c(0,100), ylim = c(0,700), breaks=seq(0,500,5))
hist(ag.seq$dmax, ylim = c(0,1000))
hist(ag.hrc$no.quad, xlim = c(0,100), ylim = c(0,700), breaks=seq(0,500,5))
hist(ag.hrc$dmax, ylim = c(0,1000))
# filter
# specify the maximum distance between two points in the cluster, below which clusters should be removed from analysis
dmin = 35
# specify minimum number of quadrants within cluster
qmin = 30
# select images of clusters that meet requirements
clust.sel = clust.seq[clust.seq$no.quad > qmin & clust.seq$dmax > dmin,]
# create species matrix and match with cluster id
# first create list of variables to be extracted from dataset (cdata)
group = c("ACR.BRA", "ACR.HIP", "ACR.OTH", "ACR.PE",
"ACR.TCD", "ALC.SF", "CCA", "DSUB",
"FAV.MUS", "GORG", "MALG", "OTH.HC",
"OTH.SF", "OTH.SINV", "POCI",
"POR.BRA", "POR.ENC", "POR.MASS",
"Sand", "Turf", "Turfsa")
# assign id to species matrix (obtained from cdata) by merging clust.sel and cdata df's
sp = as.data.frame(merge(clust.sel, cdata, by = "image"))[,c("id", group)]
# match and bind cluster-id to environmental vars
# same procedure as above
env.m = as.data.frame(merge(clust.sel, env, by = "image"))[,c("id", colnames(env))]
# aggregate data according to cluster ID
# using mean to summarize data within clusters
sp.clust = aggregate(. ~ id, data = sp, FUN = mean)
# need to specify action to be taken when encountering NAs in data
# na.pass passes data unchanged when encountering NAs
env.clust = aggregate(. ~ id, data = env.m, FUN = mean, na.rm = T, na.action = na.pass)
env.clust$morph = as.factor(env.clust$morph)