-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcca.py
90 lines (62 loc) · 3.33 KB
/
cca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from sklearn.cross_decomposition import CCA
from scipy import stats
import numpy as np
def find_CCA_scaling_vectors(input_array_one, input_array_two):
"""
Calculate CCA (Canonical Correlation Analysis) scaling vectors
Args:
input_array_one: the first input array [T,D]
input_array_two: the second input array [T,D]
Returns:
cca_model: CCA model with scaling vectors for the CCA transformation
"""
# Define CCA model which considers the first CCA coefficient only
cca_model = CCA(n_components=1)
# Fit CCA model to the given data
cca_model.fit(input_array_one, input_array_two)
# Encode the given arrays into 1D space using the CCA linear transform
return cca_model
def calculate_CCA_score(input_array_one, input_array_two, CCA_model = None):
"""
Calculate CCA (Canonical Correlation Analysis) coefficient
Args:
input_array_one: the first input array [T,D]
input_array_two: the second input array [T,D]
CCA_model: CCA model with scaling vectors for the CCA transformation
Returns:
r: Pearson Correlation Coefficient after CCA transformation (scalar)
"""
if CCA_model is None:
# Define CCA model which considers the first CCA coefficient only
CCA_model = CCA(n_components=1, max_iter=2000, scale=False)
# identify non-constant dimensions
std_of_array_one = np.std(input_array_one, axis = 0)
non_contant_dims_in_array_one = np.where(std_of_array_one > 1e-8)[0]
std_of_array_two = np.std(input_array_two, axis=0)
non_contant_dims_in_array_two = np.where(std_of_array_two > 1e-8)[0]
# use only non-constant dimensions
input_array_one = input_array_one[:, non_contant_dims_in_array_one] # * 1e8
input_array_two = input_array_two[:, non_contant_dims_in_array_two] # * 1e8
# Fit CCA model to the given data
CCA_model.fit(input_array_one, input_array_two)
# Encode the given arrays into 1D space using the CCA linear transform
encoding_one, encoding_two = CCA_model.transform(input_array_one, input_array_two)
# Standartize arrays shape: make it np.array of floats, remove any dummy dimensions
encoding_one = np.array(encoding_one, dtype='float64').squeeze()
encoding_two = np.array(encoding_two, dtype='float64').squeeze()
# Calculate Pearson Correlation Coefficient (and p-value, which we don't use)
r, p = stats.pearsonr(encoding_one, encoding_two)
return r
if __name__ == '__main__':
#X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
#Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
#X = [[1, 2], [2, 3], [3, 4], [4, 4], [0., 1.], [1.,0.], [2.,2.], [3.,4.], [1, 2], [2, 3], [3, 4], [4, 4], [0., 1.], [1.,0.], [2.,2.], [3.,4.]]
#Y = [[2.5, 21], [2, 2.5], [2.01, 2], [2.8, 2.01], [0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3], [1, 2], [2, 3], [3, 4], [4, 4], [0., 1.], [1.,0.], [2.,2.], [3.,4.]]
#X = [[1], [2], [3], [4], [1], [2], [3], [4], [1], [2], [3], [4], [1], [2], [3], [4]]
#Y = [[2], [2], [2.01], [2.01], [2], [2], [2.01], [2.01], [2], [2], [2.01], [2.01], [2], [2], [2.01], [2.01]]
n = 10000
X = np.random.randint(1,11,(n,42))
Y = np.random.randint(0,15,(n,42)) + X*0.005 + 7
CCA_model = find_CCA_scaling_vectors(X,Y)
corr = calculate_CCA_score(X,Y)
print("CCA is : ", corr)