-
Notifications
You must be signed in to change notification settings - Fork 4.3k
/
Copy pathoptimized_rnnstack_converter.py
169 lines (142 loc) · 8.15 KB
/
optimized_rnnstack_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# ==============================================================================
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
import numpy as np
import cntk as C
from .converter import *
def _from_optimized_rnnstack(cudnn_rnn):
'''
converts cudnn optimized_rnnstack to non-cudnn functions to run in non-CUDA environment
Args:
cudnn_rnn: the optimized_rnnstack function that contains the parameters to be converted
Returns:
converted rnn function on GEMM based implementation that can be used on CPU
'''
if cudnn_rnn.root_function.op_name != 'OptimizedRNNStack':
raise ValueError('unexpected cudnn_rnn.root_function.op_name value "%s"'%cudnn_rnn.root_function.op_name)
cudnn_param = cudnn_rnn.parameters[0]
rnn_name = cudnn_rnn.name
input_var = cudnn_rnn.inputs[0]
hidden_size = cudnn_rnn.root_function.attributes['hiddenSize']
num_layers = cudnn_rnn.root_function.attributes['numLayers']
bidirectional = cudnn_rnn.root_function.attributes['bidirectional']
recurrent_op = cudnn_rnn.root_function.attributes['recurrentOp']
if recurrent_op not in ['lstm', 'rnnReLU', 'rnnTanh']:
raise ValueError('unsupported recurrent_op value "%s"'%recurrent_op)
#note that cudnn GRU is different from standard GRU so no conversion unless creating a new type of GRU cell for CPU
def _any_inferred(shape):
return np.any([dim < 0 for dim in shape])
if _any_inferred(cudnn_param.shape) or _any_inferred(input_var.shape):
raise ValueError('parameter not initialized yet')
input_size = input_var.shape[0] if len(input_var.shape) else 1
num_gates = 1
rnn_lambda = None
if recurrent_op == 'lstm':
num_gates = 4
if bidirectional:
rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_bw'+i), go_backwards=True)(x))
else:
rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+"_"+i))(x)
elif recurrent_op == 'rnnReLU' or recurrent_op == 'rnnTanh':
num_gates = 1
activation = C.relu if recurrent_op == 'rnnReLU' else C.tanh
if bidirectional:
rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.RNNStep(hidden_size, activation=activation, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.RNNStep(hidden_size, activation=activation, name=rnn_name+'_bw'+i), go_backwards=True)(x))
else:
rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.RNNStep(hidden_size, activation=activation, name=rnn_name+"_"+i))(x)
noncudnn_func = rnn_lambda(input_var, '0')
param = cudnn_param.value.reshape(-1)
offset = 0
multiplier = 2 if bidirectional else 1
def _adjust_gate_order(W):
if recurrent_op == 'lstm':
if len(W.shape) == 2:
i,f,m,o = np.hsplit(W, 4)
return np.concatenate((i,m,f,o), axis=1)
elif len(W.shape) == 1:
i,f,m,o = np.split(W, 4)
return np.concatenate((i,m,f,o))
else:
raise ValueError('LSTM parameter must have 1 or 2 dimensions')
else:
return W
def _get_cudnn_rnn_weight_splitter(in_dim, h_dim):
# for unidirectional, W, H
# for bidirectional, fw_W, fw_H, bw_W, bw_H
splitter = [in_dim*h_dim*num_gates, h_dim*h_dim*num_gates] * multiplier
splitter = splitter[0:-1]
return np.cumsum(splitter)
def _get_cudnn_rnn_bias_splitter(h_dim):
# for unidirectional, b1, b2
# for bidirectional, fw_b1, fw_b2, bw_b1, bw_b2
splitter = [h_dim*num_gates, h_dim*num_gates] * multiplier
splitter = splitter[0:-1]
return np.cumsum(splitter)
offset = 0
layer_input_size = input_size
for layer in range(num_layers):
layer_size = (layer_input_size + hidden_size) * hidden_size * num_gates * multiplier
layer_param = param[offset:offset+layer_size]
layer_name = str(layer)
if bidirectional:
fw_Wt, fw_Ht, bw_Wt, bw_Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size))
fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1)
bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1)
fw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(fw_Wt.reshape(num_gates*hidden_size, -1).transpose()))
fw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(fw_Ht.reshape(num_gates*hidden_size, -1).transpose()))
bw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(bw_Wt.reshape(num_gates*hidden_size, -1).transpose()))
bw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(bw_Ht.reshape(num_gates*hidden_size, -1).transpose()))
else:
Wt, Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size))
cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1)
cell.W.value = np.ascontiguousarray(_adjust_gate_order(Wt.reshape(num_gates*hidden_size, -1).transpose()))
cell.H.value = np.ascontiguousarray(_adjust_gate_order(Ht.reshape(num_gates*hidden_size, -1).transpose()))
offset += layer_size
layer_input_size = hidden_size * multiplier
if layer != num_layers - 1:
noncudnn_func = rnn_lambda(noncudnn_func.output, str(layer+1))
for layer in range(num_layers):
layer_size = 2 * hidden_size * num_gates * multiplier
layer_param = param[offset:offset+layer_size]
layer_name = str(layer)
if bidirectional:
fw_b1, fw_b2, bw_b1, bw_b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size))
fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1)
bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1)
fw_cell.b.value = _adjust_gate_order(fw_b1 + fw_b2).reshape(-1)
bw_cell.b.value = _adjust_gate_order(bw_b1 + bw_b2).reshape(-1)
else:
b1, b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size))
cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1)
cell.b.value = _adjust_gate_order(b1 + b2).reshape(-1)
offset += layer_size
return noncudnn_func
def convert_optimized_rnnstack(cudnn_model):
'''
Converts model that contains cudnn optimized_rnnstack to use non-cudnn functions,
so it can be used in non-CUDA environment
Args:
cudnn_model: a model that contains optimized_rnnstacks
Returns:
Converted model on GEMM based implementation of rnn that can be used on CPU
'''
class CuDNNOptimizedRNNConverter:
def __init__(self, model):
self.filter = lambda x : type(x) == C.Function and x.root_function.op_name == 'OptimizedRNNStack'
all_cudnn_rnns = C.logging.graph.depth_first_search(model, self.filter, depth=-1)
unique_params = set([cudnn_rnn.parameters[0] for cudnn_rnn in all_cudnn_rnns])
self.map_param_to_func = {p:None for p in unique_params}
def converter(self, cudnn_rnn):
param = cudnn_rnn.parameters[0]
if self.map_param_to_func[param]:
#shared parameter, clone
converted = self.map_param_to_func[param][0].clone(C.CloneMethod.share, {self.map_param_to_func[param][1] : cudnn_rnn.inputs[0], self.map_param_to_func[param][2] : C.placeholder()})
else:
#unique or first parameter, convert
converted = _from_optimized_rnnstack(cudnn_rnn)
self.map_param_to_func[param] = (converted, cudnn_rnn.inputs[0], cudnn_rnn.output,)
return converted
optimizedRNNConverter = CuDNNOptimizedRNNConverter(cudnn_model)
return convert(cudnn_model, optimizedRNNConverter.filter, optimizedRNNConverter.converter)