-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathequi.py
133 lines (107 loc) · 5.82 KB
/
equi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import numpy
import theano
from theano import tensor as T
from theano.sandbox import rng_mrg
class ESGD(object):
"""Equilibrated SGD with bias correction of Adam.
Parameters
----------
parameters : list
List of parameters of the model. Must be theano shared variables.
gradients : list
List of the gradients w.r.t. each parameter.
"""
def __init__(self, parameters, gradients):
self.parameters = parameters
self.gradients = gradients
self.ema_grad = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters] # exponential moving average of gradient
self.ema_precond = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters] # exponential moving average of equilibration matrix (preconditioner)
self.t1 = theano.shared(numpy.asarray(0, "float32"))
self.t2 = theano.shared(numpy.asarray(0, "float32"))
self.damping = theano.shared(numpy.asarray(0, "float32"))
self.rng = rng_mrg.MRG_RandomStreams(numpy.random.randint(2**30))
def updates(self, learning_rate, beta1, beta2, epsilon):
""" Returns two updates. A slow one that updates the equilibration preconditioner. It should be called once every
X updates. The second update is faster because it uses the saved estimate of the equilibration preconditioner.
"""
# Update gradient estimation
grad = [beta1*old_g + (1-beta1)*g for old_g, g in
zip(self.ema_grad, self.gradients)]
new_t1 = self.t1 + 1
print epsilon
# Update preconditioner
samples = [self.rng.normal(size=p.shape, avg=0, std=1,
dtype=theano.config.floatX) for p in self.parameters]
product = theano.gradient.Lop(self.gradients, self.parameters, samples)
precond = [beta2*old_precond + (1-beta2)*(p**2) for old_precond, p in
zip(self.ema_precond, product)]
new_t2 = self.t2 + 1
damping_new = T.max([d.max() for d in precond])/(1-beta1**new_t1) * epsilon
slow_updates = zip(self.ema_precond, precond)
slow_updates.append((self.t2, new_t2))
slow_updates += zip(self.ema_grad, grad)
slow_updates.append((self.t1, new_t1))
slow_updates.append((self.damping, damping_new))
for param, g, precon in zip(self.parameters, self.gradients, precond):
g_bias_corrected = g/(1-beta1**new_t1)
precon_bias_corrected = precon/(1-beta2**new_t2)
update = -learning_rate * g_bias_corrected / (T.sqrt(precon_bias_corrected) + damping_new)
slow_updates.append((param, param + update))
fast_updates = zip(self.ema_grad, grad)
fast_updates.append((self.t1, new_t1))
for param, g, precon in zip(self.parameters, grad, self.ema_precond):
g_bias_corrected = g/(1-beta1**new_t1)
precon_bias_corrected = precon/(1-beta2**new_t2)
update = - learning_rate * g_bias_corrected / (T.sqrt(precon_bias_corrected) + self.damping)
fast_updates.append((param, param + update))
return slow_updates, fast_updates
class Adam(object):
"""Adam
Parameters
----------
parameters : list
List of parameters of the model. Must be theano shared variables.
gradients : list
List of the gradients w.r.t. each parameter.
"""
def __init__(self, parameters, gradients):
self.parameters = parameters
self.gradients = gradients
self.ema_grad = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters] # exponential moving average of gradient
self.ema_precond = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters] # exponential moving average of equilibration matrix (preconditioner)
self.t1 = theano.shared(numpy.asarray(0, "float32"))
self.t2 = theano.shared(numpy.asarray(0, "float32"))
self.rng = rng_mrg.MRG_RandomStreams(numpy.random.randint(2**30))
def updates(self, learning_rate, beta1, beta2, epsilon):
""" Returns two updates. A slow one that updates the equilibration preconditioner. It should be called once every
X updates. The second update is faster because it uses the saved estimate of the equilibration preconditioner.
"""
# Update gradient estimation
grad = [beta1*old_g + (1-beta1)*g for old_g, g in
zip(self.ema_grad, self.gradients)]
new_t1 = self.t1 + 1
# Update preconditioner
precond = [beta2*old_precond + (1-beta2)*(p**2) for old_precond, p in
zip(self.ema_precond, self.gradients)]
new_t2 = self.t2 + 1
slow_updates = zip(self.ema_precond, precond)
slow_updates.append((self.t2, new_t2))
slow_updates += zip(self.ema_grad, grad)
slow_updates.append((self.t1, new_t1))
for param, g, precon in zip(self.parameters, self.gradients, precond):
g_bias_corrected = g/(1-beta1**new_t1)
precon_bias_corrected = precon/(1-beta2**new_t2)
update = -learning_rate * g_bias_corrected / (T.sqrt(precon_bias_corrected) + epsilon)
slow_updates.append((param, param + update))
fast_updates = zip(self.ema_grad, grad)
fast_updates.append((self.t1, new_t1))
for param, g, precon in zip(self.parameters, grad, self.ema_precond):
g_bias_corrected = g/(1-beta1**new_t1)
precon_bias_corrected = precon/(1-beta2**new_t2)
update = - learning_rate * g_bias_corrected / (T.sqrt(precon_bias_corrected) + epsilon)
fast_updates.append((param, param + update))
return slow_updates, fast_updates