Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
synthetic-data/np_wgan_bds.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1127 lines (893 sloc)
40.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Compute the bounds on the outputs of nodes in the critic of a WGAN, and the | |
bounds on the "main" gradients of the layers, the weights, and the biases. | |
Works with tensorflow version 1.15 and numpy version 1.18 | |
For the gradient penalty term of a WGAN-GP, the bounds that it computes are on | |
the NORM of gradient of the weights (for each layer). | |
It also has code to actually compute the gradients (instead of bounds), based | |
on specific input values. | |
Assumes the critic is fully connected, and that every layer has either ReLU or | |
Leaky ReLU activation function or no activation function. | |
Assumes that the output is a scalar with no activation function. | |
Contains the following functions: | |
forwardPropLayerBound : uses forward prop to compute the bounds on the | |
output of the next layer of the critic. | |
backPropLayerGradBound : uses backprop to compute the bounds on the grad | |
of a layer of the critic. | |
backPropBiasGradBound : uses backprop to compute the bounds on the grad | |
of the bias of a layer of the critic. | |
backPropWtsGradBound : uses backprop to compute the bounds on the grad | |
of the weights of a layer of the critic. | |
computeBounds : uses the four functions above to compute all of the bounds | |
for all of the layers of the critic. | |
forwardPropLayer : uses forward prop to compute the values of the | |
output of the next layer of the critic. | |
backPropLayerGrad : uses backprop to compute the gradient | |
of a layer of the critic. | |
backPropBiasGrad : uses backprop to compute the gradient | |
of the bias of a layer of the critic. | |
backPropWtsGrad : uses backprop to compute the the gradient | |
of the weights of a layer of the critic. | |
computeGrads : uses the four functions above to compute the gradients | |
for a WGAN-GP (optionally a tradition WGAN) | |
practice_data : Generates practice data to test the computed gradients and | |
bounds. | |
setup : Setup for unit test. Calls other functions to compute the gradients | |
and bounds on the practice data. | |
test : Tests the gradients and bounds in the dictionary returned by setup() | |
against the gradients computed by TensorFlow. | |
@author: Joseph Pedersen | |
1 2 3 4 5 6 7 | |
1234567890123456789012345678901234567890123456789012345678901234567890123456789 | |
""" | |
import numpy as np | |
import os | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
import tensorflow as tf | |
print('tf',tf.__version__) | |
printon = False | |
####################################### | |
# Compute the bounds on the gradients | |
####################################### | |
def forwardPropLayerBound(inbound, weights, bias, alpha=0): | |
"""Computes the upper and lower bounds on the nodes of a layer of a network | |
based on : the bounds of the previous layer, the weights and bias, and | |
the activation function (ReLU, Leaky ReLU, or None) | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
inbound : (N,2) ndarray; N is the number of nodes in the input layer | |
Each row represents the min & max of that node | |
weights : (N,M) ndarray; M is the number of nodes in the next layer | |
bias : (M,) ndarray | |
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU | |
If alpha = 1, that represents no activation function in that layer. | |
Returns | |
------- | |
outbound : (M,2) ndarray, each row represents the min & max of that node | |
""" | |
outMin = ( | |
np.sum( | |
weights* | |
np.where(weights>=0, | |
inbound[:,0,None], | |
inbound[:,1,None]), | |
axis=0 | |
) | |
) + bias | |
outMax = ( | |
np.sum( | |
weights* | |
np.where(weights<=0, | |
inbound[:,0,None], | |
inbound[:,1,None]), | |
axis=0 | |
) | |
) + bias | |
# First column are mins, second column are maxes | |
outbound = np.concatenate((outMin[:,None], outMax[:,None]), axis=1) | |
# If applicable, apply (potentially Leaky) ReLU | |
if alpha != 1: | |
outbound = np.where(outbound > 0, | |
outbound, | |
alpha*outbound) | |
return outbound | |
def backPropLayerGradBound(biasGradBound, weights): | |
"""Computes the upper and lower bounds on the gradient of a layer of a | |
network based on : the bounds on the gradient of the bias of the next | |
layer, and the weights connecting the layer to the next layer | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
biasGradbound : (M,2) ndarray; Each row represents the min & max of the | |
gradient for the bias of the next layer. | |
weights : (N,M) ndarray; weight matrix connecting this layer to the next. | |
Returns | |
------- | |
layerGradBound : (N,2) ndarray; each row represents the min & max of the | |
gradient of this layer. | |
""" | |
lbound = biasGradBound[None,:,0] | |
ubound = biasGradBound[None,:,1] | |
layerGradMin = ( | |
np.sum( | |
weights* | |
np.where(weights>=0, | |
lbound, | |
ubound | |
), | |
axis=1 | |
) | |
) | |
layerGradMax = ( | |
np.sum( | |
weights* | |
np.where(weights<=0, | |
lbound, | |
ubound | |
), | |
axis=1 | |
) | |
) | |
# First column are mins, second column are maxes | |
layerGradBound = ( | |
np.concatenate((layerGradMin[:,None],layerGradMax[:,None]), axis=1) | |
) | |
return layerGradBound | |
def backPropBiasGradBound(Lbound, Lgradbound, alpha): | |
"""Computes the upper and lower bounds on the gradient of the bias of a | |
layer of a network based on: the bounds on the output of the layer and | |
its gradient, and the activation function (ReLU, Leaky ReLU, or None) | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
Lbound : (M,2) ndarray; M is the number of nodes in the layer | |
Each row represents the min & max of that node. | |
Lgradbound : (M,2) ndarray; Each row represents the min & max of the | |
gradient for the layer. | |
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU | |
If alpha = 1, that represents no activation function in that layer. | |
Returns | |
------- | |
biasGradBound : (M,2) ndarray; each row represents the min & max of the | |
gradient of the bias. | |
""" | |
if alpha==1: # if no activation function, the bound is just Lgradbound | |
return Lgradbound, np.ones(shape=Lbound.shape) | |
reluPrimeBounds = np.where(Lbound > 0, 1, alpha) | |
biasGradBounds = Lgradbound[:,:,None]*(reluPrimeBounds[:,None,:]) | |
biasGradMin = np.min(biasGradBounds,axis = (1,2)) | |
biasGradMax = np.max(biasGradBounds,axis = (1,2)) | |
# First column are mins, second column are maxes | |
biasGradBound = ( | |
np.concatenate((biasGradMin[:,None],biasGradMax[:,None]), axis=1) | |
) | |
return biasGradBound, reluPrimeBounds | |
def backPropWtsGradBound(Lbound, biasGradbound): | |
"""Computes the upper and lower bounds on the gradient of the weights of a | |
layer of a network based on: the bounds on the output of the nodes of | |
the previous layer, and the bounds on the gradient of the bias for the | |
same layer | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
Lbound : (N,2) ndarray; N is the number of nodes in the previous layer | |
Each row represents the min & max of that node. | |
biasGradbound : (M,2) ndarray; Each row represents the min & max of the | |
gradient for the bias of this layer. | |
Returns | |
------- | |
wtsGradBound : (N,M,2) ndarray; each row represents the min & max of the | |
gradient of the weights. | |
""" | |
wBounds = Lbound[:,None,:,None]*biasGradbound[None,:,None,:] | |
wtsGradMin = ( | |
np.min( | |
wBounds, | |
axis = (2,3) | |
) | |
) | |
wtsGradMax = ( | |
np.max( | |
wBounds, | |
axis = (2,3) | |
) | |
) | |
# First column are mins, second column are maxes | |
wtsGradBound = ( | |
np.concatenate((wtsGradMin[:,:,None],wtsGradMax[:,:,None]), axis=2) | |
) | |
return wtsGradBound | |
def computeBounds(inputBound, weights, bias, alphas, | |
outputGrad=None, compGP_g=True): | |
"""Computes the bounds on the values of the nodes of the discriminator of a | |
GAN, and the bounds on the gradients of: the layers, the weights, and the | |
biases. To compute this, it needs the bounds on the input, and the weights, | |
biases, and activation function (alpha) of each layer of the discriminator. | |
Parameters | |
---------- | |
inputBound : (N,2) ndarray; N is the number of nodes in the input layer, | |
which is layer 0. Each row represents the min & max of that node. | |
weights : a tuple of ndarrays, each of shape (N_i, N_{i+1}) | |
item i is the weight matrix connecting layer i to layer (i+1) | |
bias : a tuple of ndarrays, each of shape (N_{i+1},) | |
item i is the bias for layer (i+1) | |
alphas : a tuple of floats, each satisfying 0 <= alpha <=1 | |
item i indicates the activation function for layer (i+1) | |
If alpha = 0, that represents ReLU for that layer. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU. | |
If alpha = 1, that represents no activation function in that layer. | |
outputGrad : (optional) A tuple of bounds on the gradient of the loss with | |
respect to the output, e.g. (-1.1, -0.9). The gradient should be -1 for | |
the loss function of the improved WGAN without the gradient penalty | |
term, but the bounds need to account for floating point arithmetic. | |
If None, the default, then the bounds are set to (-1.000001, -0.999999) | |
compGP_g : bool. If True, the default, then compute the bounds on the norm | |
of the gradient of the gradient penalty term. | |
Requires: | |
1) all tuples should have the same length, L+1, where L is the number | |
of hidden layers in the discriminator. Layer (L+1) is the output | |
2) Array dimensions should match appropriately, as described above. | |
3) the output layer should be a scalar, with no activation function, | |
i.e. weights[L+1].shape[1]=1, bias[L+1].shape[0]=1, and alpha[L+1]=1 | |
Returns | |
------- | |
bd_layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 2) | |
item i is the bounds on the (i+1)th layer of the network, which has | |
N_{i+1} nodes. The [r,0] entry of array i is the min of node r in layer | |
(i+1). The [r,1] entry is the corresponding max. | |
bd_layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 2) | |
item i is the bounds on the gradient of layer (i+1). | |
The [j,0] entry of array i is the min of the gradient of node j of | |
layer (i+1). The [j,1] entry is the corresponding max. | |
bd_bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 2) | |
item i is the bounds on the gradient of the bias for layer (i+1). | |
The [j,0] entry of array i is the min of the gradient of the bias for | |
node j of layer (i+1). The [j,1] entry is the corresponding max. | |
bd_wts_g : a tuple of ndarrays, each of shape (N_i, N_{i+1}, 2) | |
item i is the bounds on the gradient of the weight matrix connecting | |
layer i to layer (i+1). The [j,k,0] entry of array i is the min of the | |
gradient of the weight connecting node j of layer i to node j of layer | |
k. The [j,k,1] entry of array i is the corresponding max. | |
bd_wts_GP_gNORM : a tuple of floats. The bound on the norm of the gradient | |
of the gradient penalty term, for each layer of the critic. | |
""" | |
numL = len(weights) # number of layers, counting output, but not input | |
if len(bias) != numL: | |
raise ValueError("Numbers of weight matrices and biases do not match") | |
if len(alphas) != numL: | |
raise ValueError("Numbers of weight matrices and alphas do not match") | |
if outputGrad != None: | |
if len(outputGrad) != 2: | |
raise ValueError("ouputGrad should have a length of 2") | |
if (type(outputGrad[0])!=int and type(outputGrad[0])!=float): | |
raise ValueError("outputGrad[0] should be int or float") | |
if (type(outputGrad[1])!=int and type(outputGrad[1])!=float): | |
raise ValueError("outputGrad[1] should be int or float") | |
# Dimensions of weight matrices and biases | |
wtDims = [w.shape for w in weights] | |
biasDims = [b.shape for b in bias] | |
if wtDims[0][0] != inputBound.shape[0]: | |
msg = (f"First weight matrix dimension {wtDims[0][0]} " | |
+ f"does not match input shape {inputBound.shape[0]} ") | |
raise ValueError(msg) | |
for i in range(1, numL): | |
if wtDims[i][0] != wtDims[i-1][1]: | |
msg = f"weight matrix inner dimension mismatch at i={i}" | |
raise ValueError(msg) | |
for i in range(numL): | |
if wtDims[i][1] != biasDims[i][0]: | |
msg = f"bias dimension mismatch at i={i}" | |
raise ValueError(msg) | |
# Make a list containing the bounds on nodes in the first hidden layer | |
bd_layer_out = [inputBound] | |
# Forward Prop to compute the bounds on the nodes in all other layers | |
for i in range(0, numL): | |
bd_layer_out.append( | |
forwardPropLayerBound(inbound = bd_layer_out[i], | |
weights = weights[i], | |
bias = bias[i], | |
alpha = alphas[i]) | |
) | |
# Convert to tuple | |
bd_layer_out = tuple(bd_layer_out) | |
# Initialize lists of length numL, for bounds on gradients | |
bd_layer_g = [None] * (numL+1) | |
bd_bias_g = [None] * numL | |
bd_wts_g = [None] * numL | |
bd_relu_g = [None] * numL | |
# Bound on grad for output layer: assumes (partial L / partial Y) = -1 | |
if outputGrad == None: | |
bd_layer_g[numL] = np.array([[-1.0001,-0.9999]]) | |
else: | |
bd_layer_g[numL] = np.array([[outputGrad[0],outputGrad[1]]]) | |
# Conduct backprop to compute the bounds on the gradients | |
for i in range(numL, 0,-1): | |
# grad of the bias for layer (i+1) | |
bd_bias_g[i-1], bd_relu_g[i-1] = backPropBiasGradBound(bd_layer_out[i], | |
bd_layer_g[i], | |
alphas[i-1]) | |
# grad of weights connecting layers (i) to (i+1) | |
bd_wts_g[i-1] = backPropWtsGradBound(bd_layer_out[i-1], bd_bias_g[i-1]) | |
# gradient of layer (i+1) | |
bd_layer_g[i-1] = backPropLayerGradBound(bd_bias_g[i-1], weights[i-1]) | |
# Make them tuples | |
bd_layer_g = tuple(bd_layer_g) | |
bd_bias_g = tuple(bd_bias_g) | |
bd_relu_g = tuple(bd_relu_g) | |
bd_wts_g = tuple(bd_wts_g) | |
if compGP_g: | |
# Compute bound on NORM of gradient of gradient penalty ||grad of GP|| | |
weightMatrixNorms = [None] * (numL) | |
reluPrimeNorms = [None] * (numL-1) | |
bd_wts_GP_gNORM = [None] * (numL) | |
gradDnorm_max = np.linalg.norm(np.max(np.abs(bd_layer_g[0]),axis=1)) | |
multiplier = 2.0 * np.maximum(gradDnorm_max - 1, 1) # without lamda | |
for i, mat in enumerate(weights): | |
weightMatrixNorms[i] = np.linalg.norm(mat) # Froebenius norm | |
for i in range(numL-1): | |
reluPrimeNorms[i] = np.linalg.norm(bd_relu_g[i][:,1]) # Euclidean norm | |
productBound = (np.product(weightMatrixNorms) | |
* np.product(reluPrimeNorms) | |
* multiplier) | |
for i in range(numL): | |
bd_wts_GP_gNORM[i] = productBound/weightMatrixNorms[i] | |
bd_wts_GP_gNORM = tuple(bd_wts_GP_gNORM) | |
return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g, bd_wts_GP_gNORM | |
else: | |
return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g | |
############################################################################### | |
# | |
# The next section has the functions that compute the actual gradients | |
# (instead of bounds) for specific values of inpt | |
# | |
############################################################################### | |
####################################### | |
# Compute the gradients | |
####################################### | |
def forwardPropLayer(inpt, weights, bias, alpha=0): | |
"""Computes the values for the nodes of a layer of a network | |
based on : the previous layer, the weights and bias, and | |
the activation function (ReLU, Leaky ReLU, or None) | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
inpt : (N,1) ndarray; N is the number of nodes in the input layer | |
Each row represents the min & max of that node | |
weights : (N,M) ndarray; M is the number of nodes in the next layer | |
bias : (M,) ndarray | |
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU | |
If alpha = 1, that represents no activation function in that layer. | |
Returns | |
------- | |
output : (M,1) ndarray, each row represents the value of that node | |
""" | |
output = weights.transpose() @ inpt + bias[:,None] | |
# If applicable, apply (potentially Leaky) ReLU | |
if alpha != 1: | |
output = np.where(output > 0, | |
output, | |
alpha*output) | |
return output | |
def backPropLayerGrad(biasGrad, weights): | |
"""Computes the gradient of a layer of a | |
network based on : the gradient of the bias of the next | |
layer, and the weights connecting the layer to the next layer | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
biasGrad : (M,1) ndarray; Each row represents the min & max of the | |
gradient for the bias of the next layer. | |
weights : (N,M) ndarray; weight matrix connecting this layer to the next. | |
Returns | |
------- | |
layerGrad : (N,1) ndarray; each row represents the value of the | |
gradient of this layer. | |
""" | |
layerGrad = weights @ biasGrad | |
return layerGrad | |
def backPropBiasGrad(Lvals, Lgrad, alpha): | |
"""Computes the gradient of the bias of a | |
layer of a network based on: the output of the layer and | |
its gradient, and the activation function (ReLU, Leaky ReLU, or None) | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
Lvals : (M,1) ndarray; M is the number of nodes in the layer | |
Each row represents the value of that node. | |
Lgrad : (M,1) ndarray; Each row represents the value of the | |
gradient for the layer. | |
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU | |
If alpha = 1, that represents no activation function in that layer. | |
Returns | |
------- | |
biasGrad : (M,1) ndarray; each row represents the value of the | |
gradient of the bias. | |
""" | |
if alpha==1: # if no activation function, the bound is just Lgradbound | |
return Lgrad, np.ones(Lvals.shape) | |
reluPrime = np.where(Lvals > 0, 1, alpha) | |
biasGrad = Lgrad * reluPrime | |
return biasGrad, reluPrime | |
def backPropWtsGrad(Lvals, biasGrad): | |
"""Computes the gradient of the weights of a layer of a network based on: | |
the output of the nodes of the previous layer, and the | |
gradient of the bias for the same layer | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
Lvals : (N,1) ndarray; N is the number of nodes in the previous layer | |
Each row represents the value of that node. | |
biasGrad : (M,1) ndarray; the gradient for the bias of this layer. | |
Returns | |
------- | |
wtsGrad : (N,M) ndarray; the gradient of the weights. | |
""" | |
wtsGrad = Lvals @ biasGrad.transpose() | |
return wtsGrad | |
def computeGrads(inpt, weights, bias, alphas, compGP_g=True): | |
"""Computes the values of the nodes of the discriminator of a WGAN, and | |
the gradients of: the layers, the weights, and the biases. To compute this, | |
it needs the input, the weights, the biases, and the activation functions | |
(alpha) of each layer of the discriminator. | |
Parameters | |
---------- | |
inpt : (N,1) ndarray; N is the number of nodes in the input layer, | |
which is layer 0. Each row represents the value of that node. | |
weights : a tuple of ndarrays, each of shape (N_i, N_{i+1}) | |
item i is the weight matrix connecting layer i to layer (i+1) | |
bias : a tuple of ndarrays, each of shape (N_{i+1},) | |
item i is the bias for layer (i+1) | |
alphas : a tuple of floats, each satisfying 0 <= alpha <=1 | |
item i indicates the activation function for layer (i+1) | |
If alpha = 0, that represents ReLU for that layer. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU. | |
If alpha = 1, that represents no activation function in that layer. | |
compGP_g : bool. If True, the default, then compute the gradient of the | |
gradient penalty term. | |
Requires: | |
1) all tuples should have the same length, L+1, where L is the number | |
of hidden layers in the discriminator. Layer (L+1) is the output | |
2) Array dimensions should match appropriately, as described above. | |
3) the output layer should be a scalar, with no activation function, | |
i.e. weights[L].shape[1]=1, bias[L].shape[0]=1, and alpha[L]=1 | |
where [L] represents layer L+1, due to zero-indexing | |
Returns | |
------- | |
layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 1) | |
item i is the values of the (i+1)th layer of the network, which has | |
N_{i+1} nodes. The [r,0] entry of array i is the value of node r. | |
layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 1) | |
item i is the gradient of layer (i+1). | |
bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 1) | |
item i is the gradient of the bias for layer (i+1). | |
wts_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2) | |
item i is the gradient of the weight matrix connecting | |
layer i to layer (i+1), for the loss function without GP term. | |
wts_GP_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2) | |
item i is the gradient of the weight matrix connecting | |
layer i to layer (i+1), for the GP term of the loss function. | |
""" | |
numL = len(weights) # number of layers, counting output, but not input | |
# = L + 1, where L is the number of hidden layers | |
if len(bias) != numL: | |
raise ValueError("Numbers of weight matrices and biases do not match") | |
if len(alphas) != numL: | |
raise ValueError("Numbers of weight matrices and alphas do not match") | |
# Dimensions of weight matrices and biases | |
wtDims = [w.shape for w in weights] | |
biasDims = [b.shape for b in bias] | |
if wtDims[0][0] != inpt.shape[0]: | |
raise ValueError(f"First weight matrix dimension {wtDims[0][0]} " | |
+ f"does not match input {inpt.shape[0]}") | |
for i in range(1, numL): | |
if wtDims[i][0] != wtDims[i-1][1]: | |
msg = f"weight matrix inner dimension mismatch at i={i}" | |
raise ValueError(msg) | |
for i in range(numL): | |
if wtDims[i][1] != biasDims[i][0]: | |
msg = f"bias dimension mismatch at i={i}" | |
raise ValueError(msg) | |
# Make a list containing the values of input nodes | |
layer_out = [inpt] | |
# Forward Prop to compute the values of the nodes in all other layers | |
for i in range(0, numL): | |
layer_out.append( | |
forwardPropLayer(inpt = layer_out[i], | |
weights = weights[i], | |
bias = bias[i], | |
alpha = alphas[i]) | |
) | |
# Convert to tuple | |
layer_out = tuple(layer_out) | |
# Initialize lists of length numL, for values of gradients | |
layer_g = [None] * (numL+1) | |
bias_g = [None] * numL | |
relu_g = [None] * numL | |
wts_g = [None] * numL | |
# values of grad (of real term) for output layer: | |
layer_g[numL] = np.array([[-1.00000]], dtype=np.float32) | |
# Conduct backprop to compute the bounds on the gradients | |
for i in range(numL, 0,-1): | |
# grad of the bias for layer (i+1) | |
bias_g[i-1], relu_g[i-1] = backPropBiasGrad(layer_out[i], | |
layer_g[i], | |
alphas[i-1]) | |
# grad of weights connecting layers (i) to (i+1) | |
wts_g[i-1] = backPropWtsGrad(layer_out[i-1], bias_g[i-1]) | |
# gradient of layer (i+1) | |
layer_g[i-1] = backPropLayerGrad(bias_g[i-1], weights[i-1]) | |
# Make them tuples | |
layer_g = tuple(layer_g) | |
bias_g = tuple(bias_g) | |
relu_g = tuple(relu_g) | |
wts_g = tuple(wts_g) | |
if compGP_g: | |
# Compute gradient of gradient penalty (grad of GP) | |
gradDnorm = np.linalg.norm(layer_g[0][:,0]) | |
Dlist = [None] * (numL) # left factor of GP, with the gradD term | |
Rlist = [None] * (numL) # R for "right", since it gets multiplied on right | |
wts_GP_g = [None] * (numL) | |
Dlist[0] = -2.0*(gradDnorm - 1.0)*layer_g[0]/gradDnorm # not * by lamda | |
Rlist[0] = np.ones((1,1), dtype=np.float32) | |
for i in range(1,numL): | |
Dlist[i] = (weights[i-1].transpose() @ Dlist[i-1]) * relu_g[i-1] | |
Rlist[i] = relu_g[numL-1-i] * (weights[numL-i] @ Rlist[i-1]) | |
wts_GP_g[numL - 1] = Dlist[numL - 1] | |
for i in range(numL - 1): | |
wts_GP_g[i] = Dlist[i] @ np.transpose(Rlist[numL-1-i]) | |
return layer_out, layer_g, bias_g, wts_g, wts_GP_g | |
else: | |
return layer_out, layer_g, bias_g, wts_g | |
############################################################################### | |
# | |
# The next section has practice data used to check functionality | |
# | |
############################################################################### | |
####################################### | |
# Practice data | |
####################################### | |
def practice_data(eps = 0.01, layers = 1, random=False, max_nodes=10): | |
"""Generate practice data to test the computed gradients and bounds | |
Requires: numpy imported as np | |
Parameters | |
---------- | |
eps : float. Half of the width of the allowed input (for each node) | |
layers : int. The number of layers for the unit test | |
random : bool. Uses fixed values if False, and random values if True | |
max_nodes : int. The maximum number of nodes to use for each layer | |
Returns : sample data for unit test | |
------- | |
ib : (N,2) ndarray; N is the number of nodes in the input layer, | |
which is layer 0. Each row represents the min & max of that node. | |
wts : a tuple of ndarrays, each of shape (N_i, N_{i+1}) | |
item i is the weight matrix connecting layer i to layer (i+1) | |
bias : a tuple of ndarrays, each of shape (N_{i+1},) | |
item i is the bias for layer (i+1) | |
alpha : a tuple of floats, each satisfying 0 <= alpha <=1 | |
item i indicates the activation function for layer (i+1) | |
If alpha = 0, that represents ReLU for that layer. | |
If 0 < alpha < 1, that represents the parameter for Leaky ReLU. | |
If alpha = 1, that represents no activation function in that layer. | |
""" | |
if type(layers) != int or layers < 1: | |
raise ValueError("layers should be an int >= 1") | |
if random: | |
sigma = np.random.choice([1e-3,1e-2,1e-1,1]) | |
#sigma = np.random.choice([1e-3,1e-2,1e-1]) | |
sizes = tuple(np.random.randint(2,max_nodes+1,size=layers)) | |
ib_size = (np.random.choice(max_nodes)+1,) | |
ib_center = np.random.uniform(low=eps,high=1-eps,size=ib_size) | |
ib = np.transpose([ib_center-eps, ib_center+eps]) | |
bias_sizes = sizes+(1,) | |
wts_dim0 = ib_size+sizes | |
alpha = tuple(np.random.choice([0.1,0.2,0.3,0.5], layers))+(1,) | |
#alpha = tuple(np.random.choice([0.2], layers))+(1,) | |
bias = [None]*(layers+1) | |
wts = [None]*(layers+1) | |
for i in range(layers+1): | |
bias[i] = np.random.normal(0,sigma,bias_sizes[i]) | |
wts[i] = np.random.normal(0,sigma,(wts_dim0[i],bias_sizes[i])) | |
else: | |
if layers == 1: # practice data for one hidden layer | |
# input bounds | |
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps], | |
[-2-eps,-2+eps],[-3-eps,-3+eps]], | |
dtype=np.float32) | |
# weights for the network | |
wts = (np.array([[-1, 2, -2],[ -7, 5, 1], | |
[ 2, -2, 3],[-3, 4, 10]], | |
dtype=np.float32), | |
np.array([[-1], [2], [-2]], | |
dtype=np.float32) | |
) | |
# bias for the network | |
bias = (np.array([-100,0,100], dtype=np.float32), | |
np.array([0], dtype=np.float32)) | |
# alphas for the network | |
alpha = (0.2,1) | |
elif layers == 2: # practice data for two hidden layers | |
# input bounds | |
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps], | |
[-2-eps,-2+eps],[-3-eps,-3+eps]], | |
dtype=np.float32) | |
# weights for the network | |
wts = (np.array([[-1, 2, -2],[ -7, 5, 1], | |
[ 2, -2, 3],[-3, 4, 10]], | |
dtype=np.float32), | |
np.array([[-1,3], [2,-4], [-2,5]], | |
dtype=np.float32), | |
np.array([[8], [-9]], | |
dtype=np.float32) | |
) | |
# bias for the network | |
bias = (np.array([-97,0,101], dtype=np.float32), | |
np.array([-11,13], dtype=np.float32), | |
np.array([0], dtype=np.float32)) | |
# alphas for the network | |
alpha = (0.2,0.4,1.) | |
else: # practice data for three hidden layers | |
# input bounds | |
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps], | |
[-2-eps,-2+eps],[-3-eps,-3+eps]], | |
dtype=np.float32) | |
# weights for the network | |
wts = (np.array([[-1, 2, -2],[ -7, 5, 1], | |
[ 2, -2, 3],[-3, 4, 10]], | |
dtype=np.float32), | |
np.array([[-1,3], [2,-4], [-2,5]], | |
dtype=np.float32), | |
np.array([[8,1,-3,6,-2], [-9,0,3,-4,5]], | |
dtype=np.float32), | |
np.array([[-7],[4],[-5],[2],[-3]], | |
dtype=np.float32) | |
) | |
# bias for the network | |
bias = (np.array([-97,0,101], dtype=np.float32), | |
np.array([-11,13], dtype=np.float32), | |
np.array([-2,3,-4,5,-6], dtype=np.float32), | |
np.array([0], dtype=np.float32)) | |
# alphas for the network | |
alpha = (0.2,0.4,0.25,1.) | |
return ib, wts, bias, alpha | |
def setup(eps = 0.01, layers = 1, random=False, max_nodes=10): | |
"""Setup for unit test. Uses data from practice_data() to create a | |
TF graph of a WGAN-GP, compute the gradients using TensorFlow, and compute | |
the bounds on the grads using computeBounds() and the actual grads using | |
computeGrads(). Outputs everything as a dictionary | |
Requires: | |
numpy imported as np | |
tensorflow imported as tf | |
Parameters | |
---------- | |
eps : float. Half of the width of the allowed input (for each node) | |
layers : int. The number of layers for the unit test | |
random : bool. Uses fixed values if False, and random values if True | |
max_nodes : int. The maximum number of nodes to use for each layer | |
Returns | |
------- | |
dump_dict : a dictionary containing the gradients and bounds | |
""" | |
####################################### | |
# Generate data to test bounds | |
####################################### | |
ib, wts, bias, alpha = practice_data(eps=eps, | |
layers=layers, | |
random=random, | |
max_nodes=max_nodes) | |
####################################### | |
# Tensorflow graph | |
####################################### | |
batch_size = 1 | |
train_data = ((ib[:,1:]+ib[:,:1])/2).transpose() | |
# start with a fresh graph | |
tf.reset_default_graph() | |
# create the placeholder for real data | |
real_data = tf.placeholder( | |
tf.float32, | |
shape=[batch_size, ib.shape[0]], | |
name="RealData") | |
# create a noise data set | |
fake_data = tf.random_uniform([batch_size, ib.shape[0]],0.,1.) | |
def tfcritic(x): | |
layer = 1 | |
for w, b, a in zip(wts, bias, alpha): | |
x = tf.contrib.layers.fully_connected( | |
x, | |
w.shape[1], | |
activation_fn = lambda z: tf.nn.leaky_relu(z, alpha=a), | |
scope=f'critic.{layer}', | |
reuse=tf.AUTO_REUSE, | |
weights_initializer=tf.constant_initializer(w), | |
biases_initializer=tf.constant_initializer(b) | |
) | |
layer += 1 | |
return x | |
crit_real = tfcritic(real_data) | |
crit_fake = tfcritic(fake_data) | |
# points between real and fake (line 6 of WGAN-GP algorithm) | |
u = tf.random_uniform(shape=[batch_size, 1], minval=0, maxval=1) | |
interpolates = (u * real_data) + (1 - u)*fake_data | |
# compute gradients of dicriminator at interpolates | |
crit_interp = tfcritic(interpolates) | |
gradients = tf.gradients(crit_interp, [interpolates])[0] | |
# calculate the 2 norm of the gradients | |
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), | |
reduction_indices=[1])) | |
# subtract 1, square | |
crit_gp = (slopes - 1.)**2 | |
lamda = 10 | |
crit_gp_loss = lamda*tf.reduce_mean(crit_gp) | |
crit_main_loss = tf.reduce_mean(crit_fake) - tf.reduce_mean(crit_real) | |
crit_real_loss = - tf.reduce_mean(crit_real) | |
crit_loss = crit_main_loss + crit_gp_loss | |
# get TF trainable params that have 'critic' in name: | |
crit_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) | |
if 'critic' in v.name] | |
# compute the gradients for those params: | |
critOpt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) | |
# for crit_real_loss | |
crit_main_grads = critOpt.compute_gradients(crit_main_loss, | |
var_list=crit_params) | |
crit_real_grads = critOpt.compute_gradients(crit_real_loss, | |
var_list=crit_params) | |
crit_gp_grads = critOpt.compute_gradients(crit_gp_loss, | |
var_list=crit_params) | |
crit_grads = critOpt.compute_gradients(crit_loss, var_list=crit_params) | |
crit_train_op = critOpt.apply_gradients(crit_grads) | |
critic_wts = [v for v in crit_params if 'weights' in v.name] | |
critic_bias = [v for v in crit_params if 'biases' in v.name] | |
ib_tensor = tf.constant(ib) | |
b1 = computeBounds(inputBound=ib, | |
weights=wts, | |
bias=bias, | |
alphas=alpha, | |
compGP_g=True) | |
# the real grads | |
g1 = computeGrads(inpt=train_data.transpose(), | |
weights=wts, | |
bias=bias, | |
alphas=alpha, | |
compGP_g=False) | |
# evaluate grads in session | |
with tf.Session() as session: | |
# initialize variables | |
session.run(tf.global_variables_initializer()) | |
if printon: | |
print('Starting tensorflow session') | |
myvars = tf.trainable_variables() | |
print("\nTensorFlow trainable parameters:") | |
for myvar in myvars: | |
print(myvar.name) | |
print(' shape =', myvar.get_shape()) | |
print("\ncritic_wts:") | |
for myvar in critic_wts: | |
print(myvar.name) | |
print(' shape =', myvar.get_shape()) | |
print("\ncritic_bias:") | |
for myvar in critic_bias: | |
print(myvar.name) | |
print(' shape =', myvar.get_shape()) | |
(f,interps,g,cmg,crg,cg) = session.run( | |
[fake_data, | |
interpolates, | |
gradients, | |
crit_main_grads, | |
crit_real_grads, | |
crit_grads], | |
feed_dict={real_data: train_data}) | |
# the grads at the interpolates (which the grad of GP is computed) | |
gp1 = computeGrads(inpt=interps.transpose(), | |
weights=wts, | |
bias=bias, | |
alphas=alpha, | |
compGP_g=True) | |
dump_dict = { | |
'ib' : ib, | |
'wts' : wts, | |
'bias' : bias, | |
'alpha' : alpha, | |
'fake_data' : f, | |
'interpolates' : interps, | |
'gradients' : g, | |
'crit_main_grads' : cmg, | |
'crit_real_grads' : crg, | |
'crit_grads' : cg, | |
'bds' : b1, | |
'comp_grads' : g1, | |
'comp_gp' : gp1 | |
} | |
return dump_dict | |
def test(d, lamda=10): | |
"""Tests the gradients and bounds in the dictionary returned by setup() | |
against the gradients computed by TensorFlow | |
Requires: | |
numpy imported as np | |
tensorflow imported as tf | |
Parameters | |
---------- | |
d : dictionary. The return from setup() | |
lamda : int or float. The GP multiplier in the WGAN-GP loss function | |
Returns | |
------- | |
check_dict : a dictionary containing the results of the test. In particular | |
check_dict['overallQ'] is True if all gradients were in the computed | |
bounds and check_dict['overallD'] shows the maximum relative error | |
between the grads computed by this module and those computed by TF | |
""" | |
layers = len(d['bds'][3]) | |
if len(d['crit_real_grads']) != 2*layers: | |
raise ValueError("Number of layers do not match") | |
wtQ = [] | |
biasQ = [] | |
wtgpQ = [] | |
biasgpQ = [] | |
allQ = [] | |
layerQ = [] | |
wtD = [] | |
biasD = [] | |
wtgpD = [] | |
biasgpD = [] | |
maxD = [] | |
#0=bd_layer_out, 1=bd_layer_g, 2=bd_bias_g, 3=bd_wts_g, 4=bd_wts_GP_gNORM | |
for i in range(layers): | |
wtQ.append( | |
(d['bds'][3][i][:,:,0] <= d['crit_real_grads'][2*i][0], | |
d['bds'][3][i][:,:,1] >= d['crit_real_grads'][2*i][0])) | |
biasQ.append( | |
(d['bds'][2][i][:,0] <= d['crit_real_grads'][2*i+1][0], | |
d['bds'][2][i][:,1] >= d['crit_real_grads'][2*i+1][0])) | |
wtgpQ.append( | |
np.linalg.norm( | |
d['crit_grads'][2*i][0]-d['crit_main_grads'][2*i][0] | |
) <= lamda * d['bds'][4][i]) | |
biasgpQ.append( | |
np.linalg.norm( | |
d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0] | |
) <= 1e-9) # grad of gp wrt bias should be zero | |
allQ.append( | |
(wtQ[-1][0].all() & wtQ[-1][1].all(), | |
biasQ[-1][0].all() & biasQ[-1][1].all(), | |
wtgpQ[-1], | |
biasgpQ[-1])) | |
layerQ.append(all(allQ[-1])) | |
wtD.append( | |
np.linalg.norm( | |
d['comp_grads'][3][i] - d['crit_real_grads'][2*i][0]) / | |
np.linalg.norm(d['crit_real_grads'][2*i][0])) | |
biasD.append( | |
np.linalg.norm( | |
d['comp_grads'][2][i][:,0] - d['crit_real_grads'][2*i+1][0]) / | |
np.linalg.norm(d['crit_real_grads'][2*i+1][0])) | |
wtgpD.append( | |
np.linalg.norm( | |
d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0] | |
- lamda * d['comp_gp'][4][i]) / | |
np.linalg.norm( | |
d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0])) | |
biasgpD.append( | |
np.linalg.norm( | |
d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0])) | |
maxD.append(np.max([wtD[-1],biasD[-1],wtgpD[-1],biasgpD[-1]])) | |
check_dict = { | |
'wtQ' : wtQ, | |
'biasQ' : biasQ, | |
'wtgpQ' : wtgpQ, | |
'biasgpQ' : biasgpQ, | |
'allQ' : allQ, | |
'layerQ' : layerQ, | |
'overallQ' : all(layerQ), | |
'wtD' : wtD, | |
'biasD' : biasD, | |
'wtgpD' : wtgpD, | |
'biasgpD' : biasgpD, | |
'maxD' : maxD, | |
'overallD' : max(maxD) | |
} | |
return check_dict | |
print('np_wgan_bds loaded') |