Skip to content
Permalink
3b379e68a7
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1127 lines (893 sloc) 40.1 KB
# -*- coding: utf-8 -*-
"""
Compute the bounds on the outputs of nodes in the critic of a WGAN, and the
bounds on the "main" gradients of the layers, the weights, and the biases.
Works with tensorflow version 1.15 and numpy version 1.18
For the gradient penalty term of a WGAN-GP, the bounds that it computes are on
the NORM of gradient of the weights (for each layer).
It also has code to actually compute the gradients (instead of bounds), based
on specific input values.
Assumes the critic is fully connected, and that every layer has either ReLU or
Leaky ReLU activation function or no activation function.
Assumes that the output is a scalar with no activation function.
Contains the following functions:
forwardPropLayerBound : uses forward prop to compute the bounds on the
output of the next layer of the critic.
backPropLayerGradBound : uses backprop to compute the bounds on the grad
of a layer of the critic.
backPropBiasGradBound : uses backprop to compute the bounds on the grad
of the bias of a layer of the critic.
backPropWtsGradBound : uses backprop to compute the bounds on the grad
of the weights of a layer of the critic.
computeBounds : uses the four functions above to compute all of the bounds
for all of the layers of the critic.
forwardPropLayer : uses forward prop to compute the values of the
output of the next layer of the critic.
backPropLayerGrad : uses backprop to compute the gradient
of a layer of the critic.
backPropBiasGrad : uses backprop to compute the gradient
of the bias of a layer of the critic.
backPropWtsGrad : uses backprop to compute the the gradient
of the weights of a layer of the critic.
computeGrads : uses the four functions above to compute the gradients
for a WGAN-GP (optionally a tradition WGAN)
practice_data : Generates practice data to test the computed gradients and
bounds.
setup : Setup for unit test. Calls other functions to compute the gradients
and bounds on the practice data.
test : Tests the gradients and bounds in the dictionary returned by setup()
against the gradients computed by TensorFlow.
@author: Joseph Pedersen
1 2 3 4 5 6 7
1234567890123456789012345678901234567890123456789012345678901234567890123456789
"""
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
print('tf',tf.__version__)
printon = False
#######################################
# Compute the bounds on the gradients
#######################################
def forwardPropLayerBound(inbound, weights, bias, alpha=0):
"""Computes the upper and lower bounds on the nodes of a layer of a network
based on : the bounds of the previous layer, the weights and bias, and
the activation function (ReLU, Leaky ReLU, or None)
Requires: numpy imported as np
Parameters
----------
inbound : (N,2) ndarray; N is the number of nodes in the input layer
Each row represents the min & max of that node
weights : (N,M) ndarray; M is the number of nodes in the next layer
bias : (M,) ndarray
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU
If alpha = 1, that represents no activation function in that layer.
Returns
-------
outbound : (M,2) ndarray, each row represents the min & max of that node
"""
outMin = (
np.sum(
weights*
np.where(weights>=0,
inbound[:,0,None],
inbound[:,1,None]),
axis=0
)
) + bias
outMax = (
np.sum(
weights*
np.where(weights<=0,
inbound[:,0,None],
inbound[:,1,None]),
axis=0
)
) + bias
# First column are mins, second column are maxes
outbound = np.concatenate((outMin[:,None], outMax[:,None]), axis=1)
# If applicable, apply (potentially Leaky) ReLU
if alpha != 1:
outbound = np.where(outbound > 0,
outbound,
alpha*outbound)
return outbound
def backPropLayerGradBound(biasGradBound, weights):
"""Computes the upper and lower bounds on the gradient of a layer of a
network based on : the bounds on the gradient of the bias of the next
layer, and the weights connecting the layer to the next layer
Requires: numpy imported as np
Parameters
----------
biasGradbound : (M,2) ndarray; Each row represents the min & max of the
gradient for the bias of the next layer.
weights : (N,M) ndarray; weight matrix connecting this layer to the next.
Returns
-------
layerGradBound : (N,2) ndarray; each row represents the min & max of the
gradient of this layer.
"""
lbound = biasGradBound[None,:,0]
ubound = biasGradBound[None,:,1]
layerGradMin = (
np.sum(
weights*
np.where(weights>=0,
lbound,
ubound
),
axis=1
)
)
layerGradMax = (
np.sum(
weights*
np.where(weights<=0,
lbound,
ubound
),
axis=1
)
)
# First column are mins, second column are maxes
layerGradBound = (
np.concatenate((layerGradMin[:,None],layerGradMax[:,None]), axis=1)
)
return layerGradBound
def backPropBiasGradBound(Lbound, Lgradbound, alpha):
"""Computes the upper and lower bounds on the gradient of the bias of a
layer of a network based on: the bounds on the output of the layer and
its gradient, and the activation function (ReLU, Leaky ReLU, or None)
Requires: numpy imported as np
Parameters
----------
Lbound : (M,2) ndarray; M is the number of nodes in the layer
Each row represents the min & max of that node.
Lgradbound : (M,2) ndarray; Each row represents the min & max of the
gradient for the layer.
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU
If alpha = 1, that represents no activation function in that layer.
Returns
-------
biasGradBound : (M,2) ndarray; each row represents the min & max of the
gradient of the bias.
"""
if alpha==1: # if no activation function, the bound is just Lgradbound
return Lgradbound, np.ones(shape=Lbound.shape)
reluPrimeBounds = np.where(Lbound > 0, 1, alpha)
biasGradBounds = Lgradbound[:,:,None]*(reluPrimeBounds[:,None,:])
biasGradMin = np.min(biasGradBounds,axis = (1,2))
biasGradMax = np.max(biasGradBounds,axis = (1,2))
# First column are mins, second column are maxes
biasGradBound = (
np.concatenate((biasGradMin[:,None],biasGradMax[:,None]), axis=1)
)
return biasGradBound, reluPrimeBounds
def backPropWtsGradBound(Lbound, biasGradbound):
"""Computes the upper and lower bounds on the gradient of the weights of a
layer of a network based on: the bounds on the output of the nodes of
the previous layer, and the bounds on the gradient of the bias for the
same layer
Requires: numpy imported as np
Parameters
----------
Lbound : (N,2) ndarray; N is the number of nodes in the previous layer
Each row represents the min & max of that node.
biasGradbound : (M,2) ndarray; Each row represents the min & max of the
gradient for the bias of this layer.
Returns
-------
wtsGradBound : (N,M,2) ndarray; each row represents the min & max of the
gradient of the weights.
"""
wBounds = Lbound[:,None,:,None]*biasGradbound[None,:,None,:]
wtsGradMin = (
np.min(
wBounds,
axis = (2,3)
)
)
wtsGradMax = (
np.max(
wBounds,
axis = (2,3)
)
)
# First column are mins, second column are maxes
wtsGradBound = (
np.concatenate((wtsGradMin[:,:,None],wtsGradMax[:,:,None]), axis=2)
)
return wtsGradBound
def computeBounds(inputBound, weights, bias, alphas,
outputGrad=None, compGP_g=True):
"""Computes the bounds on the values of the nodes of the discriminator of a
GAN, and the bounds on the gradients of: the layers, the weights, and the
biases. To compute this, it needs the bounds on the input, and the weights,
biases, and activation function (alpha) of each layer of the discriminator.
Parameters
----------
inputBound : (N,2) ndarray; N is the number of nodes in the input layer,
which is layer 0. Each row represents the min & max of that node.
weights : a tuple of ndarrays, each of shape (N_i, N_{i+1})
item i is the weight matrix connecting layer i to layer (i+1)
bias : a tuple of ndarrays, each of shape (N_{i+1},)
item i is the bias for layer (i+1)
alphas : a tuple of floats, each satisfying 0 <= alpha <=1
item i indicates the activation function for layer (i+1)
If alpha = 0, that represents ReLU for that layer.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
If alpha = 1, that represents no activation function in that layer.
outputGrad : (optional) A tuple of bounds on the gradient of the loss with
respect to the output, e.g. (-1.1, -0.9). The gradient should be -1 for
the loss function of the improved WGAN without the gradient penalty
term, but the bounds need to account for floating point arithmetic.
If None, the default, then the bounds are set to (-1.000001, -0.999999)
compGP_g : bool. If True, the default, then compute the bounds on the norm
of the gradient of the gradient penalty term.
Requires:
1) all tuples should have the same length, L+1, where L is the number
of hidden layers in the discriminator. Layer (L+1) is the output
2) Array dimensions should match appropriately, as described above.
3) the output layer should be a scalar, with no activation function,
i.e. weights[L+1].shape[1]=1, bias[L+1].shape[0]=1, and alpha[L+1]=1
Returns
-------
bd_layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 2)
item i is the bounds on the (i+1)th layer of the network, which has
N_{i+1} nodes. The [r,0] entry of array i is the min of node r in layer
(i+1). The [r,1] entry is the corresponding max.
bd_layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 2)
item i is the bounds on the gradient of layer (i+1).
The [j,0] entry of array i is the min of the gradient of node j of
layer (i+1). The [j,1] entry is the corresponding max.
bd_bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 2)
item i is the bounds on the gradient of the bias for layer (i+1).
The [j,0] entry of array i is the min of the gradient of the bias for
node j of layer (i+1). The [j,1] entry is the corresponding max.
bd_wts_g : a tuple of ndarrays, each of shape (N_i, N_{i+1}, 2)
item i is the bounds on the gradient of the weight matrix connecting
layer i to layer (i+1). The [j,k,0] entry of array i is the min of the
gradient of the weight connecting node j of layer i to node j of layer
k. The [j,k,1] entry of array i is the corresponding max.
bd_wts_GP_gNORM : a tuple of floats. The bound on the norm of the gradient
of the gradient penalty term, for each layer of the critic.
"""
numL = len(weights) # number of layers, counting output, but not input
if len(bias) != numL:
raise ValueError("Numbers of weight matrices and biases do not match")
if len(alphas) != numL:
raise ValueError("Numbers of weight matrices and alphas do not match")
if outputGrad != None:
if len(outputGrad) != 2:
raise ValueError("ouputGrad should have a length of 2")
if (type(outputGrad[0])!=int and type(outputGrad[0])!=float):
raise ValueError("outputGrad[0] should be int or float")
if (type(outputGrad[1])!=int and type(outputGrad[1])!=float):
raise ValueError("outputGrad[1] should be int or float")
# Dimensions of weight matrices and biases
wtDims = [w.shape for w in weights]
biasDims = [b.shape for b in bias]
if wtDims[0][0] != inputBound.shape[0]:
msg = (f"First weight matrix dimension {wtDims[0][0]} "
+ f"does not match input shape {inputBound.shape[0]} ")
raise ValueError(msg)
for i in range(1, numL):
if wtDims[i][0] != wtDims[i-1][1]:
msg = f"weight matrix inner dimension mismatch at i={i}"
raise ValueError(msg)
for i in range(numL):
if wtDims[i][1] != biasDims[i][0]:
msg = f"bias dimension mismatch at i={i}"
raise ValueError(msg)
# Make a list containing the bounds on nodes in the first hidden layer
bd_layer_out = [inputBound]
# Forward Prop to compute the bounds on the nodes in all other layers
for i in range(0, numL):
bd_layer_out.append(
forwardPropLayerBound(inbound = bd_layer_out[i],
weights = weights[i],
bias = bias[i],
alpha = alphas[i])
)
# Convert to tuple
bd_layer_out = tuple(bd_layer_out)
# Initialize lists of length numL, for bounds on gradients
bd_layer_g = [None] * (numL+1)
bd_bias_g = [None] * numL
bd_wts_g = [None] * numL
bd_relu_g = [None] * numL
# Bound on grad for output layer: assumes (partial L / partial Y) = -1
if outputGrad == None:
bd_layer_g[numL] = np.array([[-1.0001,-0.9999]])
else:
bd_layer_g[numL] = np.array([[outputGrad[0],outputGrad[1]]])
# Conduct backprop to compute the bounds on the gradients
for i in range(numL, 0,-1):
# grad of the bias for layer (i+1)
bd_bias_g[i-1], bd_relu_g[i-1] = backPropBiasGradBound(bd_layer_out[i],
bd_layer_g[i],
alphas[i-1])
# grad of weights connecting layers (i) to (i+1)
bd_wts_g[i-1] = backPropWtsGradBound(bd_layer_out[i-1], bd_bias_g[i-1])
# gradient of layer (i+1)
bd_layer_g[i-1] = backPropLayerGradBound(bd_bias_g[i-1], weights[i-1])
# Make them tuples
bd_layer_g = tuple(bd_layer_g)
bd_bias_g = tuple(bd_bias_g)
bd_relu_g = tuple(bd_relu_g)
bd_wts_g = tuple(bd_wts_g)
if compGP_g:
# Compute bound on NORM of gradient of gradient penalty ||grad of GP||
weightMatrixNorms = [None] * (numL)
reluPrimeNorms = [None] * (numL-1)
bd_wts_GP_gNORM = [None] * (numL)
gradDnorm_max = np.linalg.norm(np.max(np.abs(bd_layer_g[0]),axis=1))
multiplier = 2.0 * np.maximum(gradDnorm_max - 1, 1) # without lamda
for i, mat in enumerate(weights):
weightMatrixNorms[i] = np.linalg.norm(mat) # Froebenius norm
for i in range(numL-1):
reluPrimeNorms[i] = np.linalg.norm(bd_relu_g[i][:,1]) # Euclidean norm
productBound = (np.product(weightMatrixNorms)
* np.product(reluPrimeNorms)
* multiplier)
for i in range(numL):
bd_wts_GP_gNORM[i] = productBound/weightMatrixNorms[i]
bd_wts_GP_gNORM = tuple(bd_wts_GP_gNORM)
return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g, bd_wts_GP_gNORM
else:
return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g
###############################################################################
#
# The next section has the functions that compute the actual gradients
# (instead of bounds) for specific values of inpt
#
###############################################################################
#######################################
# Compute the gradients
#######################################
def forwardPropLayer(inpt, weights, bias, alpha=0):
"""Computes the values for the nodes of a layer of a network
based on : the previous layer, the weights and bias, and
the activation function (ReLU, Leaky ReLU, or None)
Requires: numpy imported as np
Parameters
----------
inpt : (N,1) ndarray; N is the number of nodes in the input layer
Each row represents the min & max of that node
weights : (N,M) ndarray; M is the number of nodes in the next layer
bias : (M,) ndarray
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU
If alpha = 1, that represents no activation function in that layer.
Returns
-------
output : (M,1) ndarray, each row represents the value of that node
"""
output = weights.transpose() @ inpt + bias[:,None]
# If applicable, apply (potentially Leaky) ReLU
if alpha != 1:
output = np.where(output > 0,
output,
alpha*output)
return output
def backPropLayerGrad(biasGrad, weights):
"""Computes the gradient of a layer of a
network based on : the gradient of the bias of the next
layer, and the weights connecting the layer to the next layer
Requires: numpy imported as np
Parameters
----------
biasGrad : (M,1) ndarray; Each row represents the min & max of the
gradient for the bias of the next layer.
weights : (N,M) ndarray; weight matrix connecting this layer to the next.
Returns
-------
layerGrad : (N,1) ndarray; each row represents the value of the
gradient of this layer.
"""
layerGrad = weights @ biasGrad
return layerGrad
def backPropBiasGrad(Lvals, Lgrad, alpha):
"""Computes the gradient of the bias of a
layer of a network based on: the output of the layer and
its gradient, and the activation function (ReLU, Leaky ReLU, or None)
Requires: numpy imported as np
Parameters
----------
Lvals : (M,1) ndarray; M is the number of nodes in the layer
Each row represents the value of that node.
Lgrad : (M,1) ndarray; Each row represents the value of the
gradient for the layer.
alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU
If alpha = 1, that represents no activation function in that layer.
Returns
-------
biasGrad : (M,1) ndarray; each row represents the value of the
gradient of the bias.
"""
if alpha==1: # if no activation function, the bound is just Lgradbound
return Lgrad, np.ones(Lvals.shape)
reluPrime = np.where(Lvals > 0, 1, alpha)
biasGrad = Lgrad * reluPrime
return biasGrad, reluPrime
def backPropWtsGrad(Lvals, biasGrad):
"""Computes the gradient of the weights of a layer of a network based on:
the output of the nodes of the previous layer, and the
gradient of the bias for the same layer
Requires: numpy imported as np
Parameters
----------
Lvals : (N,1) ndarray; N is the number of nodes in the previous layer
Each row represents the value of that node.
biasGrad : (M,1) ndarray; the gradient for the bias of this layer.
Returns
-------
wtsGrad : (N,M) ndarray; the gradient of the weights.
"""
wtsGrad = Lvals @ biasGrad.transpose()
return wtsGrad
def computeGrads(inpt, weights, bias, alphas, compGP_g=True):
"""Computes the values of the nodes of the discriminator of a WGAN, and
the gradients of: the layers, the weights, and the biases. To compute this,
it needs the input, the weights, the biases, and the activation functions
(alpha) of each layer of the discriminator.
Parameters
----------
inpt : (N,1) ndarray; N is the number of nodes in the input layer,
which is layer 0. Each row represents the value of that node.
weights : a tuple of ndarrays, each of shape (N_i, N_{i+1})
item i is the weight matrix connecting layer i to layer (i+1)
bias : a tuple of ndarrays, each of shape (N_{i+1},)
item i is the bias for layer (i+1)
alphas : a tuple of floats, each satisfying 0 <= alpha <=1
item i indicates the activation function for layer (i+1)
If alpha = 0, that represents ReLU for that layer.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
If alpha = 1, that represents no activation function in that layer.
compGP_g : bool. If True, the default, then compute the gradient of the
gradient penalty term.
Requires:
1) all tuples should have the same length, L+1, where L is the number
of hidden layers in the discriminator. Layer (L+1) is the output
2) Array dimensions should match appropriately, as described above.
3) the output layer should be a scalar, with no activation function,
i.e. weights[L].shape[1]=1, bias[L].shape[0]=1, and alpha[L]=1
where [L] represents layer L+1, due to zero-indexing
Returns
-------
layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 1)
item i is the values of the (i+1)th layer of the network, which has
N_{i+1} nodes. The [r,0] entry of array i is the value of node r.
layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 1)
item i is the gradient of layer (i+1).
bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 1)
item i is the gradient of the bias for layer (i+1).
wts_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2)
item i is the gradient of the weight matrix connecting
layer i to layer (i+1), for the loss function without GP term.
wts_GP_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2)
item i is the gradient of the weight matrix connecting
layer i to layer (i+1), for the GP term of the loss function.
"""
numL = len(weights) # number of layers, counting output, but not input
# = L + 1, where L is the number of hidden layers
if len(bias) != numL:
raise ValueError("Numbers of weight matrices and biases do not match")
if len(alphas) != numL:
raise ValueError("Numbers of weight matrices and alphas do not match")
# Dimensions of weight matrices and biases
wtDims = [w.shape for w in weights]
biasDims = [b.shape for b in bias]
if wtDims[0][0] != inpt.shape[0]:
raise ValueError(f"First weight matrix dimension {wtDims[0][0]} "
+ f"does not match input {inpt.shape[0]}")
for i in range(1, numL):
if wtDims[i][0] != wtDims[i-1][1]:
msg = f"weight matrix inner dimension mismatch at i={i}"
raise ValueError(msg)
for i in range(numL):
if wtDims[i][1] != biasDims[i][0]:
msg = f"bias dimension mismatch at i={i}"
raise ValueError(msg)
# Make a list containing the values of input nodes
layer_out = [inpt]
# Forward Prop to compute the values of the nodes in all other layers
for i in range(0, numL):
layer_out.append(
forwardPropLayer(inpt = layer_out[i],
weights = weights[i],
bias = bias[i],
alpha = alphas[i])
)
# Convert to tuple
layer_out = tuple(layer_out)
# Initialize lists of length numL, for values of gradients
layer_g = [None] * (numL+1)
bias_g = [None] * numL
relu_g = [None] * numL
wts_g = [None] * numL
# values of grad (of real term) for output layer:
layer_g[numL] = np.array([[-1.00000]], dtype=np.float32)
# Conduct backprop to compute the bounds on the gradients
for i in range(numL, 0,-1):
# grad of the bias for layer (i+1)
bias_g[i-1], relu_g[i-1] = backPropBiasGrad(layer_out[i],
layer_g[i],
alphas[i-1])
# grad of weights connecting layers (i) to (i+1)
wts_g[i-1] = backPropWtsGrad(layer_out[i-1], bias_g[i-1])
# gradient of layer (i+1)
layer_g[i-1] = backPropLayerGrad(bias_g[i-1], weights[i-1])
# Make them tuples
layer_g = tuple(layer_g)
bias_g = tuple(bias_g)
relu_g = tuple(relu_g)
wts_g = tuple(wts_g)
if compGP_g:
# Compute gradient of gradient penalty (grad of GP)
gradDnorm = np.linalg.norm(layer_g[0][:,0])
Dlist = [None] * (numL) # left factor of GP, with the gradD term
Rlist = [None] * (numL) # R for "right", since it gets multiplied on right
wts_GP_g = [None] * (numL)
Dlist[0] = -2.0*(gradDnorm - 1.0)*layer_g[0]/gradDnorm # not * by lamda
Rlist[0] = np.ones((1,1), dtype=np.float32)
for i in range(1,numL):
Dlist[i] = (weights[i-1].transpose() @ Dlist[i-1]) * relu_g[i-1]
Rlist[i] = relu_g[numL-1-i] * (weights[numL-i] @ Rlist[i-1])
wts_GP_g[numL - 1] = Dlist[numL - 1]
for i in range(numL - 1):
wts_GP_g[i] = Dlist[i] @ np.transpose(Rlist[numL-1-i])
return layer_out, layer_g, bias_g, wts_g, wts_GP_g
else:
return layer_out, layer_g, bias_g, wts_g
###############################################################################
#
# The next section has practice data used to check functionality
#
###############################################################################
#######################################
# Practice data
#######################################
def practice_data(eps = 0.01, layers = 1, random=False, max_nodes=10):
"""Generate practice data to test the computed gradients and bounds
Requires: numpy imported as np
Parameters
----------
eps : float. Half of the width of the allowed input (for each node)
layers : int. The number of layers for the unit test
random : bool. Uses fixed values if False, and random values if True
max_nodes : int. The maximum number of nodes to use for each layer
Returns : sample data for unit test
-------
ib : (N,2) ndarray; N is the number of nodes in the input layer,
which is layer 0. Each row represents the min & max of that node.
wts : a tuple of ndarrays, each of shape (N_i, N_{i+1})
item i is the weight matrix connecting layer i to layer (i+1)
bias : a tuple of ndarrays, each of shape (N_{i+1},)
item i is the bias for layer (i+1)
alpha : a tuple of floats, each satisfying 0 <= alpha <=1
item i indicates the activation function for layer (i+1)
If alpha = 0, that represents ReLU for that layer.
If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
If alpha = 1, that represents no activation function in that layer.
"""
if type(layers) != int or layers < 1:
raise ValueError("layers should be an int >= 1")
if random:
sigma = np.random.choice([1e-3,1e-2,1e-1,1])
#sigma = np.random.choice([1e-3,1e-2,1e-1])
sizes = tuple(np.random.randint(2,max_nodes+1,size=layers))
ib_size = (np.random.choice(max_nodes)+1,)
ib_center = np.random.uniform(low=eps,high=1-eps,size=ib_size)
ib = np.transpose([ib_center-eps, ib_center+eps])
bias_sizes = sizes+(1,)
wts_dim0 = ib_size+sizes
alpha = tuple(np.random.choice([0.1,0.2,0.3,0.5], layers))+(1,)
#alpha = tuple(np.random.choice([0.2], layers))+(1,)
bias = [None]*(layers+1)
wts = [None]*(layers+1)
for i in range(layers+1):
bias[i] = np.random.normal(0,sigma,bias_sizes[i])
wts[i] = np.random.normal(0,sigma,(wts_dim0[i],bias_sizes[i]))
else:
if layers == 1: # practice data for one hidden layer
# input bounds
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
[-2-eps,-2+eps],[-3-eps,-3+eps]],
dtype=np.float32)
# weights for the network
wts = (np.array([[-1, 2, -2],[ -7, 5, 1],
[ 2, -2, 3],[-3, 4, 10]],
dtype=np.float32),
np.array([[-1], [2], [-2]],
dtype=np.float32)
)
# bias for the network
bias = (np.array([-100,0,100], dtype=np.float32),
np.array([0], dtype=np.float32))
# alphas for the network
alpha = (0.2,1)
elif layers == 2: # practice data for two hidden layers
# input bounds
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
[-2-eps,-2+eps],[-3-eps,-3+eps]],
dtype=np.float32)
# weights for the network
wts = (np.array([[-1, 2, -2],[ -7, 5, 1],
[ 2, -2, 3],[-3, 4, 10]],
dtype=np.float32),
np.array([[-1,3], [2,-4], [-2,5]],
dtype=np.float32),
np.array([[8], [-9]],
dtype=np.float32)
)
# bias for the network
bias = (np.array([-97,0,101], dtype=np.float32),
np.array([-11,13], dtype=np.float32),
np.array([0], dtype=np.float32))
# alphas for the network
alpha = (0.2,0.4,1.)
else: # practice data for three hidden layers
# input bounds
ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
[-2-eps,-2+eps],[-3-eps,-3+eps]],
dtype=np.float32)
# weights for the network
wts = (np.array([[-1, 2, -2],[ -7, 5, 1],
[ 2, -2, 3],[-3, 4, 10]],
dtype=np.float32),
np.array([[-1,3], [2,-4], [-2,5]],
dtype=np.float32),
np.array([[8,1,-3,6,-2], [-9,0,3,-4,5]],
dtype=np.float32),
np.array([[-7],[4],[-5],[2],[-3]],
dtype=np.float32)
)
# bias for the network
bias = (np.array([-97,0,101], dtype=np.float32),
np.array([-11,13], dtype=np.float32),
np.array([-2,3,-4,5,-6], dtype=np.float32),
np.array([0], dtype=np.float32))
# alphas for the network
alpha = (0.2,0.4,0.25,1.)
return ib, wts, bias, alpha
def setup(eps = 0.01, layers = 1, random=False, max_nodes=10):
"""Setup for unit test. Uses data from practice_data() to create a
TF graph of a WGAN-GP, compute the gradients using TensorFlow, and compute
the bounds on the grads using computeBounds() and the actual grads using
computeGrads(). Outputs everything as a dictionary
Requires:
numpy imported as np
tensorflow imported as tf
Parameters
----------
eps : float. Half of the width of the allowed input (for each node)
layers : int. The number of layers for the unit test
random : bool. Uses fixed values if False, and random values if True
max_nodes : int. The maximum number of nodes to use for each layer
Returns
-------
dump_dict : a dictionary containing the gradients and bounds
"""
#######################################
# Generate data to test bounds
#######################################
ib, wts, bias, alpha = practice_data(eps=eps,
layers=layers,
random=random,
max_nodes=max_nodes)
#######################################
# Tensorflow graph
#######################################
batch_size = 1
train_data = ((ib[:,1:]+ib[:,:1])/2).transpose()
# start with a fresh graph
tf.reset_default_graph()
# create the placeholder for real data
real_data = tf.placeholder(
tf.float32,
shape=[batch_size, ib.shape[0]],
name="RealData")
# create a noise data set
fake_data = tf.random_uniform([batch_size, ib.shape[0]],0.,1.)
def tfcritic(x):
layer = 1
for w, b, a in zip(wts, bias, alpha):
x = tf.contrib.layers.fully_connected(
x,
w.shape[1],
activation_fn = lambda z: tf.nn.leaky_relu(z, alpha=a),
scope=f'critic.{layer}',
reuse=tf.AUTO_REUSE,
weights_initializer=tf.constant_initializer(w),
biases_initializer=tf.constant_initializer(b)
)
layer += 1
return x
crit_real = tfcritic(real_data)
crit_fake = tfcritic(fake_data)
# points between real and fake (line 6 of WGAN-GP algorithm)
u = tf.random_uniform(shape=[batch_size, 1], minval=0, maxval=1)
interpolates = (u * real_data) + (1 - u)*fake_data
# compute gradients of dicriminator at interpolates
crit_interp = tfcritic(interpolates)
gradients = tf.gradients(crit_interp, [interpolates])[0]
# calculate the 2 norm of the gradients
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
reduction_indices=[1]))
# subtract 1, square
crit_gp = (slopes - 1.)**2
lamda = 10
crit_gp_loss = lamda*tf.reduce_mean(crit_gp)
crit_main_loss = tf.reduce_mean(crit_fake) - tf.reduce_mean(crit_real)
crit_real_loss = - tf.reduce_mean(crit_real)
crit_loss = crit_main_loss + crit_gp_loss
# get TF trainable params that have 'critic' in name:
crit_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
if 'critic' in v.name]
# compute the gradients for those params:
critOpt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
# for crit_real_loss
crit_main_grads = critOpt.compute_gradients(crit_main_loss,
var_list=crit_params)
crit_real_grads = critOpt.compute_gradients(crit_real_loss,
var_list=crit_params)
crit_gp_grads = critOpt.compute_gradients(crit_gp_loss,
var_list=crit_params)
crit_grads = critOpt.compute_gradients(crit_loss, var_list=crit_params)
crit_train_op = critOpt.apply_gradients(crit_grads)
critic_wts = [v for v in crit_params if 'weights' in v.name]
critic_bias = [v for v in crit_params if 'biases' in v.name]
ib_tensor = tf.constant(ib)
b1 = computeBounds(inputBound=ib,
weights=wts,
bias=bias,
alphas=alpha,
compGP_g=True)
# the real grads
g1 = computeGrads(inpt=train_data.transpose(),
weights=wts,
bias=bias,
alphas=alpha,
compGP_g=False)
# evaluate grads in session
with tf.Session() as session:
# initialize variables
session.run(tf.global_variables_initializer())
if printon:
print('Starting tensorflow session')
myvars = tf.trainable_variables()
print("\nTensorFlow trainable parameters:")
for myvar in myvars:
print(myvar.name)
print(' shape =', myvar.get_shape())
print("\ncritic_wts:")
for myvar in critic_wts:
print(myvar.name)
print(' shape =', myvar.get_shape())
print("\ncritic_bias:")
for myvar in critic_bias:
print(myvar.name)
print(' shape =', myvar.get_shape())
(f,interps,g,cmg,crg,cg) = session.run(
[fake_data,
interpolates,
gradients,
crit_main_grads,
crit_real_grads,
crit_grads],
feed_dict={real_data: train_data})
# the grads at the interpolates (which the grad of GP is computed)
gp1 = computeGrads(inpt=interps.transpose(),
weights=wts,
bias=bias,
alphas=alpha,
compGP_g=True)
dump_dict = {
'ib' : ib,
'wts' : wts,
'bias' : bias,
'alpha' : alpha,
'fake_data' : f,
'interpolates' : interps,
'gradients' : g,
'crit_main_grads' : cmg,
'crit_real_grads' : crg,
'crit_grads' : cg,
'bds' : b1,
'comp_grads' : g1,
'comp_gp' : gp1
}
return dump_dict
def test(d, lamda=10):
"""Tests the gradients and bounds in the dictionary returned by setup()
against the gradients computed by TensorFlow
Requires:
numpy imported as np
tensorflow imported as tf
Parameters
----------
d : dictionary. The return from setup()
lamda : int or float. The GP multiplier in the WGAN-GP loss function
Returns
-------
check_dict : a dictionary containing the results of the test. In particular
check_dict['overallQ'] is True if all gradients were in the computed
bounds and check_dict['overallD'] shows the maximum relative error
between the grads computed by this module and those computed by TF
"""
layers = len(d['bds'][3])
if len(d['crit_real_grads']) != 2*layers:
raise ValueError("Number of layers do not match")
wtQ = []
biasQ = []
wtgpQ = []
biasgpQ = []
allQ = []
layerQ = []
wtD = []
biasD = []
wtgpD = []
biasgpD = []
maxD = []
#0=bd_layer_out, 1=bd_layer_g, 2=bd_bias_g, 3=bd_wts_g, 4=bd_wts_GP_gNORM
for i in range(layers):
wtQ.append(
(d['bds'][3][i][:,:,0] <= d['crit_real_grads'][2*i][0],
d['bds'][3][i][:,:,1] >= d['crit_real_grads'][2*i][0]))
biasQ.append(
(d['bds'][2][i][:,0] <= d['crit_real_grads'][2*i+1][0],
d['bds'][2][i][:,1] >= d['crit_real_grads'][2*i+1][0]))
wtgpQ.append(
np.linalg.norm(
d['crit_grads'][2*i][0]-d['crit_main_grads'][2*i][0]
) <= lamda * d['bds'][4][i])
biasgpQ.append(
np.linalg.norm(
d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0]
) <= 1e-9) # grad of gp wrt bias should be zero
allQ.append(
(wtQ[-1][0].all() & wtQ[-1][1].all(),
biasQ[-1][0].all() & biasQ[-1][1].all(),
wtgpQ[-1],
biasgpQ[-1]))
layerQ.append(all(allQ[-1]))
wtD.append(
np.linalg.norm(
d['comp_grads'][3][i] - d['crit_real_grads'][2*i][0]) /
np.linalg.norm(d['crit_real_grads'][2*i][0]))
biasD.append(
np.linalg.norm(
d['comp_grads'][2][i][:,0] - d['crit_real_grads'][2*i+1][0]) /
np.linalg.norm(d['crit_real_grads'][2*i+1][0]))
wtgpD.append(
np.linalg.norm(
d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0]
- lamda * d['comp_gp'][4][i]) /
np.linalg.norm(
d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0]))
biasgpD.append(
np.linalg.norm(
d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0]))
maxD.append(np.max([wtD[-1],biasD[-1],wtgpD[-1],biasgpD[-1]]))
check_dict = {
'wtQ' : wtQ,
'biasQ' : biasQ,
'wtgpQ' : wtgpQ,
'biasgpQ' : biasgpQ,
'allQ' : allQ,
'layerQ' : layerQ,
'overallQ' : all(layerQ),
'wtD' : wtD,
'biasD' : biasD,
'wtgpD' : wtgpD,
'biasgpD' : biasgpD,
'maxD' : maxD,
'overallD' : max(maxD)
}
return check_dict
print('np_wgan_bds loaded')