np_wgan_bds.py

# -*- coding: utf-8 -*-
"""
Compute the bounds on the outputs of nodes in the critic of a WGAN, and the
bounds on the "main" gradients of the layers, the weights, and the biases.

Works with tensorflow version 1.15 and numpy version 1.18

For the gradient penalty term of a WGAN-GP, the bounds that it computes are on
the NORM of gradient of the weights (for each layer).

It also has code to actually compute the gradients (instead of bounds), based
on specific input values.

Assumes the critic is fully connected, and that every layer has either ReLU or
Leaky ReLU activation function or no activation function.

Assumes that the output is a scalar with no activation function.

Contains the following functions:

    forwardPropLayerBound : uses forward prop to compute the bounds on the
        output of the next layer of the critic.
    backPropLayerGradBound : uses backprop to compute the bounds on the grad
        of a layer of the critic.
    backPropBiasGradBound : uses backprop to compute the bounds on the grad
        of the bias of a layer of the critic.
    backPropWtsGradBound : uses backprop to compute the bounds on the grad
        of the weights of a layer of the critic.
    computeBounds : uses the four functions above to compute all of the bounds
        for all of the layers of the critic.

    forwardPropLayer : uses forward prop to compute the values of the
        output of the next layer of the critic.
    backPropLayerGrad : uses backprop to compute the gradient
        of a layer of the critic.
    backPropBiasGrad : uses backprop to compute the gradient
        of the bias of a layer of the critic.
    backPropWtsGrad : uses backprop to compute the the gradient
        of the weights of a layer of the critic.
    computeGrads : uses the four functions above to compute the gradients
        for a WGAN-GP (optionally a tradition WGAN)

    practice_data : Generates practice data to test the computed gradients and
        bounds.
    setup : Setup for unit test. Calls other functions to compute the gradients
        and bounds on the practice data.
    test : Tests the gradients and bounds in the dictionary returned by setup()
        against the gradients computed by TensorFlow.

@author: Joseph Pedersen
         1         2         3         4         5         6         7
1234567890123456789012345678901234567890123456789012345678901234567890123456789
"""

import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
print('tf',tf.__version__)
printon = False

#######################################
# Compute the bounds on the gradients
#######################################

def forwardPropLayerBound(inbound, weights, bias, alpha=0):
    """Computes the upper and lower bounds on the nodes of a layer of a network
       based on : the bounds of the previous layer, the weights and bias, and
       the activation function (ReLU, Leaky ReLU, or None)

    Requires: numpy imported as np

    Parameters
    ----------
    inbound : (N,2) ndarray; N is the number of nodes in the input layer
        Each row represents the min & max of that node
    weights : (N,M) ndarray; M is the number of nodes in the next layer
    bias : (M,) ndarray
    alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU
        If alpha = 1, that represents no activation function in that layer.

    Returns
    -------
    outbound : (M,2) ndarray, each row represents the min & max of that node
    """

    outMin = (
        np.sum(
            weights*
            np.where(weights>=0,
                     inbound[:,0,None],
                     inbound[:,1,None]),
            axis=0
            )
        ) + bias

    outMax = (
        np.sum(
            weights*
            np.where(weights<=0,
                     inbound[:,0,None],
                     inbound[:,1,None]),
            axis=0
            )
        ) + bias

    # First column are mins, second column are maxes
    outbound = np.concatenate((outMin[:,None], outMax[:,None]), axis=1)

    # If applicable, apply (potentially Leaky) ReLU
    if alpha != 1:
        outbound = np.where(outbound > 0,
                            outbound,
                            alpha*outbound)

    return outbound


def backPropLayerGradBound(biasGradBound, weights):
    """Computes the upper and lower bounds on the gradient of a layer of a
       network based on : the bounds on the gradient of the bias of the next
       layer, and the weights connecting the layer to the next layer

    Requires: numpy imported as np

    Parameters
    ----------
    biasGradbound : (M,2) ndarray; Each row represents the min & max of the
        gradient for the bias of the next layer.
    weights : (N,M) ndarray; weight matrix connecting this layer to the next.

    Returns
    -------
    layerGradBound : (N,2) ndarray; each row represents the min & max of the
        gradient of this layer.
    """

    lbound = biasGradBound[None,:,0]
    ubound = biasGradBound[None,:,1]

    layerGradMin = (
        np.sum(
            weights*
            np.where(weights>=0,
                     lbound,
                     ubound
                     ),
            axis=1
            )
        )

    layerGradMax = (
        np.sum(
            weights*
            np.where(weights<=0,
                     lbound,
                     ubound
                     ),
            axis=1
            )
        )

    # First column are mins, second column are maxes
    layerGradBound = (
        np.concatenate((layerGradMin[:,None],layerGradMax[:,None]), axis=1)
        )

    return layerGradBound


def backPropBiasGradBound(Lbound, Lgradbound, alpha):
    """Computes the upper and lower bounds on the gradient of the bias of a
       layer of a network based on: the bounds on the output of the layer and
       its gradient, and the activation function (ReLU, Leaky ReLU, or None)

    Requires: numpy imported as np

    Parameters
    ----------
    Lbound : (M,2) ndarray; M is the number of nodes in the layer
        Each row represents the min & max of that node.
    Lgradbound : (M,2) ndarray; Each row represents the min & max of the
        gradient for the layer.
    alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU
        If alpha = 1, that represents no activation function in that layer.

    Returns
    -------
    biasGradBound : (M,2) ndarray; each row represents the min & max of the
        gradient of the bias.
    """

    if alpha==1: # if no activation function, the bound is just Lgradbound
        return Lgradbound, np.ones(shape=Lbound.shape)

    reluPrimeBounds = np.where(Lbound > 0, 1, alpha)

    biasGradBounds = Lgradbound[:,:,None]*(reluPrimeBounds[:,None,:])

    biasGradMin = np.min(biasGradBounds,axis = (1,2))

    biasGradMax = np.max(biasGradBounds,axis = (1,2))

    # First column are mins, second column are maxes
    biasGradBound = (
        np.concatenate((biasGradMin[:,None],biasGradMax[:,None]), axis=1)
        )

    return biasGradBound, reluPrimeBounds


def backPropWtsGradBound(Lbound, biasGradbound):
    """Computes the upper and lower bounds on the gradient of the weights of a
       layer of a network based on: the bounds on the output of the nodes of
       the previous layer, and the bounds on the gradient of the bias for the
       same layer

    Requires: numpy imported as np

    Parameters
    ----------
    Lbound : (N,2) ndarray; N is the number of nodes in the previous layer
        Each row represents the min & max of that node.
    biasGradbound : (M,2) ndarray; Each row represents the min & max of the
        gradient for the bias of this layer.

    Returns
    -------
    wtsGradBound : (N,M,2) ndarray; each row represents the min & max of the
        gradient of the weights.
    """

    wBounds = Lbound[:,None,:,None]*biasGradbound[None,:,None,:]

    wtsGradMin = (
        np.min(
            wBounds,
            axis = (2,3)
            )
        )

    wtsGradMax = (
        np.max(
            wBounds,
            axis = (2,3)
            )
        )

    # First column are mins, second column are maxes
    wtsGradBound = (
        np.concatenate((wtsGradMin[:,:,None],wtsGradMax[:,:,None]), axis=2)
        )

    return wtsGradBound


def computeBounds(inputBound, weights, bias, alphas,
                  outputGrad=None, compGP_g=True):
    """Computes the bounds on the values of the nodes of the discriminator of a
    GAN, and the bounds on the gradients of: the layers, the weights, and the
    biases. To compute this, it needs the bounds on the input, and the weights,
    biases, and activation function (alpha) of each layer of the discriminator.

    Parameters
    ----------
    inputBound : (N,2) ndarray; N is the number of nodes in the input layer,
        which is layer 0. Each row represents the min & max of that node.
    weights : a tuple of ndarrays, each of shape (N_i, N_{i+1})
        item i is the weight matrix connecting layer i to layer (i+1)
    bias : a tuple of ndarrays, each of shape (N_{i+1},)
        item i is the bias for layer (i+1)
    alphas : a tuple of floats, each satisfying 0 <= alpha <=1
        item i indicates the activation function for layer (i+1)
        If alpha = 0, that represents ReLU for that layer.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
        If alpha = 1, that represents no activation function in that layer.
    outputGrad : (optional) A tuple of bounds on the gradient of the loss with
        respect to the output, e.g. (-1.1, -0.9). The gradient should be -1 for
        the loss function of the improved WGAN without the gradient penalty
        term, but the bounds need to account for floating point arithmetic.
        If None, the default, then the bounds are set to (-1.000001, -0.999999)
    compGP_g : bool. If True, the default, then compute the bounds on the norm
        of the gradient of the gradient penalty term.

    Requires:
        1) all tuples should have the same length, L+1, where L is the number
           of hidden layers in the discriminator. Layer (L+1) is the output
        2) Array dimensions should match appropriately, as described above.
        3) the output layer should be a scalar, with no activation function,
           i.e. weights[L+1].shape[1]=1, bias[L+1].shape[0]=1, and alpha[L+1]=1

    Returns
    -------
    bd_layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 2)
        item i is the bounds on the (i+1)th layer of the network, which has
        N_{i+1} nodes. The [r,0] entry of array i is the min of node r in layer
        (i+1). The [r,1] entry is the corresponding max.
    bd_layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 2)
        item i is the bounds on the gradient of layer (i+1).
        The [j,0] entry of array i is the min of the gradient of node j of
        layer (i+1). The [j,1] entry is the corresponding max.
    bd_bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 2)
        item i is the bounds on the gradient of the bias for layer (i+1).
        The [j,0] entry of array i is the min of the gradient of the bias for
        node j of layer (i+1). The [j,1] entry is the corresponding max.
    bd_wts_g : a tuple of ndarrays, each of shape (N_i, N_{i+1}, 2)
        item i is the bounds on the gradient of the weight matrix connecting
        layer i to layer (i+1). The [j,k,0] entry of array i is the min of the
        gradient of the weight connecting node j of layer i to node j of layer
        k. The [j,k,1] entry of array i is the corresponding max.
    bd_wts_GP_gNORM : a tuple of floats. The bound on the norm of the gradient
        of the gradient penalty term, for each layer of the critic.
    """

    numL = len(weights) # number of layers, counting output, but not input

    if len(bias) != numL:
        raise ValueError("Numbers of weight matrices and biases do not match")

    if len(alphas) != numL:
        raise ValueError("Numbers of weight matrices and alphas do not match")

    if outputGrad != None:
        if len(outputGrad) != 2:
            raise ValueError("ouputGrad should have a length of 2")
        if (type(outputGrad[0])!=int and type(outputGrad[0])!=float):
            raise ValueError("outputGrad[0] should be int or float")
        if (type(outputGrad[1])!=int and type(outputGrad[1])!=float):
            raise ValueError("outputGrad[1] should be int or float")

    # Dimensions of weight matrices and biases
    wtDims = [w.shape for w in weights]
    biasDims = [b.shape for b in bias]

    if wtDims[0][0] != inputBound.shape[0]:
        msg = (f"First weight matrix dimension {wtDims[0][0]} "
               + f"does not match input shape {inputBound.shape[0]} ")
        raise ValueError(msg)

    for i in range(1, numL):
        if wtDims[i][0] != wtDims[i-1][1]:
            msg = f"weight matrix inner dimension mismatch at i={i}"
            raise ValueError(msg)

    for i in range(numL):
        if wtDims[i][1] != biasDims[i][0]:
            msg = f"bias dimension mismatch at i={i}"
            raise ValueError(msg)

    # Make a list containing the bounds on nodes in the first hidden layer
    bd_layer_out = [inputBound]

    # Forward Prop to compute the bounds on the nodes in all other layers
    for i in range(0, numL):
        bd_layer_out.append(
            forwardPropLayerBound(inbound = bd_layer_out[i],
                                  weights = weights[i],
                                  bias = bias[i],
                                  alpha = alphas[i])
            )

    # Convert to tuple
    bd_layer_out = tuple(bd_layer_out)

    # Initialize lists of length numL, for bounds on gradients
    bd_layer_g = [None] * (numL+1)
    bd_bias_g = [None] * numL
    bd_wts_g = [None] * numL
    bd_relu_g = [None] * numL

    # Bound on grad for output layer:  assumes (partial L / partial Y) = -1
    if outputGrad == None:
        bd_layer_g[numL] = np.array([[-1.0001,-0.9999]])
    else:
        bd_layer_g[numL] = np.array([[outputGrad[0],outputGrad[1]]])

    # Conduct backprop to compute the bounds on the gradients
    for i in range(numL, 0,-1):

        # grad of the bias for layer (i+1)
        bd_bias_g[i-1], bd_relu_g[i-1] = backPropBiasGradBound(bd_layer_out[i],
                                                               bd_layer_g[i],
                                                               alphas[i-1])

        # grad of weights connecting layers (i) to (i+1)
        bd_wts_g[i-1] = backPropWtsGradBound(bd_layer_out[i-1], bd_bias_g[i-1])

        # gradient of layer (i+1)
        bd_layer_g[i-1] = backPropLayerGradBound(bd_bias_g[i-1], weights[i-1])

    # Make them tuples
    bd_layer_g = tuple(bd_layer_g)
    bd_bias_g = tuple(bd_bias_g)
    bd_relu_g = tuple(bd_relu_g)
    bd_wts_g = tuple(bd_wts_g)


    if compGP_g:
        # Compute bound on NORM of gradient of gradient penalty ||grad of GP||
        weightMatrixNorms =  [None] * (numL)
        reluPrimeNorms =  [None] * (numL-1)
        bd_wts_GP_gNORM = [None] * (numL)

        gradDnorm_max = np.linalg.norm(np.max(np.abs(bd_layer_g[0]),axis=1))

        multiplier = 2.0 * np.maximum(gradDnorm_max - 1, 1) # without lamda

        for i, mat in enumerate(weights):
            weightMatrixNorms[i] = np.linalg.norm(mat) # Froebenius norm

        for i in range(numL-1):
            reluPrimeNorms[i] = np.linalg.norm(bd_relu_g[i][:,1]) # Euclidean norm

        productBound = (np.product(weightMatrixNorms)
                        * np.product(reluPrimeNorms)
                        * multiplier)

        for i in range(numL):
            bd_wts_GP_gNORM[i] = productBound/weightMatrixNorms[i]

        bd_wts_GP_gNORM = tuple(bd_wts_GP_gNORM)

        return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g, bd_wts_GP_gNORM

    else:
        return bd_layer_out, bd_layer_g, bd_bias_g, bd_wts_g


###############################################################################
#
# The next section has the functions that compute the actual gradients
#   (instead of bounds) for specific values of inpt
#
###############################################################################

#######################################
# Compute the gradients
#######################################

def forwardPropLayer(inpt, weights, bias, alpha=0):
    """Computes the values for the nodes of a layer of a network
       based on : the previous layer, the weights and bias, and
       the activation function (ReLU, Leaky ReLU, or None)

    Requires: numpy imported as np

    Parameters
    ----------
    inpt : (N,1) ndarray; N is the number of nodes in the input layer
        Each row represents the min & max of that node
    weights : (N,M) ndarray; M is the number of nodes in the next layer
    bias : (M,) ndarray
    alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU
        If alpha = 1, that represents no activation function in that layer.

    Returns
    -------
    output : (M,1) ndarray, each row represents the value of that node
    """

    output = weights.transpose() @ inpt + bias[:,None]

    # If applicable, apply (potentially Leaky) ReLU
    if alpha != 1:
        output = np.where(output > 0,
                          output,
                          alpha*output)

    return output


def backPropLayerGrad(biasGrad, weights):
    """Computes the gradient of a layer of a
       network based on : the gradient of the bias of the next
       layer, and the weights connecting the layer to the next layer

    Requires: numpy imported as np

    Parameters
    ----------
    biasGrad : (M,1) ndarray; Each row represents the min & max of the
        gradient for the bias of the next layer.
    weights : (N,M) ndarray; weight matrix connecting this layer to the next.

    Returns
    -------
    layerGrad : (N,1) ndarray; each row represents the value of the
        gradient of this layer.
    """

    layerGrad = weights @ biasGrad

    return layerGrad


def backPropBiasGrad(Lvals, Lgrad, alpha):
    """Computes the gradient of the bias of a
       layer of a network based on: the output of the layer and
       its gradient, and the activation function (ReLU, Leaky ReLU, or None)

    Requires: numpy imported as np

    Parameters
    ----------
    Lvals : (M,1) ndarray; M is the number of nodes in the layer
        Each row represents the value of that node.
    Lgrad : (M,1) ndarray; Each row represents the value of the
        gradient for the layer.
    alpha : A float. 0 <= alpha <= 1. The default is 0, which represents ReLU.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU
        If alpha = 1, that represents no activation function in that layer.

    Returns
    -------
    biasGrad : (M,1) ndarray; each row represents the value of the
        gradient of the bias.
    """

    if alpha==1: # if no activation function, the bound is just Lgradbound
        return Lgrad, np.ones(Lvals.shape)

    reluPrime = np.where(Lvals > 0, 1, alpha)
    biasGrad = Lgrad * reluPrime

    return biasGrad, reluPrime


def backPropWtsGrad(Lvals, biasGrad):
    """Computes the gradient of the weights of a layer of a network based on:
        the output of the nodes of the previous layer, and the
        gradient of the bias for the same layer

    Requires: numpy imported as np

    Parameters
    ----------
    Lvals : (N,1) ndarray; N is the number of nodes in the previous layer
        Each row represents the value of that node.
    biasGrad : (M,1) ndarray; the gradient for the bias of this layer.

    Returns
    -------
    wtsGrad : (N,M) ndarray; the gradient of the weights.
    """

    wtsGrad = Lvals @ biasGrad.transpose()

    return wtsGrad


def computeGrads(inpt, weights, bias, alphas, compGP_g=True):
    """Computes the values of the nodes of the discriminator of a WGAN, and
    the gradients of: the layers, the weights, and the biases. To compute this,
    it needs the input, the weights, the biases, and the activation functions
    (alpha) of each layer of the discriminator.

    Parameters
    ----------
    inpt : (N,1) ndarray; N is the number of nodes in the input layer,
        which is layer 0. Each row represents the value of that node.
    weights : a tuple of ndarrays, each of shape (N_i, N_{i+1})
        item i is the weight matrix connecting layer i to layer (i+1)
    bias : a tuple of ndarrays, each of shape (N_{i+1},)
        item i is the bias for layer (i+1)
    alphas : a tuple of floats, each satisfying 0 <= alpha <=1
        item i indicates the activation function for layer (i+1)
        If alpha = 0, that represents ReLU for that layer.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
        If alpha = 1, that represents no activation function in that layer.
    compGP_g : bool. If True, the default, then compute the gradient of the
        gradient penalty term.

    Requires:
        1) all tuples should have the same length, L+1, where L is the number
           of hidden layers in the discriminator. Layer (L+1) is the output
        2) Array dimensions should match appropriately, as described above.
        3) the output layer should be a scalar, with no activation function,
           i.e. weights[L].shape[1]=1, bias[L].shape[0]=1, and alpha[L]=1
           where [L] represents layer L+1, due to zero-indexing

    Returns
    -------
    layer_out : a tuple of ndarrays, each of shape (N_{i+1}, 1)
        item i is the values of the (i+1)th layer of the network, which has
        N_{i+1} nodes. The [r,0] entry of array i is the value of node r.
    layer_g : a tuple of ndarrays, each of shape (N_{i+1}, 1)
        item i is the gradient of layer (i+1).
    bias_g : a tuple of ndarrays, each of shape (N_{i+1}, 1)
        item i is the gradient of the bias for layer (i+1).
    wts_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2)
        item i is the gradient of the weight matrix connecting
        layer i to layer (i+1), for the loss function without GP term.
    wts_GP_g : a tuple of tf.Tensors, each of shape (N_i, N_{i+1}, 2)
        item i is the gradient of the weight matrix connecting
        layer i to layer (i+1), for the GP term of the loss function.
    """

    numL = len(weights) # number of layers, counting output, but not input
                        # = L + 1, where L is the number of hidden layers

    if len(bias) != numL:
        raise ValueError("Numbers of weight matrices and biases do not match")

    if len(alphas) != numL:
        raise ValueError("Numbers of weight matrices and alphas do not match")


    # Dimensions of weight matrices and biases
    wtDims = [w.shape for w in weights]
    biasDims = [b.shape for b in bias]

    if wtDims[0][0] != inpt.shape[0]:
        raise ValueError(f"First weight matrix dimension {wtDims[0][0]} "
                         + f"does not match input {inpt.shape[0]}")

    for i in range(1, numL):
        if wtDims[i][0] != wtDims[i-1][1]:
            msg = f"weight matrix inner dimension mismatch at i={i}"
            raise ValueError(msg)

    for i in range(numL):
        if wtDims[i][1] != biasDims[i][0]:
            msg = f"bias dimension mismatch at i={i}"
            raise ValueError(msg)

    # Make a list containing the values of input nodes
    layer_out = [inpt]

    # Forward Prop to compute the values of the nodes in all other layers
    for i in range(0, numL):
        layer_out.append(
            forwardPropLayer(inpt = layer_out[i],
                             weights = weights[i],
                             bias = bias[i],
                             alpha = alphas[i])
            )

    # Convert to tuple
    layer_out = tuple(layer_out)

    # Initialize lists of length numL, for values of gradients
    layer_g = [None] * (numL+1)
    bias_g = [None] * numL
    relu_g = [None] * numL
    wts_g = [None] * numL

    # values of grad (of real term) for output layer:
    layer_g[numL] = np.array([[-1.00000]], dtype=np.float32)

    # Conduct backprop to compute the bounds on the gradients
    for i in range(numL, 0,-1):

        # grad of the bias for layer (i+1)
        bias_g[i-1], relu_g[i-1] = backPropBiasGrad(layer_out[i],
                                                    layer_g[i],
                                                    alphas[i-1])

        # grad of weights connecting layers (i) to (i+1)
        wts_g[i-1] = backPropWtsGrad(layer_out[i-1], bias_g[i-1])

        # gradient of layer (i+1)
        layer_g[i-1] = backPropLayerGrad(bias_g[i-1], weights[i-1])

    # Make them tuples
    layer_g = tuple(layer_g)
    bias_g = tuple(bias_g)
    relu_g = tuple(relu_g)
    wts_g = tuple(wts_g)


    if compGP_g:
        # Compute gradient of gradient penalty (grad of GP)
        gradDnorm = np.linalg.norm(layer_g[0][:,0])
        Dlist = [None] * (numL) # left factor of GP, with the gradD term
        Rlist = [None] * (numL) # R for "right", since it gets multiplied on right
        wts_GP_g = [None] * (numL)

        Dlist[0] = -2.0*(gradDnorm - 1.0)*layer_g[0]/gradDnorm # not * by lamda
        Rlist[0] = np.ones((1,1), dtype=np.float32)

        for i in range(1,numL):
            Dlist[i] = (weights[i-1].transpose() @ Dlist[i-1]) * relu_g[i-1]
            Rlist[i] = relu_g[numL-1-i] * (weights[numL-i] @ Rlist[i-1])
        wts_GP_g[numL - 1] = Dlist[numL - 1]

        for i in range(numL - 1):
            wts_GP_g[i] = Dlist[i] @ np.transpose(Rlist[numL-1-i])

        return layer_out, layer_g, bias_g, wts_g, wts_GP_g

    else:
        return layer_out, layer_g, bias_g, wts_g


###############################################################################
#
# The next section has practice data used to check functionality
#
###############################################################################

#######################################
# Practice data
#######################################

def practice_data(eps = 0.01, layers = 1, random=False, max_nodes=10):
    """Generate practice data to test the computed gradients and bounds

    Requires: numpy imported as np

    Parameters
    ----------
    eps : float. Half of the width of the allowed input (for each node)
    layers : int. The number of layers for the unit test
    random : bool. Uses fixed values if False, and random values if True
    max_nodes : int. The maximum number of nodes to use for each layer

    Returns : sample data for unit test
    -------
    ib : (N,2) ndarray; N is the number of nodes in the input layer,
        which is layer 0. Each row represents the min & max of that node.
    wts : a tuple of ndarrays, each of shape (N_i, N_{i+1})
        item i is the weight matrix connecting layer i to layer (i+1)
    bias : a tuple of ndarrays, each of shape (N_{i+1},)
        item i is the bias for layer (i+1)
    alpha : a tuple of floats, each satisfying 0 <= alpha <=1
        item i indicates the activation function for layer (i+1)
        If alpha = 0, that represents ReLU for that layer.
        If 0 < alpha < 1, that represents the parameter for Leaky ReLU.
        If alpha = 1, that represents no activation function in that layer.
    """

    if type(layers) != int or layers < 1:
        raise ValueError("layers should be an int >= 1")

    if random:
        sigma = np.random.choice([1e-3,1e-2,1e-1,1])
        #sigma = np.random.choice([1e-3,1e-2,1e-1])
        sizes = tuple(np.random.randint(2,max_nodes+1,size=layers))
        ib_size = (np.random.choice(max_nodes)+1,)
        ib_center = np.random.uniform(low=eps,high=1-eps,size=ib_size)
        ib = np.transpose([ib_center-eps, ib_center+eps])
        bias_sizes = sizes+(1,)
        wts_dim0 = ib_size+sizes
        alpha = tuple(np.random.choice([0.1,0.2,0.3,0.5], layers))+(1,)
        #alpha = tuple(np.random.choice([0.2], layers))+(1,)

        bias = [None]*(layers+1)
        wts = [None]*(layers+1)

        for i in range(layers+1):
            bias[i] = np.random.normal(0,sigma,bias_sizes[i])
            wts[i] = np.random.normal(0,sigma,(wts_dim0[i],bias_sizes[i]))

    else:

        if layers == 1: # practice data for one hidden layer

            # input bounds
            ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
                           [-2-eps,-2+eps],[-3-eps,-3+eps]],
                          dtype=np.float32)

            # weights for the network
            wts = (np.array([[-1,  2, -2],[ -7,  5,  1],
                             [ 2, -2,  3],[-3,  4,  10]],
                            dtype=np.float32),
                   np.array([[-1], [2], [-2]],
                            dtype=np.float32)
                   )

            # bias for the network
            bias = (np.array([-100,0,100], dtype=np.float32),
                    np.array([0], dtype=np.float32))

            # alphas for the network
            alpha = (0.2,1)

        elif layers == 2: # practice data for two hidden layers

            # input bounds
            ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
                           [-2-eps,-2+eps],[-3-eps,-3+eps]],
                          dtype=np.float32)

            # weights for the network
            wts = (np.array([[-1,  2, -2],[ -7,  5,  1],
                             [ 2, -2,  3],[-3,  4,  10]],
                            dtype=np.float32),
                   np.array([[-1,3], [2,-4], [-2,5]],
                            dtype=np.float32),
                   np.array([[8], [-9]],
                            dtype=np.float32)
                   )

            # bias for the network
            bias = (np.array([-97,0,101], dtype=np.float32),
                    np.array([-11,13], dtype=np.float32),
                    np.array([0], dtype=np.float32))

            # alphas for the network
            alpha = (0.2,0.4,1.)

        else: # practice data for three hidden layers

            # input bounds
            ib = np.array([[-1-eps,-1+eps],[2-eps,2+eps],
                           [-2-eps,-2+eps],[-3-eps,-3+eps]],
                          dtype=np.float32)

            # weights for the network
            wts = (np.array([[-1,  2, -2],[ -7,  5,  1],
                             [ 2, -2,  3],[-3,  4,  10]],
                            dtype=np.float32),
                   np.array([[-1,3], [2,-4], [-2,5]],
                            dtype=np.float32),
                   np.array([[8,1,-3,6,-2], [-9,0,3,-4,5]],
                            dtype=np.float32),
                   np.array([[-7],[4],[-5],[2],[-3]],
                            dtype=np.float32)
                   )

            # bias for the network
            bias = (np.array([-97,0,101], dtype=np.float32),
                    np.array([-11,13], dtype=np.float32),
                    np.array([-2,3,-4,5,-6], dtype=np.float32),
                    np.array([0], dtype=np.float32))

            # alphas for the network
            alpha = (0.2,0.4,0.25,1.)

    return ib, wts, bias, alpha


def setup(eps = 0.01, layers = 1, random=False, max_nodes=10):
    """Setup for unit test.  Uses data from practice_data() to create a
    TF graph of a WGAN-GP, compute the gradients using TensorFlow, and compute
    the bounds on the grads using computeBounds() and the actual grads using
    computeGrads(). Outputs everything as a dictionary

    Requires:
    numpy imported as np
    tensorflow imported as tf

    Parameters
    ----------
    eps : float. Half of the width of the allowed input (for each node)
    layers : int. The number of layers for the unit test
    random : bool. Uses fixed values if False, and random values if True
    max_nodes : int. The maximum number of nodes to use for each layer

    Returns
    -------
    dump_dict : a dictionary containing the gradients and bounds
    """

    #######################################
    # Generate data to test bounds
    #######################################

    ib, wts, bias, alpha = practice_data(eps=eps,
                                         layers=layers,
                                         random=random,
                                         max_nodes=max_nodes)

    #######################################
    # Tensorflow graph
    #######################################

    batch_size = 1

    train_data = ((ib[:,1:]+ib[:,:1])/2).transpose()

    # start with a fresh graph
    tf.reset_default_graph()

    # create the placeholder for real data
    real_data = tf.placeholder(
        tf.float32,
        shape=[batch_size, ib.shape[0]],
        name="RealData")

    # create a noise data set
    fake_data = tf.random_uniform([batch_size, ib.shape[0]],0.,1.)

    def tfcritic(x):

        layer = 1

        for w, b, a in zip(wts, bias, alpha):

            x = tf.contrib.layers.fully_connected(
                x,
                w.shape[1],
                activation_fn = lambda z: tf.nn.leaky_relu(z, alpha=a),
                scope=f'critic.{layer}',
                reuse=tf.AUTO_REUSE,
                weights_initializer=tf.constant_initializer(w),
                biases_initializer=tf.constant_initializer(b)
                )

            layer += 1

        return x

    crit_real = tfcritic(real_data)
    crit_fake = tfcritic(fake_data)

    # points between real and fake (line 6 of WGAN-GP algorithm)
    u = tf.random_uniform(shape=[batch_size, 1], minval=0, maxval=1)
    interpolates = (u * real_data) + (1 - u)*fake_data

    # compute gradients of dicriminator at interpolates
    crit_interp = tfcritic(interpolates)

    gradients = tf.gradients(crit_interp, [interpolates])[0]

    # calculate the 2 norm of the gradients
    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
                                   reduction_indices=[1]))

    # subtract 1, square
    crit_gp = (slopes - 1.)**2

    lamda = 10
    crit_gp_loss = lamda*tf.reduce_mean(crit_gp)
    crit_main_loss = tf.reduce_mean(crit_fake) - tf.reduce_mean(crit_real)
    crit_real_loss = - tf.reduce_mean(crit_real)
    crit_loss = crit_main_loss + crit_gp_loss

    # get TF trainable params that have 'critic' in name:
    crit_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
                   if 'critic' in v.name]

    # compute the gradients for those params:
    critOpt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)

    # for crit_real_loss
    crit_main_grads = critOpt.compute_gradients(crit_main_loss,
                                                var_list=crit_params)
    crit_real_grads = critOpt.compute_gradients(crit_real_loss,
                                                var_list=crit_params)
    crit_gp_grads = critOpt.compute_gradients(crit_gp_loss,
                                              var_list=crit_params)
    crit_grads = critOpt.compute_gradients(crit_loss, var_list=crit_params)
    crit_train_op = critOpt.apply_gradients(crit_grads)

    critic_wts = [v for v in crit_params if 'weights' in v.name]
    critic_bias = [v for v in crit_params if 'biases' in v.name]

    ib_tensor = tf.constant(ib)

    b1 = computeBounds(inputBound=ib,
                       weights=wts,
                       bias=bias,
                       alphas=alpha,
                       compGP_g=True)

    # the real grads
    g1 = computeGrads(inpt=train_data.transpose(),
                      weights=wts,
                      bias=bias,
                      alphas=alpha,
                      compGP_g=False)

    # evaluate grads in session
    with tf.Session() as session:

        # initialize variables
        session.run(tf.global_variables_initializer())

        if printon:
            print('Starting tensorflow session')

            myvars = tf.trainable_variables()
            print("\nTensorFlow trainable parameters:")
            for myvar in myvars:
                print(myvar.name)
                print('  shape =', myvar.get_shape())

            print("\ncritic_wts:")
            for myvar in critic_wts:
                print(myvar.name)
                print('  shape =', myvar.get_shape())

            print("\ncritic_bias:")
            for myvar in critic_bias:
                print(myvar.name)
                print('  shape =', myvar.get_shape())

        (f,interps,g,cmg,crg,cg) = session.run(
            [fake_data,
             interpolates,
             gradients,
             crit_main_grads,
             crit_real_grads,
             crit_grads],
            feed_dict={real_data: train_data})

        # the grads at the interpolates (which the grad of GP is computed)
        gp1 = computeGrads(inpt=interps.transpose(),
                           weights=wts,
                           bias=bias,
                           alphas=alpha,
                           compGP_g=True)

        dump_dict = {
            'ib' : ib,
            'wts' : wts,
            'bias' : bias,
            'alpha' : alpha,
            'fake_data' : f,
            'interpolates' : interps,
            'gradients' : g,
            'crit_main_grads' : cmg,
            'crit_real_grads' : crg,
            'crit_grads' : cg,
            'bds' : b1,
            'comp_grads' : g1,
            'comp_gp' : gp1
            }

    return dump_dict


def test(d, lamda=10):
    """Tests the gradients and bounds in the dictionary returned by setup()
    against the gradients computed by TensorFlow

    Requires:
    numpy imported as np
    tensorflow imported as tf

    Parameters
    ----------
    d : dictionary. The return from setup()
    lamda : int or float. The GP multiplier in the WGAN-GP loss function

    Returns
    -------
    check_dict : a dictionary containing the results of the test. In particular
        check_dict['overallQ'] is True if all gradients were in the computed
        bounds and check_dict['overallD'] shows the maximum relative error
        between the grads computed by this module and those computed by TF
    """

    layers = len(d['bds'][3])

    if len(d['crit_real_grads']) != 2*layers:
        raise ValueError("Number of layers do not match")

    wtQ = []
    biasQ = []
    wtgpQ = []
    biasgpQ = []
    allQ = []
    layerQ = []
    wtD = []
    biasD = []
    wtgpD = []
    biasgpD = []
    maxD = []

    #0=bd_layer_out, 1=bd_layer_g, 2=bd_bias_g, 3=bd_wts_g, 4=bd_wts_GP_gNORM

    for i in range(layers):
        wtQ.append(
            (d['bds'][3][i][:,:,0] <= d['crit_real_grads'][2*i][0],
             d['bds'][3][i][:,:,1] >= d['crit_real_grads'][2*i][0]))
        biasQ.append(
            (d['bds'][2][i][:,0] <= d['crit_real_grads'][2*i+1][0],
             d['bds'][2][i][:,1] >= d['crit_real_grads'][2*i+1][0]))
        wtgpQ.append(
            np.linalg.norm(
                d['crit_grads'][2*i][0]-d['crit_main_grads'][2*i][0]
            ) <= lamda * d['bds'][4][i])
        biasgpQ.append(
            np.linalg.norm(
                d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0]
            ) <= 1e-9) # grad of gp wrt bias should be zero
        allQ.append(
            (wtQ[-1][0].all() & wtQ[-1][1].all(),
             biasQ[-1][0].all() & biasQ[-1][1].all(),
             wtgpQ[-1],
             biasgpQ[-1]))
        layerQ.append(all(allQ[-1]))

        wtD.append(
            np.linalg.norm(
                d['comp_grads'][3][i] - d['crit_real_grads'][2*i][0]) /
            np.linalg.norm(d['crit_real_grads'][2*i][0]))
        biasD.append(
            np.linalg.norm(
                d['comp_grads'][2][i][:,0] - d['crit_real_grads'][2*i+1][0]) /
            np.linalg.norm(d['crit_real_grads'][2*i+1][0]))
        wtgpD.append(
            np.linalg.norm(
                d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0]
                - lamda * d['comp_gp'][4][i]) /
            np.linalg.norm(
                d['crit_grads'][2*i][0] - d['crit_main_grads'][2*i][0]))
        biasgpD.append(
            np.linalg.norm(
                d['crit_grads'][2*i+1][0]-d['crit_main_grads'][2*i+1][0]))
        maxD.append(np.max([wtD[-1],biasD[-1],wtgpD[-1],biasgpD[-1]]))

    check_dict = {
        'wtQ' : wtQ,
        'biasQ' : biasQ,
        'wtgpQ' : wtgpQ,
        'biasgpQ' : biasgpQ,
        'allQ' : allQ,
        'layerQ' : layerQ,
        'overallQ' : all(layerQ),
        'wtD' : wtD,
        'biasD' : biasD,
        'wtgpD' : wtgpD,
        'biasgpD' : biasgpD,
        'maxD' : maxD,
        'overallD' : max(maxD)
        }

    return check_dict

print('np_wgan_bds loaded')