nnAA.py

# -*- coding: utf-8 -*-
"""
Implementation of a modification of the 'nearest neighbor adversarial accuracy'
introduced by Yale et al. (2019) in "Privacy Preserving Synthetic Health Data"

nnAA() computes a deterministic version of the nearest neighbor adversarial
accuracy between to data sets, modified to be unbiased so that if the data sets
were drawn from the same distribution, the expected value of nnAA would be 0.5

written by Joseph Pedersen
version 20210314
"""

import numpy as np
from scipy.spatial import distance_matrix as dmat
from numpy.linalg import norm as norm

# -------------------------------------------------
# definition of nearest neighbor adversarial accuracy between data sets T & S

def nnAA(T, S, p=2, threshold=1000000, dTTnn=None, dSSnn=None):
    """
    Computes the nearest neighbor adversarial accuracy between two data sets,
    by computing the deterministic function on both sets (in their entirety).

    Parameters
    ----------
    T : (n,d) ndarray of 'true' data
    S : (n,d) ndarray of 'synthetic' data. T and S must have the same shape.
    p : int > 0, or the string 'inf'. optional. The default is 2.
        The p-norm to use for determining nearest neighbors.
    threshold : int >= 0, optional. The default is 1000000.
        If (n**2)*d > threshold, scipy.spatial.distance_matrix uses a Python
        loop instead of large temporary arrays.
    dTTnn : optional. (n,n) ndarray of the distances to leave-one-out NN
    dSSnn : optional. (n,n) ndarray of the distances to leave-one-out NN

    Returns
    -------
    np.array([T_isotration, S_isotration])
    dTTnn (n,1) ndarray of the distances to leave-one-out NN
    dSSnn (n,1) ndarray of the distances to leave-one-out NN
    dTSnn (n,2) ndarray of the distances from T to 2 NN in S
    dSTnn (n,2) ndarray of the distances from S to 2 NN in T

    The nearest neighbor adversarial accuracy is the mean of the isotrations,
    but both are returned for additional information.
    """

    try:
        n = T.shape[0] # number of records in dataframe
    except:
        raise ValueError("T must be a NumPy ndarray")

    try:
        n = S.shape[0] # number of records in dataframe
    except:
        raise ValueError("S must be a NumPy ndarray")

    assert len(T.shape) == 2, "T must be a 2d ndarray"
    assert len(S.shape) == 2, "S must be a 2d ndarray"
    assert T.shape ==S.shape, "Dimensions of T and S must match"

    if dTTnn:
        try:
            nTT = dTTnn.shape[0]
        except:
            raise ValueError("dTTnn must be a NumPy ndarray")
        assert dTTnn.shape[1] == 1, "dTTnn should be a column"
        assert n == nTT, "Dimensions of T and dTTnn must match"

    if dSSnn:
        try:
            nSS = dSSnn.shape[0]
        except:
            raise ValueError("dSSnn must be a NumPy ndarray")
        assert dSSnn.shape[1] == 1, "dSSnn should be a column"
        assert n == nSS, "Dimensions of S and dSSnn must match"

    if type(p) == str and p.lower() == 'inf':
        p = np.inf
    else:
        try:
            p = int(p)
        except:
            raise ValueError("p should be an integer or 'inf'")

        assert p > 0, "p must be a positive integer or 'inf'."
        assert p < 11, "Using p > 10 could cause overflow. Use p < 11 or 'inf'"

    try:
        threshold = int(threshold)
    except:
        raise ValueError("threshold must be a positive integer")

    assert threshold > 0, "threshold must be a positive integer"

    # Distances for T
    if not dTTnn:
        dTT = dmat(T,T,p,threshold) # dTT_ij = || T_i - T_j ||
        # distance from each t_i to its leave-one-out nearest neighbor
        dTTnn = np.partition(dTT,kth=1)[:,1, None]
    dTTnn2 = np.column_stack((dTTnn,dTTnn)) # replicated in 2 cols

    # Distances for S
    if not dSSnn:
        dSS = dmat(S,S,p,threshold) # dSS_ij = || S_i - S_j ||
        # distance from each s_i to its leave-one-out nearest neighbor
        dSSnn = np.partition(dSS,kth=1)[:,1, None]
    dSSnn2 = np.column_stack((dSSnn,dSSnn)) # replicated in 2 cols

    # Distances between T and S
    dTS = dmat(T,S,p,threshold) # dTS_ij = || T_i - S_j ||

    # distances from each t_i to its two nearest neighbors in S
    dTSnn = np.partition(dTS,kth=1)[:,:2]

    # distances from each s_j to its two nearest neighbors in T
    dSTnn = np.partition(dTS,kth=1,axis=0).transpose()[:,:2]

    # measure the fit at each real data point t_i
    nnAA_real_1 = np.choose(np.sum(dTSnn >= dTTnn2, axis=1), (0,1/n,1))
    nnAA_real_2 = np.choose(np.sum(dTSnn > dTTnn2, axis=1), (0,1/n,1))
    nnAA_real = (nnAA_real_1 + nnAA_real_2)/2

    # measure the fit at each synthetic data point s_i
    nnAA_synth_1 = np.choose(np.sum(dSTnn >= dSSnn2, axis=1), (0,1/n,1))
    nnAA_synth_2 = np.choose(np.sum(dSTnn > dSSnn2, axis=1), (0,1/n,1))
    nnAA_synth = (nnAA_synth_1 + nnAA_synth_2)/2

    nnAA_arr = np.column_stack((nnAA_real,nnAA_synth))

    return np.mean(nnAA_arr, axis=0), dTTnn, dSSnn, dTSnn, dSTnn
	# -- coding: utf-8 --
	"""
	Implementation of a modification of the 'nearest neighbor adversarial accuracy'
	introduced by Yale et al. (2019) in "Privacy Preserving Synthetic Health Data"

	nnAA() computes a deterministic version of the nearest neighbor adversarial
	accuracy between to data sets, modified to be unbiased so that if the data sets
	were drawn from the same distribution, the expected value of nnAA would be 0.5

	written by Joseph Pedersen
	version 20210314
	"""

	import numpy as np
	from scipy.spatial import distance_matrix as dmat
	from numpy.linalg import norm as norm

	# -------------------------------------------------
	# definition of nearest neighbor adversarial accuracy between data sets T & S

	def nnAA(T, S, p=2, threshold=1000000, dTTnn=None, dSSnn=None):
	"""
	Computes the nearest neighbor adversarial accuracy between two data sets,
	by computing the deterministic function on both sets (in their entirety).

	Parameters
	----------
	T : (n,d) ndarray of 'true' data
	S : (n,d) ndarray of 'synthetic' data. T and S must have the same shape.
	p : int > 0, or the string 'inf'. optional. The default is 2.
	The p-norm to use for determining nearest neighbors.
	threshold : int >= 0, optional. The default is 1000000.
	If (n*2)d > threshold, scipy.spatial.distance_matrix uses a Python
	loop instead of large temporary arrays.
	dTTnn : optional. (n,n) ndarray of the distances to leave-one-out NN
	dSSnn : optional. (n,n) ndarray of the distances to leave-one-out NN

	Returns
	-------
	np.array([T_isotration, S_isotration])
	dTTnn (n,1) ndarray of the distances to leave-one-out NN
	dSSnn (n,1) ndarray of the distances to leave-one-out NN
	dTSnn (n,2) ndarray of the distances from T to 2 NN in S
	dSTnn (n,2) ndarray of the distances from S to 2 NN in T

	The nearest neighbor adversarial accuracy is the mean of the isotrations,
	but both are returned for additional information.
	"""

	try:
	n = T.shape[0] # number of records in dataframe
	except:
	raise ValueError("T must be a NumPy ndarray")

	try:
	n = S.shape[0] # number of records in dataframe
	except:
	raise ValueError("S must be a NumPy ndarray")

	assert len(T.shape) == 2, "T must be a 2d ndarray"
	assert len(S.shape) == 2, "S must be a 2d ndarray"
	assert T.shape ==S.shape, "Dimensions of T and S must match"

	if dTTnn:
	try:
	nTT = dTTnn.shape[0]
	except:
	raise ValueError("dTTnn must be a NumPy ndarray")
	assert dTTnn.shape[1] == 1, "dTTnn should be a column"
	assert n == nTT, "Dimensions of T and dTTnn must match"

	if dSSnn:
	try:
	nSS = dSSnn.shape[0]
	except:
	raise ValueError("dSSnn must be a NumPy ndarray")
	assert dSSnn.shape[1] == 1, "dSSnn should be a column"
	assert n == nSS, "Dimensions of S and dSSnn must match"

	if type(p) == str and p.lower() == 'inf':
	p = np.inf
	else:
	try:
	p = int(p)
	except:
	raise ValueError("p should be an integer or 'inf'")

	assert p > 0, "p must be a positive integer or 'inf'."
	assert p < 11, "Using p > 10 could cause overflow. Use p < 11 or 'inf'"

	try:
	threshold = int(threshold)
	except:
	raise ValueError("threshold must be a positive integer")

	assert threshold > 0, "threshold must be a positive integer"

	# Distances for T
	if not dTTnn:
	dTT = dmat(T,T,p,threshold) # dTT_ij = \|\| T_i - T_j \|\|
	# distance from each t_i to its leave-one-out nearest neighbor
	dTTnn = np.partition(dTT,kth=1)[:,1, None]
	dTTnn2 = np.column_stack((dTTnn,dTTnn)) # replicated in 2 cols

	# Distances for S
	if not dSSnn:
	dSS = dmat(S,S,p,threshold) # dSS_ij = \|\| S_i - S_j \|\|
	# distance from each s_i to its leave-one-out nearest neighbor
	dSSnn = np.partition(dSS,kth=1)[:,1, None]
	dSSnn2 = np.column_stack((dSSnn,dSSnn)) # replicated in 2 cols

	# Distances between T and S
	dTS = dmat(T,S,p,threshold) # dTS_ij = \|\| T_i - S_j \|\|

	# distances from each t_i to its two nearest neighbors in S
	dTSnn = np.partition(dTS,kth=1)[:,:2]

	# distances from each s_j to its two nearest neighbors in T
	dSTnn = np.partition(dTS,kth=1,axis=0).transpose()[:,:2]

	# measure the fit at each real data point t_i
	nnAA_real_1 = np.choose(np.sum(dTSnn >= dTTnn2, axis=1), (0,1/n,1))
	nnAA_real_2 = np.choose(np.sum(dTSnn > dTTnn2, axis=1), (0,1/n,1))
	nnAA_real = (nnAA_real_1 + nnAA_real_2)/2

	# measure the fit at each synthetic data point s_i
	nnAA_synth_1 = np.choose(np.sum(dSTnn >= dSSnn2, axis=1), (0,1/n,1))
	nnAA_synth_2 = np.choose(np.sum(dSTnn > dSSnn2, axis=1), (0,1/n,1))
	nnAA_synth = (nnAA_synth_1 + nnAA_synth_2)/2

	nnAA_arr = np.column_stack((nnAA_real,nnAA_synth))

	return np.mean(nnAA_arr, axis=0), dTTnn, dSSnn, dTSnn, dSTnn