Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# -*- coding: utf-8 -*-
"""
Implementation of a modification of the 'nearest neighbor adversarial accuracy'
introduced by Yale et al. (2019) in "Privacy Preserving Synthetic Health Data"
nnAA() computes a deterministic version of the nearest neighbor adversarial
accuracy between to data sets, modified to be unbiased so that if the data sets
were drawn from the same distribution, the expected value of nnAA would be 0.5
written by Joseph Pedersen
version 20210314
"""
import numpy as np
from scipy.spatial import distance_matrix as dmat
from numpy.linalg import norm as norm
# -------------------------------------------------
# definition of nearest neighbor adversarial accuracy between data sets T & S
def nnAA(T, S, p=2, threshold=1000000, dTTnn=None, dSSnn=None):
"""
Computes the nearest neighbor adversarial accuracy between two data sets,
by computing the deterministic function on both sets (in their entirety).
Parameters
----------
T : (n,d) ndarray of 'true' data
S : (n,d) ndarray of 'synthetic' data. T and S must have the same shape.
p : int > 0, or the string 'inf'. optional. The default is 2.
The p-norm to use for determining nearest neighbors.
threshold : int >= 0, optional. The default is 1000000.
If (n**2)*d > threshold, scipy.spatial.distance_matrix uses a Python
loop instead of large temporary arrays.
dTTnn : optional. (n,n) ndarray of the distances to leave-one-out NN
dSSnn : optional. (n,n) ndarray of the distances to leave-one-out NN
Returns
-------
np.array([T_isotration, S_isotration])
dTTnn (n,1) ndarray of the distances to leave-one-out NN
dSSnn (n,1) ndarray of the distances to leave-one-out NN
dTSnn (n,2) ndarray of the distances from T to 2 NN in S
dSTnn (n,2) ndarray of the distances from S to 2 NN in T
The nearest neighbor adversarial accuracy is the mean of the isotrations,
but both are returned for additional information.
"""
try:
n = T.shape[0] # number of records in dataframe
except:
raise ValueError("T must be a NumPy ndarray")
try:
n = S.shape[0] # number of records in dataframe
except:
raise ValueError("S must be a NumPy ndarray")
assert len(T.shape) == 2, "T must be a 2d ndarray"
assert len(S.shape) == 2, "S must be a 2d ndarray"
assert T.shape ==S.shape, "Dimensions of T and S must match"
if dTTnn:
try:
nTT = dTTnn.shape[0]
except:
raise ValueError("dTTnn must be a NumPy ndarray")
assert dTTnn.shape[1] == 1, "dTTnn should be a column"
assert n == nTT, "Dimensions of T and dTTnn must match"
if dSSnn:
try:
nSS = dSSnn.shape[0]
except:
raise ValueError("dSSnn must be a NumPy ndarray")
assert dSSnn.shape[1] == 1, "dSSnn should be a column"
assert n == nSS, "Dimensions of S and dSSnn must match"
if type(p) == str and p.lower() == 'inf':
p = np.inf
else:
try:
p = int(p)
except:
raise ValueError("p should be an integer or 'inf'")
assert p > 0, "p must be a positive integer or 'inf'."
assert p < 11, "Using p > 10 could cause overflow. Use p < 11 or 'inf'"
try:
threshold = int(threshold)
except:
raise ValueError("threshold must be a positive integer")
assert threshold > 0, "threshold must be a positive integer"
# Distances for T
if not dTTnn:
dTT = dmat(T,T,p,threshold) # dTT_ij = || T_i - T_j ||
# distance from each t_i to its leave-one-out nearest neighbor
dTTnn = np.partition(dTT,kth=1)[:,1, None]
dTTnn2 = np.column_stack((dTTnn,dTTnn)) # replicated in 2 cols
# Distances for S
if not dSSnn:
dSS = dmat(S,S,p,threshold) # dSS_ij = || S_i - S_j ||
# distance from each s_i to its leave-one-out nearest neighbor
dSSnn = np.partition(dSS,kth=1)[:,1, None]
dSSnn2 = np.column_stack((dSSnn,dSSnn)) # replicated in 2 cols
# Distances between T and S
dTS = dmat(T,S,p,threshold) # dTS_ij = || T_i - S_j ||
# distances from each t_i to its two nearest neighbors in S
dTSnn = np.partition(dTS,kth=1)[:,:2]
# distances from each s_j to its two nearest neighbors in T
dSTnn = np.partition(dTS,kth=1,axis=0).transpose()[:,:2]
# measure the fit at each real data point t_i
nnAA_real_1 = np.choose(np.sum(dTSnn >= dTTnn2, axis=1), (0,1/n,1))
nnAA_real_2 = np.choose(np.sum(dTSnn > dTTnn2, axis=1), (0,1/n,1))
nnAA_real = (nnAA_real_1 + nnAA_real_2)/2
# measure the fit at each synthetic data point s_i
nnAA_synth_1 = np.choose(np.sum(dSTnn >= dSSnn2, axis=1), (0,1/n,1))
nnAA_synth_2 = np.choose(np.sum(dSTnn > dSSnn2, axis=1), (0,1/n,1))
nnAA_synth = (nnAA_synth_1 + nnAA_synth_2)/2
nnAA_arr = np.column_stack((nnAA_real,nnAA_synth))
return np.mean(nnAA_arr, axis=0), dTTnn, dSSnn, dTSnn, dSTnn