Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
synthetic-data/nnAA.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
133 lines (107 sloc)
4.97 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Implementation of a modification of the 'nearest neighbor adversarial accuracy' | |
introduced by Yale et al. (2019) in "Privacy Preserving Synthetic Health Data" | |
nnAA() computes a deterministic version of the nearest neighbor adversarial | |
accuracy between to data sets, modified to be unbiased so that if the data sets | |
were drawn from the same distribution, the expected value of nnAA would be 0.5 | |
written by Joseph Pedersen | |
version 20210314 | |
""" | |
import numpy as np | |
from scipy.spatial import distance_matrix as dmat | |
from numpy.linalg import norm as norm | |
# ------------------------------------------------- | |
# definition of nearest neighbor adversarial accuracy between data sets T & S | |
def nnAA(T, S, p=2, threshold=1000000, dTTnn=None, dSSnn=None): | |
""" | |
Computes the nearest neighbor adversarial accuracy between two data sets, | |
by computing the deterministic function on both sets (in their entirety). | |
Parameters | |
---------- | |
T : (n,d) ndarray of 'true' data | |
S : (n,d) ndarray of 'synthetic' data. T and S must have the same shape. | |
p : int > 0, or the string 'inf'. optional. The default is 2. | |
The p-norm to use for determining nearest neighbors. | |
threshold : int >= 0, optional. The default is 1000000. | |
If (n**2)*d > threshold, scipy.spatial.distance_matrix uses a Python | |
loop instead of large temporary arrays. | |
dTTnn : optional. (n,n) ndarray of the distances to leave-one-out NN | |
dSSnn : optional. (n,n) ndarray of the distances to leave-one-out NN | |
Returns | |
------- | |
np.array([T_isotration, S_isotration]) | |
dTTnn (n,1) ndarray of the distances to leave-one-out NN | |
dSSnn (n,1) ndarray of the distances to leave-one-out NN | |
dTSnn (n,2) ndarray of the distances from T to 2 NN in S | |
dSTnn (n,2) ndarray of the distances from S to 2 NN in T | |
The nearest neighbor adversarial accuracy is the mean of the isotrations, | |
but both are returned for additional information. | |
""" | |
try: | |
n = T.shape[0] # number of records in dataframe | |
except: | |
raise ValueError("T must be a NumPy ndarray") | |
try: | |
n = S.shape[0] # number of records in dataframe | |
except: | |
raise ValueError("S must be a NumPy ndarray") | |
assert len(T.shape) == 2, "T must be a 2d ndarray" | |
assert len(S.shape) == 2, "S must be a 2d ndarray" | |
assert T.shape ==S.shape, "Dimensions of T and S must match" | |
if dTTnn: | |
try: | |
nTT = dTTnn.shape[0] | |
except: | |
raise ValueError("dTTnn must be a NumPy ndarray") | |
assert dTTnn.shape[1] == 1, "dTTnn should be a column" | |
assert n == nTT, "Dimensions of T and dTTnn must match" | |
if dSSnn: | |
try: | |
nSS = dSSnn.shape[0] | |
except: | |
raise ValueError("dSSnn must be a NumPy ndarray") | |
assert dSSnn.shape[1] == 1, "dSSnn should be a column" | |
assert n == nSS, "Dimensions of S and dSSnn must match" | |
if type(p) == str and p.lower() == 'inf': | |
p = np.inf | |
else: | |
try: | |
p = int(p) | |
except: | |
raise ValueError("p should be an integer or 'inf'") | |
assert p > 0, "p must be a positive integer or 'inf'." | |
assert p < 11, "Using p > 10 could cause overflow. Use p < 11 or 'inf'" | |
try: | |
threshold = int(threshold) | |
except: | |
raise ValueError("threshold must be a positive integer") | |
assert threshold > 0, "threshold must be a positive integer" | |
# Distances for T | |
if not dTTnn: | |
dTT = dmat(T,T,p,threshold) # dTT_ij = || T_i - T_j || | |
# distance from each t_i to its leave-one-out nearest neighbor | |
dTTnn = np.partition(dTT,kth=1)[:,1, None] | |
dTTnn2 = np.column_stack((dTTnn,dTTnn)) # replicated in 2 cols | |
# Distances for S | |
if not dSSnn: | |
dSS = dmat(S,S,p,threshold) # dSS_ij = || S_i - S_j || | |
# distance from each s_i to its leave-one-out nearest neighbor | |
dSSnn = np.partition(dSS,kth=1)[:,1, None] | |
dSSnn2 = np.column_stack((dSSnn,dSSnn)) # replicated in 2 cols | |
# Distances between T and S | |
dTS = dmat(T,S,p,threshold) # dTS_ij = || T_i - S_j || | |
# distances from each t_i to its two nearest neighbors in S | |
dTSnn = np.partition(dTS,kth=1)[:,:2] | |
# distances from each s_j to its two nearest neighbors in T | |
dSTnn = np.partition(dTS,kth=1,axis=0).transpose()[:,:2] | |
# measure the fit at each real data point t_i | |
nnAA_real_1 = np.choose(np.sum(dTSnn >= dTTnn2, axis=1), (0,1/n,1)) | |
nnAA_real_2 = np.choose(np.sum(dTSnn > dTTnn2, axis=1), (0,1/n,1)) | |
nnAA_real = (nnAA_real_1 + nnAA_real_2)/2 | |
# measure the fit at each synthetic data point s_i | |
nnAA_synth_1 = np.choose(np.sum(dSTnn >= dSSnn2, axis=1), (0,1/n,1)) | |
nnAA_synth_2 = np.choose(np.sum(dSTnn > dSSnn2, axis=1), (0,1/n,1)) | |
nnAA_synth = (nnAA_synth_1 + nnAA_synth_2)/2 | |
nnAA_arr = np.column_stack((nnAA_real,nnAA_synth)) | |
return np.mean(nnAA_arr, axis=0), dTTnn, dSSnn, dTSnn, dSTnn |