From 4bf46dbce42c9376e72bb5e63aa6277df0402255 Mon Sep 17 00:00:00 2001 From: Karan Bhanot Date: Thu, 19 Aug 2021 16:15:19 -0400 Subject: [PATCH] Add code and results for the various metrics --- notebooks/Atus Metrics and Plots.ipynb | 1214 +++++++++++++++++++++++ notebooks/Synthetic_Data_Evaluation.Rmd | 120 +++ results/ATUSEquity1.csv | 17 + results/MIMICRaceEquity1.csv | 133 +++ scripts/Equity_metrics.R | 161 +++ scripts/metrics.py | 33 + scripts/sunburst_process.R | 137 +++ scripts/table_process.R | 184 ++++ 8 files changed, 1999 insertions(+) create mode 100644 notebooks/Atus Metrics and Plots.ipynb create mode 100644 notebooks/Synthetic_Data_Evaluation.Rmd create mode 100644 results/ATUSEquity1.csv create mode 100644 results/MIMICRaceEquity1.csv create mode 100644 scripts/Equity_metrics.R create mode 100644 scripts/metrics.py create mode 100644 scripts/sunburst_process.R create mode 100644 scripts/table_process.R diff --git a/notebooks/Atus Metrics and Plots.ipynb b/notebooks/Atus Metrics and Plots.ipynb new file mode 100644 index 0000000..eeaa06e --- /dev/null +++ b/notebooks/Atus Metrics and Plots.ipynb @@ -0,0 +1,1214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3e1aa024", + "metadata": {}, + "source": [ + "# American Time Use Survey (ATUS) Metrics and Plots\n", + "\n", + "In this notebook, we use Pearson's Correlation Coefficient (PCC) and Directional Symmetry (DS) based time-series log disparity metric on the datasets. We also draw plots.\n", + "\n", + "## Background\n", + "\n", + "**ATUS dataset:**\n", + "This dataset includes details about how individuals across America spend their time. The average sleep times across several age groups across all days of the week are calculated and shown in a line plot. The real data is replicated in plots and we use synthetic data generation methods to produce line plots. These plots are meant to be evaluated against the plots generated on original data to evaluate how well do synthetic data generation methods work." + ] + }, + { + "cell_type": "markdown", + "id": "591fb5c2", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2df982d1", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline\n", + "\n", + "import sys\n", + "sys.path.append('../scripts/')\n", + "from metrics import pcc, ds\n", + "\n", + "from IPython.display import display_html\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d4f8d69", + "metadata": {}, + "source": [ + "## Import dataset\n", + "\n", + "We import the real and synthetic version of the ATUS dataset for sleep created in the paper [Synthetic Event Time Series Health Data Generation](https://arxiv.org/abs/1911.06411)." + ] + }, + { + "cell_type": "markdown", + "id": "fed02b30", + "metadata": {}, + "source": [ + "### Real data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f4abd7cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TUMONTHTUDIARYDAYTESEXTEAGEhour1_4hour2_5hour3_6hour4_7hour5_8hour6_9...hour21_0hour22_1hour23_2hour24_3hour25_4hour26_5hour27_6hour28_7hour29_8hour30_9
0711[65-74]60.030.00.00.00.00.0...60.060.060.060.060.030.00.00.00.00.0
11272[65-74]60.060.00.00.00.00.0...0.060.060.060.060.060.030.00.00.00.0
2271[35-44]60.030.00.00.00.00.0...60.060.060.060.060.030.00.00.00.00.0
31172[25-34]60.060.060.00.00.00.0...60.060.060.060.060.060.030.00.00.00.0
4822[35-44]60.060.060.00.00.00.0...60.00.060.060.060.060.045.00.00.00.0
\n", + "

5 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " TUMONTH TUDIARYDAY TESEX TEAGE hour1_4 hour2_5 hour3_6 hour4_7 \\\n", + "0 7 1 1 [65-74] 60.0 30.0 0.0 0.0 \n", + "1 12 7 2 [65-74] 60.0 60.0 0.0 0.0 \n", + "2 2 7 1 [35-44] 60.0 30.0 0.0 0.0 \n", + "3 11 7 2 [25-34] 60.0 60.0 60.0 0.0 \n", + "4 8 2 2 [35-44] 60.0 60.0 60.0 0.0 \n", + "\n", + " hour5_8 hour6_9 ... hour21_0 hour22_1 hour23_2 hour24_3 hour25_4 \\\n", + "0 0.0 0.0 ... 60.0 60.0 60.0 60.0 60.0 \n", + "1 0.0 0.0 ... 0.0 60.0 60.0 60.0 60.0 \n", + "2 0.0 0.0 ... 60.0 60.0 60.0 60.0 60.0 \n", + "3 0.0 0.0 ... 60.0 60.0 60.0 60.0 60.0 \n", + "4 0.0 0.0 ... 60.0 0.0 60.0 60.0 60.0 \n", + "\n", + " hour26_5 hour27_6 hour28_7 hour29_8 hour30_9 \n", + "0 30.0 0.0 0.0 0.0 0.0 \n", + "1 60.0 30.0 0.0 0.0 0.0 \n", + "2 30.0 0.0 0.0 0.0 0.0 \n", + "3 60.0 30.0 0.0 0.0 0.0 \n", + "4 60.0 45.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 34 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data\n", + "atus_data = pd.read_csv('../data/Atus/atus_train.csv')\n", + "\n", + "# Map age from numbers to actual values\n", + "age_mapper = {0: '[15-24]', 1: '[25-34]', 2: '[35-44]', 3: '[45-54]', 4: '[55-64]', 5: '[65-74]', 6: '75+'}\n", + "atus_data['TEAGE'] = atus_data['TEAGE'].apply(lambda x: age_mapper[x])\n", + "\n", + "# Show first rows\n", + "atus_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3ce85b09", + "metadata": {}, + "source": [ + "### Synthetic data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ded4768f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TUMONTHTUDIARYDAYTESEXTEAGEhour1_4hour2_5hour3_6hour4_7hour5_8hour6_9...hour21_0hour22_1hour23_2hour24_3hour25_4hour26_5hour27_6hour28_7hour29_8hour30_9
01042[25-34]60.060.030.01.00.00.0...59.060.060.060.060.060.021.07.01.00.0
11072[35-44]60.060.060.060.015.01.0...0.059.060.060.060.060.060.011.00.00.0
2172[55-64]60.060.060.022.00.00.0...60.060.060.060.060.060.058.00.00.00.0
3312[15-24]60.060.058.00.00.00.0...60.060.060.060.060.030.00.00.00.00.0
4742[45-54]60.060.048.08.00.00.0...60.060.058.017.00.00.00.00.00.00.0
\n", + "

5 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " TUMONTH TUDIARYDAY TESEX TEAGE hour1_4 hour2_5 hour3_6 hour4_7 \\\n", + "0 10 4 2 [25-34] 60.0 60.0 30.0 1.0 \n", + "1 10 7 2 [35-44] 60.0 60.0 60.0 60.0 \n", + "2 1 7 2 [55-64] 60.0 60.0 60.0 22.0 \n", + "3 3 1 2 [15-24] 60.0 60.0 58.0 0.0 \n", + "4 7 4 2 [45-54] 60.0 60.0 48.0 8.0 \n", + "\n", + " hour5_8 hour6_9 ... hour21_0 hour22_1 hour23_2 hour24_3 hour25_4 \\\n", + "0 0.0 0.0 ... 59.0 60.0 60.0 60.0 60.0 \n", + "1 15.0 1.0 ... 0.0 59.0 60.0 60.0 60.0 \n", + "2 0.0 0.0 ... 60.0 60.0 60.0 60.0 60.0 \n", + "3 0.0 0.0 ... 60.0 60.0 60.0 60.0 60.0 \n", + "4 0.0 0.0 ... 60.0 60.0 58.0 17.0 0.0 \n", + "\n", + " hour26_5 hour27_6 hour28_7 hour29_8 hour30_9 \n", + "0 60.0 21.0 7.0 1.0 0.0 \n", + "1 60.0 60.0 11.0 0.0 0.0 \n", + "2 60.0 58.0 0.0 0.0 0.0 \n", + "3 30.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 34 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data\n", + "atus_synth_data = pd.read_csv('../data/Atus/atus_train_synthetic_1.csv')\n", + "\n", + "# Map age from numbers to actual values\n", + "atus_synth_data['TEAGE'] = atus_synth_data['TEAGE'].apply(lambda x: age_mapper[x])\n", + "\n", + "# As the sleep times are not whole numbers, we change them in synthetic data\n", + "atus_synth_data.iloc[:, 4:] = round(atus_synth_data.iloc[:, 4:])\n", + "\n", + "# Show first rows\n", + "atus_synth_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4c451e73", + "metadata": {}, + "source": [ + "## Color function\n", + "\n", + "This function colors the pandas based on the defined thresholds." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "630aceae", + "metadata": {}, + "outputs": [], + "source": [ + "def _color_for_fairness(val):\n", + " \"\"\"\n", + " Returns the style for the given value\n", + " - background color\n", + " - text color (if changed)\n", + " \"\"\"\n", + " \n", + " # If background color is darker, change text color to \"white\"\n", + " change_text = False\n", + " \n", + " # Identify background color\n", + " if val < np.log(0.8):\n", + " color = '#d58d70'\n", + " elif ((val >= np.log(0.8)) and (val < np.log(0.9))):\n", + " color = '#eabcad'\n", + " elif ((val >= np.log(0.9)) and (val <= -np.log(0.9))):\n", + " color = '#d4e6e8'\n", + " elif ((val > -np.log(0.9)) and (val <= -np.log(0.8))):\n", + " color = '#a5b0cb'\n", + " else:\n", + " color = '#00205b'\n", + " change_text = True\n", + " \n", + " # Return\n", + " if change_text:\n", + " return 'background-color: {}; color: {}'.format(color, 'white')\n", + " else:\n", + " return 'background-color: {}'.format(color)" + ] + }, + { + "cell_type": "markdown", + "id": "ae767af6", + "metadata": {}, + "source": [ + "# Time metric\n", + "\n", + "In this section we apply the time-series log disparity metric on age groups and gender." + ] + }, + { + "cell_type": "markdown", + "id": "91100996", + "metadata": {}, + "source": [ + "## Age groups metric results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4d5db956", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCC DS
[15-24]-0.0251070.000000
[25-34]0.003757-0.405465
[35-44]-0.014916-0.405465
[45-54]-0.019036-0.405465
[55-64]0.0062950.000000
[65-74]-0.074573-0.182322
75+-0.910678-0.693147
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_atus_age_err_df(real_data, synth_data):\n", + " \n", + " # Define required lists\n", + " all_real = []\n", + " all_other_real = []\n", + " all_synth = []\n", + " all_other_synth = []\n", + " \n", + " # Stratify by each age-group\n", + " for age_idx in [\"[15-24]\", \"[25-34]\", \"[35-44]\", \"[45-54]\", \"[55-64]\", \"[65-74]\", \"75+\"]:\n", + " for day_idx in [1, 2, 3, 4, 5, 6, 7]:\n", + " # Calculate relevant real and synthetic data\n", + " # Also, calculate all other real and synthetic data for each group\n", + " \n", + " curr_data = real_data[(real_data[\"TEAGE\"] == age_idx) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_real.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " other_data = real_data[(real_data[\"TEAGE\"] != age_idx) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_other_real.append(np.mean((other_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " curr_data = synth_data[(synth_data[\"TEAGE\"] == age_idx) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_synth.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " other_data = synth_data[(synth_data[\"TEAGE\"] != age_idx) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_other_synth.append(np.mean((other_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + " \n", + " # Calculate PCC and DS values\n", + " pcc_values = []\n", + " ds_values = []\n", + " for i in range(0, 49, 7):\n", + " pcc_values.append(np.log(pcc(all_real[i:i+7], all_synth[i:i+7])/\n", + " pcc(all_other_real[i:i+7], all_other_synth[i:i+7])))\n", + " ds_values.append(np.log(ds(all_real[i:i+7], all_synth[i:i+7])/\n", + " ds(all_other_real[i:i+7], all_other_synth[i:i+7])))\n", + "\n", + " # Create dataframe\n", + " res = pd.DataFrame({\"PCC\": pcc_values, \"DS\": ds_values})\n", + " res.index = [\"[15-24]\", \"[25-34]\", \"[35-44]\", \"[45-54]\", \"[55-64]\", \"[65-74]\", \"75+\"]\n", + " \n", + " # Return\n", + " return res\n", + "\n", + "age_fairness = get_atus_age_err_df(atus_data, atus_synth_data)\n", + "age_fairness.style.applymap(_color_for_fairness)" + ] + }, + { + "cell_type": "markdown", + "id": "449ed8cd", + "metadata": {}, + "source": [ + "## Age groups stratified by gender metric results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c64c2587", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCC DS
Male [15-24]-0.045852-0.182322
Male [25-34]-0.006313-0.182322
Male [35-44]-0.030661-0.405465
Male [45-54]-0.052947-0.405465
Male [55-64]-0.023886-0.405465
Male [65-74]-0.0339210.000000
Male 75+-2.175244-inf
Female [15-24]-0.028932-0.182322
Female [25-34]-0.005141-0.182322
Female [35-44]-0.008614-0.405465
Female [45-54]-0.045094-0.405465
Female [55-64]-0.045089-0.182322
Female [65-74]-0.295852-0.405465
Female 75+-0.378053-0.182322
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_atus_gender_age_err_df(real_data, synth_data):\n", + " \n", + " # Define required lists\n", + " all_real = []\n", + " all_other_real = []\n", + " all_synth = []\n", + " all_other_synth = []\n", + " \n", + " # Stratify by each gender and age-group\n", + " for gender_idx in [1, 2]:\n", + " for age_idx in [\"[15-24]\", \"[25-34]\", \"[35-44]\", \"[45-54]\", \"[55-64]\", \"[65-74]\", \"75+\"]:\n", + " for day_idx in [1, 2, 3, 4, 5, 6, 7]:\n", + " # Calculate relevant real and synthetic data\n", + " # Also, calculate all other real and synthetic data for each group\n", + " \n", + " curr_data = real_data[(real_data[\"TESEX\"] == gender_idx) & \n", + " (real_data[\"TEAGE\"] == age_idx) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_real.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " other_data = real_data[((real_data[\"TESEX\"] != gender_idx) |\n", + " (real_data[\"TEAGE\"] != age_idx)) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_other_real.append(np.mean((other_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " curr_data = synth_data[(synth_data[\"TESEX\"] == gender_idx) & \n", + " (synth_data[\"TEAGE\"] == age_idx) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_synth.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " other_data = synth_data[((synth_data[\"TESEX\"] != gender_idx) |\n", + " (synth_data[\"TEAGE\"] != age_idx)) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " all_other_synth.append(np.mean((other_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + " \n", + " # Calculate PCC and DS values\n", + " pcc_values = []\n", + " ds_values = []\n", + " for i in range(0, 98, 7):\n", + " pcc_values.append(np.log(pcc(all_real[i:i+7], all_synth[i:i+7])/\n", + " pcc(all_other_real[i:i+7], all_other_synth[i:i+7])))\n", + " ds_values.append(np.log(ds(all_real[i:i+7], all_synth[i:i+7])/\n", + " ds(all_other_real[i:i+7], all_other_synth[i:i+7])))\n", + "\n", + " # Create dataframe\n", + " res = pd.DataFrame({\"PCC\": pcc_values, \"DS\": ds_values})\n", + " res.index = [\"Male [15-24]\", \"Male [25-34]\", \"Male [35-44]\", \"Male [45-54]\", \n", + " \"Male [55-64]\", \"Male [65-74]\", \"Male 75+\",\n", + " \"Female [15-24]\", \"Female [25-34]\", \"Female [35-44]\", \"Female [45-54]\", \n", + " \"Female [55-64]\", \"Female [65-74]\", \"Female 75+\"]\n", + " \n", + " # Return\n", + " return res\n", + "\n", + "gender_age_fairness = get_atus_gender_age_err_df(atus_data, atus_synth_data)\n", + "gender_age_fairness.style.applymap(_color_for_fairness)" + ] + }, + { + "cell_type": "markdown", + "id": "983e4d2e", + "metadata": {}, + "source": [ + "## Overall metric results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b20f2a5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCC DS
[15-24]-0.0251070.000000
[25-34]0.003757-0.405465
[35-44]-0.014916-0.405465
[45-54]-0.019036-0.405465
[55-64]0.0062950.000000
[65-74]-0.074573-0.182322
75+-0.910678-0.693147
      \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCC DS
Male [15-24]-0.045852-0.182322
Male [25-34]-0.006313-0.182322
Male [35-44]-0.030661-0.405465
Male [45-54]-0.052947-0.405465
Male [55-64]-0.023886-0.405465
Male [65-74]-0.0339210.000000
Male 75+-2.175244-inf
      \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCC DS
Female [15-24]-0.028932-0.182322
Female [25-34]-0.005141-0.182322
Female [35-44]-0.008614-0.405465
Female [45-54]-0.045094-0.405465
Female [55-64]-0.045089-0.182322
Female [65-74]-0.295852-0.405465
Female 75+-0.378053-0.182322
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Combine all metrics\n", + "total = pd.concat([age_fairness, gender_age_fairness])\n", + "\n", + "# Arrange them side-by-side\n", + "styler = []\n", + "for i in range(0, 21, 7):\n", + " styler.append(total.iloc[i:i+7,:].style.set_table_attributes(\"style='display:inline'\").applymap(_color_for_fairness))\n", + "\n", + "# Show the results together\n", + "display_html(styler[0]._repr_html_() + \"\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\" + \n", + " styler[1]._repr_html_() + \"\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\" + \n", + " styler[2]._repr_html_(), raw = True)" + ] + }, + { + "cell_type": "markdown", + "id": "a3ffd80d", + "metadata": {}, + "source": [ + "# Covariate plots\n", + "\n", + "This section generates plots for `age` and `gender` protected attributes in the dataset." + ] + }, + { + "cell_type": "markdown", + "id": "425a9462", + "metadata": {}, + "source": [ + "## Age covariate plot for 15-24 and 75+\n", + "\n", + "We generate two plots, one for the individuals aged `[15-24]` and the other for individuals aged `75+`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a240313f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def plot_atus_age(real_data, synth_data):\n", + " \n", + " # Define style\n", + " sns.set_style('white') \n", + " days = ['Sunday', 'Sunday', 'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']\n", + " \n", + " # Define required lists\n", + " real_avgs = []\n", + " synth_avgs = []\n", + " \n", + " # Calculate the relevant real and synthetic data\n", + " for age_idx in [\"[15-24]\", \"75+\"]:\n", + " for day_idx in [1, 2, 3, 4, 5, 6, 7]:\n", + " \n", + " curr_data = real_data[(real_data[\"TEAGE\"] == age_idx) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " real_avgs.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + " \n", + " curr_data = synth_data[(synth_data[\"TEAGE\"] == age_idx) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " synth_avgs.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " # Create two sub-figures\n", + " fig, ax = plt.subplots(1, 2)\n", + " plt.rcParams[\"figure.figsize\"] = (14, 6)\n", + " \n", + " # Create first plot\n", + " ax[0].plot(real_avgs[:7], label = \"Real\", marker = \"o\")\n", + " ax[0].plot(synth_avgs[:7], label = \"Synthetic\", marker = \"x\")\n", + " ax[0].set_yticks([11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15])\n", + " ax[0].set_xticklabels(days)\n", + " ax[0].set_xlabel('Days of the Week')\n", + " ax[0].set_ylabel('Sleep Activity in Hours')\n", + " ax[0].set_title(\"Average sleep time for\\nYoungsters (15-24 year olds)\")\n", + " \n", + " # Create second plot\n", + " ax[1].plot(real_avgs[7:], label = \"Real\", marker = \"o\")\n", + " ax[1].plot(synth_avgs[7:], label = \"Synthetic\", marker = \"x\")\n", + " ax[1].set_yticks([11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15])\n", + " ax[1].set_xticklabels(days)\n", + " ax[1].set_xlabel('Days of the Week')\n", + " ax[1].set_ylabel('Sleep Activity in Hours')\n", + " ax[1].set_title(\"Average sleep time for\\nElderly (75+ year olds)\")\n", + " \n", + " # Plot the figures\n", + " ax.flatten()[-2].legend(loc = 'upper center', bbox_to_anchor = (1.1, -0.12), ncol = 12)\n", + "\n", + "plot_atus_age(atus_data, atus_synth_data)" + ] + }, + { + "cell_type": "markdown", + "id": "3ef2e3de", + "metadata": {}, + "source": [ + "## Gender covariate plot for Males and Females\n", + "\n", + "We generate the covariate plots males and females. The difference is not as pronounced as seen in the age groups above." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a7168101", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def plot_atus_gender(real_data, synth_data):\n", + " \n", + " # Define style\n", + " sns.set_style('white') \n", + " days = ['Sunday', 'Sunday', 'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']\n", + " \n", + " # Define required lists\n", + " real_avgs = []\n", + " synth_avgs = []\n", + " \n", + " # Calculate the relevant real and synthetic data\n", + " for gender_idx in [1, 2]:\n", + " for day_idx in [1, 2, 3, 4, 5, 6, 7]:\n", + " \n", + " curr_data = real_data[(real_data[\"TESEX\"] == gender_idx) & \n", + " ((real_data[\"TUDIARYDAY\"] == day_idx))]\n", + " real_avgs.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + " \n", + " curr_data = synth_data[(synth_data[\"TESEX\"] == gender_idx) & \n", + " ((synth_data[\"TUDIARYDAY\"] == day_idx))]\n", + " synth_avgs.append(np.mean((curr_data.iloc[:, 4:].sum(axis = 1)/60).values))\n", + "\n", + " # Create two sub-figures\n", + " fig, ax = plt.subplots(1, 2)\n", + " plt.rcParams[\"figure.figsize\"] = (14, 6)\n", + " \n", + " # Create first plot\n", + " ax[0].plot(real_avgs[:7], label = \"Real\", marker = \"o\")\n", + " ax[0].plot(synth_avgs[:7], label = \"Synthetic\", marker = \"x\")\n", + " ax[0].set_yticks([10, 10.5, 11, 11.5, 12, 12.5])\n", + " ax[0].set_xticklabels(days)\n", + " ax[0].set_xlabel('Days of the Week')\n", + " ax[0].set_ylabel('Sleep Activity in Hours')\n", + " ax[0].set_title(\"Average sleep time for Males\")\n", + " \n", + " # Create second plot\n", + " ax[1].plot(real_avgs[7:], label = \"Real\", marker = \"o\")\n", + " ax[1].plot(synth_avgs[7:], label = \"Synthetic\", marker = \"x\")\n", + " ax[1].set_yticks([10, 10.5, 11, 11.5, 12, 12.5])\n", + " ax[1].set_xticklabels(days)\n", + " ax[1].set_xlabel('Days of the Week')\n", + " ax[1].set_ylabel('Sleep Activity in Hours')\n", + " ax[1].set_title(\"Average sleep time for Females\")\n", + " \n", + " # Plot the figures\n", + " ax.flatten()[-2].legend(loc = 'upper center', bbox_to_anchor = (1.1, -0.12), ncol = 12)\n", + "\n", + "plot_atus_gender(atus_data, atus_synth_data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Synthetic_Data_Evaluation.Rmd b/notebooks/Synthetic_Data_Evaluation.Rmd new file mode 100644 index 0000000..e675d9f --- /dev/null +++ b/notebooks/Synthetic_Data_Evaluation.Rmd @@ -0,0 +1,120 @@ +--- +title: "R Notebook" +output: html_notebook +--- +```{r} +source("../scripts/Equity_metrics.R") +source("../scripts/sunburst_process.R") +source("../scripts/table_process.R") +library(plotly) +library(formattable) +``` + + +Read in the datasets +```{r} +ATUSreference<-read.csv(file = "../data/Atus/atus_train.csv") +ATUSsynthetic<-read.csv(file = "../data/Atus/atus_train_synthetic.csv") + +MIMICRacereference<-read.csv(file = "../data/Mimic/mimic_3.csv") +MIMICRacesynthetic<-read.csv(file = "../data/Mimic/mimic_3_synthetic.csv") +``` + + +Preprocess the ATUS data +```{r} +ATUSreference_processed<-ATUSreference %>% + group_by(TESEX, TEAGE) %>% + summarise(background_n = n()) +colnames(ATUSreference_processed)<-c("Gender","Age","background_n") + +ATUSsynthetic_processed<-ATUSsynthetic %>% + group_by(TESEX, TEAGE) %>% + summarise(background_n = n()) +colnames(ATUSsynthetic_processed)<-c("Gender","Age","user_n") + + +ATUSreference_processed<-ATUSreference_processed %>% mutate(Gender=recode(Gender, + `1`="Male", + `2`="Female"), + Age=recode(Age, + `0`="15-24", + `1`="25-34", + `2`="35-44", + `3`="45-54", + `4`="55-64", + `5`="65-74", + `6`="75+")) + +ATUSsynthetic_processed<-ATUSsynthetic_processed %>% mutate(Gender=recode(Gender, + `1`="Male", + `2`="Female"), + Age=recode(Age, + `0`="15-24", + `1`="25-34", + `2`="35-44", + `3`="45-54", + `4`="55-64", + `5`="65-74", + `6`="75+")) + +``` + +Preprocess the MIMIC data +```{r} +MIMICRacereference_processed<-MIMICRacereference %>% + group_by(GENDER,AGE,ETHNICITY,mortality_withinthirtydays) %>% + summarise(background_n = n()) +colnames(MIMICRacereference_processed)<-c("Gender","Age","Ethnicity","Mortality","background_n") + +MIMICRacesynthetic_processed<-MIMICRacesynthetic %>% + group_by(GENDER,AGE, ETHNICITY, mortality_withinthirtydays) %>% + summarise(background_n = n()) +colnames(MIMICRacesynthetic_processed)<-c("Gender","Age","Ethnicity","Mortality","user_n") + + +MIMICRacereference_processed$Mortality<-as.factor(MIMICRacereference_processed$Mortality) +MIMICRacesynthetic_processed$Mortality<-as.factor(MIMICRacesynthetic_processed$Mortality) + + +MIMICRacereference_processed<-MIMICRacereference_processed %>% mutate(Gender=recode(Gender, + `M`="Male", + `F`="Female"), + Mortality =recode(Mortality, + `0`="Alive", + `1`="Died")) + +MIMICRacesynthetic_processed<-MIMICRacesynthetic_processed %>% mutate(Gender=recode(Gender, + `M`="Male", + `F`="Female"), + Mortality =recode(Mortality, + `0`="Alive", + `1`="Died")) +``` + + +Evaluation on ATUS data +```{r} +test_sunburst2(ATUSreference_processed,c("Gender","Age"),ATUSsynthetic_processed, "ATUSEquity1.csv", sig_t = 0.05,lower_t = -log(0.9), upper_t = -log(0.8) ) +``` + +Evaluation on MIMIC data +```{r} +test_sunburst2(MIMICRacereference_processed,c("Mortality","Ethnicity","Age","Gender"),MIMICRacesynthetic_processed,"MIMICRaceEquity1.csv", sig_t = 0.05,lower_t = -log(0.9), upper_t = -log(0.8)) +``` + +Tables for ATUS and MIMIC with significance threshold =0.05, lower metric threshold = -log(0.9), upper metric threshold = -log(0.8), metric = log disparity +```{r} +generate_table(0.05,-log(0.9),-log(0.8), "LDI", "ATUS") +generate_table(0.05,-log(0.9),-log(0.8), "LDI", "MIMIC") +``` + + + + + + + + + + diff --git a/results/ATUSEquity1.csv b/results/ATUSEquity1.csv new file mode 100644 index 0000000..ec7e74f --- /dev/null +++ b/results/ATUSEquity1.csv @@ -0,0 +1,17 @@ +"","ids","labels","parents","Observed_Rate","Ideal_Rate","EquityColors","EquityLable","EquityValue","Observed_Number","Trial_Number" +"1","Female - 15-24","15-24","Female",0.066,0.0637,"#d4e6e8","Equitable(p)",0.0382,1980,30000 +"2","Female - 25-34","25-34","Female",0.1,0.0986,"#d4e6e8","Equitable(p)",0.0189,3010,30000 +"3","Female - 35-44","35-44","Female",0.131,0.123,"#d4e6e8","Equitable",0.0659,3916,30000 +"4","Female - 45-54","45-54","Female",0.101,0.103,"#d4e6e8","Equitable(p)",-0.0175,3029,30000 +"5","Female - 55-64","55-64","Female",0.0743,0.0766,"#d4e6e8","Equitable(p)",-0.0319,2230,30000 +"6","Female - 65-74","65-74","Female",0.0565,0.0546,"#d4e6e8","Equitable(p)",0.0348,1694,30000 +"7","Female - 75+","75+","Female",0.0445,0.0504,"#eabcad","Underrepresented",-0.13,1335,30000 +"8","Female","Female","",0.573,0.57,"#d4e6e8","Equitable(p)",0.014,17194,30000 +"9","Male - 15-24","15-24","Male",0.0514,0.0554,"#d4e6e8","Equitable(p)",-0.0784,1543,30000 +"10","Male - 25-34","25-34","Male",0.0782,0.0688,"#a5b0cb","Overrepresented",0.138,2346,30000 +"11","Male - 35-44","35-44","Male",0.0995,0.099,"#d4e6e8","Equitable(p)",0.00525,2985,30000 +"12","Male - 45-54","45-54","Male",0.0788,0.0816,"#d4e6e8","Equitable(p)",-0.0389,2363,30000 +"13","Male - 55-64","55-64","Male",0.0559,0.0584,"#d4e6e8","Equitable(p)",-0.0478,1676,30000 +"14","Male - 65-74","65-74","Male",0.0373,0.0399,"#d4e6e8","Equitable(p)",-0.0692,1119,30000 +"15","Male - 75+","75+","Male",0.0258,0.0271,"#d4e6e8","Equitable(p)",-0.0502,774,30000 +"16","Male","Male","",0.427,0.43,"#d4e6e8","Equitable(p)",-0.014,12806,30000 diff --git a/results/MIMICRaceEquity1.csv b/results/MIMICRaceEquity1.csv new file mode 100644 index 0000000..b9bfc42 --- /dev/null +++ b/results/MIMICRaceEquity1.csv @@ -0,0 +1,133 @@ +"","ids","labels","parents","Observed_Rate","Ideal_Rate","EquityColors","EquityLable","EquityValue","Observed_Number","Trial_Number" +"1","Alive - Asian - <=45 - Female","Female","Alive - Asian - <=45",0.00374,0.00334,"#d4e6e8","Equitable(p)",0.114,374,1e+05 +"2","Alive - Asian - <=45 - Male","Male","Alive - Asian - <=45",0.0036,0.00174,"#00205b","Highly Overrepresented",0.728,360,1e+05 +"3","Alive - Asian - <=45","<=45","Alive - Asian",0.00734,0.00508,"#00205b","Highly Overrepresented",0.37,734,1e+05 +"4","Alive - Asian - 46-65 - Female","Female","Alive - Asian - 46-65",0.00462,0.00312,"#00205b","Highly Overrepresented",0.394,462,1e+05 +"5","Alive - Asian - 46-65 - Male","Male","Alive - Asian - 46-65",0.00578,0.00479,"#d4e6e8","Equitable(p)",0.189,578,1e+05 +"6","Alive - Asian - 46-65","46-65","Alive - Asian",0.0104,0.00791,"#00205b","Highly Overrepresented",0.276,1040,1e+05 +"7","Alive - Asian - 66-80 - Female","Female","Alive - Asian - 66-80",0.00345,0.00312,"#d4e6e8","Equitable(p)",0.101,345,1e+05 +"8","Alive - Asian - 66-80 - Male","Male","Alive - Asian - 66-80",0.00351,0.00356,"#d4e6e8","Equitable(p)",-0.013,351,1e+05 +"9","Alive - Asian - 66-80","66-80","Alive - Asian",0.00696,0.00668,"#d4e6e8","Equitable(p)",0.0419,696,1e+05 +"10","Alive - Asian - 81+ - Female","Female","Alive - Asian - 81+",0.00332,0.00203,"#00205b","Highly Overrepresented",0.492,332,1e+05 +"11","Alive - Asian - 81+ - Male","Male","Alive - Asian - 81+",0.00246,0.00196,"#d4e6e8","Equitable(p)",0.228,246,1e+05 +"12","Alive - Asian - 81+","81+","Alive - Asian",0.00578,0.00399,"#00205b","Highly Overrepresented",0.372,578,1e+05 +"13","Alive - Asian","Asian","Alive",0.0305,0.0237,"#00205b","Highly Overrepresented",0.26,3048,1e+05 +"14","Alive - Black - <=45 - Female","Female","Alive - Black - <=45",0.00515,0.0132,"#d58570","Highly Underrepresented",-0.95,515,1e+05 +"15","Alive - Black - <=45 - Male","Male","Alive - Black - <=45",0.00489,0.00835,"#d58570","Highly Underrepresented",-0.538,489,1e+05 +"16","Alive - Black - <=45","<=45","Alive - Black",0.01,0.0216,"#d58570","Highly Underrepresented",-0.776,1004,1e+05 +"17","Alive - Black - 46-65 - Female","Female","Alive - Black - 46-65",0.00978,0.0165,"#d58570","Highly Underrepresented",-0.533,978,1e+05 +"18","Alive - Black - 46-65 - Male","Male","Alive - Black - 46-65",0.0121,0.0181,"#d58570","Highly Underrepresented",-0.405,1213,1e+05 +"19","Alive - Black - 46-65","46-65","Alive - Black",0.0219,0.0346,"#d58570","Highly Underrepresented",-0.47,2191,1e+05 +"20","Alive - Black - 66-80 - Female","Female","Alive - Black - 66-80",0.00704,0.0132,"#d58570","Highly Underrepresented",-0.635,704,1e+05 +"21","Alive - Black - 66-80 - Male","Male","Alive - Black - 66-80",0.00639,0.00951,"#d58570","Highly Underrepresented",-0.4,639,1e+05 +"22","Alive - Black - 66-80","66-80","Alive - Black",0.0134,0.0227,"#d58570","Highly Underrepresented",-0.535,1343,1e+05 +"23","Alive - Black - 81+ - Female","Female","Alive - Black - 81+",0.00589,0.00595,"#d4e6e8","Equitable(p)",-0.0103,589,1e+05 +"24","Alive - Black - 81+ - Male","Male","Alive - Black - 81+",0.00552,0.00377,"#00205b","Highly Overrepresented",0.382,552,1e+05 +"25","Alive - Black - 81+","81+","Alive - Black",0.0114,0.00972,"#d4e6e8","Equitable(p)",0.162,1141,1e+05 +"26","Alive - Black","Black","Alive",0.0568,0.0886,"#d58570","Highly Underrepresented",-0.479,5679,1e+05 +"27","Alive","Alive","",0.875,0.847,"#00205b","Highly Overrepresented",0.232,87468,1e+05 +"28","Alive - Other - <=45 - Female","Female","Alive - Other - <=45",0.00632,0.009,"#d58570","Highly Underrepresented",-0.356,632,1e+05 +"29","Alive - Other - <=45 - Male","Male","Alive - Other - <=45",0.00588,0.011,"#d58570","Highly Underrepresented",-0.628,588,1e+05 +"30","Alive - Other - <=45","<=45","Alive - Other",0.0122,0.02,"#d58570","Highly Underrepresented",-0.5,1220,1e+05 +"31","Alive - Other - 46-65 - Female","Female","Alive - Other - 46-65",0.00946,0.01,"#d4e6e8","Equitable(p)",-0.0575,946,1e+05 +"32","Alive - Other - 46-65 - Male","Male","Alive - Other - 46-65",0.0122,0.0144,"#eabcad","Underrepresented",-0.173,1218,1e+05 +"33","Alive - Other - 46-65","46-65","Alive - Other",0.0216,0.0245,"#d4e6e8","Equitable(p)",-0.125,2164,1e+05 +"34","Alive - Other - 66-80 - Female","Female","Alive - Other - 66-80",0.00694,0.00537,"#d4e6e8","Equitable(p)",0.258,694,1e+05 +"35","Alive - Other - 66-80 - Male","Male","Alive - Other - 66-80",0.00665,0.00697,"#d4e6e8","Equitable(p)",-0.0468,665,1e+05 +"36","Alive - Other - 66-80","66-80","Alive - Other",0.0136,0.0123,"#d4e6e8","Equitable(p)",0.098,1359,1e+05 +"37","Alive - Other - 81+ - Female","Female","Alive - Other - 81+",0.00618,0.00247,"#00205b","Highly Overrepresented",0.922,618,1e+05 +"38","Alive - Other - 81+ - Male","Male","Alive - Other - 81+",0.0054,0.00174,"#00205b","Highly Overrepresented",1.14,540,1e+05 +"39","Alive - Other - 81+","81+","Alive - Other",0.0116,0.00421,"#00205b","Highly Overrepresented",1.02,1158,1e+05 +"40","Alive - Other","Other","Alive",0.059,0.061,"#d4e6e8","Equitable(p)",-0.0345,5901,1e+05 +"41","Alive - Unknown - <=45 - Female","Female","Alive - Unknown - <=45",0.00604,0.00617,"#d4e6e8","Equitable(p)",-0.0212,604,1e+05 +"42","Alive - Unknown - <=45 - Male","Male","Alive - Unknown - <=45",0.00546,0.00784,"#d58570","Highly Underrepresented",-0.364,546,1e+05 +"43","Alive - Unknown - <=45","<=45","Alive - Unknown",0.0115,0.014,"#eabcad","Underrepresented",-0.2,1150,1e+05 +"44","Alive - Unknown - 46-65 - Female","Female","Alive - Unknown - 46-65",0.0105,0.0098,"#d4e6e8","Equitable(p)",0.0681,1048,1e+05 +"45","Alive - Unknown - 46-65 - Male","Male","Alive - Unknown - 46-65",0.0136,0.012,"#d4e6e8","Equitable(p)",0.124,1362,1e+05 +"46","Alive - Unknown - 46-65","46-65","Alive - Unknown",0.0241,0.0218,"#d4e6e8","Equitable(p)",0.101,2410,1e+05 +"47","Alive - Unknown - 66-80 - Female","Female","Alive - Unknown - 66-80",0.00724,0.00958,"#d58570","Highly Underrepresented",-0.282,724,1e+05 +"48","Alive - Unknown - 66-80 - Male","Male","Alive - Unknown - 66-80",0.00668,0.0113,"#d58570","Highly Underrepresented",-0.532,668,1e+05 +"49","Alive - Unknown - 66-80","66-80","Alive - Unknown",0.0139,0.0209,"#d58570","Highly Underrepresented",-0.414,1392,1e+05 +"50","Alive - Unknown - 81+ - Female","Female","Alive - Unknown - 81+",0.007,0.00697,"#d4e6e8","Equitable(p)",0.00481,700,1e+05 +"51","Alive - Unknown - 81+ - Male","Male","Alive - Unknown - 81+",0.00548,0.00501,"#d4e6e8","Equitable(p)",0.0907,548,1e+05 +"52","Alive - Unknown - 81+","81+","Alive - Unknown",0.0125,0.012,"#d4e6e8","Equitable(p)",0.0419,1248,1e+05 +"53","Alive - Unknown","Unknown","Alive",0.062,0.0687,"#eabcad","Underrepresented",-0.11,6200,1e+05 +"54","Alive - White - <=45 - Female","Female","Alive - White - <=45",0.0514,0.0485,"#d4e6e8","Equitable(p)",0.0609,5144,1e+05 +"55","Alive - White - <=45 - Male","Male","Alive - White - <=45",0.0494,0.0557,"#eabcad","Underrepresented",-0.126,4945,1e+05 +"56","Alive - White - <=45","<=45","Alive - White",0.101,0.104,"#d4e6e8","Equitable(p)",-0.0368,10089,1e+05 +"57","Alive - White - 46-65 - Female","Female","Alive - White - 46-65",0.104,0.0909,"#a5b0cb","Overrepresented",0.153,10440,1e+05 +"58","Alive - White - 46-65 - Male","Male","Alive - White - 46-65",0.135,0.119,"#a5b0cb","Overrepresented",0.143,13520,1e+05 +"59","Alive - White - 46-65","46-65","Alive - White",0.24,0.21,"#a5b0cb","Overrepresented",0.169,23960,1e+05 +"60","Alive - White - 66-80 - Female","Female","Alive - White - 66-80",0.0891,0.0809,"#a5b0cb","Overrepresented",0.106,8912,1e+05 +"61","Alive - White - 66-80 - Male","Male","Alive - White - 66-80",0.0849,0.0869,"#d4e6e8","Equitable(p)",-0.0258,8491,1e+05 +"62","Alive - White - 66-80","66-80","Alive - White",0.174,0.168,"#d4e6e8","Equitable(p)",0.0436,17403,1e+05 +"63","Alive - White - 81+ - Female","Female","Alive - White - 81+",0.0814,0.0718,"#a5b0cb","Overrepresented",0.136,8140,1e+05 +"64","Alive - White - 81+ - Male","Male","Alive - White - 81+",0.0705,0.0509,"#00205b","Highly Overrepresented",0.345,7048,1e+05 +"65","Alive - White - 81+","81+","Alive - White",0.152,0.123,"#00205b","Highly Overrepresented",0.247,15188,1e+05 +"66","Alive - White","White","Alive",0.666,0.605,"#00205b","Highly Overrepresented",0.265,66640,1e+05 +"67","Died - Asian - <=45 - Female","Female","Died - Asian - <=45",0.00049,0.000145,"#00205b","Highly Overrepresented",1.22,49,1e+05 +"68","Died - Asian - <=45 - Male","Male","Died - Asian - <=45",0.00037,0.000218,"#00205b","Highly Overrepresented",0.531,37,1e+05 +"69","Died - Asian - <=45","<=45","Died - Asian",0.00086,0.000363,"#d4e6e8","Equitable(p)",0.863,86,1e+05 +"70","Died - Asian - 46-65 - Female","Female","Died - Asian - 46-65",0.00062,0.000435,"#d4e6e8","Equitable(p)",0.354,62,1e+05 +"71","Died - Asian - 46-65 - Male","Male","Died - Asian - 46-65",9e-04,0.000435,"#d4e6e8","Equitable(p)",0.727,90,1e+05 +"72","Died - Asian - 46-65","46-65","Died - Asian",0.00152,0.000871,"#d4e6e8","Equitable(p)",0.558,152,1e+05 +"73","Died - Asian - 66-80 - Female","Female","Died - Asian - 66-80",0.00056,0.000508,"#d4e6e8","Equitable(p)",0.0975,56,1e+05 +"74","Died - Asian - 66-80 - Male","Male","Died - Asian - 66-80",0.00051,0.000943,"#d4e6e8","Equitable(p)",-0.616,51,1e+05 +"75","Died - Asian - 66-80","66-80","Died - Asian",0.00107,0.00145,"#d4e6e8","Equitable(p)",-0.305,107,1e+05 +"76","Died - Asian - 81+ - Female","Female","Died - Asian - 81+",0.00069,0.000653,"#d4e6e8","Equitable(p)",0.055,69,1e+05 +"77","Died - Asian - 81+ - Male","Male","Died - Asian - 81+",0.00055,0.000943,"#d4e6e8","Equitable(p)",-0.54,55,1e+05 +"78","Died - Asian - 81+","81+","Died - Asian",0.00124,0.0016,"#d4e6e8","Equitable(p)",-0.253,124,1e+05 +"79","Died - Asian","Asian","Died",0.00469,0.00428,"#d4e6e8","Equitable(p)",0.0915,469,1e+05 +"80","Died - Black - <=45 - Female","Female","Died - Black - <=45",0.00044,0.000508,"#d4e6e8","Equitable(p)",-0.144,44,1e+05 +"81","Died - Black - <=45 - Male","Male","Died - Black - <=45",0.00036,0.000653,"#d4e6e8","Equitable(p)",-0.596,36,1e+05 +"82","Died - Black - <=45","<=45","Died - Black",8e-04,0.00116,"#d4e6e8","Equitable(p)",-0.373,80,1e+05 +"83","Died - Black - 46-65 - Female","Female","Died - Black - 46-65",0.00099,0.00138,"#d4e6e8","Equitable(p)",-0.332,99,1e+05 +"84","Died - Black - 46-65 - Male","Male","Died - Black - 46-65",0.00142,0.00174,"#d4e6e8","Equitable(p)",-0.205,142,1e+05 +"85","Died - Black - 46-65","46-65","Died - Black",0.00241,0.00312,"#d4e6e8","Equitable(p)",-0.259,241,1e+05 +"86","Died - Black - 66-80 - Female","Female","Died - Black - 66-80",0.0011,0.00232,"#d58570","Highly Underrepresented",-0.748,110,1e+05 +"87","Died - Black - 66-80 - Male","Male","Died - Black - 66-80",0.00116,0.000943,"#d4e6e8","Equitable(p)",0.207,116,1e+05 +"88","Died - Black - 66-80","66-80","Died - Black",0.00226,0.00327,"#d58570","Highly Underrepresented",-0.369,226,1e+05 +"89","Died - Black - 81+ - Female","Female","Died - Black - 81+",0.00122,0.00138,"#d4e6e8","Equitable(p)",-0.123,122,1e+05 +"90","Died - Black - 81+ - Male","Male","Died - Black - 81+",0.00092,0.000871,"#d4e6e8","Equitable(p)",0.055,92,1e+05 +"91","Died - Black - 81+","81+","Died - Black",0.00214,0.00225,"#d4e6e8","Equitable(p)",-0.0501,214,1e+05 +"92","Died - Black","Black","Died",0.00761,0.0098,"#d58570","Highly Underrepresented",-0.255,761,1e+05 +"93","Died","Died","",0.125,0.153,"#d58570","Highly Underrepresented",-0.232,12532,1e+05 +"94","Died - Other - <=45 - Female","Female","Died - Other - <=45",0.00068,0.000363,"#d4e6e8","Equitable(p)",0.628,68,1e+05 +"95","Died - Other - <=45 - Male","Male","Died - Other - <=45",0.00057,0.000508,"#d4e6e8","Equitable(p)",0.115,57,1e+05 +"96","Died - Other - <=45","<=45","Died - Other",0.00125,0.000871,"#d4e6e8","Equitable(p)",0.362,125,1e+05 +"97","Died - Other - 46-65 - Female","Female","Died - Other - 46-65",0.00128,0.000871,"#d4e6e8","Equitable(p)",0.386,128,1e+05 +"98","Died - Other - 46-65 - Male","Male","Died - Other - 46-65",0.00138,0.00181,"#d4e6e8","Equitable(p)",-0.274,138,1e+05 +"99","Died - Other - 46-65","46-65","Died - Other",0.00266,0.00269,"#d4e6e8","Equitable(p)",-0.0094,266,1e+05 +"100","Died - Other - 66-80 - Female","Female","Died - Other - 66-80",0.00144,0.00123,"#d4e6e8","Equitable(p)",0.155,144,1e+05 +"101","Died - Other - 66-80 - Male","Male","Died - Other - 66-80",0.00117,0.00123,"#d4e6e8","Equitable(p)",-0.0531,117,1e+05 +"102","Died - Other - 66-80","66-80","Died - Other",0.00261,0.00247,"#d4e6e8","Equitable(p)",0.0564,261,1e+05 +"103","Died - Other - 81+ - Female","Female","Died - Other - 81+",0.00117,0.000435,"#00205b","Highly Overrepresented",0.989,117,1e+05 +"104","Died - Other - 81+ - Male","Male","Died - Other - 81+",0.00104,0.000726,"#d4e6e8","Equitable(p)",0.36,104,1e+05 +"105","Died - Other - 81+","81+","Died - Other",0.00221,0.00116,"#00205b","Highly Overrepresented",0.645,221,1e+05 +"106","Died - Other","Other","Died",0.00873,0.00718,"#d4e6e8","Equitable(p)",0.196,873,1e+05 +"107","Died - Unknown - <=45 - Female","Female","Died - Unknown - <=45",0.00058,0.00109,"#d4e6e8","Equitable(p)",-0.63,58,1e+05 +"108","Died - Unknown - <=45 - Male","Male","Died - Unknown - <=45",0.00039,0.00116,"#d58570","Highly Underrepresented",-1.09,39,1e+05 +"109","Died - Unknown - <=45","<=45","Died - Unknown",0.00097,0.00225,"#d58570","Highly Underrepresented",-0.843,97,1e+05 +"110","Died - Unknown - 46-65 - Female","Female","Died - Unknown - 46-65",0.00126,0.00269,"#d58570","Highly Underrepresented",-0.758,126,1e+05 +"111","Died - Unknown - 46-65 - Male","Male","Died - Unknown - 46-65",0.00166,0.0045,"#d58570","Highly Underrepresented",-1,166,1e+05 +"112","Died - Unknown - 46-65","46-65","Died - Unknown",0.00292,0.00718,"#d58570","Highly Underrepresented",-0.905,292,1e+05 +"113","Died - Unknown - 66-80 - Female","Female","Died - Unknown - 66-80",0.0014,0.00457,"#d58570","Highly Underrepresented",-1.19,140,1e+05 +"114","Died - Unknown - 66-80 - Male","Male","Died - Unknown - 66-80",0.00131,0.00435,"#d58570","Highly Underrepresented",-1.2,131,1e+05 +"115","Died - Unknown - 66-80","66-80","Died - Unknown",0.00271,0.00893,"#d58570","Highly Underrepresented",-1.2,271,1e+05 +"116","Died - Unknown - 81+ - Female","Female","Died - Unknown - 81+",0.0012,0.00327,"#d58570","Highly Underrepresented",-1,120,1e+05 +"117","Died - Unknown - 81+ - Male","Male","Died - Unknown - 81+",0.00102,0.00276,"#d58570","Highly Underrepresented",-0.996,102,1e+05 +"118","Died - Unknown - 81+","81+","Died - Unknown",0.00222,0.00602,"#d58570","Highly Underrepresented",-1,222,1e+05 +"119","Died - Unknown","Unknown","Died",0.00882,0.0244,"#d58570","Highly Underrepresented",-1.03,882,1e+05 +"120","Died - White - <=45 - Female","Female","Died - White - <=45",0.00522,0.00247,"#00205b","Highly Overrepresented",0.752,522,1e+05 +"121","Died - White - <=45 - Male","Male","Died - White - <=45",0.00522,0.00363,"#00205b","Highly Overrepresented",0.365,522,1e+05 +"122","Died - White - <=45","<=45","Died - White",0.0104,0.0061,"#00205b","Highly Overrepresented",0.542,1044,1e+05 +"123","Died - White - 46-65 - Female","Female","Died - White - 46-65",0.0111,0.0112,"#d4e6e8","Equitable(p)",-0.00596,1111,1e+05 +"124","Died - White - 46-65 - Male","Male","Died - White - 46-65",0.0152,0.0155,"#d4e6e8","Equitable(p)",-0.015,1523,1e+05 +"125","Died - White - 46-65","46-65","Died - White",0.0263,0.0266,"#d4e6e8","Equitable(p)",-0.0114,2634,1e+05 +"126","Died - White - 66-80 - Female","Female","Died - White - 66-80",0.0157,0.016,"#d4e6e8","Equitable(p)",-0.019,1567,1e+05 +"127","Died - White - 66-80 - Male","Male","Died - White - 66-80",0.0146,0.0182,"#eabcad","Underrepresented",-0.222,1464,1e+05 +"128","Died - White - 66-80","66-80","Died - White",0.0303,0.0342,"#eabcad","Underrepresented",-0.124,3031,1e+05 +"129","Died - White - 81+ - Female","Female","Died - White - 81+",0.0148,0.0229,"#d58570","Highly Underrepresented",-0.442,1481,1e+05 +"130","Died - White - 81+ - Male","Male","Died - White - 81+",0.0136,0.0176,"#d58570","Highly Underrepresented",-0.262,1357,1e+05 +"131","Died - White - 81+","81+","Died - White",0.0284,0.0404,"#d58570","Highly Underrepresented",-0.366,2838,1e+05 +"132","Died - White","White","Died",0.0955,0.107,"#eabcad","Underrepresented",-0.13,9547,1e+05 diff --git a/scripts/Equity_metrics.R b/scripts/Equity_metrics.R new file mode 100644 index 0000000..e549f34 --- /dev/null +++ b/scripts/Equity_metrics.R @@ -0,0 +1,161 @@ +#convert numbers to ratios +Rate_Calculation<-function(subgroup_n,total_n){ + if(total_n == 0){ + return(0) + } + return(subgroup_n/total_n) +} + +# Equity Metric derived from Disparate Impact +Log_Disparate_Impact<-function (alpha_di, beta_di, for_plot = FALSE){ + if (!for_plot){ + if (beta_di == 0 & alpha_di == 0){ + return (-9999999) + } + else if (beta_di == 0){ + return (-Inf) + } + else if (alpha_di == 0){ + return (-8888888) + } + + } + + left_di <- beta_di/(1-beta_di) + right_di <- alpha_di/(1-alpha_di) + result_di <- log(left_di)-log(right_di) + return (result_di) +} + +# Significance test (two sided) +compare_population_proportion_2<-function(Xb,Nb,Xa,Na,conf_val){ + if (Xa == 0 & Xb == 0){ + return (-9999999) + } + else if (Xa == 0){ + return (-Inf) + } + else if (Xb == 0){ + return (-8888888) + } + + Xa_not<- Na-Xa + Xb_not<- Nb-Xb + if (Xa>=5 & Xa_not>=5 & Xb>=5 & Xb_not>=5){ + p_value<-prop.test(x = c(Xa, Xb), n = c(Na, Nb), + alternative = c("two.sided"), + conf.level = conf_val, + correct = FALSE)$p.value + } + else{ + p_value<- -7777777 + #p_value<-fisher.test(dat.xtabs,alternative = c("two.sided"), + #conf.level = conf_val)$p.value + } + + return(p_value) +} + +# transform the significance number into text +whether_significant<-function (p_num, threshold){ + if (p_num == -Inf){ + return ("Absent") + } + else if(p_num == -9999999){ + return ("No Info") + } + else if(p_num == -8888888){ + return ("No Base Data") + } + else if (p_numsignificance_threshold){ + return("Equitable(p)") + } + + if (value == -Inf){ + return ("Absent") + } + else if(value == -9999999){ + return ("No Info") + } + else if(value == -8888888){ + return ("No Base Data") + } + else if(value == -7777777){ + return ("Insufficient Data") + } + else if (value < -threshold) + { + if (value < -neg_break){ + return ("Highly Underrepresented") + } + else{ + return ("Underrepresented") + } + } + else if(value > threshold) { + if (value > pos_break){ + return ("Highly Overrepresented") + } + else{ + return ("Overrepresented") + } + } + else{ + return("Equitable") + } + +} +# assign the colors to different equity values +add_colors<-function(value, significance_value,significance_threshold, threshold, neg_break, pos_break){ + if (significance_value>significance_threshold){ + return("#d4e6e8") + } + + if (value == -Inf){ + return ("#ab2328") + } + else if(value == -9999999){ + return ("#000000") + } + else if(value == -8888888){ + return ("#54585a") + } + else if(value == -7777777){ + return ("#000000") + } + else if (value < -threshold) + { + if (value < -neg_break){ + return ("#d58570") + } + else{ + return ("#eabcad") + } + } + else if(value > threshold) { + if (value > pos_break){ + return ("#00205b") + } + else{ + return ("#a5b0cb") + } + } + else{ + return("#d4e6e8") + } +} \ No newline at end of file diff --git a/scripts/metrics.py b/scripts/metrics.py new file mode 100644 index 0000000..1173b35 --- /dev/null +++ b/scripts/metrics.py @@ -0,0 +1,33 @@ +import numpy as np + + +def pcc(y_true, y_pred): + """ + Returns Pearson's Correlation Coefficient after + changing it to the range 0 to 1 + """ + + # Find Pearson's correlation coefficient + pearson_value = (np.cov(y_true, y_pred, bias=True)[0,1]) / (np.std(y_true) * np.std(y_pred)) + + # Convert to 0 to 1 + pearson_value = (pearson_value + 1)/2 + + # Return + return pearson_value + + +def ds(y_true, y_pred): + """ + Returns Directional Symmetry + """ + + # Find Directional symmetry + n = len(y_true) + d_k = 0 + for i in range(1,n): + d_k += int(((y_true[i] - y_true[i-1])*(y_pred[i] - y_pred[i-1])) >= 0) + res = (100.0*d_k)/(n-1) + + # Return + return res \ No newline at end of file diff --git a/scripts/sunburst_process.R b/scripts/sunburst_process.R new file mode 100644 index 0000000..dc7aa27 --- /dev/null +++ b/scripts/sunburst_process.R @@ -0,0 +1,137 @@ +generate_sunburst_df<-function(background_df,variables_list,user_data_analysis,sig_t,tol_1, tol_2,equity_metric_selected, whether_NMI){ + + num_variables_df<-length(variables_list) + background_info_df<- background_df %>% + group_by(.dots = variables_list[1:num_variables_df]) %>% + summarise(background_n = sum(background_n)) + background_info_df<-na.omit(background_info_df) + total_population_background<- sum(background_info_df$background_n) + background_info_df$total_background<-rep(total_population_background, nrow(background_info_df)) + + user_info_df<- user_data_analysis %>% + group_by(.dots = variables_list[1:num_variables_df]) %>% + summarise(user_n = sum(user_n)) + user_info_df<-na.omit(user_info_df) + total_population_user<- sum(user_info_df$user_n) + user_info_df$total_user<-rep(total_population_user, nrow(user_info_df)) + + participant_rate<-total_population_user/total_population_background + + index_max<-num_variables_df-1 + + if (index_max>0){ + + for (var_index in index_max:1){ + background_var<-background_df %>% + group_by(.dots = variables_list[1:var_index]) %>% + summarise(background_n = sum(background_n)) + background_var<-na.omit(background_var) + + user_info_df_var<-user_data_analysis%>% + group_by(.dots = variables_list[1:var_index])%>% + summarise(user_n = sum(user_n)) + user_info_df_var<-na.omit(user_info_df_var) + + total_population_user_var<-sum(as.integer(user_info_df_var$user_n)) + total_population_background_var<-sum(as.integer(background_var$background_n)) + + background_var$total_background<-rep(total_population_background_var, nrow(background_var)) + user_info_df_var$total_user<-rep(total_population_user_var, nrow(user_info_df_var)) + + background_info_df<-plyr::rbind.fill(background_info_df, background_var) + user_info_df<-plyr::rbind.fill(user_info_df, user_info_df_var) + } + + } + + + #combine user input data with the background info + merged_df<-merge(background_info_df, user_info_df, by=1:(ncol(background_info_df)-2), all=TRUE) + merged_df$user_n<-merged_df$user_n %>% replace_na(0) + merged_df$background_n<-merged_df$background_n %>% replace_na(0) + + merged_df$total_user<-merged_df$total_user %>% replace_na(0) + merged_df$total_background<-merged_df$total_background %>% replace_na(0) + + #calculate the observed & background rates + merged_df$Observed_Rate<- mapply(Rate_Calculation,as.integer(merged_df$user_n),as.integer(merged_df$total_user)) + merged_df$Background_Rate<- mapply(Rate_Calculation,as.integer(merged_df$background_n),as.integer(merged_df$total_background)) + + merged_df$pValue<- mapply(compare_population_proportion_2,as.integer(merged_df$background_n),as.integer(merged_df$total_background),as.integer(merged_df$user_n),as.integer(merged_df$total_user),rep(1-sig_t,nrow(merged_df))) + merged_df$BH_p<- p.adjust(merged_df$pValue,method = "BH") + + if (whether_NMI){ + merged_df$EquityValue<- mapply(equity_metric_selected,as.numeric(merged_df$Background_Rate),as.numeric(merged_df$Observed_Rate),as.numeric(participant_rate), for_plot = FALSE) + merged_df$EquityLable<-mapply(whether_biased_label_NMI, as.numeric(merged_df$EquityValue),as.numeric(merged_df$BH_p),sig_t,tol_1,tol_2) + merged_df$EquityColors<-mapply(add_colors_NMI, as.numeric(merged_df$EquityValue),as.numeric(merged_df$BH_p),sig_t,tol_1,tol_2) + } + else{ + merged_df$EquityValue<- mapply(equity_metric_selected,as.numeric(merged_df$Background_Rate),as.numeric(merged_df$Observed_Rate), for_plot = FALSE) + merged_df$EquityLable<-mapply(whether_biased_label, as.numeric(merged_df$EquityValue),as.numeric(merged_df$BH_p),sig_t,tol_1,tol_2,tol_2) + merged_df$EquityColors<-mapply(add_colors, as.numeric(merged_df$EquityValue),as.numeric(merged_df$BH_p),sig_t,tol_1,tol_2,tol_2) + } + merged_df$na_count <- apply(merged_df[,1:num_variables_df,drop=F], 1, function(x) sum(is.na(x))) + + num_groups<-nrow(merged_df) + + new_df<- data.frame(ids = character(num_groups), + labels = character(num_groups), + parents = character(num_groups), + Observed_Rate = merged_df$Observed_Rate, + Ideal_Rate = merged_df$Background_Rate, + EquityColors=merged_df$EquityColors, + EquityLable=merged_df$EquityLable, + EquityValue=merged_df$EquityValue, + Observed_Number=merged_df$user_n, + Trial_Number = merged_df$total_user, + stringsAsFactors=FALSE) + + for (group_index in 1:num_groups){ + level<-num_variables_df-merged_df[group_index,'na_count'] + if (level>1){ + row_list <- unlist(merged_df[group_index,1:level]) + id_name<- paste(row_list,collapse=" - ") + label_name<- as.character(row_list[level]) + parent_name<-paste(row_list[1:(level-1)],collapse=" - ") + } + else{ + id_name<- as.character(merged_df[group_index,1]) + label_name<- id_name + parent_name<-"" + } + + new_df[group_index,'ids']<-id_name + new_df[group_index,'labels']<-label_name + new_df[group_index,'parents']<-parent_name + + } + + new_df$EquityValue<- mapply(signif, new_df$EquityValue,digits = 3) + new_df$Observed_Rate<- mapply(signif, new_df$Observed_Rate,digits = 3) + new_df$Ideal_Rate<- mapply(signif, new_df$Ideal_Rate,digits = 3) + + return(new_df) +} + +generate_sunburst_plotly3<-function(new_df){ + plot_ly(data= new_df)%>% add_trace( + type='sunburst', + ids=new_df$ids, + labels=new_df$labels, + parents=new_df$parents, + leaf = list(opacity = 1), + marker = list(colors = new_df$EquityColors), + #textinfo = "current path+label", + maxdepth =-1, + insidetextorientation='radial', + #hoverinfo = "current path+label+text", + hovertext = ~paste(new_df$EquityLable,'
','Ideal Rate:', formattable(new_df$Ideal_Rate, digits = 3, format = "f"),'
','Observed Rate:', formattable(new_df$Observed_Rate, digits = 3, format = "f"),'
','No. of Participants:', new_df$Observed_Number), + insidetextfont = 2) + +} + +test_sunburst2<-function(given_background,varList,given_rct,write_filename, sig_t, lower_t, upper_t){ + df_processed<-generate_sunburst_df(given_background,as.list(varList),given_rct,sig_t,lower_t,upper_t,Log_Disparate_Impact, FALSE) + write.csv(x= df_processed, file= write_filename) + generate_sunburst_plotly3(df_processed) +} \ No newline at end of file diff --git a/scripts/table_process.R b/scripts/table_process.R new file mode 100644 index 0000000..0a91d4d --- /dev/null +++ b/scripts/table_process.R @@ -0,0 +1,184 @@ +colorpicker <- function(z,cut1 = log(0.8),cut2=log(0.9),cut3=-log(0.9),cut4=-log(0.8)){ + if(is.na(z)){return("white")} + else if(z>cut4){return("white")} + else if(z == -Inf){return("white")} + else {return("black")} +} + +bgpicker <- function(z,cut1 = log(0.8),cut2=log(0.9),cut3=-log(0.9),cut4=-log(0.8)){ + if(is.na(z)){return("black")} + else if(z == -Inf){return("#ab2328")} + else if(z <= cut1){return("#d58570")} + else if( z > cut1 & z <= cut2){return("#eabcad")} + else if( z > cut2 & z <= cut3){return("#d4e6e8")} + else if( z > cut3 & z <= cut4){return("#a5b0cb")} + else if (z>cut4){return("#00205b")} +} + + + + +make_table_combine<-function(background_info,RCT_info,x, sig_t, cutoff1,cutoff2, selected_metric) { + user_colnames<-colnames(RCT_info) + + nonlab_factors<- user_colnames[!user_colnames%in% c("user_n")] + num_nonlab<- length(nonlab_factors) + + background_info_df<-background_info %>% + group_by(.dots = nonlab_factors[1]) %>% + summarise(background_n = sum(background_n)) + names(background_info_df)[1] <- "Group_Name" + background_info_df<-na.omit(background_info_df) + total_population_background<-sum(as.integer(background_info_df$background_n)) + background_info_df$total_background<-rep(total_population_background, nrow(background_info_df)) + + user_info_df<-RCT_info%>% + group_by(.dots = nonlab_factors[1]) %>% + summarise(user_n = sum(user_n)) + names(user_info_df)[1] <- "Group_Name" + user_info_df<-na.omit(user_info_df) + total_population_user<-sum(as.integer(user_info_df$user_n)) + user_info_df$total_user<-rep(total_population_user, nrow(user_info_df)) + + for (i in 2:num_nonlab){ + background_info_df_var<-background_info %>% + group_by(.dots = nonlab_factors[i]) %>% + summarise(background_n = sum(background_n)) + names(background_info_df_var)[1] <- "Group_Name" + background_info_df_var<-na.omit(background_info_df_var) + total_population_background_var<-sum(as.integer(background_info_df_var$background_n)) + background_info_df_var$total_background<-rep(total_population_background_var, nrow(background_info_df_var)) + + user_info_df_var<-RCT_info%>% + group_by(.dots = nonlab_factors[i]) %>% + summarise(user_n = sum(user_n)) + names(user_info_df_var)[1] <- "Group_Name" + user_info_df_var<-na.omit(user_info_df_var) + total_population_user_var<-sum(as.integer(user_info_df_var$user_n)) + user_info_df_var$total_user<-rep(total_population_user, nrow(user_info_df_var)) + background_info_df<-plyr::rbind.fill(background_info_df, background_info_df_var) + user_info_df<-plyr::rbind.fill(user_info_df, user_info_df_var) + } + + + merged_df<-merge(background_info_df, user_info_df, by="Group_Name", all=TRUE) + + + merged_df$user_n<-merged_df$user_n %>% replace_na(0) + merged_df$background_n<-merged_df$background_n %>% replace_na(0) + + merged_df$total_user<-merged_df$total_user %>% replace_na(0) + merged_df$total_background<-merged_df$total_background %>% replace_na(0) + + + merged_df$participant_rate<- mapply(Rate_Calculation,as.integer(merged_df$total_user),as.integer(merged_df$total_background)) + + #calculate the observed & background rates + merged_df$Observed_Rate<- mapply(Rate_Calculation,as.integer(merged_df$user_n),as.integer(merged_df$total_user)) + merged_df$Background_Rate<- mapply(Rate_Calculation,as.integer(merged_df$background_n),as.integer(merged_df$total_background)) + + merged_df$pValue<- mapply(compare_population_proportion_2,as.integer(merged_df$background_n),as.integer(merged_df$total_background),as.integer(merged_df$user_n),as.integer(merged_df$total_user),rep(1-sig_t,nrow(merged_df))) + merged_df$BH_p<- mapply(p.adjust,merged_df$pValue,method = "BH") + merged_df$whether_significant<- mapply(whether_significant,as.numeric(merged_df$BH_p), sig_t) + + if (selected_metric == "LDI"){ + merged_df$EquityValue<- mapply(Log_Disparate_Impact,as.numeric(merged_df$Background_Rate),as.numeric(merged_df$Observed_Rate), for_plot = FALSE) + merged_df$EquityLable<-mapply(whether_biased_label, as.numeric(merged_df$EquityValue),as.numeric(merged_df$BH_p),sig_t,cutoff1,cutoff2,cutoff2) + } + + merged_df<-merged_df %>%mutate(Group_Name = factor(Group_Name, levels = x)) %>%arrange(Group_Name) + + + new_df_univariable<- data.frame(Group_Name = merged_df$Group_Name, + BH_p = merged_df$BH_p, + Significant_Level = merged_df$whether_significant, + Equity_Value= merged_df$EquityValue, + Equity_Level= merged_df$EquityLable, + stringsAsFactors=FALSE) + + new_df_univariable<-new_df_univariable%>% mutate_if(is.numeric, round, digits=5) + + return(new_df_univariable) + +} + +preprocess_comparison_df<-function(sig_t, cut1,cut2, selected_metrics, file_type){ + ATUS_attributes<-c("Female", "Male","15-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+") + MIMIC_attributes<-c("Female", "Male","<=45", "46-65", "66-80","81+","White","Black","Unknown","Asian","Other","Alive","Died") + MIMIC_attributes2<-c("Female", "Male","\u2264 45", "46-65", "66-80","81+","White","Black","Unknown","Asian","Other","Alive","Died") + + t1<-cut1 + t2<-cut2 + + + if (file_type == "ATUS"){ + df_comparison_studies<-make_table_combine(ATUSreference_processed, ATUSsynthetic_processed, ATUS_attributes,sig_t,t1,t2,selected_metrics) + colnames(df_comparison_studies)[2:5] <- paste(colnames(df_comparison_studies)[2:5],"(1)", sep = "") + } + else if (file_type == "MIMIC"){ + df_comparison_studies<-make_table_combine(MIMICRacereference_processed, MIMICRacesynthetic_processed,MIMIC_attributes,sig_t,t1,t2,selected_metrics) + colnames(df_comparison_studies)[2:5] <- paste(colnames(df_comparison_studies)[2:5],"(1)", sep = "") + } + + if (file_type == "ATUS"){ + df_comparison_studies_new<-df_comparison_studies %>%mutate(Group_Name = factor(Group_Name, levels = ATUS_attributes)) %>%arrange(Group_Name) + df_comparison_studies_new$Group_Name<-ATUS_attributes + } + else if (file_type == "MIMIC"){ + df_comparison_studies_new<-df_comparison_studies %>%mutate(Group_Name = factor(Group_Name, levels = MIMIC_attributes)) %>%arrange(Group_Name) + df_comparison_studies_new$Group_Name<-MIMIC_attributes2 + } + df_comparison_studies_new<-df_comparison_studies_new[c(1,seq(4,ncol(df_comparison_studies_new),by=4))] + return(df_comparison_studies_new) +} + + +generate_table <- function(sig_t, lower_t, upper_t, metric_name,file_name){ + df_demo_comparison<-preprocess_comparison_df(sig_t,lower_t, upper_t, metric_name,file_name) + colnames(df_demo_comparison)<-c("Characteristics","Equity") + + rownames(df_demo_comparison) <- NULL + df_demo_comparison[,2:ncol(df_demo_comparison)] <- sapply(df_demo_comparison[,2:ncol(df_demo_comparison)],as.numeric) + + if (file_name == "ATUS"){ + demo_result<-format_table (df_demo_comparison, + align =c("l","c","c","c"), + lapply(2:ncol(df_demo_comparison), function(col) { + area(row=1:nrow(df_demo_comparison), col) ~ formatter("span", + style = x ~ style(display = "block", + "border-radius" = "4px", + "padding-right" = "4px", + color = sapply(x,colorpicker), + "background-color" = sapply(x,bgpicker)), + x ~ sprintf("%.3f", x)) + } + + ))%>% + kable_styling("striped", full_width = TRUE,fixed_thead = TRUE) %>% + pack_rows("Gender", 1,2) %>% + pack_rows("Age group (years)", 3,9) + } + else if (file_name == "MIMIC"){ + demo_result<-format_table (df_demo_comparison, + align =c("l","c","c","c"), + lapply(2:ncol(df_demo_comparison), function(col) { + area(row=1:nrow(df_demo_comparison), col) ~ formatter("span", + style = x ~ style(display = "block", + "border-radius" = "4px", + "padding-right" = "4px", + color = sapply(x,colorpicker), + "background-color" = sapply(x,bgpicker)), + x ~ sprintf("%.3f", x)) + } + + ))%>% + kable_styling("striped", full_width = TRUE,fixed_thead = TRUE) %>% + pack_rows("Gender", 1,2) %>% + pack_rows("Age group (years)",3,6)%>% + pack_rows("Race/Ethnicity",7,11) %>% + pack_rows("Mortality", 12,13) + } + + gt::html(demo_result) + +}