Analysis 3a: Real World Datasets Audio Data

Imports, Functs and Paths

from scipy.stats import pointbiserialr
from myst_nb import glue
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# plot styles
from sciplotlib import style
from leakconfound.analyses.load import gather_data, data_to_long, models_renamer
from leakconfound.plotting import custom_bar_rope_plot
from leakconfound.analyses.utils import save_paper_val

import matplotlib as mpl

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

base_save_paper = "./paper_val/"
results_base = "../../results/"
base_dir = "../../"
audio_data_non_TaCo_folder = f"{results_base}basic_non_TaCo/realistic_non_TaCo/"
audio_data_TaCo_folder = f"{results_base}basic_TaCo/realistic/"
audio_data_robust_non_TaCo_folder = (
    f"{results_base}robust_cr_non_TaCo/realistic_non_TaCo/"
)
audio_data_shuffled_TaCo_folder = f"{results_base}shuffled_features_TaCo/realistic/"
audio_data_shuffled_non_TaCo_folder = (
    f"{results_base}" "shuffled_features_non_TaCo/realistic_non_TaCo/"
)

colors = [
    "#E64B35",
    "#4DBBD5",
    "#00A087",
    "#3C5488",
    "#F39B7F",
    "#8491B4",
    "#91D1C2FF",
    "#DC0000",
    "#7E6148",
    "#B09C85",
]
red = colors[0]
blue = colors[1]
green = colors[2]
purple = colors[5]


def mm_to_inch(val_in_inch):
    mm = 0.1 / 2.54
    return val_in_inch * mm


mpl.style.use(style.get_style("nature-reviews"))
mpl.rc("xtick", labelsize=11)
mpl.rc("ytick", labelsize=11)
mpl.rc("axes", labelsize=12, titlesize=12)
mpl.rc("figure", dpi=300)
mpl.rc("figure.subplot", wspace=mm_to_inch(8), hspace=0.7)
mpl.rc("lines", linewidth=1, markersize=2)

fig = plt.figure(
    figsize=[mm_to_inch(183), mm_to_inch(140)],
)
<Figure size 2161.42x1653.54 with 0 Axes>
def correct_fold_repeat(df):
    df.repeat = df.fold // 5
    df.fold = df.fold % 5
    return df


def convert_hat_scores(df):
    _df = df.copy()
    return (
        _df.assign(is_X_hat=lambda df: df.score_name.str.contains("X_hat"))
        .assign(
            confound=lambda df: df.apply(
                lambda row: "X hat" if row["is_X_hat"] else row["confound"], axis=1
            )
        )
        .assign(score_name=lambda df: df.score_name.str.replace("X_hat_", ""))
    )


def correct_columns(df):
    return correct_fold_repeat(convert_hat_scores(df))

Some info on the data:

data = pd.read_csv("../../data/realistic/audio_data/audio_data.csv")
bdi_median = data.BDI__continuous_confound.median()
bdi = data.BDI__continuous_confound.apply(lambda x: x > bdi_median)
adhd = data.ATT_Task__binary_target

print(pd.crosstab(adhd, bdi))

corr, p = pointbiserialr(bdi, adhd)

save_paper_val(
    base_save_paper,
    "real_world",
    "relation_BDI_ADHD",
    "p_pointbiserial",
    p,
    precision=3,
)

save_paper_val(
    base_save_paper, "real_world", "relation_BDI_ADHD", "r_pointbiserial", corr
)
print(f"pointbiserialr, r={corr:.2},p={p:.2}")
BDI__continuous_confound  False  True 
ATT_Task__binary_target               
0.0                          55      8
1.0                          17     46
pointbiserialr, r=0.61,p=3.7e-14

Performance TaCo with Original Data

use_models = ["Linear/Logistic", "Decision Tree", "Random Forest"]
arguments_TaCo = ["data", "model_name", "is_deconfounded", "random_seed"]

score_names = [
    "test_roc_auc",
    "test_X_hat_roc_auc",
]
df_long_TaCo = (
    gather_data(audio_data_TaCo_folder, arguments_TaCo)
    .assign(model_name=lambda df: df.model_name.map(models_renamer))
    .query('(data.str.startswith("audio"))')
    .pipe(data_to_long, analysis_arguments=arguments_TaCo, score_names=score_names)
    .pipe(correct_columns)
)
Mrf_rem = (
    df_long_TaCo.query('(confound == "removed") & (model_name=="Random Forest")')
    .groupby("repeat")
    .mean()
    .score
)
Mrf_no_rem = (
    df_long_TaCo.query('(confound == "not removed") & (model_name=="Random Forest")')
    .groupby("repeat")
    .mean()
    .score
)
print(
    f"Performance no removal using RF M={Mrf_no_rem.mean():.2}, sd={Mrf_no_rem.std():.2}"
)
print(
    f"Performance after TaCo removal using RF M={Mrf_rem.mean():.2}, sd={Mrf_rem.std():.2}"
)

save_paper_val(
    base_save_paper, "real_world", "raw", "mean_audio_TaCo.txt", Mrf_no_rem.mean()
)

save_paper_val(
    base_save_paper, "real_world", "raw", "std_audio_TaCo.txt", Mrf_no_rem.std()
)

save_paper_val(
    base_save_paper, "real_world", "removed", "mean_audio_TaCo.txt", Mrf_rem.mean()
)

save_paper_val(
    base_save_paper, "real_world", "removed", "std_audio_TaCo.txt", Mrf_rem.std()
)
Performance no removal using RF M=0.71, sd=0.017
Performance after TaCo removal using RF M=0.81, sd=0.017

Performance TaCo with Shuffled Data

df_long_shuffled_TaCo = (
    gather_data(audio_data_shuffled_TaCo_folder, arguments_TaCo)
    .assign(model_name=lambda df: df.model_name.map(models_renamer))
    .query('(data.str.startswith("audio"))')
    .pipe(data_to_long, analysis_arguments=arguments_TaCo, score_names=score_names)
    .pipe(correct_columns)
)
Mrf_shuffled_rem = (
    df_long_shuffled_TaCo.query(
        '(confound == "removed") & (model_name=="Random Forest")'
    )
    .groupby("repeat")
    .mean()
    .score
)
Mrf_shuffled_no_rem = (
    df_long_shuffled_TaCo.query(
        '(confound == "not removed") & (model_name=="Random Forest")'
    )
    .groupby("repeat")
    .mean()
    .score
)
print("shuffled Results:")
print(
    f"Performance no removal using RF M={Mrf_shuffled_no_rem.mean():.2},"
    f" sd={Mrf_shuffled_no_rem.std():.2}"
)
print(
    f"Performance after TaCo removal using RF M={Mrf_shuffled_rem.mean():.2},"
    f" sd={Mrf_shuffled_rem.std():.2}"
)

save_paper_val(
    base_save_paper,
    "real_world",
    "raw",
    "mean_audio_TaCo_shuffled.txt",
    Mrf_shuffled_no_rem.mean(),
)

save_paper_val(
    base_save_paper,
    "real_world",
    "raw",
    "std_audio_TaCo_shuffled.txt",
    Mrf_shuffled_no_rem.std(),
)

save_paper_val(
    base_save_paper,
    "real_world",
    "removed",
    "mean_audio_TaCo_shuffled.txt",
    Mrf_shuffled_rem.mean(),
)

save_paper_val(
    base_save_paper,
    "real_world",
    "removed",
    "std_audio_TaCo_shuffled.txt",
    Mrf_shuffled_rem.std(),
)
shuffled Results:
Performance no removal using RF M=0.48, sd=0.04
Performance after TaCo removal using RF M=0.77, sd=0.02
arguments = ["data", "model_name", "is_deconfounded", "random_seed", "confounds_used"]
confounds_used_pattern = r"confounds=\[(.+?)\]"
df_long_non_TaCo = (
    gather_data(audio_data_non_TaCo_folder, arguments)
    .assign(
        model_name=lambda df: df.model_name.map(models_renamer),
    )
    .query('(data.str.startswith("audio"))')
    .pipe(data_to_long, analysis_arguments=arguments, score_names=score_names)
    .pipe(correct_columns)
)

df_long_shuffled_non_TaCo = (
    gather_data(audio_data_shuffled_non_TaCo_folder, arguments)
    .assign(
        model_name=lambda df: df.model_name.map(models_renamer),
    )
    .query('(data.str.startswith("audio"))')
    .pipe(data_to_long, analysis_arguments=arguments, score_names=score_names)
    .pipe(correct_columns)
)

Performance for each Confound Separated

print("Performance using RF ")
print("_" * 40)
print()
for confounds_used in df_long_non_TaCo.confounds_used.unique():
    Mrf_rem = (
        df_long_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "removed")'
        )
        .groupby("repeat")
        .score.mean()
    )

    Mrf_no_rem = (
        df_long_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "not removed")'
        )
        .groupby("repeat")
        .score.mean()
    )

    Mrf_rem_Xhat = (
        df_long_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "X hat")'
        )
        .groupby("repeat")
        .score.mean()
    )

    Mrf_no_rem_Xhat = (
        df_long_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "X hat")'
        )
        .groupby("repeat")
        .score.mean()
    )

    Mrf_rem_shuff = (
        df_long_shuffled_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "removed")'
        )
        .groupby("repeat")
        .score.mean()
    )

    Mrf_no_rem_shuff = (
        df_long_shuffled_non_TaCo.query(
            "(confounds_used == @confounds_used) & "
            '(model_name == "Random Forest") &'
            '(confound == "not removed")'
        )
        .groupby("repeat")
        .score.mean()
    )
    print("Confound: ", confounds_used)
    print("Raw:")
    print(f"No removal M={Mrf_no_rem.mean():.2}, sd={Mrf_no_rem.std():.2}")
    print(f"After CR M={Mrf_rem.mean():.2}, sd={Mrf_rem.std():.2}")

    print("Xhat:")
    print(f"No removal M={Mrf_no_rem_Xhat.mean():.2}, sd={Mrf_no_rem_Xhat.std():.2}")
    print(f"After CR M={Mrf_rem_Xhat.mean():.2}, sd={Mrf_rem_Xhat.std():.2}")

    print("Shuffled:")

    print(f"No removal M={Mrf_no_rem_shuff.mean():.2}, sd={Mrf_no_rem_shuff.std():.2}")
    print(f"After CR M={Mrf_rem_shuff.mean():.2}, sd={Mrf_rem_shuff.std():.2}")

    print("_" * 40)
    print()

    save_paper_val(
        base_save_paper,
        "real_world",
        "raw",
        f"mean_audio_conf_{confounds_used}.txt",
        Mrf_no_rem.mean(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "raw",
        f"std_audio_conf_{confounds_used}.txt",
        Mrf_no_rem.std(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "removed",
        f"mean_audio_conf_{confounds_used}.txt",
        Mrf_rem.mean(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "removed",
        f"std_audio_conf_{confounds_used}.txt",
        Mrf_rem.std(),
    )

    # X hat

    save_paper_val(
        base_save_paper,
        "real_world",
        "Xhat",
        f"mean_audio_conf_{confounds_used}.txt",
        Mrf_no_rem_Xhat.mean(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "Xhat",
        f"std_audio_conf_{confounds_used}.txt",
        Mrf_no_rem_Xhat.std(),
    )

    # Shuffled
    save_paper_val(
        base_save_paper,
        "real_world",
        "raw",
        f"mean_audio_conf_{confounds_used}_shuffled.txt",
        Mrf_no_rem_shuff.mean(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "raw",
        f"std_audio_conf_{confounds_used}_shuffled.txt",
        Mrf_no_rem_shuff.std(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "removed",
        f"mean_audio_conf_{confounds_used}_shuffled.txt",
        Mrf_rem_shuff.mean(),
    )

    save_paper_val(
        base_save_paper,
        "real_world",
        "removed",
        f"std_audio_conf_{confounds_used}_shuffled.txt",
        Mrf_rem_shuff.std(),
    )
Performance using RF 
________________________________________

Confound:  Gender
Raw:
No removal M=0.74, sd=0.02
After CR M=0.7, sd=0.021
Xhat:
No removal M=0.64, sd=0.0014
After CR M=0.64, sd=0.0014
Shuffled:
No removal M=0.54, sd=0.039
After CR M=0.6, sd=0.032
________________________________________

Confound:  Gender_Education_Age_BDI
Raw:
No removal M=0.74, sd=0.02
After CR M=0.86, sd=0.016
Xhat:
No removal M=0.88, sd=0.016
After CR M=0.88, sd=0.016
Shuffled:
No removal M=0.54, sd=0.039
After CR M=0.85, sd=0.019
________________________________________

Confound:  Education
Raw:
No removal M=0.74, sd=0.02
After CR M=0.74, sd=0.026
Xhat:
No removal M=0.58, sd=0.031
After CR M=0.58, sd=0.031
Shuffled:
No removal M=0.54, sd=0.039
After CR M=0.55, sd=0.042
________________________________________

Confound:  Age
Raw:
No removal M=0.74, sd=0.02
After CR M=0.73, sd=0.022
Xhat:
No removal M=0.56, sd=0.029
After CR M=0.56, sd=0.029
Shuffled:
No removal M=0.54, sd=0.039
After CR M=0.57, sd=0.034
________________________________________

Confound:  BDI
Raw:
No removal M=0.74, sd=0.02
After CR M=0.91, sd=0.011
Xhat:
No removal M=0.84, sd=0.013
After CR M=0.84, sd=0.013
Shuffled:
No removal M=0.54, sd=0.039
After CR M=0.84, sd=0.013
________________________________________

Plotting Relation of Confounds used to no removal v removal

df_bar_plot = (
    pd.concat(
        [
            df_long_non_TaCo.copy().query("model_name == 'Random Forest'"),
            (
                df_long_TaCo.assign(confounds_used="TaCo")
                .query("model_name == 'Random Forest'")
                .query("data == 'audio_data'")
                .reset_index(drop=True)
            ),
        ]
    )
    .dropna()
    .reset_index(drop=True)
)


df_bar_plot = df_bar_plot.query("confound in ['not removed', 'removed']")
fig, ax = plt.subplots()
custom_bar_rope_plot(
    x="confounds_used",
    y="score",
    hue="confound",
    ax=ax,
    hue_order=["not removed", "removed"],
    comparisons=(("not removed", "removed"),),
    comparisons_sing_y=(1.1,),
    cv_repeats="repeat",
    data=df_bar_plot,
    rope=0.05,
    order=["Gender", "Education", "Age", "BDI", "Gender_Education_Age_BDI", "TaCo"],
    palette=sns.color_palette([blue, red]),
    show_legend=False,
)
ax.set_xticklabels(
    [i.get_text().replace("_", " ") for i in ax.get_xticklabels()],
    rotation=15,
    ha="right",
)
ax.set_ylim(0, 1.1)
ax.set_ylabel("AUCROC")
ax.set_xlabel("Confounds Used")

handles = [
    mpl.patches.Patch(color=blue, label="$X$"),
    mpl.patches.Patch(color=red, label="$X_{CR}$"),
]

fig.legend(
    handles=handles,
    title="Original Features",
    loc="upper center",
    bbox_to_anchor=(0.5, 1.05),
    ncol=3,
)

fig.savefig("./saved_figures/all_confounds_audio_performance.svg")
fig.savefig("./saved_figures/all_confounds_audio_performance.png")
fig.savefig("./saved_figures/all_confounds_audio_performance.pdf")
glue("all_confounds_performance", fig)
../_images/03a_real_world_13_0.png ../_images/03a_real_world_13_1.png
df_bar_plot_shuffled = (
    pd.concat(
        [
            df_long_shuffled_non_TaCo.copy().query("model_name == 'Random Forest'"),
            (
                df_long_shuffled_TaCo.assign(confounds_used="TaCo")
                .query("model_name == 'Random Forest'")
                .query("data == 'audio_data'")
                .reset_index(drop=True)
            ),
        ]
    )
    .dropna()
    .reset_index(drop=True)
)


fig, ax = plt.subplots()
df_bar_plot_shuffled = df_bar_plot_shuffled.query(
    "confound in ['not removed', 'removed']"
)
custom_bar_rope_plot(
    x="confounds_used",
    y="score",
    hue="confound",
    ax=ax,
    hue_order=["not removed", "removed"],
    comparisons=(("not removed", "removed"),),
    comparisons_sing_y=(1.1,),
    cv_repeats="repeat",
    data=df_bar_plot_shuffled,
    rope=0.05,
    order=["Gender", "Education", "Age", "BDI", "Gender_Education_Age_BDI", "TaCo"],
    palette=sns.color_palette([blue, red]),
    show_legend=False,
)
ax.set_xticklabels(
    [i.get_text().replace("_", " ") for i in ax.get_xticklabels()],
    rotation=15,
    ha="right",
)
ax.set_ylim(0, 1.1)
ax.set_xlabel("Confounds Used")

fig.legend(
    handles=handles,
    title="Shuffled Features",
    loc="upper center",
    bbox_to_anchor=(0.5, 1.05),
    ncol=3,
)

fig.savefig("./saved_figures/all_shuffled_confounds_audio_performance.svg")
fig.savefig("./saved_figures/all_shuffled_confounds_audio_performance.png")
fig.savefig("./saved_figures/all_shuffled_confounds_audio_performance.pdf")
glue("all_shuffled_confounds_performance", fig)
../_images/03a_real_world_14_0.png ../_images/03a_real_world_14_1.png
../_images/03a_real_world_13_0.png
../_images/03a_real_world_14_0.png