Analysis 3d: Real World Datasets Audio Data Grouped CR
Analysis 3d: Real World Datasets Audio Data Grouped CRΒΆ
Imports, Functs and Paths
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from leakconfound.transformers import Shuffle
import os
import pandas as pd
import matplotlib.pyplot as plt
from julearn.transformers.confounds import DataFrameConfoundRemover
# plot styles
from sciplotlib import style
import matplotlib as mpl
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
base_save_paper = "./paper_val/"
results_base = "../../results/permutations"
base_dir = "../../"
colors = [
"#E64B35",
"#4DBBD5",
"#00A087",
"#3C5488",
"#F39B7F",
"#8491B4",
"#91D1C2FF",
"#DC0000",
"#7E6148",
"#B09C85",
]
red = colors[0]
blue = colors[1]
green = colors[2]
purple = colors[5]
np.random.seed(891236740)
def mm_to_inch(val_in_inch):
mm = 0.1 / 2.54
return val_in_inch * mm
base_save_paper = "./paper_val/"
results_base = "../../results/"
base_dir = "../../"
audio_data_non_TaCo_folder = f"{results_base}basic_non_TaCo/realistic_non_TaCo/models/"
audio_data_shuffled_non_TaCo_folder = (
f"{results_base}" "shuffled_features_non_TaCo/realistic_non_TaCo/models/"
)
mpl.style.use(style.get_style("nature-reviews"))
mpl.rc("xtick", labelsize=11)
mpl.rc("ytick", labelsize=11)
mpl.rc("axes", labelsize=12, titlesize=12)
mpl.rc("figure", dpi=300)
mpl.rc("figure.subplot", wspace=mm_to_inch(8), hspace=0.7)
mpl.rc("lines", linewidth=1, markersize=2)
fig = plt.figure(
figsize=[mm_to_inch(183), mm_to_inch(140)],
)
<Figure size 2161.42x1653.54 with 0 Axes>
Some info on the data:
os.listdir(audio_data_non_TaCo_folder)
df_original = pd.read_csv("../../data/realistic_non_TaCo/audio_data/audio_data_BDI.csv")
X = df_original.filter(regex=".*__continuous$").columns.tolist()
y = "ATT_Task__binary_target"
confounds = ["BDI__continuous_confound"]
cv = RepeatedStratifiedKFold(random_state=2738).split(df_original[X], df_original[y])
scores_raw = []
scores_rm = []
scores_shuffled_raw = []
scores_shuffled_rm = []
for i_train, i_test in cv:
df_i_train = df_original.iloc[i_train, :].copy()
df_i_train_0 = df_i_train.query(f"{y} == 0").copy()
df_i_test = df_original.iloc[i_test, :].copy()
X_i_train = df_i_train.loc[:, X]
Xc_i_train = df_i_train.loc[:, X + confounds]
y_i_train = df_i_train.loc[:, y]
Xc_i_train_0 = df_i_train_0.loc[:, X + confounds]
y_i_train_0 = df_i_train_0.loc[:, y]
X_i_test = df_i_test.loc[:, X]
Xc_i_test = df_i_test.loc[:, X + confounds]
y_i_test = df_i_test.loc[:, y]
# Raw:
sc = StandardScaler().fit(X_i_train)
X_sc_train = sc.transform(X_i_train)
X_sc_test = sc.transform(X_i_test)
rf_X = RandomForestClassifier(n_estimators=300, random_state=78).fit(
X_sc_train, y_i_train
)
scores_raw.append(rf_X.score(X_sc_test, y_i_test))
# Remove trained on class 0 only:
# But transform both
sc_0 = StandardScaler().fit(Xc_i_train_0)
Xc_sc_train_0 = pd.DataFrame(sc_0.transform(Xc_i_train), columns=Xc_i_train.columns)
Xc_sc_test_0 = pd.DataFrame(sc_0.transform(Xc_i_test), columns=Xc_i_train.columns)
remover = DataFrameConfoundRemover(confounds_match=confounds[0]).fit(Xc_sc_train_0)
Xc_rem_train = remover.transform(Xc_sc_train_0)
Xc_rem_test = remover.transform(Xc_sc_test_0)
assert len(Xc_rem_train.columns) == len(X)
rf_X = RandomForestClassifier(n_estimators=300, random_state=78).fit(
Xc_rem_train, y_i_train
)
scores_rm.append(rf_X.score(Xc_rem_test, y_i_test))
# Shuffled
# Raw:
shuffler = Shuffle().fit(X_i_train)
X_i_train_shuffled = shuffler.transform(X_i_train)
X_i_test_shuffled = shuffler.transform(X_i_test)
Xc_i_train_shuffled = X_i_train_shuffled.copy()
Xc_i_train_shuffled[confounds] = Xc_i_train[confounds]
Xc_i_test_shuffled = X_i_test_shuffled.copy()
Xc_i_test_shuffled[confounds] = Xc_i_test[confounds]
Xc_i_train_shuffled_0 = Xc_i_train_shuffled.loc[y_i_train == 0]
sc = StandardScaler().fit(X_i_train_shuffled)
X_sc_train_shuffled = sc.transform(X_i_train_shuffled)
X_sc_test = sc.transform(X_i_test)
rf_X = RandomForestClassifier(n_estimators=300, random_state=78).fit(
X_sc_train_shuffled, y_i_train
)
scores_shuffled_raw.append(rf_X.score(X_sc_test, y_i_test))
# Remove train_shuffleded on class 0 only:
# But transform both
sc_0 = StandardScaler().fit(Xc_i_train_shuffled_0)
Xc_sc_train_shuffled_0 = pd.DataFrame(
sc_0.transform(Xc_i_train_shuffled), columns=Xc_i_train_shuffled.columns
)
Xc_sc_test_0 = pd.DataFrame(
sc_0.transform(Xc_i_test), columns=Xc_i_train_shuffled.columns
)
remover = DataFrameConfoundRemover(confounds_match=confounds[0]).fit(
Xc_sc_train_shuffled_0
)
Xc_rem_train_shuffled = remover.transform(Xc_sc_train_shuffled_0)
Xc_rem_test = remover.transform(Xc_sc_test_0)
assert len(Xc_rem_train_shuffled.columns) == len(X)
rf_X = RandomForestClassifier(n_estimators=300, random_state=78).fit(
Xc_rem_train_shuffled, y_i_train
)
scores_shuffled_rm.append(rf_X.score(Xc_rem_test, y_i_test))
df_scores = pd.DataFrame(
dict(
raw=scores_raw,
rem=scores_rm,
shuffled_raw=scores_shuffled_raw,
shuffled_rem=scores_shuffled_rm,
repeats=np.repeat(np.arange(10), 5),
folds=np.tile(np.arange(5), 10),
)
)
df_grouped = df_scores.groupby("repeats").mean()
print(df_grouped.describe())
raw rem shuffled_raw shuffled_rem folds
count 10.000000 10.000000 10.000000 10.000000 10.0
mean 0.653169 0.827385 0.510246 0.786031 2.0
std 0.019319 0.018055 0.049074 0.021047 0.0
min 0.620308 0.801538 0.452308 0.762462 2.0
25% 0.642615 0.817615 0.485385 0.769769 2.0
50% 0.658308 0.826308 0.492923 0.778769 2.0
75% 0.666846 0.841538 0.513077 0.800000 2.0
max 0.675077 0.849846 0.619385 0.825538 2.0