Note
Click here to download the full example code
G-SMOTE validation curves
In this example the impact of the Geometric SMOTE's hyperparameters is examined. The validation scores of a Geometric SMOTE-GBC classifier is presented for different values of the Geometric SMOTE's hyperparameters.
# Author: Georgios Douzas <gdouzas@icloud.com>
# Licence: MIT
import matplotlib.pyplot as plt
import numpy as np
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import validation_curve
from sklearn.tree import DecisionTreeClassifier
from imblearn_extra.gsmote import GeometricSMOTE
RANDOM_STATE = 10
SCORER = make_scorer(geometric_mean_score)
def generate_imbalanced_data(weights, n_samples, n_features, n_informative):
"""Generate imbalanced data."""
X, y = make_classification(
n_classes=2,
class_sep=2,
weights=weights,
n_informative=n_informative,
n_redundant=1,
flip_y=0,
n_features=n_features,
n_clusters_per_class=2,
n_samples=n_samples,
random_state=RANDOM_STATE,
)
return X, y
def generate_validation_curve_info(estimator, X, y, param_range, param_name, scoring):
"""Generate information for the validation curve."""
_, test_scores = validation_curve(
estimator,
X,
y,
param_name=param_name,
param_range=param_range,
cv=3,
scoring=scoring,
n_jobs=-1,
)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
return test_scores_mean, test_scores_std, param_range
def plot_validation_curve(validation_curve_info, scoring_name, title):
"""Plot the validation curve."""
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
test_scores_mean, test_scores_std, param_range = validation_curve_info
plt.plot(param_range, test_scores_mean)
ax.fill_between(
param_range,
test_scores_mean + test_scores_std,
test_scores_mean - test_scores_std,
alpha=0.2,
)
idx_max = np.argmax(test_scores_mean)
plt.scatter(param_range[idx_max], test_scores_mean[idx_max])
plt.title(title)
plt.ylabel(scoring_name)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
plt.ylim([0.9, 1.0])
Low Imbalance Ratio or high Samples to Features Ratio
When :math:\text{IR} = \frac{\text{\# majority samples}}{\text{\# minority
samples}}
(Imbalance Ratio) is low or :math:\text{SFR} = \frac{\text{\#
samples}}{\text{\# features}}
(Samples to Features Ratio) is high then the
minority selection strategy and higher absolute values of deformation factor
dominate as optimal hyperparameters.
X, y = generate_imbalanced_data([0.3, 0.7], 2000, 6, 4)
gsmote_dtc = make_pipeline(
GeometricSMOTE(random_state=RANDOM_STATE),
DecisionTreeClassifier(random_state=RANDOM_STATE),
)
scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(
gsmote_dtc,
X,
y,
range(1, 8),
'geometricsmote__k_neighbors',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')
validation_curve_info = generate_validation_curve_info(
gsmote_dtc,
X,
y,
np.linspace(-1.0, 1.0, 9),
'geometricsmote__truncation_factor',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Truncation Factor')
validation_curve_info = generate_validation_curve_info(
gsmote_dtc,
X,
y,
np.linspace(0.0, 1.0, 5),
'geometricsmote__deformation_factor',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Deformation Factor')
validation_curve_info = generate_validation_curve_info(
gsmote_dtc,
X,
y,
['minority', 'majority', 'combined'],
'geometricsmote__selection_strategy',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')
Out:
/home/runner/work/imbalanced-learn-extra/imbalanced-learn-extra/.nox/docs/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=2102) is multi-threaded, use of fork() may lead to deadlocks in the child.
pid = os.fork()
/home/runner/work/imbalanced-learn-extra/imbalanced-learn-extra/.nox/docs/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=2102) is multi-threaded, use of fork() may lead to deadlocks in the child.
pid = os.fork()
High Imbalance Ratio or low Samples to Features Ratio
When :math:\text{IR}
is high or :math:\text{SFR}
is low then the majority
or combined selection strategies and lower absolute values of deformation factor
dominate as optimal hyperparameters.
X, y = generate_imbalanced_data([0.1, 0.9], 2000, 400, 200)
gsmote_lr = make_pipeline(
GeometricSMOTE(random_state=RANDOM_STATE),
LogisticRegression(random_state=RANDOM_STATE, solver='liblinear'),
)
scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(
gsmote_lr,
X,
y,
range(1, 8),
'geometricsmote__k_neighbors',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')
validation_curve_info = generate_validation_curve_info(
gsmote_lr,
X,
y,
np.linspace(-1.0, 1.0, 9),
'geometricsmote__truncation_factor',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Truncation Factor')
validation_curve_info = generate_validation_curve_info(
gsmote_lr,
X,
y,
np.linspace(0.0, 1.0, 5),
'geometricsmote__deformation_factor',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Deformation Factor')
validation_curve_info = generate_validation_curve_info(
gsmote_lr,
X,
y,
['minority', 'majority', 'combined'],
'geometricsmote__selection_strategy',
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')
Total running time of the script: ( 0 minutes 13.275 seconds)
Download Python source code: plot_gsmote_validation_curves.py
Download Jupyter notebook: plot_gsmote_validation_curves.ipynb