Source code for adaptivesplit.sklearn_interface.learning_curve

#extends basse.learning_curve with scikit-learn functionality
from ..base.learning_curve import *
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.base import is_classifier, is_regressor
from .utils import check_cv, check_scoring
from .resampling import SubSampleCV

"""
def lc_keymaker(estimator, X, y, ns, cv=5, cv_stat=np.mean, dummy_estimator=None,
                shuffle=-1, replacement=False, scoring=None, verbose=True, n_jobs=None, random_state=None,
                *args, **kwargs):
    # resolve nans:
    if dummy_estimator is None:
        dummy_estimator = 'default'
    if scoring is None:
        scoring = 'default'
    if n_jobs is None:
        n_jobs = 'default'
    if random_state is None:
        random_state = np.random.normal()

    return str(estimator), str(X), str(y), str(ns), str(cv), str(
        cv_stat), dummy_estimator, shuffle, replacement, scoring, verbose, n_jobs, random_state
"""

# factory function for sklearn
# @cached(max_size=64, custom_key_maker=lc_keymaker)
[docs]def calculate_learning_curve(estimator, X, y, sample_sizes, stratify=None, cv=5, cv_stat=np.mean, dummy_estimator=None, num_samples=1, power_estimator=None, scoring=None, verbose=True, n_jobs=None, random_state=None, *args, **kwargs): """Calculate learning curve on training and test data. Also generates a learning curve for baseline performance using dummy estimators. Args: estimator (estimator object): Estimator object. A object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a score function, or scoring must be passed. If it is e.g. a GridSearchCV then nested cv is performed (recommended). X (numpy.ndarray or pandas.DataFrame): array-like of shape (n_samples, n_features). The data to fit as in scikit-learn. Can be a numpy array or pandas DataFrame. y (numpy.ndarray or pandas.Series): array-like of shape (n_samples,) or (n_samples, n_outputs), default=None The target variable to try to predict in the case of supervised learning, as in scikit-learn. sample_sizes (int or list of int): sample sizes to calculate the learning curve. stratify (int): For classification tasks. If not None, use stratified sampling to account for class labels imbalance. Defaults to None. cv (int, cross-validation generator or an iterable): Determines the cross-validation splitting strategy, as in scikit-learn. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - int, to specify the number of folds in a (Stratified)KFold, - CV splitter, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, K-Fold is used. These splitters are instantiated with shuffle=False so the splits will be the same across calls. Defaults to 5. cv_stat (callable): Function for aggregating cross-validation-wise scores. Defaults to numpy.mean. dummy_estimator (estimator object): A scikit-learn-like dummy estimator to evaluate baseline performance. If None, either DummyClassifier() or DummyRegressor() are used, based on 'estimator's type. num_samples (int): Number of iterations to shuffle data before determining subsamples. The first iteration (index 0) is ALWAYS unshuffled (num_samples=1 implies no resampling at all, default). power_estimator (callable): Callable must be a power_estimator function, see the 'create_power_estimator*' factory functions. If None, power curve is not calculated. Defaults to None. scoring (str, callable, list, tuple or dict): Scikit-learn-like score to evaluate the performance of the cross-validated model on the test set. If scoring represents a single score, one can use: - a single string (see The scoring parameter: defining model evaluation rules); - a callable (see Defining your scoring strategy from metric functions) that returns a single value. If scoring represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric names and the values are the metric scores; - a dictionary with metric names as keys and callables a values. If None, the estimator’s score method is used. Defaults to None. verbose (bool): If not False, prints progress. Defaults to True. n_jobs (int): Number of jobs to run in parallel. Defaults to None. Training the estimator and computing the score are parallelized over the cross-validation splits. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. random_state (int): Controls the randomness of the bootstrapping of the samples used when building sub-samples (if shuffle!=-1). Defaults to None. *args: Extra parameters passed to sklearn.model_selection.cross_validate. **kwargs: Extra keyword parameters passed to sklearn.model_selection.cross_validate. Returns: lc_train (adaptivesplit.base.learning_curve.LearningCurve object): Learning curve calculated on training data. lc_test (adaptivesplit.base.learning_curve.LearningCurve object): Learning curve calculated on test data. lc_dummy (adaptivesplit.base.learning_curve.LearningCurve object): Learning curve calculated using the dummy estimator. It estimates baseline learning performance. """ if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.to_numpy() if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = np.squeeze(y.to_numpy()) # squeezing avoids errors with some datasets; else: y = np.squeeze(y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) if isinstance(sample_sizes, (int, float)): inc = (len(y) - cv.get_n_splits()) / sample_sizes sample_sizes = np.arange(start=cv.get_n_splits(), stop=len(y)+inc, step=inc) if scoring is None: scoring = check_scoring(estimator) if dummy_estimator is None: if is_classifier(estimator): dummy_estimator = DummyClassifier() # strategy='stratified'? elif is_regressor(estimator): dummy_estimator = DummyRegressor() else: raise RuntimeError("Estimator can only be classifier or regressor.") subsampler = SubSampleCV(estimator=estimator, dummy_estimator=dummy_estimator, sample_size=sample_sizes, num_samples=num_samples, cv=cv, cv_stat=cv_stat, power_estimator=power_estimator, scoring=scoring, verbose=verbose, n_jobs=n_jobs ) stats = subsampler.subsample(X, y, stratify=stratify, random_seed=random_state) # return the stuff lc_train = LearningCurve(data=stats[0, :, :], ns=sample_sizes, scoring=scoring, description={ "shuffles": num_samples}, curve_type="train" ) lc_test = LearningCurve(data=stats[1, :, :], ns=sample_sizes, scoring=scoring, description={ "shuffles": num_samples}, curve_type="test" ) lc_dummy = LearningCurve(data=stats[2, :, :], ns=sample_sizes, scoring=scoring, description={ "shuffles": num_samples}, curve_type="dummy" ) if power_estimator is not None: lc_power = LearningCurve(data=stats[3, :, :], ns=sample_sizes, scoring=scoring, description={ "shuffles": num_samples}, curve_type="power" ) return lc_train, lc_test, lc_dummy, lc_power return lc_train, lc_test, lc_dummy