Source code for mstc.learning.pipeline

"""Components for cross-validation and model evaluation."""
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline


[docs]def generate_cross_validation_pipeline(
    classifier, parameter_grid,
    folds=5, repeats=1, random_state=12345,
    number_of_jobs=1, scoring=None, refit=True
):
    """
    Evaluate a classifier trained with cross validation.

    Args:
        classifier (sklearn.base.ClassifierMixin): a classifier.
        parameter_grid (dict): grid of parameter.
        folds (int): number of stratified cross validation folds,
            defaults to 5.
        repeats (int): number of cross validation repeats,
            defaults to 1.
        random_state (int): random state, defaults to 12345.
        number_of_jobs (int): number of jobs to run in parallel, defaults to 1.
            -1 means using all processors.
        scoring (string, callable, list/tuple, dict or None): socring function
            or functions to evaluate predictions on the test set.
            Defaults to None to use the classifier default score method.
        refit (bool, string): whether to refit with best estimator. For
            multiple metric evaluation, this needs to be a string denoting the
            scorer is used to find the best parameters
            for refitting the estimator at the end.

    Returns:
        an evaluation report.
    """
    # ensure reproducibility in the classifier and log seed via parameter
    parameter_grid['random_state'] = [random_state]
    # generate the pipeline
    return make_pipeline(
        VarianceThreshold(),
        MinMaxScaler(),
        GridSearchCV(
            classifier,
            param_grid=parameter_grid,
            cv=RepeatedStratifiedKFold(
                n_splits=folds,
                n_repeats=repeats,
                random_state=random_state
            ),
            refit=refit,
            n_jobs=number_of_jobs,
            scoring=scoring,
            return_train_score=True
        )
    )