Source code for yellowbrick.classifier.class_prediction_error

# yellowbrick.classifier.class_prediction_error
# Shows the balance of classes and their associated predictions.
#
# Author:   Larry Gray
# Author:   Benjamin Bengfort
# Created:  Fri Jul 20 10:26:25 2018 -0400
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: class_prediction_error.py [] lwgray@gmail.com $

"""
Shows the balance of classes and their associated predictions.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from sklearn.utils.multiclass import unique_labels

from yellowbrick.draw import bar_stack
from yellowbrick.classifier.base import ClassificationScoreVisualizer
from yellowbrick.exceptions import ModelError, YellowbrickValueError, NotFitted

try:
    # See #1124: this allows compatibility for scikit-learn >= 0.20
    from sklearn.metrics._classification import _check_targets
except ImportError:
    from sklearn.metrics.classification import _check_targets


##########################################################################
## Class Prediction Error Chart
##########################################################################


[docs]class ClassPredictionError(ClassificationScoreVisualizer):
    """
    Class Prediction Error chart that shows the support for each class in the
    fitted classification model displayed as a stacked bar. Each bar is segmented
    to show the distribution of predicted classes for each class. It is initialized
    with a fitted model and generates a class prediction error chart on draw.

    Parameters
    ----------
    estimator : estimator
        A scikit-learn estimator that should be a classifier. If the model is
        not a classifier, an exception is raised. If the internal model is not
        fitted, it is fit when the visualizer is fitted, unless otherwise specified
        by ``is_fitted``.

    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If not specified the current axes will be
        used (or generated if required).

    classes : list of str, defult: None
        The class labels to use for the legend ordered by the index of the sorted
        classes discovered in the ``fit()`` method. Specifying classes in this
        manner is used to change the class names to a more specific format or
        to label encoded integer classes. Some visualizers may also use this
        field to filter the visualization for specific classes. For more advanced
        usage specify an encoder rather than class labels.

    encoder : dict or LabelEncoder, default: None
        A mapping of classes to human readable labels. Often there is a mismatch
        between desired class labels and those contained in the target variable
        passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch
        ensuring that classes are labeled correctly in the visualization.

    is_fitted : bool or str, default="auto"
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If "auto" (default), a helper method will check if the estimator
        is fitted before fitting it again.

    force_model : bool, default: False
        Do not check to ensure that the underlying estimator is a classifier. This
        will prevent an exception when the visualizer is initialized but may result
        in unexpected or unintended behavior.

    kwargs : dict
        Keyword arguments passed to the visualizer base classes.

    Attributes
    ----------
    classes_ : ndarray of shape (n_classes,)
        The class labels observed while fitting.

    class_count_ : ndarray of shape (n_classes,)
        Number of samples encountered for each class during fitting.

    score_ : float
        An evaluation metric of the classifier on test data produced when
        ``score()`` is called. This metric is between 0 and 1 -- higher scores are
        generally better. For classifiers, this score is usually accuracy, but
        ensure you check the underlying model for more details about the score.

    predictions_ : ndarray
        An ndarray of predictions whose rows are the true classes and
        whose columns are the predicted classes
    """

    def __init__(
        self,
        estimator,
        ax=None,
        classes=None,
        encoder=None,
        is_fitted="auto",
        force_model=False,
        **kwargs
    ):
        super(ClassPredictionError, self).__init__(
            estimator,
            ax=ax,
            classes=classes,
            encoder=encoder,
            is_fitted=is_fitted,
            force_model=force_model,
            **kwargs
        )

[docs]    def score(self, X, y):
        """
        Generates a 2D array where each row is the count of the
        predicted classes and each column is the true class

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        Returns
        -------
        score_ : float
            Global accuracy score
        """
        # Must be computed before calling super
        # We're relying on predict to raise NotFitted
        y_pred = self.predict(X)
        y_type, y_true, y_pred = _check_targets(y, y_pred)
        if y_type not in ("binary", "multiclass"):
            raise YellowbrickValueError("{} is not supported".format(y_type))

        # Get the indices of the unique labels
        indices = unique_labels(y_true, y_pred)
        labels = self._labels()

        # Call super to compute self.score_ and verify classes
        try:
            super(ClassPredictionError, self).score(X, y)
        except ModelError as e:
            # raise visualizer-specific errors
            if labels is not None and len(labels) < len(indices):
                raise NotImplementedError(
                    "filtering classes is currently not supported"
                )
            else:
                raise e

        # Ensure all labels are used
        if labels is not None and len(labels) > len(indices):
            raise ModelError(
                "y and y_pred contain zero values for one of the specified classes"
            )

        # Create a table of predictions whose rows are the true classes
        # and whose columns are the predicted classes; each element
        # is the count of predictions for that class that match the true
        # value of that class.
        self.predictions_ = np.array(
            [
                [(y_pred[y == label_t] == label_p).sum() for label_p in indices]
                for label_t in indices
            ]
        )

        self.draw()
        return self.score_

[docs]    def draw(self):
        """
        Renders the class prediction error across the axis.

        Returns
        -------
        ax : Matplotlib Axes
            The axes on which the figure is plotted
        """

        if not hasattr(self, "predictions_") or not hasattr(self, "classes_"):
            raise NotFitted.from_estimator(self, "draw")

        legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"}
        bar_stack(
            self.predictions_,
            self.ax,
            labels=list(self.classes_),
            ticks=self.classes_,
            colors=self.class_colors_,
            legend_kws=legend_kws,
        )
        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Adds a title and axis labels to the visualizer, ensuring that the
        y limits zoom the visualization in to the area of interest. Finalize
        also calls tight layout to ensure that no parts of the figure are
        cut off.

        Notes
        -----
        Generally this method is called from show and not directly by the user.
        """

        # Set the title
        self.set_title("Class Prediction Error for {}".format(self.name))

        # Set the axes labels
        self.ax.set_xlabel("actual class")
        self.ax.set_ylabel("number of predicted class")

        # Compute the ceiling for the y limit
        cmax = max([sum(predictions) for predictions in self.predictions_])
        self.ax.set_ylim(0, cmax + cmax * 0.1)

        # Ensure the legend fits on the figure
        self.fig.tight_layout(rect=[0, 0, 0.90, 1])


##########################################################################
## Quick Method
##########################################################################


[docs]def class_prediction_error(
    estimator,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    ax=None,
    classes=None,
    encoder=None,
    is_fitted="auto",
    force_model=False,
    show=True,
    **kwargs
):
    """Class Prediction Error

    Divides the dataset X and y into train and test splits, fits the model on the train
    split, then scores the model on the test split. The visualizer displays the support
    for each class in the fitted classification model displayed as a stacked bar plot.
    Each bar is segmented to show the distribution of predicted classes for each class.

    Parameters
    ----------
    estimator : estimator
        A scikit-learn estimator that should be a classifier. If the model is
        not a classifier, an exception is raised. If the internal model is not
        fitted, it is fit when the visualizer is fitted, unless otherwise specified
        by ``is_fitted``.

    X_train : ndarray or DataFrame of shape n x m
        A feature array of n instances with m features the model is trained on.
        Used to fit the visualizer and also to score the visualizer if test splits are
        not directly specified.

    y_train : ndarray or Series of length n
        An array or series of target or class values. Used to fit the visualizer and
        also to score the visualizer if test splits are not specified.

    X_test : ndarray or DataFrame of shape n x m, default: None
        An optional feature array of n instances with m features that the model
        is scored on if specified, using X_train as the training data.

    y_test : ndarray or Series of length n, default: None
        An optional array or series of target or class values that serve as actual
        labels for X_test for scoring purposes.

    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If not specified the current axes will be
        used (or generated if required).

    classes : list of str, defult: None
        The class labels to use for the legend ordered by the index of the sorted
        classes discovered in the ``fit()`` method. Specifying classes in this
        manner is used to change the class names to a more specific format or
        to label encoded integer classes. Some visualizers may also use this
        field to filter the visualization for specific classes. For more advanced
        usage specify an encoder rather than class labels.

    encoder : dict or LabelEncoder, default: None
        A mapping of classes to human readable labels. Often there is a mismatch
        between desired class labels and those contained in the target variable
        passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch
        ensuring that classes are labeled correctly in the visualization.

    is_fitted : bool or str, default="auto"
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If "auto" (default), a helper method will check if the estimator
        is fitted before fitting it again.

    force_model : bool, default: False
        Do not check to ensure that the underlying estimator is a classifier. This
        will prevent an exception when the visualizer is initialized but may result
        in unexpected or unintended behavior.

    show: bool, default: True
        If True, calls ``show()``, which in turn calls ``plt.show()`` however
        you cannot call ``plt.savefig`` from this signature, nor
        ``clear_figure``. If False, simply calls ``finalize()``

    kwargs: dict
        Keyword arguments passed to the visualizer base classes.

    Returns
    -------
    viz : ClassPredictionError
        Returns the fitted, finalized visualizer
    """
    # Instantiate the visualizer
    viz = ClassPredictionError(
        estimator=estimator,
        ax=ax,
        classes=classes,
        encoder=encoder,
        is_fitted=is_fitted,
        force_model=force_model,
        **kwargs
    )

    # Fit the visualizer (calls draw)
    viz.fit(X_train, y_train, **kwargs)

    # Score the visualizer
    if X_test is not None and y_test is not None:
        viz.score(X_test, y_test)
    elif X_test is not None or y_test is not None:
        raise YellowbrickValueError("must specify both X_test and y_test or neither")
    else:
        viz.score(X_train, y_train)

    # Draw the final visualization
    if show:
        viz.show()
    else:
        viz.finalize()

    # Return the visualizer
    return viz