Source code for yellowbrick.contrib.missing.dispersion

# yellowbrick.contrib.missing.dispersion
# Missing Values Dispersion Visualizer
#
# Author:  Nathan Danielsen
# Created: Fri Mar 29 5:17:36 2018 -0500
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: dispersion.py [1443e16] ndanielsen@users.noreply.github.com $

"""
Dispersion visualizer for locations of missing values by column against index position.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from yellowbrick.style.palettes import color_palette
from .base import MissingDataVisualizer


##########################################################################
## MissingValues Visualizer
##########################################################################


[docs]class MissingValuesDispersion(MissingDataVisualizer):
    """
    The Missing Values Dispersion visualizer shows the locations of missing (nan)
    values in the feature dataset by the order of the index.

    When y targets are supplied to fit, the output dispersion plot is color
    coded according to the target y that the element refers to.

    Parameters
    ----------
    alpha : float, default: 0.5
        A value for bending elments with the background.

    marker : matplotlib marker, default: |
        The marker used for each element coordinate in the plot

    classes : list, default: None
        A list of class names for the legend.
        If classes is None and a y value is passed to fit then the classes
        are selected from the target vector.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    features_ : np.array
        The feature labels ranked according to their importance

    classes_ : np.array
        The class labels for each of the target values

    Examples
    --------

    >>> from yellowbrick.contrib.missing import MissingValuesDispersion
    >>> visualizer = MissingValuesDispersion()
    >>> visualizer.fit(X, y=y)
    >>> visualizer.show()
    """

    def __init__(self, alpha=0.5, marker="|", classes=None, **kwargs):

        if "target_type" not in kwargs:
            kwargs["target_type"] = "single"
        super(MissingValuesDispersion, self).__init__(**kwargs)
        self.alpha = alpha
        self.marker = marker

        self.classes_ = classes

        # Convert to array if necessary to match estimator.classes_
        if self.classes_ is not None:
            self.classes_ = np.array(classes)

        # Set up classifier score visualization properties
        if self.classes_ is not None:
            n_colors = len(self.classes_)
        else:
            n_colors = None

        self.colors = color_palette(kwargs.pop("colors", None), n_colors)

[docs]    def get_nan_locs(self, **kwargs):
        """Gets the locations of nans in feature data and returns
        the coordinates in the matrix
        """
        if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype(
            self.X.dtype, np.unicode_
        ):
            mask = np.where(self.X == "")
            nan_matrix = np.zeros(self.X.shape)
            nan_matrix[mask] = np.nan

        else:
            nan_matrix = self.X.astype(float)

        if self.y is None:
            return np.argwhere(np.isnan(nan_matrix))
        else:
            nan_locs = []
            for target_value in np.unique(self.y):
                indices = np.argwhere(self.y == target_value)
                target_matrix = nan_matrix[indices.flatten()]
                nan_target_locs = np.argwhere(np.isnan(target_matrix))
                nan_locs.append((target_value, nan_target_locs))

            return nan_locs

[docs]    def draw(self, X, y, **kwargs):
        """Called from the fit method, this method creates a scatter plot that
        draws each instance as a class or target colored point, whose location
        is determined by the feature data set.

        If y is not None, then it draws a scatter plot where each class is in a
        different color.
        """
        nan_locs = self.get_nan_locs()
        if y is None:
            x_, y_ = list(zip(*nan_locs))
            self.ax.scatter(x_, y_, alpha=self.alpha, marker=self.marker, label=None)
        else:
            self.draw_multi_dispersion_chart(nan_locs)

[docs]    def draw_multi_dispersion_chart(self, nan_locs):
        """Draws a multi dimensional dispersion chart, each color corresponds
        to a different target variable.
        """
        for index, nan_values in enumerate(nan_locs):
            label, nan_locations = nan_values

            # if features passed in then, label as such
            if self.classes_ is not None:
                label = self.classes_[index]

            color = self.colors[index]

            x_, y_ = list(zip(*nan_locations))
            self.ax.scatter(
                x_, y_, alpha=self.alpha, marker=self.marker, color=color, label=label
            )

[docs]    def finalize(self, **kwargs):
        """
        Sets the title and x-axis label and adds a legend. Also ensures that
        the y tick labels are set to the feature names.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        Notes
        -----
        Generally this method is called from show and not directly by the user.
        """
        # Set the title
        self.set_title("Dispersion of Missing Values by Feature")
        # the x locations for the groups
        tick_locations = np.arange(len(self.features_))

        self.ax.set_xlabel("Position by index")
        self.ax.set_yticks(tick_locations)
        self.ax.set_yticklabels(self.get_feature_names())
        self.ax.legend(loc="upper left", prop={"size": 5}, bbox_to_anchor=(1, 1))


##########################################################################
## Quick Method
##########################################################################


def missing_dispersion(
    X, y=None, ax=None, classes=None, alpha=0.5, marker="|", **kwargs
):
    """
    The Missing Values Dispersion visualizer shows the locations of missing (nan)
    values in the feature dataset by the order of the index.

    When y targets are supplied to fit, the output dispersion plot is color
    coded according to the target y that the element refers to.

    Parameters
    ----------
    alpha : float, default: 0.5
        A value for bending elments with the background.

    marker : matplotlib marker, default: |
        The marker used for each element coordinate in the plot

    classes : list, default: None
        A list of class names for the legend.
        If classes is None and a y value is passed to fit then the classes
        are selected from the target vector.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    features_ : np.array
        The feature labels ranked according to their importance

    classes_ : np.array
        The class labels for each of the target values

    Examples
    --------

    >>> from yellowbrick.contrib.missing import missing_dispersion
    >>> visualizer = missing_dispersion(X, y=y)

    """
    # Instantiate the visualizer
    visualizer = MissingValuesDispersion(
        ax=ax, classes=classes, alpha=alpha, marker=marker, **kwargs
    )

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X, y)
    visualizer.show()

    # Return the axes object on the visualizer
    return visualizer.ax