Plotting

Plotting functions

class sklearn_evaluation.plot.ClassificationReport(y_true, y_pred, *, target_names=None, sample_weight=None, zero_division=0, matrix=None, keys=None)

Examples

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

X, y = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

y_pred_rf = RandomForestClassifier().fit(X_train, y_train).predict(X_test)
y_pred_lr = LogisticRegression().fit(X_train, y_train).predict(X_test)

target_names = ["Not spam", "Spam"]

cr_rf = plot.ClassificationReport.from_raw_data(
    y_test, y_pred_rf, target_names=target_names
)
cr_lr = plot.ClassificationReport.from_raw_data(
    y_test, y_pred_lr, target_names=target_names
)

# display one of the classification reports
cr_rf

# how better it is the random forest?
cr_rf - cr_lr

# compare both reports
cr_rf + cr_lr
sklearn_evaluation.plot.calibration_curve(y_true, probabilities, clf_names=None, n_bins=10, cmap='nipy_spectral', ax=None)

Plots calibration curves for a set of classifier probability estimates. Calibration curves help determining whether you can interpret predicted probabilities as confidence level. For example, if we take a well-calibrated and take the instances where the score is 0.8, 80% of those instanes should be from the positive class. This function only works for binary classifiers.

Parameters
  • y_true (array-like, shape = [n_samples] or list with array-like:) – Ground truth (correct) target values. If passed a single array- object, it assumes all the probabilities have the same shape as y_true. If passed a list, it expects y_true[i] to have the same size as probabilities[i]

  • probabilities (list of array-like, shape (n_samples, 2) or (n_samples,)) – A list containing the outputs of binary classifiers’ predict_proba() method or decision_function() method.

  • clf_names (list of str, optional)) – A list of strings, where each string refers to the name of the classifier that produced the corresponding probability estimates in probabilities. If None, the names “Classifier 1”, “Classifier 2”, etc. will be used.

  • n_bins (int, optional, default=10) – Number of bins. A bigger number requires more data.

  • cmap (string or matplotlib.colors.Colormap instance, optional) – Colormap used for plotting the projection. View Matplotlib Colormap documentation for available options. https://matplotlib.org/users/colormaps.html

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn_evaluation import plot

X, y = make_classification(n_samples=20000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=0)

rf = RandomForestClassifier()
lr = LogisticRegression()
nb = GaussianNB()

rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)

probabilities = [rf_probas, lr_probas, nb_probas]

clf_names = [
    'Random Forest',
    'Logistic Regression',
    'Gaussian Naive Bayes',
]

plot.calibration_curve(y_test, probabilities, clf_names)
plt.show()
../_images/calibration_curve.png
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn_evaluation import plot


def make_dataset(n_samples):
    X, y = make_classification(n_samples=n_samples,
                               n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               random_state=0)
    return train_test_split(X, y, test_size=0.33, random_state=0)


X_train, X_test, y_train, y_test1 = make_dataset(n_samples=1000)
probs1 = LogisticRegression().fit(X_train, y_train).predict_proba(X_test)

X_train, X_test, y_train, y_test2 = make_dataset(n_samples=10000)
probs2 = LogisticRegression().fit(X_train, y_train).predict_proba(X_test)

# if you want plot probability curves for different sample sizes, pass
# a list with the true labels per each element in the probabilities
# argyment
plot.calibration_curve([y_test1, y_test2], [probs1, probs2],
                       clf_names=['1k samples', '10k samples'])
plt.show()
../_images/calibration_curve_diff_sample_size.png
sklearn_evaluation.plot.classification_report(y_true, y_pred, *, target_names=None, sample_weight=None, zero_division=0, ax=None)

Classification report

Parameters
  • y_true (array-like, shape = [n_samples]) – Correct target values (ground truth)

  • y_pred (array-like, shape = [n_samples]) – Target predicted classes (estimator predictions)

  • target_names (list) – List containing the names of the target classes. List must be in order e.g. ['Label for class 0', 'Label for class 1']. If None, generic labels will be generated e.g. ['Class 0', 'Class 1']

  • sample_weight (array-like of shape (n_samples,), default=None) – Sample weights.

  • zero_division (bool, 0 or 1) – Sets the value to return when there is a zero division.

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

X, y = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)

plot.classification_report(y_test, y_pred, target_names=['Not spam', 'Spam'])
plt.show()
../_images/classification_report.png
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

X, y = datasets.make_classification(200,
                                    10,
                                    n_informative=5,
                                    class_sep=0.65,
                                    n_classes=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)

plot.classification_report(y_test, y_pred)
plt.show()
../_images/classification_report_multiclass.png
sklearn_evaluation.plot.confusion_matrix(y_true, y_pred, target_names=None, normalize=False, cmap=None, ax=None)

Plot confusion matrix.

Parameters
  • y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).

  • y_pred (array-like, shape = [n_samples]) – Target predicted classes (estimator predictions).

  • target_names (list) – List containing the names of the target classes. List must be in order e.g. ['Label for class 0', 'Label for class 1']. If None, generic labels will be generated e.g. ['Class 0', 'Class 1']

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

  • normalize (bool) – Normalize the confusion matrix

  • cmap (matplotlib Colormap) – If None uses a modified version of matplotlib’s OrRd colormap.

Notes

http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

X, y = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)
y_score = est.predict_proba(X_test)
y_true = y_test

plot.confusion_matrix(y_true, y_pred)
plt.show()
../_images/confusion_matrix.png
sklearn_evaluation.plot.elbow_curve(X, clf, n_clusters=None, n_jobs=1, show_cluster_time=True, ax=None)

Plots elbow curve of different values of K of a clustering algorithm.

Parameters
  • X (array-like, shape = [n_samples, n_features]:) – Data to cluster, where n_samples is the number of samples and n_features is the number of features. Refer https://numpy.org/doc/stable/glossary.html#term-array-like

  • clf – Clusterer instance that implements fit,``fit_predict``, and score methods, and an n_clusters hyperparameter. e.g. sklearn.cluster.KMeans instance

  • n_clusters (None or list of int, optional) – List of n_clusters for which to plot the explained variances. Defaults to [1, 3, 5, 7, 9, 11].

  • n_jobs (int, optional) – Number of jobs to run in parallel. Defaults to 1.

  • show_cluster_time (bool, optional) – Include plot of time it took to cluster for a particular K.

  • ax (matplotlib.axes.Axes, optional) – The axes upon which to plot the curve. If None, the plot is drawn on the current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

from sklearn_evaluation import plot

X, _ = make_blobs(n_samples=100, centers=3, n_features=5, random_state=0)
kmeans = KMeans(random_state=1)

plot.elbow_curve(X, kmeans, n_clusters=range(1, 30))
plt.show()
../_images/elbow_curve.png
sklearn_evaluation.plot.elbow_curve_from_results(n_clusters, sum_of_squares, times, ax=None)

Same as elbow_curve, but it takes the number of clusters and sum of squares as inputs. Useful if you want to train the models yourself.

sklearn_evaluation.plot.feature_importances(data, top_n=None, feature_names=None, orientation='horizontal', ax=None)

Get and order feature importances from a scikit-learn model or from an array-like structure. If data is a scikit-learn model with sub-estimators (e.g. RandomForest, AdaBoost) the function will compute the standard deviation of each feature.

Parameters
  • data (sklearn model or array-like structure) – Object to get the data from.

  • top_n (int) – Only get results for the top_n features.

  • feature_names (array-like) – Feature names

  • orientation (('horizontal', 'vertical')) – Bar plot orientation

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

"""
Feature importances plot
"""
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

X, y = datasets.make_classification(200, 20, n_informative=5, class_sep=0.65)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = RandomForestClassifier(n_estimators=1)
model.fit(X_train, y_train)

# plot all features
ax = plot.feature_importances(model)
plt.show()
../_images/feature_importances_00_00.png
# only top 5
plot.feature_importances(model, top_n=5)
plt.show()
../_images/feature_importances_01_00.png

Plot results from a sklearn grid search by changing two parameters at most.

Parameters
  • cv_results (list of named tuples) – Results from a sklearn grid search (get them using the cv_results_ parameter)

  • change (str or iterable with len<=2) – Parameter to change

  • subset (dictionary-like) – parameter-value(s) pairs to subset from grid_scores. (e.g. {'n_estimators': [1, 10]}), if None all combinations will be used.

  • kind (['line', 'bar']) – This only applies whe change is a single parameter. Changes the type of plot

  • cmap (matplotlib Colormap) – This only applies when change are two parameters. Colormap used for the matrix. If None uses a modified version of matplotlib’s OrRd colormap.

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

  • sort (bool) – If True sorts the results in alphabetical order.

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

from sklearn_evaluation.plot import grid_search


iris = datasets.load_iris()


parameters = {
    'n_estimators': [1, 10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
}

est = RandomForestClassifier()
clf = GridSearchCV(est, parameters, cv=5)

X, y = datasets.make_classification(1000, 10, n_informative=5, class_sep=0.7)
clf.fit(X, y)

# changing numeric parameter without any restrictions
# in the rest of the parameter set
grid_search(clf.cv_results_, change='n_estimators')
plt.show()
../_images/grid_search_00_00.png
# you can also use bars
grid_search(clf.cv_results_, change='n_estimators', kind='bar')
plt.show()
../_images/grid_search_01_00.png
# changing a categorical variable without any constraints
grid_search(clf.cv_results_, change='criterion')
plt.show()
../_images/grid_search_02_00.png
# bar
grid_search(clf.cv_results_, change='criterion', kind='bar')
plt.show()
../_images/grid_search_03_00.png
# varying a numerical parameter but constraining
# the rest of the parameter set
grid_search(clf.cv_results_, change='n_estimators',
            subset={'max_features': 'sqrt', 'criterion': 'gini'},
            kind='bar')
plt.show()
../_images/grid_search_04_00.png
# same as above but letting max_features to have two values
grid_search(clf.cv_results_, change='n_estimators',
            subset={'max_features': ['sqrt', 'log2'], 'criterion': 'gini'},
            kind='bar')
plt.show()
../_images/grid_search_05_00.png
# varying two parameters - you can only show this as a
# matrix so the kind parameter will be ignored
grid_search(clf.cv_results_, change=('n_estimators', 'criterion'),
            subset={'max_features': 'sqrt'})
plt.show()
../_images/grid_search_06_00.png
sklearn_evaluation.plot.learning_curve(train_scores, test_scores, train_sizes, ax=None)

Plot a learning curve

Plot a metric vs number of examples for the training and test set

Parameters
  • train_scores (array-like) – Scores for the training set

  • test_scores (array-like) – Scores for the test set

  • train_sizes (array-like) – Relative or absolute numbers of training examples used to generate the learning curve

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.model_selection import learning_curve
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import numpy as np

from sklearn_evaluation import plot

digits = load_digits()
X, y = digits.data, digits.target

# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = model_selection.ShuffleSplit(digits.data.shape[0],
                                  test_size=0.2, random_state=0)
cv = 5
estimator = GaussianNB()
train_sizes = np.linspace(.1, 1.0, 5)
train_sizes, train_scores, test_scores = learning_curve(
    estimator, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes)
plot.learning_curve(train_scores, test_scores, train_sizes)
plt.show()
../_images/learning_curve_00_00.png
# SVC is more expensive so we do a lower number of CV iterations:
cv = model_selection.ShuffleSplit(digits.data.shape[0],
                                  test_size=0.2, random_state=0)
cv = 5
estimator = SVC(gamma=0.001)
train_sizes = np.linspace(.1, 1.0, 5)
train_sizes, train_scores, test_scores = learning_curve(
    estimator, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes)

plot.learning_curve(train_scores, test_scores, train_sizes)
plt.show()
../_images/learning_curve_01_00.png
sklearn_evaluation.plot.metrics_at_thresholds(fn, y_true, y_score, n_thresholds=10, start=0.0, ax=None)

Plot metrics at increasing thresholds

sklearn_evaluation.plot.precision_at_proportions(y_true, y_score, ax=None)

Plot precision values at different proportions.

Parameters
  • y_true (array-like) – Correct target values (ground truth).

  • y_score (array-like) – Target scores (estimator predictions).

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

sklearn_evaluation.plot.precision_recall(y_true, y_score, ax=None)

Plot precision-recall curve.

Parameters
  • y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).

  • y_score (array-like, shape = [n_samples] or [n_samples, 2] for binary) –

    classification or [n_samples, n_classes] for multiclass

    Target scores (estimator predictions).

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Notes

It is assumed that the y_score parameter columns are in order. For example, if y_true = [2, 2, 1, 0, 0, 1, 2], then the first column in y_score must contain the scores for class 0, second column for class 1 and so on.

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)
X = data[0]
y = data[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)
y_score = est.predict_proba(X_test)
y_true = y_test

plot.precision_recall(y_true, y_score)
plt.show()
../_images/precision_recall.png
sklearn_evaluation.plot.prediction_error(y_true, y_pred, model=None, ax=None, plot_name='Prediction Error')

Plot the scatter plot of measured values v. predicted values, with an identity line and a best fitted line to show the prediction difference.

Parameters
  • y_true (array-like, shape = [n_samples]) – Measured target values (ground truth).

  • y_pred (array-like, shape = [n_samples]) – Predicted target values.

  • model (Regression instance that implements fit,``predict``, and) – score methods and fit_intercept attribute. e.g. sklearn.linear_model.LinearRegression instance If not specified, use the LinearRegression model.

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn_evaluation import plot
import matplotlib.pyplot as plt

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
y_true = y_test

plot.prediction_error(y_true, y_pred)
plt.show()
../_images/prediction_error.png
sklearn_evaluation.plot.residuals(y_true, y_pred, ax=None, plot_name='Residuals Plot')

Plot the residuals between measured and predicted values.

Parameters
  • y_true (array-like, shape = [n_samples]) – Measured target values (ground truth).

  • y_pred (array-like, shape = [n_samples]) – Predicted target values.

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn_evaluation import plot
import matplotlib.pyplot as plt

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
y_true = y_test

plot.residuals(y_true, y_pred)
plt.show()
../_images/residuals.png
sklearn_evaluation.plot.roc(y_true, y_score, ax=None)

Plot ROC curve.

Parameters
  • y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).

  • y_score (array-like, shape = [n_samples] or [n_samples, 2] for binary) –

    classification or [n_samples, n_classes] for multiclass

    Target scores (estimator predictions).

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Notes

It is assumed that the y_score parameter columns are in order. For example, if y_true = [2, 2, 1, 0, 0, 1, 2], then the first column in y_score must contain the scores for class 0, second column for class 1 and so on.

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)
X = data[0]
y = data[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)
y_score = est.predict_proba(X_test)
y_true = y_test

plot.roc(y_true, y_score)
plt.show()
../_images/roc.png
sklearn_evaluation.plot.scores_distribution(y_scores, n_bins=5, ax=None)

Generate a histogram from model’s predictions

Parameters
  • y_scores (array-like, shape=(n_samples, )) – Scores produced by a trained model for a single class

  • n_bins (int, default=5) – Number of histogram bins

  • ax (matplotlib Axes, default=None) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn_evaluation import plot

X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=0)

clf = LogisticRegression()

y_scores = clf.fit(X_train, y_train).predict_proba(X_test)

plot.scores_distribution(y_scores[:, 1], n_bins=10)
plt.show()
../_images/scores_distribution.png
sklearn_evaluation.plot.silhouette_analysis(X, clf, range_n_clusters=None, metric='euclidean', figsize=None, cmap='nipy_spectral', text_fontsize='medium', ax=None)

Plots silhouette analysis of clusters provided.

Parameters
  • X (array-like, shape = [n_samples, n_features]:) – Cluster data, where n_samples is the number of samples and n_features is the number of features. Refer https://numpy.org/doc/stable/glossary.html#term-array-like

  • clf – Clusterer instance that implements fit,``fit_predict``, and score methods, and an n_clusters hyperparameter. e.g. sklearn.cluster.KMeans instance

  • range_n_clusters (None or list of int, optional) – List of n_clusters for which to plot the silhouette scores. Defaults to [2, 3, 4, 5, 6].

  • metric (string or callable, optional:) – The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is the distance array itself, use “precomputed” as the metric.

  • figsize (2-tuple, optional:) – Tuple denoting figure size of the plot e.g. (6, 6). Defaults to None.

  • cmap (string or matplotlib.colors.Colormap instance, optional:) – Colormap used for plotting the projection. View Matplotlib Colormap documentation for available options. https://matplotlib.org/users/colormaps.html

  • text_fontsize (string or int, optional:) – Matplotlib-style fontsizes. Use e.g. “small”, “medium”, “large” or integer-values. Defaults to “medium”.

  • ax (matplotlib.axes.Axes, optional:) – The axes upon which to plot the curve. If None, the plot is drawn on a new set of axes.

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

from sklearn_evaluation import plot

X, y = make_blobs(
    n_samples=500,
    n_features=2,
    centers=4,
    cluster_std=1,
    center_box=(-10.0, 10.0),
    shuffle=True,
    random_state=1,
)

kmeans = KMeans(random_state=10)
plot.silhouette_analysis(X, kmeans, range_n_clusters=[3])
plt.show()
../_images/silhouette_plot_basic.png

Notes

New in version 0.8.3.

sklearn_evaluation.plot.silhouette_analysis_from_results(X, cluster_labels, metric='euclidean', figsize=None, cmap='nipy_spectral', text_fontsize='medium', ax=None)

Same as silhouette_plot but takes cluster_labels as input. Useful if you want to train the model yourself

Notes

New in version 0.8.3.

sklearn_evaluation.plot.target_analysis(y_train, y_test=None, labels=None, colors=None, ax=None)

Target analysis plot for visualising class imbalance.

There are two modes:

  1. Balance mode: if only y_train is specified

  2. Compare mode: if both train and test are specified

In balance mode, the bar chart is displayed with each class as its own color. In compare mode, a side-by-side bar chart is displayed colored by train or test respectively.

Parameters
  • y_train (array-like) – Array or list of shape (n,) that contains discrete data. Refer https://numpy.org/doc/stable/glossary.html#term-array-like

  • y_test (array-like, optional) – Array or list of shape (m,) that contains discrete data. If specified, the bar chart will be drawn in compare mode. Refer https://numpy.org/doc/stable/glossary.html#term-array-like

  • labels (list, optional) – A list of class names for the x-axis if the target is already encoded. Ensure that the labels are ordered lexicographically with respect to the values in the target. A common use case is to pass LabelEncoder.classes_ as this parameter. If not specified, the labels in the data will be used.

  • colors (list of strings) – Specify colors for the barchart.

  • ax (matplotlib.axes.Axes, optional) – The axes upon which to plot the curve. If None, the plot is drawn on the current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sklearn_evaluation import plot

import matplotlib.pyplot as plt

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
plot.target_analysis(y_train, y_test)
plt.show()
../_images/target_analysis.png

Notes

New in version 0.8.3.

sklearn_evaluation.plot.validation_curve(train_scores, test_scores, param_range, param_name=None, semilogx=False, ax=None)

Plot a validation curve

Plot a metric vs hyperparameter values for the training and test set

Parameters
  • train_scores (array-like) – Scores for the training set

  • test_scores (array-like) – Scores for the test set

  • param_range (str) – Hyperparameter values used to generate the curve

  • param_range – Hyperparameter name

  • semilgo (bool) – Sets a log scale on the x axis

  • ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes

Returns

ax – Axes containing the plot

Return type

matplotlib Axes

Examples

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve

from sklearn_evaluation import plot

digits = load_digits()
X, y = digits.data, digits.target

param_range = np.logspace(-6, -1, 5)
param_name = "gamma"
train_scores, test_scores = validation_curve(
    SVC(), X, y, param_name=param_name,
    param_range=param_range,
    cv=5,
    scoring="accuracy",
    n_jobs=1)

plot.validation_curve(train_scores, test_scores, param_range, param_name,
                      semilogx=True)
plt.show()
../_images/validation_curve_00_00.png
param_range = np.array([1, 10, 100])
param_name = "n_estimators"
train_scores, test_scores = validation_curve(
    RandomForestClassifier(), X, y,
    param_name=param_name,
    param_range=param_range,
    cv=10, scoring="accuracy", n_jobs=1)

plot.validation_curve(train_scores, test_scores, param_range, param_name,
                      semilogx=False)
plt.show()
../_images/validation_curve_01_00.png