Plotting
Plotting functions
- sklearn_evaluation.plot.confusion_matrix(y_true, y_pred, target_names=None, normalize=False, cmap=None, ax=None)
Plot confusion matrix.
- Parameters
y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).
y_pred (array-like, shape = [n_samples]) – Target predicted classes (estimator predictions).
target_names (list) – List containing the names of the target classes. List must be in order e.g.
['Label for class 0', 'Label for class 1']
. IfNone
, generic labels will be generated e.g.['Class 0', 'Class 1']
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
normalize (bool) – Normalize the confusion matrix
cmap (matplotlib Colormap) – If
None
uses a modified version of matplotlib’s OrRd colormap.
Notes
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn_evaluation import plot data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65) X = data[0] y = data[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_score = est.predict_proba(X_test) y_true = y_test plot.confusion_matrix(y_true, y_pred) plt.show()
- sklearn_evaluation.plot.feature_importances(data, top_n=None, feature_names=None, orientation='horizontal', ax=None)
Get and order feature importances from a scikit-learn model or from an array-like structure. If data is a scikit-learn model with sub-estimators (e.g. RandomForest, AdaBoost) the function will compute the standard deviation of each feature.
- Parameters
data (sklearn model or array-like structure) – Object to get the data from.
top_n (int) – Only get results for the top_n features.
feature_names (array-like) – Feature names
orientation (('horizontal', 'vertical')) – Bar plot orientation
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
""" Feature importances plot """ import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn_evaluation import plot X, y = datasets.make_classification(200, 20, n_informative=5, class_sep=0.65) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = RandomForestClassifier(n_estimators=1) model.fit(X_train, y_train) # plot all features ax = plot.feature_importances(model) plt.show()
# only top 5 plot.feature_importances(model, top_n=5) plt.show()
- sklearn_evaluation.plot.grid_search(cv_results_, change, subset=None, kind='line', cmap=None, ax=None)
Plot results from a sklearn grid search by changing two parameters at most.
- Parameters
cv_results (list of named tuples) – Results from a sklearn grid search (get them using the cv_results_ parameter)
change (str or iterable with len<=2) – Parameter to change
subset (dictionary-like) – parameter-value(s) pairs to subset from grid_scores. (e.g.
{'n_estimators': [1, 10]}
), if None all combinations will be used.kind (['line', 'bar']) – This only applies whe change is a single parameter. Changes the type of plot
cmap (matplotlib Colormap) – This only applies when change are two parameters. Colormap used for the matrix. If None uses a modified version of matplotlib’s OrRd colormap.
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn_evaluation.plot import grid_search iris = datasets.load_iris() parameters = { 'n_estimators': [1, 10, 50, 100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], } est = RandomForestClassifier() clf = GridSearchCV(est, parameters, cv=5) X, y = datasets.make_classification(1000, 10, n_informative=5, class_sep=0.7) clf.fit(X, y) # changing numeric parameter without any restrictions # in the rest of the parameter set grid_search(clf.cv_results_, change='n_estimators') plt.show()
# you can also use bars grid_search(clf.cv_results_, change='n_estimators', kind='bar') plt.show()
# changing a categorical variable without any constraints grid_search(clf.cv_results_, change='criterion') plt.show()
# bar grid_search(clf.cv_results_, change='criterion', kind='bar') plt.show()
# varying a numerical parameter but constraining # the rest of the parameter set grid_search(clf.cv_results_, change='n_estimators', subset={'max_features': 'sqrt', 'criterion': 'gini'}, kind='bar') plt.show()
# same as above but letting max_features to have two values grid_search(clf.cv_results_, change='n_estimators', subset={'max_features': ['sqrt', 'log2'], 'criterion': 'gini'}, kind='bar') plt.show()
# varying two parameters - you can only show this as a # matrix so the kind parameter will be ignored grid_search(clf.cv_results_, change=('n_estimators', 'criterion'), subset={'max_features': 'sqrt'}) plt.show()
- sklearn_evaluation.plot.learning_curve(train_scores, test_scores, train_sizes, ax=None)
Plot a learning curve
Plot a metric vs number of examples for the training and test set
- Parameters
train_scores (array-like) – Scores for the training set
test_scores (array-like) – Scores for the test set
train_sizes (array-like) – Relative or absolute numbers of training examples used to generate the learning curve
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
from sklearn.model_selection import learning_curve from sklearn import model_selection from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits import matplotlib.pyplot as plt import numpy as np from sklearn_evaluation import plot digits = load_digits() X, y = digits.data, digits.target # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = model_selection.ShuffleSplit(digits.data.shape[0], test_size=0.2, random_state=0) cv = 5 estimator = GaussianNB() train_sizes = np.linspace(.1, 1.0, 5) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes) plot.learning_curve(train_scores, test_scores, train_sizes) plt.show()
# SVC is more expensive so we do a lower number of CV iterations: cv = model_selection.ShuffleSplit(digits.data.shape[0], test_size=0.2, random_state=0) cv = 5 estimator = SVC(gamma=0.001) train_sizes = np.linspace(.1, 1.0, 5) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes) plot.learning_curve(train_scores, test_scores, train_sizes) plt.show()
- sklearn_evaluation.plot.metrics_at_thresholds(fn, y_true, y_score, n_thresholds=10, start=0.0, ax=None)
Plot metrics at increasing thresholds
- sklearn_evaluation.plot.precision_at_proportions(y_true, y_score, ax=None)
Plot precision values at different proportions.
- Parameters
y_true (array-like) – Correct target values (ground truth).
y_score (array-like) – Target scores (estimator predictions).
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
- sklearn_evaluation.plot.precision_recall(y_true, y_score, ax=None)
Plot precision-recall curve.
- Parameters
y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).
y_score (array-like, shape = [n_samples] or [n_samples, 2] for binary) –
classification or [n_samples, n_classes] for multiclass
Target scores (estimator predictions).
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
Notes
It is assumed that the y_score parameter columns are in order. For example, if
y_true = [2, 2, 1, 0, 0, 1, 2]
, then the first column in y_score must contain the scores for class 0, second column for class 1 and so on.- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn_evaluation import plot data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65) X = data[0] y = data[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_score = est.predict_proba(X_test) y_true = y_test plot.precision_recall(y_true, y_score) plt.show()
- sklearn_evaluation.plot.roc(y_true, y_score, ax=None)
Plot ROC curve.
- Parameters
y_true (array-like, shape = [n_samples]) – Correct target values (ground truth).
y_score (array-like, shape = [n_samples] or [n_samples, 2] for binary) –
classification or [n_samples, n_classes] for multiclass
Target scores (estimator predictions).
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
Notes
It is assumed that the y_score parameter columns are in order. For example, if
y_true = [2, 2, 1, 0, 0, 1, 2]
, then the first column in y_score must contain the scores for class 0, second column for class 1 and so on.- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn_evaluation import plot data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65) X = data[0] y = data[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_score = est.predict_proba(X_test) y_true = y_test plot.roc(y_true, y_score) plt.show()
- sklearn_evaluation.plot.validation_curve(train_scores, test_scores, param_range, param_name=None, semilogx=False, ax=None)
Plot a validation curve
Plot a metric vs hyperparameter values for the training and test set
- Parameters
train_scores (array-like) – Scores for the training set
test_scores (array-like) – Scores for the test set
param_range (str) – Hyperparameter values used to generate the curve
param_range – Hyperparameter name
semilgo (bool) – Sets a log scale on the x axis
ax (matplotlib Axes) – Axes object to draw the plot onto, otherwise uses current Axes
- Returns
ax – Axes containing the plot
- Return type
matplotlib Axes
Examples
import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_digits from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import validation_curve from sklearn_evaluation import plot digits = load_digits() X, y = digits.data, digits.target param_range = np.logspace(-6, -1, 5) param_name = "gamma" train_scores, test_scores = validation_curve( SVC(), X, y, param_name=param_name, param_range=param_range, cv=5, scoring="accuracy", n_jobs=1) plot.validation_curve(train_scores, test_scores, param_range, param_name, semilogx=True) plt.show()
param_range = np.array([1, 10, 100]) param_name = "n_estimators" train_scores, test_scores = validation_curve( RandomForestClassifier(), X, y, param_name=param_name, param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) plot.validation_curve(train_scores, test_scores, param_range, param_name, semilogx=False) plt.show()