Binder

Tracking Machine Learning experiments

SQLiteTracker provides a simple yet powerful way to track ML experiments using a SQLite database.

[1]:
from sklearn_evaluation import SQLiteTracker

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
[4]:
iris = load_iris(as_frame=True)
X, y = iris['data'], iris['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

models = [RandomForestRegressor(), LinearRegression(), Lasso()]
[5]:
for m in models:
    model = type(m).__name__
    print(f'Fitting {model}')

    # .new() returns a uuid and creates an entry in the db
    uuid = tracker.new()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # add data with the .update(uuid, {'param': 'value'}) method
    tracker.update(uuid, {'mse': mse, 'model': model, **m.get_params()})
Fitting RandomForestRegressor
Fitting LinearRegression
Fitting Lasso

Or use .insert(uuid, params) to supply your own ID:

[6]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

tracker.insert('my_uuid', {'mse': mse, 'model': type(svr).__name__, **svr.get_params()})

tracker shows last experiments by default:

[7]:
tracker
[7]:

SQLiteTracker

uuid created parameters comment
462b9a96dafa45f9ae900190b9116bfb2022-07-05 02:28:05{"mse": 0.011009999999999999, "model": "RandomForestRegressor", "bootstrap": true, "ccp_alpha": 0.0, "criterion": "squared_error", "max_depth": null, "max_features": 1.0, "max_leaf_nodes": null, "max_samples": null, "min_impurity_decrease": 0.0, "min_samples_leaf": 1, "min_samples_split": 2, "min_weight_fraction_leaf": 0.0, "n_estimators": 100, "n_jobs": null, "oob_score": false, "random_state": null, "verbose": 0, "warm_start": false}
63ad584609fe4d5ca40886192da36aaf2022-07-05 02:28:05{"mse": 0.042600341137617896, "model": "LinearRegression", "copy_X": true, "fit_intercept": true, "n_jobs": null, "normalize": "deprecated", "positive": false}
e415bd238f6b42df8d417ede275dd8b52022-07-05 02:28:05{"mse": 0.4317655183287654, "model": "Lasso", "alpha": 1.0, "copy_X": true, "fit_intercept": true, "max_iter": 1000, "normalize": "deprecated", "positive": false, "precompute": false, "random_state": null, "selection": "cyclic", "tol": 0.0001, "warm_start": false}
my_uuid 2022-07-05 02:28:05{"mse": 0.03041912541362143, "model": "SVR", "C": 1.0, "cache_size": 200, "coef0": 0.0, "degree": 3, "epsilon": 0.1, "gamma": "scale", "kernel": "rbf", "max_iter": -1, "shrinking": true, "tol": 0.001, "verbose": false}

(Most recent experiments)

Querying experiments

[8]:
ordered = tracker.query("""
SELECT uuid,
       json_extract(parameters, '$.model') AS model,
       json_extract(parameters, '$.mse') AS mse
FROM experiments
ORDER BY json_extract(parameters, '$.mse') ASC
""")
ordered
[8]:
model mse
uuid
462b9a96dafa45f9ae900190b9116bfb RandomForestRegressor 0.011010
my_uuid SVR 0.030419
63ad584609fe4d5ca40886192da36aaf LinearRegression 0.042600
e415bd238f6b42df8d417ede275dd8b5 Lasso 0.431766

The query method returns a data frame with “uuid” as the index:

[9]:
type(ordered)
[9]:
pandas.core.frame.DataFrame

Adding comments

[10]:
tracker.comment(ordered.index[0], 'Best performing experiment')

User tracker[uuid] to get a single experiment:

[11]:
tracker[ordered.index[0]]
[11]:
created parameters comment
uuid
462b9a96dafa45f9ae900190b9116bfb 2022-07-05 02:28:05 {"mse": 0.011009999999999999, "model": "Random... Best performing experiment

Getting recent experiments

The recent method also returns a data frame:

[12]:
df = tracker.recent()
df
[12]:
created parameters comment
uuid
462b9a96dafa45f9ae900190b9116bfb 2022-07-05 02:28:05 {"mse": 0.011009999999999999, "model": "Random... Best performing experiment
63ad584609fe4d5ca40886192da36aaf 2022-07-05 02:28:05 {"mse": 0.042600341137617896, "model": "Linear... None
e415bd238f6b42df8d417ede275dd8b5 2022-07-05 02:28:05 {"mse": 0.4317655183287654, "model": "Lasso", ... None
my_uuid 2022-07-05 02:28:05 {"mse": 0.03041912541362143, "model": "SVR", "... None

Pass normalize=True to convert the nested JSON dictionary into columns:

[13]:
df = tracker.recent(normalize=True)
df
[13]:
created mse model bootstrap ccp_alpha criterion max_depth max_features max_leaf_nodes max_samples ... tol C cache_size coef0 degree epsilon gamma kernel shrinking comment
uuid
462b9a96dafa45f9ae900190b9116bfb 2022-07-05 02:28:05 0.011010 RandomForestRegressor True 0.0 squared_error NaN 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Best performing experiment
63ad584609fe4d5ca40886192da36aaf 2022-07-05 02:28:05 0.042600 LinearRegression NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN None
e415bd238f6b42df8d417ede275dd8b5 2022-07-05 02:28:05 0.431766 Lasso NaN NaN NaN NaN NaN NaN NaN ... 0.0001 NaN NaN NaN NaN NaN NaN NaN NaN None
my_uuid 2022-07-05 02:28:05 0.030419 SVR NaN NaN NaN NaN NaN NaN NaN ... 0.0010 1.0 200.0 0.0 3.0 0.1 scale rbf True None

4 rows × 38 columns

[14]:
# delete our example database
from pathlib import Path
Path('my_experiments.db').unlink()
[ ]: