Automate experiment documentation

This example notebook uses the modlee package to document a machine learning experiment with a user-built model. We train a simple convolutional classifier on the simple Fashion MNIST dataset. After training, we can reuse the model from the auto-documented model class. Prerequisites for this tutorial include familiarity with PyTorch and Lightning.

# Boilerplate imports
import os, sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

Import modlee and initialize with an API key.

# Set the API key to an environment variable,
# to simulate setting this in your shell profile
os.environ['MODLEE_API_KEY'] = "replace-with-your-api-key"
# Modlee-specific imports
import modlee
modlee.init(api_key=os.environ['MODLEE_API_KEY'])

Load the training data; we’ll use torch’s Fashion MNIST dataset.

# Get Fashion MNIST, and convert from grayscale to RGB for compatibility with the model
train_dataloader, val_dataloader = modlee.utils.get_fashion_mnist(num_output_channels=3)
num_classes = len(train_dataloader.dataset.classes)

Next, we build the model from a pretrained torchvision ResNet model. To enable automatic documentation, wrap the model in the modlee.model.ModleeModel class. ModleeModel subclassees `lightning.pytorch.LightningModule <https://lightning.ai/docs/pytorch/stable/common/lightning_module.html>`__ and uses the same structure for the training_step, validation_step, and configure_optimizers functions. Under the hood, ModleeModel also contains the callbacks to document the experiment metafeatures.

# Use a pretrained torchvision ResNet
classifier_model = torchvision.models.resnet18(num_classes=10)

# Subclass the ModleeModel class to enable automatic documentation
class ModleeClassifier(modlee.model.ModleeModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = classifier_model
        self.loss_fn = F.cross_entropy

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y_target = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y_target)
        return {"loss": loss}

    def validation_step(self, val_batch, batch_idx):
        x, y_target = val_batch
        y_pred = self(x)
        val_loss = self.loss_fn(y_pred, y_target)
        return {'val_loss': val_loss}

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
        return optimizer

# Create the model object
modlee_model = ModleeClassifier()

Run the training loop, just for one epoch.

with modlee.start_run() as run:
    trainer = modlee.Trainer(max_epochs=1)
    trainer.fit(
        model=modlee_model,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader
    )

  | Name  | Type       | Params
-------------------------------------
0 | model | Classifier | 44.4 K
-------------------------------------
44.4 K    Trainable params
0         Non-trainable params
44.4 K    Total params
0.178     Total estimated model params size (MB)
Epoch 0: 100%|██████████| 938/938 [00:16<00:00, 57.47it/s, v_num=0]

modlee with mlflow underneath will document the experiment in an automatically generated assets folder.

last_run_path = modlee.last_run_path()
print(f"Run path: {last_run_path}")

artifacts_path = os.path.join(last_run_path, 'artifacts')
artifacts = os.listdir(artifacts_path)
print(f"Saved artifacts: {artifacts}")

os.environ['ARTIFACTS_PATH'] = artifacts_path
# Add the artifacts directory to the path,
# so we can import the model
sys.path.insert(0, artifacts_path)

Run path: /home/ubuntu/projects/modlee_pypi/examples/mlruns/0/7a47086681324d0e924f9076a1262de9/artifacts/model_graph.py
Saved artifacts: ['transforms.txt', 'model_graph.py', 'model_graph.txt', 'model_size', 'model', 'cached_vars', 'stats_rep', 'snapshot_1.npy', 'lightning_logs', 'snapshot_0.npy', 'model.py', 'loss_calls.txt', 'model_summary.txt']

# Print out the first few lines of the model
print("Model graph:")
!sed -n -e 1,15p $ARTIFACTS_PATH/model_graph.py
!echo "        ..."
!sed -n -e 58,68p $ARTIFACTS_PATH/model_graph.py
!echo "        ..."

Model graph:

import torch, onnx2torch
from torch import tensor

class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        setattr(self,'Conv', torch.nn.modules.conv.Conv2d(**{'in_channels':3,'out_channels':64,'kernel_size':(7, 7),'stride':(2, 2),'padding':(3, 3),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
        setattr(self,'Relu', torch.nn.modules.activation.ReLU(**{'inplace':False}))
        setattr(self,'MaxPool', torch.nn.modules.pooling.MaxPool2d(**{'kernel_size':[3, 3],'stride':[2, 2],'padding':[1, 1],'dilation':[1, 1],'return_indices':False,'ceil_mode':False}))
        setattr(self,'Conv_1', torch.nn.modules.conv.Conv2d(**{'in_channels':64,'out_channels':64,'kernel_size':(3, 3),'stride':(1, 1),'padding':(1, 1),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
        setattr(self,'Relu_1', torch.nn.modules.activation.ReLU(**{'inplace':False}))
        setattr(self,'Conv_2', torch.nn.modules.conv.Conv2d(**{'in_channels':64,'out_channels':64,'kernel_size':(3, 3),'stride':(1, 1),'padding':(1, 1),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
        setattr(self,'Add', onnx2torch.node_converters.binary_math_operations.OnnxBinaryMathOperation(**{'operation_type':'Add','broadcast':None,'axis':None}))
        ...

    def forward(self, input_1):
        conv = self.Conv(input_1);  input_1 = None
        relu = self.Relu(conv);  conv = None
        max_pool = self.MaxPool(relu);  relu = None
        conv_1 = self.Conv_1(max_pool)
        relu_1 = self.Relu_1(conv_1);  conv_1 = None
        conv_2 = self.Conv_2(relu_1);  relu_1 = None
        add = self.Add(conv_2, max_pool);  conv_2 = max_pool = None
        relu_2 = self.Relu_2(add);  add = None
        conv_3 = self.Conv_3(relu_2)
        ...

# Print the first lines of the data metafeatures
print("Data metafeatures:")
!head -20 $ARTIFACTS_PATH/stats_rep

Data metafeatures:
{
  "dataset_size": 60032,
  "num_sample": 1000,
  "batch_element_0": {
    "raw": {
      "feature_shape": [
        960,
        3,
        28,
        28
      ],
      "stats": {
        "kmeans": {
          "2": {
            "inertia": "155588.50824155417",
            "silhouette_score": "0.19201575",
            "calinski_harabasz_score": "248.3331975601121",
            "davies_bouldin_score": "1.9090644142081366",
            "time_taken": "0.6537415981292725"
          },

We can build the model from the cached model_graph.Model class and confirm that we can pass an input through it. Note that this model’s weights will be uninitialized. To load the model from the last checkpoint, we can load it directly from the cached model.pth.

We can build the model from the cached model_graph.Model class and confirm that we can pass an input through it. Note that this model’s weights will be uninitialized.

# Rebuilding from the object
import model_graph
rebuilt_model = model_graph.Model()

# Set models to inference
modlee_model.eval(); rebuilt_model.eval()

Next, pass an input from the train dataloader through the rebuilt network and check that the output shape is equal to the original data.

# Get a batch from the training loader
x, y = next(iter(train_dataloader))
with torch.no_grad():
    y_original = modlee_model(x)
    y_rebuilt = rebuilt_model(x)
assert y_original.shape == y_rebuilt.shape

print(f"Original input and output shapes: {x.shape}, {y_original.shape}")
print(f"Output shape from module-rebuilt model: {y_rebuilt.shape}")

Alternatively, to load the model from the last checkpoint, we can load it directly from the cached model.pth.

# Reloading from the checkpoint
reloaded_model = torch.load(os.path.join(artifacts_path, 'model', 'data','model.pth'))
y_reloaded = reloaded_model(x)
assert y_original.shape == y_reloaded.shape
print(f"Output shape from checkpoint-reloaded model: {y_reloaded.shape}")

Original input and output shapes: torch.Size([64, 3, 28, 28]), torch.Size([64, 10])
Output shape from module-rebuilt model: torch.Size([64, 10])
Output shape from checkpoint-reloaded model: torch.Size([64, 10])