{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b3f2fdb7",
   "metadata": {},
   "source": [
    "# Using MLFlow to track hyperparameter tuning\n",
    "In this notebook we will use MFflow to do some hyperparameter tuning with a our previously trained neural network in for the breast cancer dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "479e8814",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import mlflow as ml\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from torch.utils.data import Dataset, DataLoader, random_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39f31fe3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the breast cancer dataset as a dataframe\n",
    "bc_dataset = load_breast_cancer(as_frame=True)\n",
    "\n",
    "# X is a Pandas dataframe\n",
    "# The columns are the features \n",
    "X = bc_dataset[\"data\"]\n",
    "\n",
    "# y is a Pandas series with the target class labels (0 - negative, 1 - positive)\n",
    "y = bc_dataset[\"target\"]\n",
    "\n",
    "# Using the train_test_split method we split 80% of the data into the X_train, y_train numpy arrays\n",
    "# The remaining 20% is our X_test and y_test \n",
    "X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.20, random_state=10)\n",
    "\n",
    "# Create a StandardScaler object\n",
    "sc = StandardScaler()\n",
    "\n",
    "# The StandardScaler standardizes features by removing the mean and scaling to unit variance\n",
    "# Prevents features with larger variances to dominate\n",
    "# We only need to apply this to our training/testing input data\n",
    "X_train = sc.fit_transform(X_train)\n",
    "X_test = sc.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c168e1a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset class\n",
    "class Data(Dataset):\n",
    "  def __init__(self, X_train, y_train):\n",
    "    # need to convert float64 to float32 else \n",
    "    # will get the following error\n",
    "    # RuntimeError: expected scalar type Double but found Float\n",
    "    self.X = torch.from_numpy(X_train.astype(np.float32))\n",
    "    # need to convert float64 to Long else \n",
    "    # will get the following error\n",
    "    # RuntimeError: expected scalar type Long but found Float\n",
    "    # Unsqueeze function needed to \n",
    "    self.y = torch.from_numpy(y_train.astype(np.float32)).unsqueeze(1)\n",
    "    self.len = self.X.shape[0]\n",
    "  \n",
    "  def __getitem__(self, index):\n",
    "    return self.X[index], self.y[index]\n",
    "\n",
    "  def __len__(self):\n",
    "    return self.len\n",
    "\n",
    "# Initialize the training data \n",
    "traindata = Data(X_train, y_train)\n",
    "testdata = Data(X_test, y_test)\n",
    "\n",
    "val_size = 100\n",
    "train_size = len(traindata) - val_size\n",
    "\n",
    "generator = torch.Generator().manual_seed(42)\n",
    "train_ds, val_ds = random_split(traindata, [train_size, val_size], generator=generator)\n",
    "print(\"Length of training dataset: \", len(train_ds))\n",
    "print(\"Length of validation dataset: \", len(val_ds))\n",
    "\n",
    "# Create dataloader\n",
    "\n",
    "batch_size = 4\n",
    "trainloader = DataLoader(train_ds, batch_size=batch_size)\n",
    "validationloader = DataLoader(val_ds, batch_size=batch_size)\n",
    "testdata = Data(X_test, y_test)\n",
    "testloader = DataLoader(testdata, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78f4650b",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.manual_seed(42)\n",
    "\n",
    "# Define the Neural Network class\n",
    "class NeuralNetwork(torch.nn.Module):\n",
    "    def __init__(self, input_dim, hidden_layer_dim, output_dim):\n",
    "        super(NeuralNetwork, self).__init__()\n",
    "        self.linear1 = nn.Linear(input_dim, hidden_layer_dim)\n",
    "        self.linear2 = nn.Linear(hidden_layer_dim, output_dim)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        x = torch.relu(self.linear1(x))\n",
    "        x = torch.sigmoid(self.linear2(x))\n",
    "        return x\n",
    "    \n",
    "    def compute_accuracy(self, outputs, labels):\n",
    "        preds = torch.round(outputs)\n",
    "        return torch.tensor(torch.sum(preds == labels).item() / len(preds))\n",
    "\n",
    "    def training_step(self, inputs, labels):\n",
    "        out = self(inputs)                  # Generate predictions\n",
    "        loss = F.binary_cross_entropy(out, labels) # Calculate loss\n",
    "        return loss\n",
    "    \n",
    "    def training_epoch_end(self, outputs):\n",
    "        batch_losses = [x['train_loss'] for x in outputs]\n",
    "        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses\n",
    "        batch_accs = [x['train_acc'] for x in outputs]\n",
    "        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies\n",
    "        return {'train_loss': epoch_loss.item(), 'train_acc': epoch_acc.item()} # return a dictionary\n",
    "    \n",
    "    def validation_step(self, batch):\n",
    "        features, labels = batch \n",
    "        out = self(features)                    # Generate predictions\n",
    "        loss = F.binary_cross_entropy(out, labels)   # Calculate loss\n",
    "        acc = self.compute_accuracy(out, labels)           # Calculate accuracy\n",
    "        return {'val_loss': loss, 'val_acc': acc} # return a dictionary\n",
    "        \n",
    "    def validation_epoch_end(self, outputs):\n",
    "        batch_losses = [x['val_loss'] for x in outputs]\n",
    "        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses\n",
    "        batch_accs = [x['val_acc'] for x in outputs]\n",
    "        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies\n",
    "        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()} # return a dictionary\n",
    "    \n",
    "    def epoch_end(self, epoch, train_result, val_result):\n",
    "        print(\"Epoch [{}], train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}\".format(\n",
    "            epoch, train_result['train_loss'], train_result['train_acc'], val_result['val_loss'], val_result['val_acc']))\n",
    "    \n",
    "    def fit(self, epochs, lr, train_loader, val_loader, opt_func):\n",
    "        history = []\n",
    "        optimizer = opt_func(self.parameters(), lr)\n",
    "        for epoch in range(epochs):\n",
    "            # Training Phase \n",
    "            train_results = []\n",
    "            for batch in train_loader:\n",
    "                inputs, labels = batch\n",
    "                loss = self.training_step(inputs, labels)\n",
    "                loss.backward()\n",
    "                optimizer.step()\n",
    "                optimizer.zero_grad()\n",
    "                out = self(inputs)\n",
    "                acc = self.compute_accuracy(out, labels)\n",
    "                train_results.append({'train_loss': loss, 'train_acc': acc})\n",
    "            train_result = self.training_epoch_end(train_results)\n",
    "            # Validation phase\n",
    "            val_results = [self.validation_step(batch) for batch in val_loader]\n",
    "            val_result = self.validation_epoch_end(val_results)\n",
    "            result = self.epoch_end(epoch, train_result, val_result)\n",
    "            history.append(train_result | val_result)\n",
    "        return history\n",
    "    \n",
    "    def score(self, testloader):\n",
    "        test_batch_accuracies = []\n",
    "        with torch.no_grad():\n",
    "            for data in testloader:\n",
    "                # Get the inputs and labels here\n",
    "                inputs, labels = data\n",
    "                # Compute the model output here\n",
    "                outputs = self(inputs)\n",
    "                # Use compute accuracy function here\n",
    "                batch_accuracy = self.compute_accuracy(outputs, labels)\n",
    "                test_batch_accuracies.append(batch_accuracy)\n",
    "        test_accuracy = torch.stack(test_batch_accuracies).mean()\n",
    "        return test_accuracy.item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1152b13",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the input layer, hidden layer, and output layer dimensions\n",
    "input_dim = X_train.shape[1]\n",
    "hidden_layer_dim = 4\n",
    "output_dim = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68141257",
   "metadata": {},
   "outputs": [],
   "source": [
    "import mlflow.pytorch\n",
    "epochs = 10\n",
    "\n",
    "lrs = [0.1, 0.01, 0.001, 0.0001]\n",
    "\n",
    "opt_func = torch.optim.SGD\n",
    "\n",
    "for lr in lrs:\n",
    "    with ml.start_run():\n",
    "        clf = NeuralNetwork(input_dim, hidden_layer_dim, output_dim)\n",
    "        print(\"Training model with lr: \", lr)\n",
    "        history = clf.fit(epochs, lr, trainloader, validationloader, opt_func)\n",
    "        score = clf.score(testloader)\n",
    "        ml.log_param(\"lr\", lr)\n",
    "        ml.log_metric(\"train acc\", history[-1][\"train_acc\"])\n",
    "        ml.log_metric(\"val acc\", history[-1][\"val_acc\"])\n",
    "        ml.log_metric(\"test acc\", score)\n",
    "        mlflow.pytorch.log_model(clf, \"model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a74a949e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}