{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "923565f5",
   "metadata": {},
   "source": [
    "# Neural Networks for Breast Cancer Classification\n",
    "\n",
    "In this notebook we will see how to solve a classification problem using a neural network in Pytorch.\n",
    "\n",
    "We will use\n",
    "- The Scikit-Learn submodules `datasets` to import the breast cancer data set\n",
    "- Split the dataset using `test_train_split` into a training and testing subsets\n",
    "- The Python library Pytorch\n",
    "- We will create our own\n",
    "    - Dataset class\n",
    "    - Dataloader class\n",
    "    - Define our own nn.Module class\n",
    "- We will investigate the layers and weights of the model\n",
    "- We will see how to train the model by\n",
    "    - Setting the epochs\n",
    "    - Computing the loss\n",
    "    - Calling the .backward() method to compute the gradients of the weights\n",
    "- Compute the accuracy of our model on the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b696469a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from torch.utils.data import Dataset, DataLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55ec8818",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the breast cancer dataset as a dataframe\n",
    "bc_dataset = load_breast_cancer(as_frame=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "371db605",
   "metadata": {},
   "outputs": [],
   "source": [
    "# X is a Pandas dataframe\n",
    "# The columns are the features \n",
    "X = bc_dataset[\"data\"]\n",
    "# y is a Pandas series with the target class labels (0 - negative, 1 - positive)\n",
    "y = bc_dataset[\"target\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c48f10c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using the train_test_split method we split 80% of the data into the X_train, y_train numpy arrays\n",
    "# The remaining 20% is our X_test and y_test \n",
    "X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.20, random_state=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab3df8b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a StandardScaler object\n",
    "sc = StandardScaler()\n",
    "\n",
    "# The StandardScaler standardizes features by removing the mean and scaling to unit variance\n",
    "# Prevents features with larger variances to dominate\n",
    "# We only need to apply this to our training/testing input data since the output is binary 0/1\n",
    "X_train = sc.fit_transform(X_train)\n",
    "X_test = sc.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38721eb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset class\n",
    "class Data(Dataset):\n",
    "  def __init__(self, X_train, y_train):\n",
    "    # need to convert float64 to float32 else \n",
    "    # will get the following error\n",
    "    # RuntimeError: expected scalar type Double but found Float\n",
    "    self.X = torch.from_numpy(X_train.astype(np.float32))\n",
    "    # need to convert float64 to Long else \n",
    "    # will get the following error\n",
    "    # RuntimeError: expected scalar type Long but found Float\n",
    "    # Unsqueeze function needed to \n",
    "    self.y = torch.from_numpy(y_train.astype(np.float32)).unsqueeze(1)\n",
    "    self.len = self.X.shape[0]\n",
    "  \n",
    "  def __getitem__(self, index):\n",
    "    return self.X[index], self.y[index]\n",
    "\n",
    "  def __len__(self):\n",
    "    return self.len\n",
    "\n",
    "# Initialize the training data \n",
    "traindata = Data(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89e4e71f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataloader\n",
    "\n",
    "batch_size = 4\n",
    "trainloader = DataLoader(traindata, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37bf9ccc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# number of features (len of X cols)\n",
    "input_dim = X_train.shape[1]\n",
    "# number of hidden layers\n",
    "hidden_layer_dim = 4\n",
    "# number of classes (unique of y)\n",
    "output_dim = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "234a92ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.manual_seed(42)\n",
    "\n",
    "# Define the Neural Network class\n",
    "class NeuralNetwork(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetwork, self).__init__()\n",
    "        self.linear1 = nn.Linear(input_dim, hidden_layer_dim)\n",
    "        self.linear2 = nn.Linear(hidden_layer_dim, output_dim)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        x = torch.relu(self.linear1(x))\n",
    "        x = torch.sigmoid(self.linear2(x))\n",
    "        return x\n",
    "    \n",
    "clf = NeuralNetwork()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fae9c97",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Print out the parameters of the model\n",
    "print(clf.parameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63c6e16c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Access the layers of a model\n",
    "\n",
    "# Output the parameters of the model and store them in a list\n",
    "[theta, b] = clf.linear1.parameters()\n",
    "\n",
    "# Print the weights of layer 1, this object is a tuple with\n",
    "print(theta)\n",
    "\n",
    "# Print the weights of the biases\n",
    "print(b)\n",
    "\n",
    "\n",
    "print(\"Type of theta: \", type(theta))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f56746f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# To access the tensor data we use the .data attribute\n",
    "print(theta.data)\n",
    "\n",
    "# The linear1 layer weight is stored in\n",
    "# clf.linear1.weight\n",
    "print(clf.linear1.weight)\n",
    "# You can access and modify the weights tensor\n",
    "print(clf.linear1.weight.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56e293a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the loss function\n",
    "criterion = nn.BCELoss()\n",
    "\n",
    "# Set the optimzer as Stochastic Gradient Descent with a learning rate of 0.01\n",
    "optimizer = torch.optim.SGD(clf.parameters(), lr=0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5144186",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the number of epochs\n",
    "epochs = 10\n",
    "\n",
    "# Set initial zero values of correct and total labels\n",
    "# These are used to compute the accuracy\n",
    "correct, total = 0, 0\n",
    "# Set empty lists to stroe the losses and accuracies\n",
    "losses = []\n",
    "accuracies = []\n",
    "for epoch in range(epochs):\n",
    "    for i, data in enumerate(trainloader, 0):\n",
    "        # get input and target batch\n",
    "        inputs, targets = data\n",
    "\n",
    "        # forward propagation\n",
    "        outputs = clf(inputs)\n",
    "        \n",
    "        # compute the loss\n",
    "        loss = criterion(outputs, targets)\n",
    "        \n",
    "        # compute prediction, anything greater than 0.5 goes to 1, less than to 0\n",
    "        predicted = torch.round(outputs.data)\n",
    "        total += targets.size(0)\n",
    "        correct += (predicted == targets).sum().item()\n",
    "       \n",
    "        \n",
    "        # backprop\n",
    "        optimizer.zero_grad() # set optimizer to zero grad to remove previous epoch gradients\n",
    "        loss.backward() \n",
    "        optimizer.step() # update coefficients\n",
    "    acc = correct / total\n",
    "    \n",
    "    losses.append(loss.item())\n",
    "    accuracies.append(acc)\n",
    "    print(\"epoch {}  loss : {:.5f}  accuracy : {:.5f}\".format(epoch, loss, acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98707752",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)\n",
    "ax1.plot(losses)\n",
    "ax1.set_title('Loss vs Epochs')\n",
    "ax1.set_xlabel('Epochs')\n",
    "ax1.set_ylabel('Loss')\n",
    "ax2.plot(accuracies)\n",
    "ax2.set_title('Accuracy vs Epochs')\n",
    "ax2.set_xlabel('Epochs')\n",
    "ax1.set_ylabel('Accuracy')\n",
    "plt.subplots_adjust(left=0.1,\n",
    "                    bottom=0.1,\n",
    "                    right=0.9,\n",
    "                    top=0.9,\n",
    "                    wspace=0.4,\n",
    "                    hspace=0.4)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7746b7f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "testdata = Data(X_test, y_test)\n",
    "testloader = DataLoader(testdata, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "319c87b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "correct, total = 0, 0\n",
    "# no need to calculate gradients during inference\n",
    "with torch.no_grad():\n",
    "    for data in testloader:\n",
    "        inputs, labels = data\n",
    "        # calculate output by running through the network\n",
    "        outputs = clf(inputs)\n",
    "        # get the predictions\n",
    "        predicted = torch.round(outputs.data)\n",
    "        # update results\n",
    "        total += labels.size(0)\n",
    "        correct += (predicted == labels).sum().item()\n",
    "    print(f'Accuracy of the network on the {len(testdata)} test data: {100 * correct // total} %')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0e8efe3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}