{ "cells": [ { "cell_type": "markdown", "id": "923565f5", "metadata": {}, "source": [ "# Neural Networks for Breast Cancer Classification\n", "\n", "In this notebook we will see how to solve a classification problem using a neural network in Pytorch.\n", "\n", "We will use\n", "- The Scikit-Learn submodules `datasets` to import the breast cancer data set\n", "- Split the dataset using `test_train_split` into a training and testing subsets\n", "- The Python library Pytorch\n", "- We will create our own\n", " - Dataset class\n", " - Dataloader class\n", " - Define our own nn.Module class\n", "- We will investigate the layers and weights of the model\n", "- We will see how to train the model by\n", " - Setting the epochs\n", " - Computing the loss\n", " - Calling the .backward() method to compute the gradients of the weights\n", "- Compute the accuracy of our model on the test set" ] }, { "cell_type": "code", "execution_count": null, "id": "b696469a", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "\n", "from sklearn.datasets import load_breast_cancer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from torch.utils.data import Dataset, DataLoader" ] }, { "cell_type": "code", "execution_count": null, "id": "55ec8818", "metadata": {}, "outputs": [], "source": [ "# Load the breast cancer dataset as a dataframe\n", "bc_dataset = load_breast_cancer(as_frame=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "371db605", "metadata": {}, "outputs": [], "source": [ "# X is a Pandas dataframe\n", "# The columns are the features \n", "X = bc_dataset[\"data\"]\n", "# y is a Pandas series with the target class labels (0 - negative, 1 - positive)\n", "y = bc_dataset[\"target\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "c48f10c4", "metadata": {}, "outputs": [], "source": [ "# Using the train_test_split method we split 80% of the data into the X_train, y_train numpy arrays\n", "# The remaining 20% is our X_test and y_test \n", "X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.20, random_state=10)" ] }, { "cell_type": "code", "execution_count": null, "id": "ab3df8b5", "metadata": {}, "outputs": [], "source": [ "# Create a StandardScaler object\n", "sc = StandardScaler()\n", "\n", "# The StandardScaler standardizes features by removing the mean and scaling to unit variance\n", "# Prevents features with larger variances to dominate\n", "# We only need to apply this to our training/testing input data since the output is binary 0/1\n", "X_train = sc.fit_transform(X_train)\n", "X_test = sc.fit_transform(X_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "38721eb6", "metadata": {}, "outputs": [], "source": [ "# Create dataset class\n", "class Data(Dataset):\n", " def __init__(self, X_train, y_train):\n", " # need to convert float64 to float32 else \n", " # will get the following error\n", " # RuntimeError: expected scalar type Double but found Float\n", " self.X = torch.from_numpy(X_train.astype(np.float32))\n", " # need to convert float64 to Long else \n", " # will get the following error\n", " # RuntimeError: expected scalar type Long but found Float\n", " # Unsqueeze function needed to \n", " self.y = torch.from_numpy(y_train.astype(np.float32)).unsqueeze(1)\n", " self.len = self.X.shape[0]\n", " \n", " def __getitem__(self, index):\n", " return self.X[index], self.y[index]\n", "\n", " def __len__(self):\n", " return self.len\n", "\n", "# Initialize the training data \n", "traindata = Data(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "89e4e71f", "metadata": {}, "outputs": [], "source": [ "# Create dataloader\n", "\n", "batch_size = 4\n", "trainloader = DataLoader(traindata, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": null, "id": "37bf9ccc", "metadata": {}, "outputs": [], "source": [ "# number of features (len of X cols)\n", "input_dim = X_train.shape[1]\n", "# number of hidden layers\n", "hidden_layer_dim = 4\n", "# number of classes (unique of y)\n", "output_dim = 1" ] }, { "cell_type": "code", "execution_count": null, "id": "234a92ac", "metadata": {}, "outputs": [], "source": [ "torch.manual_seed(42)\n", "\n", "# Define the Neural Network class\n", "class NeuralNetwork(torch.nn.Module):\n", " def __init__(self):\n", " super(NeuralNetwork, self).__init__()\n", " self.linear1 = nn.Linear(input_dim, hidden_layer_dim)\n", " self.linear2 = nn.Linear(hidden_layer_dim, output_dim)\n", " \n", " def forward(self, x):\n", " x = torch.relu(self.linear1(x))\n", " x = torch.sigmoid(self.linear2(x))\n", " return x\n", " \n", "clf = NeuralNetwork()" ] }, { "cell_type": "code", "execution_count": null, "id": "9fae9c97", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Print out the parameters of the model\n", "print(clf.parameters)" ] }, { "cell_type": "code", "execution_count": null, "id": "63c6e16c", "metadata": {}, "outputs": [], "source": [ "# Access the layers of a model\n", "\n", "# Output the parameters of the model and store them in a list\n", "[theta, b] = clf.linear1.parameters()\n", "\n", "# Print the weights of layer 1, this object is a tuple with\n", "print(theta)\n", "\n", "# Print the weights of the biases\n", "print(b)\n", "\n", "\n", "print(\"Type of theta: \", type(theta))" ] }, { "cell_type": "code", "execution_count": null, "id": "f56746f1", "metadata": {}, "outputs": [], "source": [ "# To access the tensor data we use the .data attribute\n", "print(theta.data)\n", "\n", "# The linear1 layer weight is stored in\n", "# clf.linear1.weight\n", "print(clf.linear1.weight)\n", "# You can access and modify the weights tensor\n", "print(clf.linear1.weight.data)" ] }, { "cell_type": "code", "execution_count": null, "id": "56e293a8", "metadata": {}, "outputs": [], "source": [ "# Define the loss function\n", "criterion = nn.BCELoss()\n", "\n", "# Set the optimzer as Stochastic Gradient Descent with a learning rate of 0.01\n", "optimizer = torch.optim.SGD(clf.parameters(), lr=0.01)" ] }, { "cell_type": "code", "execution_count": null, "id": "a5144186", "metadata": {}, "outputs": [], "source": [ "# Set the number of epochs\n", "epochs = 10\n", "\n", "# Set initial zero values of correct and total labels\n", "# These are used to compute the accuracy\n", "correct, total = 0, 0\n", "# Set empty lists to stroe the losses and accuracies\n", "losses = []\n", "accuracies = []\n", "for epoch in range(epochs):\n", " for i, data in enumerate(trainloader, 0):\n", " # get input and target batch\n", " inputs, targets = data\n", "\n", " # forward propagation\n", " outputs = clf(inputs)\n", " \n", " # compute the loss\n", " loss = criterion(outputs, targets)\n", " \n", " # compute prediction, anything greater than 0.5 goes to 1, less than to 0\n", " predicted = torch.round(outputs.data)\n", " total += targets.size(0)\n", " correct += (predicted == targets).sum().item()\n", " \n", " \n", " # backprop\n", " optimizer.zero_grad() # set optimizer to zero grad to remove previous epoch gradients\n", " loss.backward() \n", " optimizer.step() # update coefficients\n", " acc = correct / total\n", " \n", " losses.append(loss.item())\n", " accuracies.append(acc)\n", " print(\"epoch {} loss : {:.5f} accuracy : {:.5f}\".format(epoch, loss, acc))" ] }, { "cell_type": "code", "execution_count": null, "id": "98707752", "metadata": {}, "outputs": [], "source": [ "fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)\n", "ax1.plot(losses)\n", "ax1.set_title('Loss vs Epochs')\n", "ax1.set_xlabel('Epochs')\n", "ax1.set_ylabel('Loss')\n", "ax2.plot(accuracies)\n", "ax2.set_title('Accuracy vs Epochs')\n", "ax2.set_xlabel('Epochs')\n", "ax1.set_ylabel('Accuracy')\n", "plt.subplots_adjust(left=0.1,\n", " bottom=0.1,\n", " right=0.9,\n", " top=0.9,\n", " wspace=0.4,\n", " hspace=0.4)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "7746b7f9", "metadata": {}, "outputs": [], "source": [ "testdata = Data(X_test, y_test)\n", "testloader = DataLoader(testdata, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": null, "id": "319c87b0", "metadata": {}, "outputs": [], "source": [ "correct, total = 0, 0\n", "# no need to calculate gradients during inference\n", "with torch.no_grad():\n", " for data in testloader:\n", " inputs, labels = data\n", " # calculate output by running through the network\n", " outputs = clf(inputs)\n", " # get the predictions\n", " predicted = torch.round(outputs.data)\n", " # update results\n", " total += labels.size(0)\n", " correct += (predicted == labels).sum().item()\n", " print(f'Accuracy of the network on the {len(testdata)} test data: {100 * correct // total} %')" ] }, { "cell_type": "code", "execution_count": null, "id": "d0e8efe3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }