# Neural Networks for Breast Cancer Classification

In this notebook we will see how to solve a classification problem using a neural network in Pytorch.

We will use
- The Scikit-Learn submodules `datasets` to import the breast cancer data set
- Split the dataset using `test_train_split` into a training and testing subsets
- The Python library Pytorch
- We will create our own
 - Dataset class
 - Dataloader class
 - Define our own nn.Module class
- We will investigate the layers and weights of the model
- We will see how to train the model by
 - Setting the epochs
 - Computing the loss
 - Calling the .backward() method to compute the gradients of the weights
- Compute the accuracy of our model on the test set

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from torch.utils.data import Dataset, DataLoader

In [None]:
# Load the breast cancer dataset as a dataframe
bc_dataset = load_breast_cancer(as_frame=True)

In [None]:
# X is a Pandas dataframe
# The columns are the features 
X = bc_dataset["data"]
# y is a Pandas series with the target class labels (0 - negative, 1 - positive)
y = bc_dataset["target"]

In [None]:
# Using the train_test_split method we split 80% of the data into the X_train, y_train numpy arrays
# The remaining 20% is our X_test and y_test 
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.20, random_state=10)

In [None]:
# Create a StandardScaler object
sc = StandardScaler()

# The StandardScaler standardizes features by removing the mean and scaling to unit variance
# Prevents features with larger variances to dominate
# We only need to apply this to our training/testing input data since the output is binary 0/1
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Create dataset class
class Data(Dataset):
 def __init__(self, X_train, y_train):
 # need to convert float64 to float32 else 
 # will get the following error
 # RuntimeError: expected scalar type Double but found Float
 self.X = torch.from_numpy(X_train.astype(np.float32))
 # need to convert float64 to Long else 
 # will get the following error
 # RuntimeError: expected scalar type Long but found Float
 # Unsqueeze function needed to 
 self.y = torch.from_numpy(y_train.astype(np.float32)).unsqueeze(1)
 self.len = self.X.shape[0]
 
 def __getitem__(self, index):
 return self.X[index], self.y[index]

 def __len__(self):
 return self.len

# Initialize the training data 
traindata = Data(X_train, y_train)

In [None]:
# Create dataloader

batch_size = 4
trainloader = DataLoader(traindata, batch_size=batch_size)

In [None]:
# number of features (len of X cols)
input_dim = X_train.shape[1]
# number of hidden layers
hidden_layer_dim = 4
# number of classes (unique of y)
output_dim = 1

In [None]:
torch.manual_seed(42)

# Define the Neural Network class
class NeuralNetwork(torch.nn.Module):
 def __init__(self):
 super(NeuralNetwork, self).__init__()
 self.linear1 = nn.Linear(input_dim, hidden_layer_dim)
 self.linear2 = nn.Linear(hidden_layer_dim, output_dim)
 
 def forward(self, x):
 x = torch.relu(self.linear1(x))
 x = torch.sigmoid(self.linear2(x))
 return x
 
clf = NeuralNetwork()

In [None]:
# Print out the parameters of the model
print(clf.parameters)

In [None]:
# Access the layers of a model

# Output the parameters of the model and store them in a list
[theta, b] = clf.linear1.parameters()

# Print the weights of layer 1, this object is a tuple with
print(theta)

# Print the weights of the biases
print(b)


print("Type of theta: ", type(theta))

In [None]:
# To access the tensor data we use the .data attribute
print(theta.data)

# The linear1 layer weight is stored in
# clf.linear1.weight
print(clf.linear1.weight)
# You can access and modify the weights tensor
print(clf.linear1.weight.data)

In [None]:
# Define the loss function
criterion = nn.BCELoss()

# Set the optimzer as Stochastic Gradient Descent with a learning rate of 0.01
optimizer = torch.optim.SGD(clf.parameters(), lr=0.01)

In [None]:
# Set the number of epochs
epochs = 10

# Set initial zero values of correct and total labels
# These are used to compute the accuracy
correct, total = 0, 0
# Set empty lists to stroe the losses and accuracies
losses = []
accuracies = []
for epoch in range(epochs):
 for i, data in enumerate(trainloader, 0):
 # get input and target batch
 inputs, targets = data

 # forward propagation
 outputs = clf(inputs)
 
 # compute the loss
 loss = criterion(outputs, targets)
 
 # compute prediction, anything greater than 0.5 goes to 1, less than to 0
 predicted = torch.round(outputs.data)
 total += targets.size(0)
 correct += (predicted == targets).sum().item()
 
 
 # backprop
 optimizer.zero_grad() # set optimizer to zero grad to remove previous epoch gradients
 loss.backward() 
 optimizer.step() # update coefficients
 acc = correct / total
 
 losses.append(loss.item())
 accuracies.append(acc)
 print("epoch {} loss : {:.5f} accuracy : {:.5f}".format(epoch, loss, acc))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
ax1.plot(losses)
ax1.set_title('Loss vs Epochs')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax2.plot(accuracies)
ax2.set_title('Accuracy vs Epochs')
ax2.set_xlabel('Epochs')
ax1.set_ylabel('Accuracy')
plt.subplots_adjust(left=0.1,
 bottom=0.1,
 right=0.9,
 top=0.9,
 wspace=0.4,
 hspace=0.4)
plt.show()

In [None]:
testdata = Data(X_test, y_test)
testloader = DataLoader(testdata, batch_size=batch_size)

In [None]:
correct, total = 0, 0
# no need to calculate gradients during inference
with torch.no_grad():
 for data in testloader:
 inputs, labels = data
 # calculate output by running through the network
 outputs = clf(inputs)
 # get the predictions
 predicted = torch.round(outputs.data)
 # update results
 total += labels.size(0)
 correct += (predicted == labels).sum().item()
 print(f'Accuracy of the network on the {len(testdata)} test data: {100 * correct // total} %')