## Principal component analysis (PCA)

PCA is an unsupervised machine learning algorithm that helps to reduce the dimension of your data. The dimension of your data is the number of input features. This algorithm finds a reduced set of input features in the data that account for the majority of the variance in the data. This means that you can work with a smaller set of input features (smaller data) without losing the important information content compared to the full set of input features.

This can drastically reduce your computing resource requirements, speed up the computation by an order of magnitude, and increase interpretability.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [None]:
from sklearn.datasets import load_digits

# Step 1: Load the digits dataset
digits = load_digits()


In [None]:
# set up the figure
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits: each image is 8x8 pixels
for i in range(64):
 ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
 ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
 
 # label the image with the target value
 ax.text(0, 7, str(digits.target[i]))

In [None]:
# Create the feature matrix and target matrix
X, y = digits.data, digits.target # X = Images (flattened), y = Digits ID (target)

In [None]:
# These are all 8 x 8 images of the digits.
# Visualize 64 features/pixels/dimensions of the first image.
X[0]

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to transform the feature matrix
pca = PCA()
X_pca = pca.fit_transform(X)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Plot cumulative explained variance
plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Cumulative Explained Variance Plot")
plt.grid()
plt.show()

# Print max number of components
print(f"Maximum number of principal components: {X_pca.shape[1]}")


In [None]:
# Visualize PCA components 1 and 2
plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], y):
 plt.scatter(
 X_pca[y == i, 0], X_pca[y == i, 1], color=color, alpha=0.8, lw=lw, label=y
 )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA of Digits dataset");

In [None]:

from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Apply PCA for Dimensionality Reduction
pca = PCA(n_components=20) # Reduce to 20 principal components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier


# Train Random Forest with PCA-transformed data
rf_pca_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca_model.fit(X_train_pca, y_train)


In [None]:
# Predict and Evaluate with PCA
y_pred_pca = rf_pca_model.predict(X_test_pca)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

print("\nClassification Report (With PCA):")
print(classification_report(y_test, y_pred_pca))

In [None]:
# Visualize the confusion matrix
print("\nConfusion Matrix (With PCA):")
cm = confusion_matrix(y_test, y_pred_pca, labels=rf_pca_model.classes_)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


# Assignments

In [None]:
# Assignment: choose 1 PCA component and tell me how much is the accuracy?
# Accuracy = 

# Post your accuracy and image of the confusion matrix in the chat.

In [None]:
# Assignment: choose 2 PCA components and tell me how much is the accuracy?
# Accuracy = 

# Post your accuracy and image of the confusion matrix in the chat.

In [None]:
# Assignment: choose 5 PCA component and tell me how much is the accuracy?
# Accuracy = 

# Post your accuracy and image of the confusion matrix in the chat.