{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Digits classification using Random Forest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_digits\n", "\n", "# Step 1: Load the Olivetti Faces dataset\n", "digits = load_digits()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up the figure\n", "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", "\n", "# plot the digits: each image is 8x8 pixels\n", "for i in range(64):\n", " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", " ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')\n", " \n", " # label the image with the target value\n", " ax.text(0, 7, str(digits.target[i]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the Feature Matrix for model training\n", "X, y = digits.data, digits.target # X = Images (flattened), y = Digits ID (target)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Images are represented as numpy array\n", "X[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Step 2: Split the data into training and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# Step 3: Train Random Forest without PCA\n", "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "rf_model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# Step 4: Predict and Evaluate\n", "y_pred = rf_model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "\n", "print(\"\\nClassification Report (Random Forest):\")\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "\n", "# Print the model preformance as a Confusion Matrix\n", "print(\"Confusion Matrix (Random Forest):\")\n", "print(confusion_matrix(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualize the confusion matrix\n", "print(\"\\nConfusion Matrix (Random Forest):\")\n", "cm = confusion_matrix(y_test, y_pred, labels=rf_model.classes_)\n", "\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)\n", "disp.plot(cmap=plt.cm.Blues)\n", "plt.title(\"Confusion Matrix\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Explainability of Random Forest\n", "\n", "# Extract and plot feature importance\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Get feature importances from the trained Random Forest model\n", "importances = rf_model.feature_importances_\n", "\n", "# Print feature importances\n", "print(\"Feature Importances:\")\n", "print(importances)\n", "\n", "# Sort feature importances in descending order\n", "indices = np.argsort(importances)[::-1]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# Plot feature importances\n", "plt.figure(figsize=(10, 5))\n", "plt.title(\"Feature Importance in Digits Classification (Random Forest)\")\n", "plt.bar(range(20), importances[indices[:20]], align=\"center\") # Plot top 20 features\n", "plt.xticks(range(20), indices[:20], rotation=90)\n", "plt.xlabel(\"Feature Index\")\n", "plt.ylabel(\"Importance Score\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 4 }