-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpca_visualization.py
More file actions
151 lines (131 loc) · 6.75 KB
/
Copy pathpca_visualization.py
File metadata and controls
151 lines (131 loc) · 6.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Project: Visualizing a High-Dimensional Dataset
Author: Your Name Here (for professional portfolio)
Objective: Use Principal Component Analysis (PCA) to reduce the dimensions of a
complex, high-dimensional dataset and visualize its clusters. This script
is designed to be a robust and customizable example for a data science portfolio.
Key Concepts:
- Dimensionality Reduction: A technique for reducing the number of features (variables)
in a dataset while retaining most of the important information.
- Principal Component Analysis (PCA): An unsupervised learning algorithm that
transforms a high-dimensional dataset into a new coordinate system where
the greatest variance by any projection of the data comes to lie on the first
coordinate (called the first principal component), the second greatest variance
on the second coordinate, and so on.
- Explained Variance: The proportion of the dataset's total variance that is
captured by each principal component. This is a crucial metric for evaluating
how much information is retained after dimensionality reduction.
- Scree Plot: A plot of the eigenvalues (or explained variance) vs. the number of
principal components, used to determine the optimal number of components to keep.
Dataset: We will use the MNIST handwritten digits dataset, which is a classic
high-dimensional dataset for this type of visualization. Each image is a 28x28
pixel grid, meaning each data point has 784 dimensions (features). We will
reduce this to a lower dimension (2D or 3D) to visualize the inherent structure.
Required Libraries:
- numpy
- scikit-learn
- matplotlib
- seaborn
- argparse (built-in)
To run this script from the command line, use:
python pca_visualization.py --n_components 2 --sample_size 10000
"""
# Step 1: Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import argparse
from mpl_toolkits.mplot3d import Axes3D
# Step 2: Set a professional plotting style
plt.style.use('seaborn-v0_8-whitegrid')
# Step 3: Define the main function to encapsulate the project logic
def main(n_components, sample_size):
"""
Main function to execute the PCA visualization project.
Args:
n_components (int): The number of dimensions to reduce the data to (2 or 3).
sample_size (int): The number of samples to use from the dataset for faster
computation.
"""
if n_components not in [2, 3]:
print("Error: The number of components must be either 2 or 3 for visualization.")
return
# Step 4: Load and subset the dataset
print(f"Loading MNIST dataset and subsampling to {sample_size} samples...")
try:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X_subset, _, y_subset, _ = train_test_split(X, y, train_size=sample_size, random_state=42, stratify=y)
print(f"Loaded a subset of {sample_size} samples.")
except Exception as e:
print(f"Error loading dataset: {e}")
print("Please check your internet connection and ensure the dataset can be fetched.")
return
# Step 5: Data Preprocessing
# Scaling the data is crucial for PCA as it is sensitive to the scale of features.
print("Scaling data using StandardScaler...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_subset)
print("Data scaled successfully.")
# Step 6: Perform a full PCA analysis to evaluate explained variance
print("\nPerforming a full PCA to analyze explained variance...")
pca_full = PCA()
pca_full.fit(X_scaled)
# Step 7: Plot the Scree Plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
pca_full.explained_variance_ratio_.cumsum(),
marker='o', linestyle='-')
plt.title('Cumulative Explained Variance by Principal Component')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.axvline(x=n_components, color='r', linestyle='--', label=f'Target: {n_components} Components')
plt.legend()
plt.savefig('Scree_plot.png')
plt.show()
print(f"The first {n_components} components explain "
f"{pca_full.explained_variance_ratio_[:n_components].sum()*100:.2f}% of the total variance.")
# Step 8: Apply PCA to reduce dimensions for visualization
print(f"\nApplying PCA to reduce dimensions to {n_components} for visualization...")
pca_final = PCA(n_components=n_components)
X_pca = pca_final.fit_transform(X_scaled)
print(f"Data reduced from {X_scaled.shape[1]} to {n_components} dimensions.")
# Step 9: Visualize the PCA-transformed data
print("Generating the visualization...")
if n_components == 2:
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_subset.astype(int),
palette=sns.color_palette("viridis", 10), legend='full',
s=50, alpha=0.7)
plt.title('2D PCA of the MNIST Dataset')
plt.xlabel(f'Principal Component 1 ({pca_final.explained_variance_ratio_[0]*100:.2f}% Variance)')
plt.ylabel(f'Principal Component 2 ({pca_final.explained_variance_ratio_[1]*100:.2f}% Variance)')
plt.legend(title='Digit')
plt.savefig('Scatter_plot.png')
plt.show()
elif n_components == 3:
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2],
c=y_subset.astype(int), cmap='viridis', s=50, alpha=0.7)
ax.set_title('3D PCA of the MNIST Dataset')
ax.set_xlabel(f'PC 1 ({pca_final.explained_variance_ratio_[0]*100:.2f}%)')
ax.set_ylabel(f'PC 2 ({pca_final.explained_variance_ratio_[1]*100:.2f}%)')
ax.set_zlabel(f'PC 3 ({pca_final.explained_variance_ratio_[2]*100:.2f}%)')
plt.colorbar(scatter, label='Digit Label')
plt.savefig('variance.png')
plt.show()
print("\nProject complete. The plots have been displayed.")
# Step 10: Parse command line arguments and run the main function
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Visualize a high-dimensional dataset using PCA.")
parser.add_argument('--n_components', type=int, default=2,
help="Number of dimensions to reduce to (2 for 2D plot, 3 for 3D plot).")
parser.add_argument('--sample_size', type=int, default=10000,
help="Number of samples to use for visualization.")
args = parser.parse_args()
main(args.n_components, args.sample_size)