Dimensionality Reduction: PCA, t-SNE, and UMAP

Pixel	Blue	Green	NIR
1	0.2	0.3	0.6
2	0.4	0.5	0.8
3	0.1	0.2	0.4
4	0.3	0.4	0.7

import numpy as np
import matplotlib.pyplot as plt

class PCA:
    def __init__(self, n_components=2):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.explained_variance = None
    
    def fit(self, X):
        """Fit PCA model"""
        # Center data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        
        # Compute covariance matrix
        cov_matrix = np.cov(X_centered.T)
        
        # Eigendecomposition
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
        
        # Sort by eigenvalue (descending)
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        
        # Store principal components
        self.components = eigenvectors[:, :self.n_components]
        self.explained_variance = eigenvalues[:self.n_components]
        
        return self
    
    def transform(self, X):
        """Project data onto principal components"""
        X_centered = X - self.mean
        return X_centered @ self.components
    
    def fit_transform(self, X):
        """Fit and transform in one step"""
        self.fit(X)
        return self.transform(X)
    
    def inverse_transform(self, Z):
        """Reconstruct from reduced dimensions"""
        return Z @ self.components.T + self.mean
    
    def explained_variance_ratio(self):
        """Proportion of variance explained"""
        return self.explained_variance / np.sum(self.explained_variance)


# Example: Hyperspectral image
if __name__ == "__main__":
    # Generate synthetic hyperspectral data
    # 1000 pixels, 50 bands
    np.random.seed(42)
    
    # Create correlated bands (simulating real hyperspectral)
    n_pixels = 1000
    n_bands = 50
    
    # Base signals
    signal1 = np.random.randn(n_pixels)  # Vegetation
    signal2 = np.random.randn(n_pixels)  # Soil
    signal3 = np.random.randn(n_pixels)  # Water
    
    # Mix signals across bands with noise
    X = np.zeros((n_pixels, n_bands))
    for i in range(n_bands):
        # Wavelength-dependent mixing
        w1 = np.exp(-(i-20)**2/100)  # Peak at band 20
        w2 = np.exp(-(i-35)**2/100)  # Peak at band 35
        w3 = 1 - w1 - w2
        
        X[:, i] = (w1 * signal1 + w2 * signal2 + w3 * signal3 + 
                   0.1 * np.random.randn(n_pixels))
    
    # Apply PCA
    pca = PCA(n_components=3)
    Z = pca.fit_transform(X)
    
    # Analyze results
    var_ratio = pca.explained_variance_ratio()
    print(f"Variance explained by PC1: {var_ratio[0]:.1%}")
    print(f"Variance explained by PC2: {var_ratio[1]:.1%}")
    print(f"Variance explained by PC3: {var_ratio[2]:.1%}")
    print(f"Total (3 PCs): {np.sum(var_ratio):.1%}")
    
    # Visualize
    fig = plt.figure(figsize=(12, 4))
    
    # Scree plot
    ax1 = fig.add_subplot(131)
    ax1.bar(range(1, len(var_ratio)+1), var_ratio)
    ax1.set_xlabel('Principal Component')
    ax1.set_ylabel('Variance Explained')
    ax1.set_title('Scree Plot')
    
    # 2D projection
    ax2 = fig.add_subplot(132)
    scatter = ax2.scatter(Z[:, 0], Z[:, 1], c=signal1, 
                         cmap='viridis', s=1, alpha=0.5)
    ax2.set_xlabel('PC1')
    ax2.set_ylabel('PC2')
    ax2.set_title('PCA Projection')
    plt.colorbar(scatter, ax=ax2, label='Vegetation signal')
    
    # Reconstruction error
    X_reconstructed = pca.inverse_transform(Z)
    reconstruction_error = np.mean((X - X_reconstructed)**2)
    print(f"Reconstruction error (3 PCs): {reconstruction_error:.6f}")
    
    plt.tight_layout()
    plt.savefig('pca_analysis.webp', dpi=150)
    print("Saved pca_analysis.webp")

Dimensionality Reduction: PCA, t-SNE, and UMAP

1. The Question

2. The Conceptual Model

Principal Component Analysis (PCA)

Variance Explained

t-SNE (t-Distributed Stochastic Neighbor Embedding)

UMAP (Uniform Manifold Approximation and Projection)

3. Building the Mathematical Model

PCA Derivation

Reconstruction

4. Worked Example by Hand

Step 1: Center the data

Step 2: Compute covariance matrix

Step 3: Find eigenvalues

Step 4: Find eigenvectors

Step 5: Project data

Step 6: Variance explained

5. Computational Implementation

Tier 1: Foundation Path (Pi/Laptop)

Tier 2: Professional Path (Go Further)

6. Interpretation

Real Application: AVIRIS Hyperspectral

Feature Engineering for Classification

7. What Could Go Wrong?

Interpreting PCA Components

Outliers Dominate

t-SNE Pitfalls

Choosing Number of Components

8. Extension: Kernel PCA

9. Math Refresher: Eigenvalues and Eigenvectors

Definition

Finding Eigenvalues

Properties

Summary