本文共 11120 字,大约阅读时间需要 37 分钟。
The data preprocessing as standarlization or feature Scaling:
Before we implement PCA, we will need to do some data preprocessing. In this assessment, some of them will be implemented by you, others we will take care of. However, when you are working on real world problems, you will need to do all these steps by yourself!
The preprocessing steps we will do are
Now we will implement PCA. Before we do that, let's pause for a moment and think about the steps for performing PCA. Assume that we are performing PCA on some dataset XX for MM principal components. We then need to perform the following steps, which we break into parts:
normalize
).eig
).After these steps, we can then compute the projection and reconstruction of the data onto the spaced spanned by the top n eigenvectors.
Recall that the principle basis is the vector related the max eigenvalue of the covariance matrix.
Code:
# PACKAGE: DO NOT EDIT THIS CELLimport numpy as npimport timeit# PACKAGE: DO NOT EDIT THIS CELLimport matplotlib as mplmpl.use('Agg')import matplotlib.pyplot as pltplt.style.use('fivethirtyeight')from ipywidgets import interactfrom load_data import load_mnistMNIST = load_mnist()images, labels = MNIST['data'], MNIST['target']# GRADED FUNCTION: DO NOT EDIT THIS LINEdef normalize(X): """Normalize the given dataset X Args: X: ndarray, dataset Returns: (Xbar, mean, std): tuple of ndarray, Xbar is the normalized dataset with mean 0 and standard deviation 1; mean and std are the mean and standard deviation respectively. Note: You will encounter dimensions where the standard deviation is zero, for those when you do normalization the normalized data will be NaN. Handle this by setting using `std = 1` for those dimensions when doing normalization. """ #mu = np.zeros(X.shape[1]) # <-- EDIT THIS, compute the mean of X mu = np.mean(X, axis = 0, keepdims = True) std = np.std(X, axis = 0, keepdims = True) std_filled = std.copy() std_filled[std==0] = 1. #Xbar = X # <-- EDIT THIS, compute the normalized data Xbar Xbar = (X - mu) / std_filled return Xbar, mu, stddef eig(S): """Compute the eigenvalues and corresponding eigenvectors for the covariance matrix S. Args: S: ndarray, covariance matrix Returns: (eigvals, eigvecs): ndarray, the eigenvalues and eigenvectors Note: the eigenvals and eigenvecs should be sorted in descending order of the eigen values """ eVals, eVecs = np.linalg.eig(S) order = np.argsort(eVals)[::-1] # sort the eigenvals in descending order. eVals = eVals[order] eVecs = eVecs[:, order] return (eVals, eVecs) # <-- EDIT THIS to return the eigenvalues and corresponding eigenvectorsdef projection_matrix(B): """Compute the projection matrix onto the space spanned by `B` Args: B: ndarray of dimension (D, M), the basis for the subspace Returns: P: the projection matrix """ P = B @ np.linalg.inv(B.T @ B) @ B.T return P # <-- EDIT THIS to compute the projection matrixdef PCA(X, num_components): """ Args: X: ndarray of size (N, D), where D is the dimension of the data, and N is the number of datapoints num_components: the number of principal components to use. Returns: X_reconstruct: ndarray of the reconstruction of X from the first `num_components` principal components. """ # your solution should take advantage of the functions you have implemented above. N, D = X.shape X_normalized, mu, std = normalize(X) X_normalized.shape # covariance matrix with mean 0 S = (X_normalized.T @ X_normalized) / N code, onb = eig(S) code = code[:num_components] onb = onb[:, :num_components] # P with the dimension(D, D) P = projection_matrix(onb) X_projection = P @ X.T return X_projection.T # <-- EDIT THIS to return the reconstruction of X ## Some preprocessing of the dataNUM_DATAPOINTS = 1000X = (images.reshape(-1, 28 * 28)[:NUM_DATAPOINTS]) / 255.Xbar, mu, std = normalize(X)for num_component in range(1, 20): from sklearn.decomposition import PCA as SKPCA # We can compute a standard solution given by scikit-learn's implementation of PCA pca = SKPCA(n_components=num_component, svd_solver='full') sklearn_reconst = pca.inverse_transform(pca.fit_transform(Xbar)) reconst = PCA(Xbar, num_component) np.testing.assert_almost_equal(reconst, sklearn_reconst) print(np.square(reconst - sklearn_reconst).sum())
Result:
(8.5153870005e-24+0j)(8.09790151532e-24+0j)(9.61487939311e-24+0j)(6.39164394758e-24+0j)(1.19817697147e-23+0j)(9.18939009489e-24+0j)(2.46356799263e-23+0j)(2.04450491509e-23+0j)(2.35281327024e-23+0j)(2.33297802189e-22+0j)(9.45193136857e-23+0j)(9.82734807213e-23+0j)(1.596514124e-22+0j)(7.20916435378e-23+0j)(2.9098190907e-23+0j)(3.7462168164e-23+0j)(3.22053322424e-23+0j)(2.71427239921e-23+0j)(1.11240190546e-22+0j)
Calculate the MSE for data set
def mse(predict, actual): """Helper function for computing the mean squared error (MSE)""" return np.square(predict - actual).sum(axis=1).mean()loss = []reconstructions = []# iterate over different number of principal components, and compute the MSEfor num_component in range(1, 100): reconst = PCA(Xbar, num_component) error = mse(reconst, Xbar) reconstructions.append(reconst) print('n = {:d}, reconstruction_error = {:f}'.format(num_component, error)) loss.append((num_component, error))reconstructions = np.asarray(reconstructions)reconstructions = reconstructions * std + mu # "unnormalize" the reconstructed imageloss = np.asarray(loss)import pandas as pd# create a table showing the number of principal components and MSEpd.DataFrame(loss).head()fig, ax = plt.subplots()ax.plot(loss[:,0], loss[:,1]);ax.axhline(100, linestyle='--', color='r', linewidth=2)ax.xaxis.set_ticks(np.arange(1, 100, 5));ax.set(xlabel='num_components', ylabel='MSE', title='MSE vs number of principal components');@interact(image_idx=(0, 1000))def show_num_components_reconst(image_idx): fig, ax = plt.subplots(figsize=(20., 20.)) actual = X[image_idx] # concatenate the actual and reconstructed images as large image before plotting it x = np.concatenate([actual[np.newaxis, :], reconstructions[:, image_idx]]) ax.imshow(np.hstack(x.reshape(-1, 28, 28)[np.arange(10)]), cmap='gray'); ax.axvline(28, color='orange', linewidth=2)@interact(i=(0, 10))def show_pca_digits(i=1): """Show the i th digit and its reconstruction""" plt.figure(figsize=(4,4)) actual_sample = X[i].reshape(28,28) reconst_sample = (reconst[i, :] * std + mu).reshape(28, 28) plt.imshow(np.hstack([actual_sample, reconst_sample]), cmap='gray') plt.show()
Sometimes, the dimensionality of our dataset may be larger than the number of samples we have. Then it might be inefficient to perform PCA with your implementation above. Instead, as mentioned in the lectures, you can implement PCA in a more efficient manner, which we call "PCA for high dimensional data" (PCA_high_dim).
Below are the steps for performing PCA for high dimensional dataset
# GRADED FUNCTION: DO NOT EDIT THIS LINE### PCA for high dimensional datasetsdef PCA_high_dim(X, n_components): """Compute PCA for small sample size but high-dimensional features. Args: X: ndarray of size (N, D), where D is the dimension of the sample, and N is the number of samples num_components: the number of principal components to use. Returns: X_reconstruct: (N, D) ndarray. the reconstruction of X from the first `num_components` pricipal components. """ N, D = X.shape S_prime = (X @ X.T) / N code_prime, onb_prime = eig(S_prime) code_prime = code_prime[:n_components] onb_prime = onb_prime[:, :n_components] # calculate the principle subspace U U = X.T @ onb_prime # (D, N) @ (N, n_components) P = projection_matrix(U) X_projection = P @ X.T return X_Projection.T # <-- EDIT THIS to return the reconstruction of X
Test CASE
np.testing.assert_almost_equal(PCA(Xbar, 2), PCA_high_dim(Xbar, 2))
Time Complexity Analysis:
Now let's compare the running time between PCA
and PCA_high_dim
.
Tips for running benchmarks or computationally expensive code:
When you have some computation that takes up a non-negligible amount of time. Try separating the code that produces output from the code that analyzes the result (e.g. plot the results, comput statistics of the results). In this way, you don't have to recompute when you want to produce more analysis.
The next cell includes a function that records the time taken for executing a function f
by repeating it for repeat
number of times. You do not need to modify the function but you can use it to compare the running time for functions which you are interested in knowing the running time.
def time(f, repeat=10): times = [] for _ in range(repeat): start = timeit.default_timer() f() stop = timeit.default_timer() times.append(stop-start) return np.mean(times), np.std(times)
times_mm0 = []times_mm1 = []# iterate over datasets of different sizefor datasetsize in np.arange(4, 784, step=20): XX = Xbar[:datasetsize] # select the first `datasetsize` samples in the dataset # record the running time for computing X.T @ X mu, sigma = time(lambda : XX.T @ XX) times_mm0.append((datasetsize, mu, sigma)) # record the running time for computing X @ X.T mu, sigma = time(lambda : XX @ XX.T) times_mm1.append((datasetsize, mu, sigma)) times_mm0 = np.asarray(times_mm0)times_mm1 = np.asarray(times_mm1)fig, ax = plt.subplots()ax.set(xlabel='size of dataset', ylabel='running time')bar = ax.errorbar(times_mm0[:, 0], times_mm0[:, 1], times_mm0[:, 2], label="$X^T X$ (PCA)", linewidth=2)ax.errorbar(times_mm1[:, 0], times_mm1[:, 1], times_mm1[:, 2], label="$X X^T$ (PCA_high_dim)", linewidth=2)ax.legend()
Benchmark for PCA and PCA high dimension
times0 = []times1 = []# iterate over datasets of different sizefor datasetsize in np.arange(4, 784, step=100): XX = Xbar[:datasetsize] npc = 2 mu, sigma = time(lambda : PCA(XX, npc), repeat=10) times0.append((datasetsize, mu, sigma)) mu, sigma = time(lambda : PCA_high_dim(XX, npc), repeat=10) times1.append((datasetsize, mu, sigma)) times0 = np.asarray(times0)times1 = np.asarray(times1)fig, ax = plt.subplots()ax.set(xlabel='number of datapoints', ylabel='run time')ax.errorbar(times0[:, 0], times0[:, 1], times0[:, 2], label="PCA", linewidth=2)ax.errorbar(times1[:, 0], times1[:, 1], times1[:, 2], label="PCA_high_dim", linewidth=2)ax.legend();
转载地址:http://cwwci.baihongyu.com/