-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpca.py
49 lines (35 loc) · 1.65 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
'''
Source: Natural Language Processing with Classification and Vector Spaces, Week 3, Coursera Assignment
Description: Principal Component Analysis
'''
import numpy as np
def compute_pca(X, n_components=2):
"""
Input:
X: of dimension (m,n) where each row corresponds to a word vector
n_components: Number of components you want to keep.
Output:
X_reduced: data transformed in 2 dims/columns + regenerated original data
"""
# mean center the data
X_demeaned = X - np.mean(X,axis=0)
# calculate the covariance matrix
covariance_matrix = np.cov(X_demeaned, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix, UPLO='L')
# sort eigenvalue in increasing order (get the indices from the sort)
idx_sorted = np.argsort(eigen_vals)
# reverse the order so that it's from highest to lowest.
idx_sorted_decreasing = idx_sorted[::-1]
# sort the eigen values by idx_sorted_decreasing
eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]
# sort eigenvectors using the idx_sorted_decreasing indices
eigen_vecs_sorted = eigen_vecs[:,idx_sorted_decreasing]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
eigen_vecs_subset = eigen_vecs_sorted[:,0:n_components]
# transform the data by multiplying the transpose of the eigenvectors
# with the transpose of the de-meaned data
# Then take the transpose of that product.
X_reduced = np.dot(eigen_vecs_subset.transpose(),X_demeaned.transpose()).transpose()
return X_reduced