Updating data section of report

Smerity · Smerity · commit 3ab4225fa7b3 · 2013-12-15T17:07:51.000-05:00
diff --git a/incremental_svd2.pyc b/incremental_svd2.pyc
diff --git a/svd_reconstruct.py b/svd_reconstruct.py
@@ -5,7 +5,6 @@
 from prettyplotlib import plt
 #import matplotlib.pyplot as plt
 from scipy.io import mmread
-from sklearn.metrics import mean_squared_error
 ##
 from incremental_svd2 import incremental_SVD
 
@@ -21,7 +20,7 @@ def check_orthogonality(A):
 
 if __name__ == '__main__':
   train = np.matrix(mmread('subset_train.mtx').todense())
-  train = train[0:2000, 0:100]
+  train = train[0:3000, 0:1000]
   print 'Using matrix of size {}'.format(train.shape)
 
   print 'Testing SVD'
@@ -35,7 +34,8 @@ def check_orthogonality(A):
   for k in xrange(1, 100):
     low_s = [s[i] for i in xrange(k)] + (min(u.shape[0], vT.shape[1]) - k) * [0]
     reconstruct = u.dot(scipy.linalg.diagsvd(low_s, u.shape[0], vT.shape[1]).dot(vT))
-    err = np.sqrt(mean_squared_error(train, reconstruct))
+    #err = np.sqrt(mean_squared_error(train, reconstruct))
+    err = np.linalg.norm(train - reconstruct, 'fro')
     print 'Exact SVD with low-rank approximation {}'.format(k)
     #print err
     #print
@@ -52,14 +52,13 @@ def check_orthogonality(A):
     print '... with block size of {}'.format(num)
     X, Y = [], []
     incr_orthoY = []
-    for k in xrange(1, 101, 1):
-      if k % 25 == 0:
-        print '   ... up to k={}'.format(k)
-      u, s, vT = incremental_SVD(train, k, num)
-      reconstruct = u.dot(s.dot(vT))
-      X.append(k)
-      Y.append(np.sqrt(mean_squared_error(train, reconstruct)))
-      incr_orthoY.append(check_orthogonality(u))
+    uL, sL, vTL = incremental_SVD(train, range(1, 101), num)
+    for i in xrange(len(uL)):
+      reconstruct = uL[i].dot(sL[i].dot(vTL[i]))
+      err = np.linalg.norm(train - reconstruct, 'fro')
+      X.append(i + 1)
+      Y.append(err)
+      incr_orthoY.append(check_orthogonality(uL[i]))
     incr_ortho.append(['iSVD u={}'.format(num), X, incr_orthoY])
     plt.plot(X, Y, label='iSVD u={}'.format(num))
   """
@@ -74,18 +73,18 @@ def check_orthogonality(A):
   ##
   plt.title('SVD reconstruction error on {}x{} matrix'.format(*train.shape))
   plt.xlabel('Low rank approximation (k)')
-  plt.ylabel('Root Mean Squared Error')
+  plt.ylabel('Frobenius norm')
   plt.ylim(0, max(svdY))
   plt.legend(loc='best')
-  plt.savefig('reconstruct_error_{}x{}.pdf'.format(*train.shape))
+  plt.savefig('reconstruct_fro_{}x{}.pdf'.format(*train.shape))
   plt.show(block=True)
   ##
   plt.plot(svdX, svdY, label="SVD", color='black', linewidth='2', linestyle='--')
   for label, X, Y in incr_ortho:
     plt.plot(X, Y, label=label)
   plt.title('SVD orthogonality error on {}x{} matrix'.format(*train.shape))
   plt.xlabel('Low rank approximation (k)')
-  plt.ylabel('Orthogonality error')
+  plt.ylabel('Deviation from orthogonality')
   plt.semilogy()
   #plt.ylim(0, max(orthoY))
   plt.legend(loc='best')
diff --git a/tex/merity_cook.tex b/tex/merity_cook.tex
@@ -69,11 +69,21 @@ \section{Introduction}
 
 \section{Data}
 
+The Netflix Prize was a large-scale recommendation competition held by Netflix.
+Their aim was to improve the recommendations they provided for their users by allowing third party researchers to analyze their data.
+At the time, the Netflix dataset was the largest real world dataset available to researchers.
+Collected over 7 years, it contained over 100 million ratings for 17,700 movies provided by over 480,000 users.
+To compete, participants would send predicted ratings for a specific test set to Netflix.
+Netflix would then return the root mean squared error (RMSE) for a portion of this test set.
+By providing RMSE on only a portion of the test set, teams cannot overfit the dataset to win the competition as their accuracy on the hidden portion would fall substantially.
+After the competition concluded, this dataset was released publicly for continued research.
+A full description of the rules and dataset can be found at the Netflix Prize website.
+
+
 %Here, we talk about the Netflix dataset. How we scrubbed it, what it consists of, etc.
-The core of the Netflix dataset consists of 17,770 text files.
+The Netflix dataset consists of 17,770 text files.
 Each text file represents a distinct movie.
-The first line in the text file is the movie's unique ID number, which is
-an integer from 1 to 17,770.
+The first line in the text file is the movie's unique ID number, which is an integer from 1 to 17,770.
 All other lines have three comma-delimited entries: user ID, rating, and date.
 
 There are 480,189 unique users in the dataset, with their IDs ranging from 1 to 2,649,429, with gaps.
@@ -94,13 +104,14 @@ \section{Data}
 
 In order to be able to perform SVD, we need a matrix with users on the rows and movies on the columns.
 This matrix would be $480,179 \times 17,770 = 8.5 \textrm{ billion}$ entries.
-In a regular matrix format, this would too big to hold in memory. One estimate is that it takes roughly 65 GB of RAM to hold the entire matrix \citep{revoR} although the actual size would depend on the amount of space allocated for each rating.
+In a regular matrix format, this would too big to hold in memory.
+One estimate is that it takes roughly 65 GB of RAM to hold the entire matrix \citep{revoR} although the actual size would depend on the amount of space allocated for each rating.
 Fortunately, the matrix is extremely sparse, containing around 100 million non-zero entries.
 To store the data in our project, we use SciPy's \verb!scipy.sparse.lil_matrix! which constructs sparse matrices using row-based linked lists.
 We store data from the text files in this sparse matrix as we read them.
 After reading in all of the text files, we output the matrix to a Matrix Market format.
 The Matrix Market format starts with a line containing the dimensions of the matrix and the number of non-zero entries.
-Then, each line contains $i \hspace{2ex} j \hspace{2ex} <\textrm{value}>$.
+Then, each line contains $i \enskip j \enskip rating$.
 For example, these are the first few lines of a Matrix Market file with a subset of the Netflix data:
 
 \begin{verbatim}