diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index bf1e87d2452..c8c251900ec 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -39,7 +39,7 @@ namespace nnet3 {
 */
 
 static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
-                        const MatrixBase<BaseFloat> &feats,
+                        const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
                         const chain::Supervision &supervision,
@@ -108,21 +108,13 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
     nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
     int32 tot_input_frames = chunk.left_context + chunk.num_frames +
-        chunk.right_context;
-
-    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
-                                   kUndefined);
-
-    int32 start_frame = chunk.first_frame - chunk.left_context;
-    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
-      int32 t2 = t;
-      if (t2 < 0) t2 = 0;
-      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
-      int32 j = t - start_frame;
-      SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j);
-      dest.CopyFromVec(src);
-    }
+        chunk.right_context,
+        start_frame = chunk.first_frame - chunk.left_context;
+
+    GeneralMatrix input_frames;
+    ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames,
+                               &input_frames);
+
     NnetIo input_io("input", -chunk.left_context, input_frames);
     nnet_chain_eg.inputs[0].Swap(&input_io);
 
@@ -193,8 +185,11 @@ int main(int argc, char *argv[]) {
     std::string online_ivector_rspecifier;
 
     ParseOptions po(usage);
-    po.Register("compress", &compress, "If true, write egs in "
-                "compressed format.");
+    po.Register("compress", &compress, "If true, write egs with input features "
+                "in compressed format (recommended).  Update: this is now "
+                "only relevant if the features being read are un-compressed; "
+                "if already compressed, we keep we same compressed format when "
+                "dumping-egs.");
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
                 "--online-ivectors option, for back compatibility");
     po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
@@ -242,7 +237,9 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(normalization_fst.NumStates() > 0);
     }
 
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    // Read as GeneralMatrix so we don't need to un-compress and re-compress
+    // when selecting parts of matrices.
+    SequentialGeneralMatrixReader feat_reader(feature_rspecifier);
     chain::RandomAccessSupervisionReader supervision_reader(
         supervision_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
@@ -253,7 +250,7 @@ int main(int argc, char *argv[]) {
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      const GeneralMatrix &feats = feat_reader.Value();
       if (!supervision_reader.HasKey(key)) {
         KALDI_WARN << "No pdf-level posterior for key " << key;
         num_err++;
diff --git a/src/featbin/apply-cmvn.cc b/src/featbin/apply-cmvn.cc
index 2020aebebc8..b4b073b7777 100644
--- a/src/featbin/apply-cmvn.cc
+++ b/src/featbin/apply-cmvn.cc
@@ -34,14 +34,14 @@ int main(int argc, char *argv[]) {
         "Usage: apply-cmvn [options] (<cmvn-stats-rspecifier>|<cmvn-stats-rxfilename>) <feats-rspecifier> <feats-wspecifier>\n"
         "e.g.: apply-cmvn --utt2spk=ark:data/train/utt2spk scp:data/train/cmvn.scp scp:data/train/feats.scp ark:-\n"
         "See also: modify-cmvn-stats, matrix-sum, compute-cmvn-stats\n";
-    
+
     ParseOptions po(usage);
     std::string utt2spk_rspecifier;
     bool norm_vars = false;
     bool norm_means = true;
     bool reverse = false;
     std::string skip_dims_str;
-    
+
     po.Register("utt2spk", &utt2spk_rspecifier,
                 "rspecifier for utterance to speaker map");
     po.Register("norm-vars", &norm_vars, "If true, normalize variances.");
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
     po.Register("reverse", &reverse, "If true, apply CMVN in a reverse sense, "
                 "so as to transform zero-mean, unit-variance input into data "
                 "with the given mean and variance.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -63,6 +63,26 @@ int main(int argc, char *argv[]) {
     if (norm_vars && !norm_means)
       KALDI_ERR << "You cannot normalize the variance but not the mean.";
 
+
+    std::string cmvn_rspecifier_or_rxfilename = po.GetArg(1);
+    std::string feat_rspecifier = po.GetArg(2);
+    std::string feat_wspecifier = po.GetArg(3);
+
+    if (!norm_means) {
+      // CMVN is a no-op, we're not doing anything.  Just echo the input
+      // don't even uncompress, if it was a CompressedMatrix.
+      SequentialGeneralMatrixReader reader(feat_rspecifier);
+      GeneralMatrixWriter writer(feat_wspecifier);
+      kaldi::int32 num_done = 0;
+      for (;!reader.Done(); reader.Next()) {
+        writer.Write(reader.Key(), reader.Value());
+        num_done++;
+      }
+      KALDI_LOG << "Copied " << num_done << " utterances.";
+      return (num_done != 0 ? 0 : 1);
+    }
+
+
     std::vector<int32> skip_dims;  // optionally use "fake"
                                    // (zero-mean/unit-variance) stats for some
                                    // dims to disable normalization.
@@ -70,24 +90,20 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
                 <<  "integers)";
     }
-    
-    
+
+
     kaldi::int32 num_done = 0, num_err = 0;
-    
-    std::string cmvn_rspecifier_or_rxfilename = po.GetArg(1);
-    std::string feat_rspecifier = po.GetArg(2);
-    std::string feat_wspecifier = po.GetArg(3);
-    
+
     SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
     BaseFloatMatrixWriter feat_writer(feat_wspecifier);
-    
+
     if (ClassifyRspecifier(cmvn_rspecifier_or_rxfilename, NULL, NULL)
         != kNoRspecifier) { // reading from a Table: per-speaker or per-utt CMN/CVN.
       std::string cmvn_rspecifier = cmvn_rspecifier_or_rxfilename;
 
       RandomAccessDoubleMatrixReaderMapped cmvn_reader(cmvn_rspecifier,
                                                        utt2spk_rspecifier);
-      
+
       for (; !feat_reader.Done(); feat_reader.Next()) {
         std::string utt = feat_reader.Key();
         Matrix<BaseFloat> feat(feat_reader.Value());
@@ -101,7 +117,7 @@ int main(int argc, char *argv[]) {
           Matrix<double> cmvn_stats = cmvn_reader.Value(utt);
           if (!skip_dims.empty())
             FakeStatsForSomeDims(skip_dims, &cmvn_stats);
-          
+
           if (reverse) {
             ApplyCmvnReverse(cmvn_stats, norm_vars, &feat);
           } else {
@@ -124,7 +140,7 @@ int main(int argc, char *argv[]) {
       cmvn_stats.Read(ki.Stream(), binary);
       if (!skip_dims.empty())
         FakeStatsForSomeDims(skip_dims, &cmvn_stats);
-      
+
       for (;!feat_reader.Done(); feat_reader.Next()) {
         std::string utt = feat_reader.Key();
         Matrix<BaseFloat> feat(feat_reader.Value());
@@ -139,7 +155,7 @@ int main(int argc, char *argv[]) {
         num_done++;
       }
     }
-    if (norm_vars) 
+    if (norm_vars)
       KALDI_LOG << "Applied cepstral mean and variance normalization to "
                 << num_done << " utterances, errors on " << num_err;
     else
@@ -151,4 +167,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc
index f1f58653f2f..8f94f27d4dd 100644
--- a/src/featbin/copy-feats.cc
+++ b/src/featbin/copy-feats.cc
@@ -42,6 +42,7 @@ int main(int argc, char *argv[]) {
     bool htk_in = false;
     bool sphinx_in = false;
     bool compress = false;
+    int32 compression_method_in = 1;
     std::string num_frames_wspecifier;
     po.Register("htk-in", &htk_in, "Read input as HTK features");
     po.Register("sphinx-in", &sphinx_in, "Read input as Sphinx features");
@@ -50,6 +51,10 @@ int main(int argc, char *argv[]) {
     po.Register("compress", &compress, "If true, write output in compressed form"
                 "(only currently supported for wxfilename, i.e. archive/script,"
                 "output)");
+    po.Register("compression-method", &compression_method_in,
+                "Only relevant if --compress=true; the method (1 through 6) to "
+                "compress the matrix.  Search for CompressionMethod in "
+                "src/matrix/compressed-matrix.h.");
     po.Register("write-num-frames", &num_frames_wspecifier,
                 "Wspecifier to write length in frames of each utterance. "
                 "e.g. 'ark,t:utt2num_frames'.  Only applicable if writing tables, "
@@ -65,6 +70,9 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0;
 
+    CompressionMethod compression_method = static_cast<CompressionMethod>(
+        compression_method_in);
+
     if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
       // Copying tables of features.
       std::string rspecifier = po.GetArg(1);
@@ -104,7 +112,8 @@ int main(int argc, char *argv[]) {
           SequentialTableReader<HtkMatrixHolder> htk_reader(rspecifier);
           for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
             kaldi_writer.Write(htk_reader.Key(),
-                               CompressedMatrix(htk_reader.Value().first));
+                               CompressedMatrix(htk_reader.Value().first,
+                                                compression_method));
             if (!num_frames_wspecifier.empty())
               num_frames_writer.Write(htk_reader.Key(),
                                       htk_reader.Value().first.NumRows());
@@ -113,7 +122,8 @@ int main(int argc, char *argv[]) {
           SequentialTableReader<SphinxMatrixHolder<> > sphinx_reader(rspecifier);
           for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
             kaldi_writer.Write(sphinx_reader.Key(),
-                               CompressedMatrix(sphinx_reader.Value()));
+                               CompressedMatrix(sphinx_reader.Value(),
+                                                compression_method));
             if (!num_frames_wspecifier.empty())
               num_frames_writer.Write(sphinx_reader.Key(),
                                       sphinx_reader.Value().NumRows());
@@ -122,7 +132,8 @@ int main(int argc, char *argv[]) {
           SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
           for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
             kaldi_writer.Write(kaldi_reader.Key(),
-                               CompressedMatrix(kaldi_reader.Value()));
+                               CompressedMatrix(kaldi_reader.Value(),
+                                                compression_method));
             if (!num_frames_wspecifier.empty())
               num_frames_writer.Write(kaldi_reader.Key(),
                                       kaldi_reader.Value().NumRows());
diff --git a/src/matrix/compressed-matrix.cc b/src/matrix/compressed-matrix.cc
index 2ac2c544bc8..45965e87651 100644
--- a/src/matrix/compressed-matrix.cc
+++ b/src/matrix/compressed-matrix.cc
@@ -24,23 +24,94 @@
 
 namespace kaldi {
 
-//static 
+//static
 MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) {
   // Returns size in bytes of the data.
-  if (header.format == 1) {
+  DataFormat format = static_cast<DataFormat>(header.format);
+  if (format == kOneByteWithColHeaders) {
     return sizeof(GlobalHeader) +
         header.num_cols * (sizeof(PerColHeader) + header.num_rows);
-  } else {
-    KALDI_ASSERT(header.format == 2) ;
+  } else if (format == kTwoByte) {
     return sizeof(GlobalHeader) +
         2 * header.num_rows * header.num_cols;
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    return sizeof(GlobalHeader) +
+        header.num_rows * header.num_cols;
   }
 }
 
 
+template<typename Real>  // static inline
+void CompressedMatrix::ComputeGlobalHeader(
+    const MatrixBase<Real> &mat, CompressionMethod method,
+    GlobalHeader *header) {
+  if (method == kAutomaticMethod) {
+    if (mat.NumRows() > 8) method = kSpeechFeature;
+    else method = kTwoByteAuto;
+  }
+
+  switch (method) {
+    case kSpeechFeature:
+      header->format = static_cast<int32>(kOneByteWithColHeaders);  // 1.
+      break;
+    case kTwoByteAuto: case kTwoByteSignedInteger:
+      header->format = static_cast<int32>(kTwoByte);  // 2.
+      break;
+    case kOneByteAuto: case kOneByteInteger: case kOneByteZeroOne:
+      header->format = static_cast<int32>(kOneByte);  // 3.
+      break;
+    default:
+      KALDI_ERR << "Invalid compression type: "
+                << static_cast<int32>(method);
+  }
+
+  header->num_rows = mat.NumRows();
+  header->num_cols = mat.NumCols();
+
+  // Now compute 'min_value' and 'range'.
+  switch (method) {
+    case kSpeechFeature: case kTwoByteAuto: case kOneByteAuto: {
+      float min_value = mat.Min(), max_value = mat.Max();
+      // ensure that max_value is strictly greater than min_value, even if matrix is
+      // constant; this avoids crashes in ComputeColHeader when compressing speech
+      // featupres.
+      if (max_value == min_value)
+        max_value = min_value + (1.0 + fabs(min_value));
+      KALDI_ASSERT(min_value - min_value == 0 &&
+                   max_value - max_value == 0 &&
+                   "Cannot compress a matrix with Nan's or Inf's");
+
+      header->min_value = min_value;
+      header->range = max_value - min_value;
+
+      // we previously checked that max_value != min_value, so their
+      // difference should be nonzero.
+      KALDI_ASSERT(header->range > 0.0);
+      break;
+    }
+    case kTwoByteSignedInteger: {
+      header->min_value = -32768.0;
+      header->range = 65535.0;
+      break;
+    }
+    case kOneByteZeroOne: {
+      header->min_value = 0.0;
+      header->range = 1.0;
+      break;
+    }
+    default:
+      KALDI_ERR << "Unknown compression method = "
+                << static_cast<int32>(method);
+  }
+  KALDI_COMPILE_TIME_ASSERT(sizeof(*header) == 20);  // otherwise
+  // something weird is happening and our code probably won't work or
+  // won't be robust across platforms.
+}
+
 template<typename Real>
 void CompressedMatrix::CopyFromMat(
-    const MatrixBase<Real> &mat) {
+    const MatrixBase<Real> &mat, CompressionMethod method) {
   if (data_ != NULL) {
     delete [] static_cast<float*>(data_);  // call delete [] because was allocated with new float[]
     data_ = NULL;
@@ -49,55 +120,21 @@ void CompressedMatrix::CopyFromMat(
 
 
   GlobalHeader global_header;
-  KALDI_COMPILE_TIME_ASSERT(sizeof(global_header) == 20);  // otherwise
-  // something weird is happening and our code probably won't work or
-  // won't be robust across platforms.
-  
-  // Below, the point of the "safety_margin" is that the minimum
-  // and maximum values in the matrix shouldn't coincide with
-  // the minimum and maximum ranges of the 16-bit range, because
-  // this could cause certain problems in ComputeColHeader, where
-  // we need to ensure that the percentile_0 through percentile_100
-  // are in strictly increasing order.
-  float min_value = mat.Min(), max_value = mat.Max();
-  if (max_value == min_value)
-    max_value = min_value + (1.0 + fabs(min_value)); // ensure it's strictly
-                                                     // greater than min_value,
-                                                     // even if matrix is
-                                                     // constant.
-
-  global_header.min_value = min_value;
-  global_header.range = max_value - min_value;
-  // We can't compress the matrix if there are inf's or nan's.
-  // The caller should check for this first.
-  KALDI_ASSERT(KALDI_ISFINITE(global_header.min_value) &&
-               KALDI_ISFINITE(global_header.range));
-
-  // Avoid division by zero if the matrix is just a constant:
-  // make sure max_value > min_value.
-  if (global_header.range <= 0.0)
-    global_header.range = 1.0e-05;
-  global_header.num_rows = mat.NumRows();
-  global_header.num_cols = mat.NumCols();
-
-  if (mat.NumRows() > 8) {
-    global_header.format = 1;  // format where each row has a PerColHeader.
-  } else {
-    global_header.format = 2;  // format where all data is uint16.
-  }
-  
+  ComputeGlobalHeader(mat, method, &global_header);
+
   int32 data_size = DataSize(global_header);
 
   data_ = AllocateData(data_size);
-  
+
   *(reinterpret_cast<GlobalHeader*>(data_)) = global_header;
 
-  if (global_header.format == 1) {
+  DataFormat format = static_cast<DataFormat>(global_header.format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *header_data =
         reinterpret_cast<PerColHeader*>(static_cast<char*>(data_) +
                                         sizeof(GlobalHeader));
-    unsigned char *byte_data =
-        reinterpret_cast<unsigned char*>(header_data + global_header.num_cols);
+    uint8 *byte_data =
+        reinterpret_cast<uint8*>(header_data + global_header.num_cols);
 
     const Real *matrix_data = mat.Data();
 
@@ -109,7 +146,7 @@ void CompressedMatrix::CopyFromMat(
       header_data++;
       byte_data += global_header.num_rows;
     }
-  } else {
+  } else if (format == kTwoByte) {
     uint16 *data = reinterpret_cast<uint16*>(static_cast<char*>(data_) +
                                              sizeof(GlobalHeader));
     int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
@@ -119,15 +156,28 @@ void CompressedMatrix::CopyFromMat(
         data[c] = FloatToUint16(global_header, row_data[c]);
       data += num_cols;
     }
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    uint8 *data = reinterpret_cast<uint8*>(static_cast<char*>(data_) +
+                                           sizeof(GlobalHeader));
+    int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
+    for (int32 r = 0; r < num_rows; r++) {
+      const Real *row_data = mat.RowData(r);
+      for (int32 c = 0; c < num_cols; c++)
+        data[c] = FloatToUint8(global_header, row_data[c]);
+      data += num_cols;
+    }
   }
 }
 
 // Instantiate the template for float and double.
 template
-void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat);
+void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat,
+                                   CompressionMethod method);
 
 template
-void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat);
+void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat,
+                                   CompressionMethod method);
 
 
 CompressedMatrix::CompressedMatrix(
@@ -135,21 +185,31 @@ CompressedMatrix::CompressedMatrix(
     const MatrixIndexT row_offset,
     const MatrixIndexT num_rows,
     const MatrixIndexT col_offset,
-    const MatrixIndexT num_cols): data_(NULL) {
+    const MatrixIndexT num_cols,
+    bool allow_padding): data_(NULL) {
   int32 old_num_rows = cmat.NumRows(), old_num_cols = cmat.NumCols();
+
+  if (old_num_rows == 0) {
+    KALDI_ASSERT(num_rows == 0 && num_cols == 0);
+    // The empty matrix is stored as a zero pointer.
+    return;
+  }
+
   KALDI_ASSERT(row_offset < old_num_rows);
   KALDI_ASSERT(col_offset < old_num_cols);
-  KALDI_ASSERT(row_offset >= 0);
+  KALDI_ASSERT(row_offset >= 0 || allow_padding);
   KALDI_ASSERT(col_offset >= 0);
-  KALDI_ASSERT(row_offset + num_rows <= old_num_rows);
+  KALDI_ASSERT(row_offset + num_rows <= old_num_rows || allow_padding);
   KALDI_ASSERT(col_offset + num_cols <= old_num_cols);
 
-  if (old_num_rows == 0) { return; }  // Zero-size matrix stored as zero pointer.
   if (num_rows == 0 || num_cols == 0) { return; }
-  
+
+  bool padding_is_used = (row_offset < 0 ||
+                          row_offset + num_rows > old_num_rows);
+
   GlobalHeader new_global_header;
   KALDI_COMPILE_TIME_ASSERT(sizeof(new_global_header) == 20);
-  
+
   GlobalHeader *old_global_header = reinterpret_cast<GlobalHeader*>(cmat.Data());
 
   new_global_header = *old_global_header;
@@ -159,18 +219,19 @@ CompressedMatrix::CompressedMatrix(
   // We don't switch format from 1 -> 2 (in case of size reduction) yet; if this
   // is needed, we will do this below by creating a temporary Matrix.
   new_global_header.format = old_global_header->format;
-  
+
   data_ = AllocateData(DataSize(new_global_header));  // allocate memory
   *(reinterpret_cast<GlobalHeader*>(data_)) = new_global_header;
-  
-  if (old_global_header->format == 1) {
-    // Both have the format where we have a PerColHeader and then compress as
-    // chars...
+
+
+
+  DataFormat format = static_cast<DataFormat>(old_global_header->format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *old_per_col_header =
         reinterpret_cast<PerColHeader*>(old_global_header + 1);
-    unsigned char *old_byte_data =
-        reinterpret_cast<unsigned char*>(old_per_col_header +
-                                         old_global_header->num_cols);
+    uint8 *old_byte_data =
+        reinterpret_cast<uint8*>(old_per_col_header +
+                                 old_global_header->num_cols);
     PerColHeader *new_per_col_header =
         reinterpret_cast<PerColHeader*>(
             reinterpret_cast<GlobalHeader*>(data_) + 1);
@@ -178,41 +239,78 @@ CompressedMatrix::CompressedMatrix(
     memcpy(new_per_col_header, old_per_col_header + col_offset,
            sizeof(PerColHeader) * num_cols);
 
-    unsigned char *new_byte_data =
-        reinterpret_cast<unsigned char*>(new_per_col_header + num_cols);
-    unsigned char *old_start_of_subcol =
-        old_byte_data + row_offset + (col_offset * old_num_rows),
-        *new_start_of_col = new_byte_data;
-    for (int32 i = 0; i < num_cols; i++) {
-      memcpy(new_start_of_col, old_start_of_subcol, num_rows);
-      new_start_of_col += num_rows;
-      old_start_of_subcol += old_num_rows;
+    uint8 *new_byte_data =
+        reinterpret_cast<uint8*>(new_per_col_header + num_cols);
+    if (!padding_is_used) {
+      uint8 *old_start_of_subcol =
+          old_byte_data + row_offset + (col_offset * old_num_rows),
+          *new_start_of_col = new_byte_data;
+      for (int32 i = 0; i < num_cols; i++) {
+        memcpy(new_start_of_col, old_start_of_subcol, num_rows);
+        new_start_of_col += num_rows;
+        old_start_of_subcol += old_num_rows;
+      }
+    } else {
+      uint8 *old_start_of_col =
+          old_byte_data + (col_offset * old_num_rows),
+          *new_start_of_col = new_byte_data;
+      for (int32 i = 0; i < num_cols; i++) {
+
+        for (int32 j = 0; j < num_rows; j++) {
+          int32 old_j = j + row_offset;
+          if (old_j < 0) old_j = 0;
+          else if (old_j >= old_num_rows) old_j = old_num_rows - 1;
+          new_start_of_col[j] = old_start_of_col[old_j];
+        }
+        new_start_of_col += num_rows;
+        old_start_of_col += old_num_rows;
+      }
     }
-  } else {
-    // both have the new format (2).
-    KALDI_ASSERT(old_global_header->format == 2);
-
+  } else if (format == kTwoByte) {
     const uint16 *old_data =
         reinterpret_cast<const uint16*>(old_global_header + 1);
-    uint16 *new_data =
+    uint16 *new_row_data =
         reinterpret_cast<uint16*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
-    
-    old_data += col_offset + (old_num_cols * row_offset);
 
     for (int32 row = 0; row < num_rows; row++) {
-      memcpy(new_data, old_data, sizeof(uint16) * num_cols);
-      new_data += num_cols;
-      old_data += old_num_cols;
+      int32 old_row = row + row_offset;
+      // The next two lines are only relevant if padding_is_used.
+      if (old_row < 0) old_row = 0;
+      else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
+      const uint16 *old_row_data =
+          old_data + col_offset + (old_num_cols * old_row);
+      memcpy(new_row_data, old_row_data, sizeof(uint16) * num_cols);
+      new_row_data += num_cols;
+    }
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    const uint8 *old_data =
+        reinterpret_cast<const uint8*>(old_global_header + 1);
+    uint8 *new_row_data =
+        reinterpret_cast<uint8*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
+
+    for (int32 row = 0; row < num_rows; row++) {
+      int32 old_row = row + row_offset;
+      // The next two lines are only relevant if padding_is_used.
+      if (old_row < 0) old_row = 0;
+      else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
+      const uint8 *old_row_data =
+          old_data + col_offset + (old_num_cols * old_row);
+      memcpy(new_row_data, old_row_data, sizeof(uint8) * num_cols);
+      new_row_data += num_cols;
     }
   }
 
-  if (num_rows < 8 && new_global_header.format == 1) {
+  if (num_rows < 8 && format == kOneByteWithColHeaders) {
     // format was 1 but we want it to be 2 -> create a temporary
     // Matrix (uncompress), re-compress, and swap.
+    // This gives us almost exact reconstruction while saving
+    // memory (the elements take more space but there will be
+    // no per-column headers).
     Matrix<float> temp(this->NumRows(), this->NumCols(),
                        kUndefined);
     this->CopyToMat(&temp);
-    CompressedMatrix temp_cmat(temp);
+    CompressedMatrix temp_cmat(temp, kTwoByteAuto);
     this->Swap(&temp_cmat);
   }
 }
@@ -242,9 +340,22 @@ inline uint16 CompressedMatrix::FloatToUint16(
   // round to closest int; avoids bias.
 }
 
+
+inline uint8 CompressedMatrix::FloatToUint8(
+    const GlobalHeader &global_header,
+    float value) {
+  float f = (value - global_header.min_value) /
+      global_header.range;
+  if (f > 1.0) f = 1.0;  // Note: this should not happen.
+  if (f < 0.0) f = 0.0;  // Note: this should not happen.
+  return static_cast<int>(f * 255 + 0.499);  // + 0.499 is to
+  // round to closest int; avoids bias.
+}
+
+
 inline float CompressedMatrix::Uint16ToFloat(
-        const GlobalHeader &global_header,
-        uint16 value) {
+    const GlobalHeader &global_header,
+    uint16 value) {
   // the constant 1.52590218966964e-05 is 1/65535.
   return global_header.min_value
       + global_header.range * 1.52590218966964e-05F * value;
@@ -281,7 +392,7 @@ void CompressedMatrix::ComputeColHeader(
     // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
     // 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear
     // at those positions in sorted order.
-    
+
     header->percentile_0 =
         std::min<uint16>(FloatToUint16(global_header, sdata[0]), 65532);
     header->percentile_25 =
@@ -297,7 +408,7 @@ void CompressedMatrix::ComputeColHeader(
     header->percentile_100 = std::max<uint16>(
         FloatToUint16(global_header, sdata[num_rows-1]),
         header->percentile_75 + static_cast<uint16>(1));
-    
+
   } else {  // handle this pathological case.
     std::sort(sdata.begin(), sdata.end());
     // Note: we know num_rows is at least 1.
@@ -328,7 +439,7 @@ void CompressedMatrix::ComputeColHeader(
 }
 
 // static
-inline unsigned char CompressedMatrix::FloatToChar(
+inline uint8 CompressedMatrix::FloatToChar(
     float p0, float p25, float p75, float p100,
     float value) {
   int ans;
@@ -356,14 +467,14 @@ inline unsigned char CompressedMatrix::FloatToChar(
     if (ans < 192) ans = 192;
     if (ans > 255) ans = 255;
   }
-  return static_cast<unsigned char>(ans);
+  return static_cast<uint8>(ans);
 }
 
 
 // static
 inline float CompressedMatrix::CharToFloat(
     float p0, float p25, float p75, float p100,
-    unsigned char value) {
+    uint8 value) {
   if (value <= 64) {
     return p0 + (p25 - p0) * value * (1/64.0);
   } else if (value <= 192) {
@@ -379,10 +490,10 @@ void CompressedMatrix::CompressColumn(
     const GlobalHeader &global_header,
     const Real *data, MatrixIndexT stride,
     int32 num_rows, CompressedMatrix::PerColHeader *header,
-    unsigned char *byte_data) {
+    uint8 *byte_data) {
   ComputeColHeader(global_header, data, stride,
                    num_rows, header);
-  
+
   float p0 = Uint16ToFloat(global_header, header->percentile_0),
       p25 = Uint16ToFloat(global_header, header->percentile_25),
       p75 = Uint16ToFloat(global_header, header->percentile_75),
@@ -406,11 +517,13 @@ void CompressedMatrix::Write(std::ostream &os, bool binary) const {
   if (binary) {  // Binary-mode write:
     if (data_ != NULL) {
       GlobalHeader &h = *reinterpret_cast<GlobalHeader*>(data_);
-      if (h.format == 1) {
+      DataFormat format = static_cast<DataFormat>(h.format);
+      if (format == kOneByteWithColHeaders) {
         WriteToken(os, binary, "CM");
-      } else {
-        KALDI_ASSERT(h.format == 2);
+      } else if (format == kTwoByte) {
         WriteToken(os, binary, "CM2");
+      } else if (format == kOneByte) {
+        WriteToken(os, binary, "CM3");
       }
       MatrixIndexT size = DataSize(h);  // total size of data in data_
       // We don't write out the "int32 format", hence the + 4, - 4.
@@ -446,10 +559,11 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
       std::string tok; // Should be CM (format 1) or CM2 (format 2)
       ReadToken(is, binary, &tok);
       GlobalHeader h;
-      if (tok == "CM") { h.format = 1; }
-      else if (tok == "CM2") { h.format = 2; }
+      if (tok == "CM") { h.format = 1; } //  kOneByteWithColHeaders
+      else if (tok == "CM2") { h.format = 2; }  // kTwoByte
+      else if (tok == "CM3") { h.format = 3; }  // kOneByte
       else {
-        KALDI_ERR << "Unexpected token " << tok << ", expecting CM or CM2.";
+        KALDI_ERR << "Unexpected token " << tok << ", expecting CM, CM2 or CM3";
       }
       // don't read the "format" -> hence + 4, - 4.
       is.read(reinterpret_cast<char*>(&h) + 4, sizeof(h) - 4);
@@ -467,13 +581,12 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
       // case if you changed your code, making a Matrix into a CompressedMatrix,
       // and you want back-compatibility for reading.
       Matrix<BaseFloat> M;
-      M.Read(is, binary); // This will crash if it was not a Matrix.  This might happen,
-                          // for instance, if the CompressedMatrix was written using the
-                          // older code where we didn't write the token "CM", we just
-                          // wrote the binary data directly.
+      M.Read(is, binary); // This will crash if it was not a Matrix.
       this->CopyFromMat(M);
     }
-  } else {  // Text-mode read.
+  } else {  // Text-mode read.  In this case you don't get to
+    // choose the compression type.  Anyway this branch would only
+    // be taken when debugging.
     Matrix<BaseFloat> temp;
     temp.Read(is, binary);
     this->CopyFromMat(temp);
@@ -491,7 +604,7 @@ void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
     mat->CopyFromMat(temp, kTrans);
     return;
   }
-  
+
   if (data_ == NULL) {
     KALDI_ASSERT(mat->NumRows() == 0);
     KALDI_ASSERT(mat->NumCols() == 0);
@@ -501,11 +614,12 @@ void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
   int32 num_cols = h->num_cols, num_rows = h->num_rows;
   KALDI_ASSERT(mat->NumRows() == num_rows);
   KALDI_ASSERT(mat->NumCols() == num_cols);
-  
-  if (h->format == 1) {
+
+  DataFormat format = static_cast<DataFormat>(h->format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    unsigned char *byte_data = reinterpret_cast<unsigned char*>(per_col_header +
-                                                                h->num_cols);
+    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
+                                                h->num_cols);
     for (int32 i = 0; i < num_cols; i++, per_col_header++) {
       float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
           p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
@@ -516,13 +630,25 @@ void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
         (*mat)(j, i) = f;
       }
     }
-  } else {
-    KALDI_ASSERT(h->format == 2);
+  } else if (format == kTwoByte) {
     const uint16 *data = reinterpret_cast<const uint16*>(h + 1);
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 65535.0);
     for (int32 i = 0; i < num_rows; i++) {
       Real *row_data = mat->RowData(i);
       for (int32 j = 0; j < num_cols; j++)
-        row_data[j] = Uint16ToFloat(*h, data[j]);
+        row_data[j] = min_value + data[j] * increment;
+      data += num_cols;
+    }
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    float min_value = h->min_value, increment = h->range * (1.0 / 255.0);
+
+    const uint8 *data = reinterpret_cast<const uint8*>(h + 1);
+    for (int32 i = 0; i < num_rows; i++) {
+      Real *row_data = mat->RowData(i);
+      for (int32 j = 0; j < num_cols; j++)
+        row_data[j] = min_value + data[j] * increment;
       data += num_cols;
     }
   }
@@ -544,11 +670,11 @@ void CompressedMatrix::CopyRowToVec(MatrixIndexT row,
   KALDI_ASSERT(v->Dim() == this->NumCols());
 
   GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-
-  if (h->format == 1) {  // format with per-col header.
+  DataFormat format = static_cast<DataFormat>(h->format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    unsigned char *byte_data = reinterpret_cast<unsigned char*>(per_col_header +
-                                                                h->num_cols);
+    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
+                                                h->num_cols);
     byte_data += row;  // point to first value we are interested in
     for (int32 i = 0; i < h->num_cols;
          i++, per_col_header++, byte_data += h->num_rows) {
@@ -559,15 +685,26 @@ void CompressedMatrix::CopyRowToVec(MatrixIndexT row,
       float f = CharToFloat(p0, p25, p75, p100, *byte_data);
       (*v)(i) = f;
     }
-  } else {
-    KALDI_ASSERT(h->format == 2);  // uint16 format
+  } else if (format == kTwoByte) {
     int32 num_cols = h->num_cols;
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 65535.0);
     const uint16 *row_data = reinterpret_cast<uint16*>(h + 1) + (num_cols * row);
     Real *v_data = v->Data();
     for (int32 c = 0; c < num_cols; c++)
-      v_data[c] = Uint16ToFloat(*h, row_data[c]);
+      v_data[c] = min_value + row_data[c] * increment;
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    int32 num_cols = h->num_cols;
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 255.0);
+    const uint8 *row_data = reinterpret_cast<uint8*>(h + 1) + (num_cols * row);
+    Real *v_data = v->Data();
+    for (int32 c = 0; c < num_cols; c++)
+      v_data[c] = min_value + row_data[c] * increment;
   }
 }
+
 template<typename Real>
 void CompressedMatrix::CopyColToVec(MatrixIndexT col,
                                     VectorBase<Real> *v) const {
@@ -577,10 +714,11 @@ void CompressedMatrix::CopyColToVec(MatrixIndexT col,
 
   GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
 
-  if (h->format == 1) {  // format with per-col header.
+  DataFormat format = static_cast<DataFormat>(h->format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    unsigned char *byte_data = reinterpret_cast<unsigned char*>(per_col_header +
-                                                                h->num_cols);
+    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
+                                                h->num_cols);
     byte_data += col*h->num_rows;  // point to first value in the column we want
     per_col_header += col;
     float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
@@ -591,13 +729,23 @@ void CompressedMatrix::CopyColToVec(MatrixIndexT col,
       float f = CharToFloat(p0, p25, p75, p100, *byte_data);
       (*v)(i) = f;
     }
-  } else {
-    KALDI_ASSERT(h->format == 2);  // uint16 format
+  } else if (format == kTwoByte) {
     int32 num_rows = h->num_rows, num_cols = h->num_cols;
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 65535.0);
     const uint16 *col_data = reinterpret_cast<uint16*>(h + 1) + col;
     Real *v_data = v->Data();
     for (int32 r = 0; r < num_rows; r++)
-      v_data[r] = Uint16ToFloat(*h, col_data[r * num_cols]);
+      v_data[r] = min_value + increment * col_data[r * num_cols];
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    int32 num_rows = h->num_rows, num_cols = h->num_cols;
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 255.0);
+    const uint8 *col_data = reinterpret_cast<uint8*>(h + 1) + col;
+    Real *v_data = v->Data();
+    for (int32 r = 0; r < num_rows; r++)
+      v_data[r] = min_value + increment * col_data[r * num_cols];
   }
 }
 
@@ -625,15 +773,14 @@ void CompressedMatrix::CopyToMat(int32 row_offset,
   GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
   int32 num_rows = h->num_rows, num_cols = h->num_cols,
       tgt_cols = dest->NumCols(), tgt_rows = dest->NumRows();
-  
-  if (h->format == 1) {
-    // format where we have a per-column header and use one byte per
-    // element.
+
+  DataFormat format = static_cast<DataFormat>(h->format);
+  if (format == kOneByteWithColHeaders) {
     PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    unsigned char *byte_data = reinterpret_cast<unsigned char*>(per_col_header +
-                                                                h->num_cols);
+    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
+                                                h->num_cols);
 
-    unsigned char *start_of_subcol = byte_data+row_offset;  // skip appropriate
+    uint8 *start_of_subcol = byte_data+row_offset;  // skip appropriate
     // number of columns
     start_of_subcol += col_offset*num_rows;  // skip appropriate number of rows
 
@@ -652,15 +799,28 @@ void CompressedMatrix::CopyToMat(int32 row_offset,
         (*dest)(j, i) = f;
       }
     }
-  } else {
-    KALDI_ASSERT(h->format == 2);
+  } else if (format == kTwoByte) {
     const uint16 *data = reinterpret_cast<const uint16*>(h+1) + col_offset +
         (num_cols * row_offset);
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 65535.0);
 
     for (int32 row = 0; row < tgt_rows; row++) {
       Real *dest_row = dest->RowData(row);
       for (int32 col = 0; col < tgt_cols; col++)
-        dest_row[col] = Uint16ToFloat(*h, data[col]);
+        dest_row[col] = min_value + increment * data[col];
+      data += num_cols;
+    }
+  } else {
+    KALDI_ASSERT(format == kOneByte);
+    const uint8 *data = reinterpret_cast<const uint8*>(h+1) + col_offset +
+        (num_cols * row_offset);
+    float min_value = h->min_value,
+        increment = h->range * (1.0 / 255.0);
+    for (int32 row = 0; row < tgt_rows; row++) {
+      Real *dest_row = dest->RowData(row);
+      for (int32 col = 0; col < tgt_cols; col++)
+        dest_row[col] = min_value + increment * data[col];
       data += num_cols;
     }
   }
@@ -668,11 +828,11 @@ void CompressedMatrix::CopyToMat(int32 row_offset,
 
 // instantiate the templates.
 template void CompressedMatrix::CopyToMat(int32,
-               int32,
-               MatrixBase<float> *dest) const;
+                                          int32,
+                                          MatrixBase<float> *dest) const;
 template void CompressedMatrix::CopyToMat(int32,
-               int32,
-               MatrixBase<double> *dest) const;
+                                          int32,
+                                          MatrixBase<double> *dest) const;
 
 void CompressedMatrix::Clear() {
   if (data_ != NULL) {
diff --git a/src/matrix/compressed-matrix.h b/src/matrix/compressed-matrix.h
index 4e4238c43da..7166192b78c 100644
--- a/src/matrix/compressed-matrix.h
+++ b/src/matrix/compressed-matrix.h
@@ -28,19 +28,62 @@ namespace kaldi {
 /// \addtogroup matrix_group
 /// @{
 
-/// This class does lossy compression of a matrix.  It only
-/// supports copying to-from a KaldiMatrix.  For large matrices,
-/// each element is compressed into about one byte, but there
-/// is a little overhead on top of that (globally, and also per
-/// column).
-
-/// The basic idea is for each column (in the normal configuration)
-/// we work out the values at the 0th, 25th, 50th and 100th percentiles
-/// and store them as 16-bit integers; we then encode each value in
-/// the column as a single byte, in 3 separate ranges with different
-/// linear encodings (0-25th, 25-50th, 50th-100th).
-/// If the matrix has 8 rows or fewer, we simply store all values as
-/// uint16.
+
+
+/*
+  The enum CompressionMethod is used when creating a CompressedMatrix (a lossily
+  compressed matrix) from a regular Matrix.  It dictates how we choose the
+  compressed format and how we choose the ranges of floats that are represented
+  by particular integers.
+
+    kAutomaticMethod = 1 This is the default when you don't specify the
+                        compression method.  It is a shorthand for using
+                        kSpeechFeature if the num-rows is more than 8, and
+                        kTwoByteAuto otherwise.
+    kSpeechFeature = 2  This is the most complicated of the compression methods,
+                        and was designed for speech features which have a roughly
+                        Gaussian distribution with different ranges for each
+                        dimension.  Each element is stored in one byte, but there
+                        is an 8-byte header per column; the spacing of the
+                        integer values is not uniform but is in 3 ranges.
+    kTwoByteAuto = 3    Each element is stored in two bytes as a uint16, with
+                        the representable range of values chosen automatically
+                        with the minimum and maximum elements of the matrix as
+                        its edges.
+    kTwoByteSignedInteger = 4
+                        Each element is stored in two bytes as a uint16, with
+                        the representable range of value chosen to coincide with
+                        what you'd get if you stored signed integers, i.e.
+                        [-32768.0, 32767.0].  Suitable for waveform data that
+                        was previously stored as 16-bit PCM.
+    kOneByteAuto = 5    Each element is stored in one byte as a uint8, with the
+                        representable range of values chosen automatically with
+                        the minimum and maximum elements of the matrix as its
+                        edges.
+    kOneByteZeroOne = 6 Each element is stored in
+                        one byte as a uint8, with the representable range of
+                        values equal to [0.0, 1.0].  Suitable for image data
+                        that has previously been compressed as int8.
+
+    // We can add new methods here as needed: if they just imply different ways
+    // of selecting the min_value and range, and a num-bytes = 1 or 2, they will
+    // be trivial to implement.
+*/
+enum CompressionMethod {
+  kAutomaticMethod = 1,
+  kSpeechFeature = 2,
+  kTwoByteAuto = 3,
+  kTwoByteSignedInteger = 4,
+  kOneByteAuto = 5,
+  kOneByteInteger = 6,
+  kOneByteZeroOne = 7
+};
+
+
+/*
+  This class does lossy compression of a matrix.  It supports various compression
+  methods, see enum CompressionMethod.
+*/
 
 class CompressedMatrix {
  public:
@@ -49,23 +92,35 @@ class CompressedMatrix {
   ~CompressedMatrix() { Clear(); }
 
   template<typename Real>
-  CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
+  explicit CompressedMatrix(const MatrixBase<Real> &mat,
+                            CompressionMethod method = kAutomaticMethod):
+      data_(NULL) { CopyFromMat(mat, method); }
 
   /// Initializer that can be used to select part of an existing
   /// CompressedMatrix without un-compressing and re-compressing (note: unlike
   /// similar initializers for class Matrix, it doesn't point to the same memory
   /// location).
+  ///
+  /// This creates a CompressedMatrix with the size (num_rows, num_cols)
+  /// starting at (row_offset, col_offset).
+  ///
+  /// If you specify allow_padding = true,
+  /// it is permitted to have row_offset < 0 and
+  /// row_offset + num_rows > mat.NumRows(), and the result will contain
+  /// repeats of the first and last rows of 'mat' as necessary.
   CompressedMatrix(const CompressedMatrix &mat,
                    const MatrixIndexT row_offset,
                    const MatrixIndexT num_rows,
                    const MatrixIndexT col_offset,
-                   const MatrixIndexT num_cols);
+                   const MatrixIndexT num_cols,
+                   bool allow_padding = false);
 
   void *Data() const { return this->data_; }
 
   /// This will resize *this and copy the contents of mat to *this.
   template<typename Real>
-  void CopyFromMat(const MatrixBase<Real> &mat);
+  void CopyFromMat(const MatrixBase<Real> &mat,
+                   CompressionMethod method = kAutomaticMethod);
 
   CompressedMatrix(const CompressedMatrix &mat);
 
@@ -75,7 +130,7 @@ class CompressedMatrix {
   CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
 
   /// Copies contents to matrix.  Note: mat must have the correct size.
-  /// kNoTrans case uses a temporary.
+  /// The kTrans case uses a temporary.
   template<typename Real>
   void CopyToMat(MatrixBase<Real> *mat,
                  MatrixTransposeType trans = kNoTrans) const;
@@ -118,23 +173,57 @@ class CompressedMatrix {
   friend class Matrix<double>;
  private:
 
+  // This enum describes the different compressed-data formats: these are
+  // distinct from the compression methods although all of the methods apart
+  // from kAutomaticMethod dictate a particular compressed-data format.
+  //
+  //  kOneByteWithColHeaders means there is a GlobalHeader and each
+  //    column has a PerColHeader; the actual data is stored in
+  //    one byte per element, in column-major order (the mapping
+  //    from integers to floats is a little complicated).
+  //  kTwoByte means there is a global header but no PerColHeader;
+  //    the actual data is stored in two bytes per element in
+  //    row-major order; it's decompressed as:
+  //       uint16 i;  GlobalHeader g;
+  //       float f = g.min_value + i * (g.range / 65535.0)
+  //  kOneByte means there is a global header but not PerColHeader;
+  //    the data is stored in one byte per element in row-major
+  //    order and is decompressed as:
+  //       uint8 i;  GlobalHeader g;
+  //       float f = g.min_value + i * (g.range / 255.0)
+  enum DataFormat {
+    kOneByteWithColHeaders = 1,
+    kTwoByte = 2,
+    kOneByte = 3
+  };
+
+
   // allocates data using new [], ensures byte alignment
   // sufficient for float.
   static void *AllocateData(int32 num_bytes);
 
-  // the "format" will be 1 for the original format where each column has a
-  // PerColHeader, and 2 for the format now used for matrices with 8 or fewer
-  // rows, where everything is represented as 16-bit integers.
   struct GlobalHeader {
-    int32 format;
-    float min_value;
+    int32 format;     // Represents the enum DataFormat.
+    float min_value;  // min_value and range represent the ranges of the integer
+                      // data in the kTwoByte and kOneByte formats, and the
+                      // range of the PerColHeader uint16's in the
+                      // kOneByteWithColheaders format.
     float range;
     int32 num_rows;
     int32 num_cols;
   };
 
+  // This function computes the global header for compressing this data.
+  template<typename Real>
+  static inline void ComputeGlobalHeader(const MatrixBase<Real> &mat,
+                                         CompressionMethod method,
+                                         GlobalHeader *header);
+
+
+  // The number of bytes we need to request when allocating 'data_'.
   static MatrixIndexT DataSize(const GlobalHeader &header);
 
+  // This struct is only used in format kOneByteWithColHeaders.
   struct PerColHeader {
     uint16 percentile_0;
     uint16 percentile_25;
@@ -146,7 +235,7 @@ class CompressedMatrix {
   static void CompressColumn(const GlobalHeader &global_header,
                              const Real *data, MatrixIndexT stride,
                              int32 num_rows, PerColHeader *header,
-                             unsigned char *byte_data);
+                             uint8 *byte_data);
   template<typename Real>
   static void ComputeColHeader(const GlobalHeader &global_header,
                                const Real *data, MatrixIndexT stride,
@@ -155,14 +244,22 @@ class CompressedMatrix {
   static inline uint16 FloatToUint16(const GlobalHeader &global_header,
                                      float value);
 
+  // this is used only in the kOneByte compression format.
+  static inline uint8 FloatToUint8(const GlobalHeader &global_header,
+                                   float value);
+
   static inline float Uint16ToFloat(const GlobalHeader &global_header,
                                     uint16 value);
-  static inline unsigned char FloatToChar(float p0, float p25,
+
+  // this is used only in the kOneByteWithColHeaders compression format.
+  static inline uint8 FloatToChar(float p0, float p25,
                                           float p75, float p100,
                                           float value);
+
+  // this is used only in the kOneByteWithColHeaders compression format.
   static inline float CharToFloat(float p0, float p25,
                                   float p75, float p100,
-                                  unsigned char value);
+                                  uint8 value);
 
   void *data_; // first GlobalHeader, then PerColHeader (repeated), then
   // the byte data for each column (repeated).  Note: don't intersperse
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 687ac66ac46..bdd4fc342a4 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -51,26 +51,7 @@ void RandPosdefSpMatrix(MatrixIndexT dim, SpMatrix<Real> *matrix) {
 }
 
 
-template<typename Real> static void InitRand(TpMatrix<Real> *M) {
-  // start:
-  for (MatrixIndexT i = 0;i < M->NumRows();i++)
-    for (MatrixIndexT j = 0;j<=i;j++)
-      (*M)(i, j) = RandGauss();
-}
-
-
-
-template<typename Real> static void InitRand(Vector<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-    (*v)(i) = RandGauss();
-}
-
-template<typename Real> static void InitRand(VectorBase<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-    (*v)(i) = RandGauss();
-}
-
-template<typename Real> static void InitRand(MatrixBase<Real> *M) {
+template<typename Real> static void InitRandNonsingular(MatrixBase<Real> *M) {
 start:
   for (MatrixIndexT i = 0;i < M->NumRows();i++)
     for (MatrixIndexT j = 0;j < M->NumCols();j++)
@@ -83,7 +64,7 @@ template<typename Real> static void InitRand(MatrixBase<Real> *M) {
 }
 
 
-template<typename Real> static void InitRand(SpMatrix<Real> *M) {
+template<typename Real> static void InitRandNonsingular(SpMatrix<Real> *M) {
 start:
   for (MatrixIndexT i = 0;i < M->NumRows();i++)
     for (MatrixIndexT j = 0;j<=i;j++)
@@ -158,7 +139,7 @@ template<typename Real> static void CholeskyUnitTestTr() {
   for (MatrixIndexT i = 0; i < 5; i++) {
     MatrixIndexT dimM = 2 + Rand() % 10;
     Matrix<Real> M(dimM, dimM);
-    InitRand(&M);
+    InitRandNonsingular(&M);
     SpMatrix<Real> S(dimM);
     S.AddMat2(1.0, M, kNoTrans, 0.0);
     TpMatrix<Real> C(dimM);
@@ -284,7 +265,7 @@ template<typename Real> static void UnitTestAddSp() {
   for (MatrixIndexT i = 0;i< 10;i++) {
     MatrixIndexT dimM = 10+Rand()%10;
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     Matrix<Real> M(S), N(S);
     N.AddSp(2.0, S);
     M.Scale(3.0);
@@ -298,10 +279,10 @@ static void UnitTestSpAddDiagVec() {
     BaseFloat alpha = (i<5 ? 1.0 : 0.5);
     MatrixIndexT dimM = 10+Rand()%10;
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     SpMatrix<Real> T(S);
     Vector<OtherReal> v(dimM);
-    InitRand(&v);
+    v.SetRandn();
     S.AddDiagVec(alpha, v);
     for (MatrixIndexT i = 0; i < dimM; i++)
       T(i, i) += alpha * v(i);
@@ -316,12 +297,12 @@ static void UnitTestSpAddVecVec() {
     BaseFloat alpha = (i<5 ? 1.0 : 0.5);
     MatrixIndexT dimM = 10+Rand()%10;
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     Matrix<Real> T(S);
 
     Vector<Real> v(dimM), w(dimM);
-    InitRand(&v);
-    InitRand(&w);
+    v.SetRandn();
+    w.SetRandn();
     S.AddVecVec(alpha, v, w);
     T.AddVecVec(alpha, v, w);
     T.AddVecVec(alpha, w, v);
@@ -340,8 +321,8 @@ template<typename Real> static void UnitTestCopyRowsAndCols() {
     // CopyRowsFromVec.
     Vector<Real> v(dimM);
     Matrix<Real> M(dimM, dimN), N(dimM, dimN);
-    InitRand(&v);
-    InitRand(&w);
+    v.SetRandn();
+    w.SetRandn();
     M.CopyColsFromVec(v);
     N.CopyRowsFromVec(w);
     for (MatrixIndexT r = 0; r < dimM; r++) {
@@ -360,7 +341,7 @@ template<typename Real> static void UnitTestSpliceRows() {
 
     Vector<Real> V(dimM*dimN), V10(dimM*dimN);
     Vector<Real> Vs(std::min(dimM, dimN)), Vs10(std::min(dimM, dimN));
-    InitRand(&V);
+    V.SetRandn();
     Matrix<Real> M(dimM, dimN);
     M.CopyRowsFromVec(V);
     V10.CopyRowsFromMat(M);
@@ -372,7 +353,7 @@ template<typename Real> static void UnitTestSpliceRows() {
 
     {
       Vector<Real> V2(dimM), V3(dimM);
-      InitRand(&V2);
+      V2.SetRandn();
       MatrixIndexT x;
       M.CopyColFromVec(V2, x = (Rand() % dimN));
       V3.CopyColFromMat(M, x);
@@ -381,7 +362,7 @@ template<typename Real> static void UnitTestSpliceRows() {
 
     {
       Vector<Real> V2(dimN), V3(dimN);
-      InitRand(&V2);
+      V2.SetRandn();
       MatrixIndexT x;
       M.CopyRowFromVec(V2, x = (Rand() % dimM));
       V3.CopyRowFromMat(M, x);
@@ -409,7 +390,7 @@ template<typename Real> static void UnitTestRemoveRow() {
   for (MatrixIndexT p = 0;p< 10;p++) {
     MatrixIndexT dimM = 10+Rand()%10, dimN = 10+Rand()%10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     MatrixIndexT i = Rand() % dimM;  // Row to remove.
     Matrix<Real> N(M);
     N.RemoveRow(i);
@@ -429,7 +410,7 @@ template<typename Real> static void UnitTestRemoveRow() {
   for (MatrixIndexT p = 0;p< 10;p++) {
     MatrixIndexT dimM = 10+Rand()%10;
     Vector<Real> V(dimM);
-    InitRand(&V);
+    V.SetRandn();
     MatrixIndexT i = Rand() % dimM;  // Element to remove.
     Vector<Real> N(V);
     N.RemoveElement(i);
@@ -447,7 +428,7 @@ template<typename Real> static void UnitTestRemoveRow() {
 template<typename Real> static void UnitTestSubvector() {
 
   Vector<Real> V(100);
-  InitRand(&V);
+  V.SetRandn();
   Vector<Real> V2(100);
   for (MatrixIndexT i = 0;i < 10;i++) {
     SubVector<Real> tmp(V, i*10, 10);
@@ -531,11 +512,11 @@ static void UnitTestSetRandUniform() {
 
 
 template <typename Real>
-static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
+static void UnitTestSimpleForVec() {  // testing some simple operators on vector
 
   for (MatrixIndexT i = 0; i < 5; i++) {
     Vector<Real> V(100), V1(100), V2(100), V3(100), V4(100);
-    InitRand(&V);
+    V.SetRandn();
     if (i % 2 == 0) {
       V1.SetZero();
       V1.Add(1.0);
@@ -572,7 +553,7 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
     for(int i = 0; i < sizes.size(); i++) {
       MatrixIndexT dimM = sizes[i] + Rand() % 10, dimN = sizes[i] + Rand() % 10;
       Matrix<Real> M(dimM, dimN);
-      InitRand(&M);
+      M.SetRandn();
       Vector<Real> Vr(dimN), Vc(dimM);
       Vr.AddRowSumMat(0.4, M);
       Vr.AddRowSumMat(0.3, M, 0.5); // note: 0.3 + 0.4*0.5 = 0.5.
@@ -586,18 +567,18 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
       Vector<Real> V2r(dimN), V2c(dimM);
       for (MatrixIndexT k = 0; k < dimM; k++) {
         V2r.CopyRowFromMat(M, k);
-        AssertEqual(V2r.Sum(), Vc(k));
+        KALDI_ASSERT(fabs(V2r.Sum() - Vc(k)) < 0.01);
       }
       for (MatrixIndexT j = 0; j < dimN; j++) {
         V2c.CopyColFromMat(M, j);
-        AssertEqual(V2c.Sum(), Vr(j));
+        KALDI_ASSERT(fabs(V2c.Sum() - Vr(j)) < 0.01);
       }
     }
   }
 
   for (MatrixIndexT i = 0; i < 5; i++) {
     Vector<Real> V(100), V1(100), V2(100);
-    InitRand(&V);
+    V.SetRandn();
 
     V1.CopyFromVec(V);
     V1.ApplyExp();
@@ -620,7 +601,7 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
     MatrixIndexT dimV = 10 + Rand() % 10;
     Real p = 0.5 + RandUniform() * 4.5;
     Vector<Real> V(dimV), V1(dimV), V2(dimV);
-    InitRand(&V);
+    V.SetRandn();
     V1.AddVecVec(1.0, V, V, 0.0);  // V1:=V.*V.
     V2.CopyFromVec(V1);
     AssertEqual(V1.Norm(p), V2.Norm(p));
@@ -638,7 +619,7 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
 
   // Test ApplySoftMax() for matrix.
   Matrix<Real> M(10, 10);
-  InitRand(&M);
+  M.SetRandn();
   M.ApplySoftMax();
   KALDI_ASSERT( fabs(1.0 - M.Sum()) < 0.01);
 
@@ -696,11 +677,11 @@ static void UnitTestNorm() {  // test some simple norm properties: scaling.  als
     if (scalar < 0) scalar *= -1.0;
     MatrixIndexT dimM = 10 + Rand() % 10, dimN = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     Vector<Real> V(dimN);
-    InitRand(&V);
+    V.SetRandn();
 
     Real Mnorm = M.FrobeniusNorm(),
         Snorm = S.FrobeniusNorm(),
@@ -733,7 +714,7 @@ static void UnitTestCopyRows() {
         num_rows2 = 10 + Rand() % 10,
         num_cols = 10 + Rand() % 10;
     Matrix<Real> M(num_rows1, num_cols);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
@@ -766,7 +747,7 @@ static void UnitTestCopyToRows() {
         num_rows2 = 10 + Rand() % 10,
         num_cols = 10 + Rand() % 10;
     Matrix<Real> M(num_rows1, num_cols);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> N(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<Real*> reorder_dst(num_rows1,
@@ -799,7 +780,7 @@ static void UnitTestAddRows() {
         num_rows2 = 10 + Rand() % 10,
         num_cols = 10 + Rand() % 10;
     Matrix<Real> M(num_rows1, num_cols);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
@@ -838,7 +819,7 @@ static void UnitTestAddToRows() {
         num_rows2 = 10 + Rand() % 10,
         num_cols = 10 + Rand() % 10;
     Matrix<Real> M(num_rows1, num_cols);
-    InitRand(&M);
+    M.SetRandn();
 
     Real alpha =
         static_cast<Real>((Rand() % num_rows2)) / static_cast<Real>(num_rows1);
@@ -873,7 +854,7 @@ static void UnitTestCopyCols() {
         num_cols2 = 10 + Rand() % 10,
         num_rows = 10 + Rand() % 10;
     Matrix<Real> M(num_rows, num_cols1);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
     std::vector<int32> reorder(num_cols2);
@@ -898,7 +879,7 @@ static void UnitTestSimpleForMat() {  // test some simple operates on all kinds
     // for FrobeniousNorm() function
     MatrixIndexT dimM = 10 + Rand() % 10, dimN = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     {
       Matrix<Real> N(M);
       Real a = M.LogSumExp(), b = N.ApplySoftMax();
@@ -961,10 +942,10 @@ static void UnitTestSimpleForMat() {  // test some simple operates on all kinds
     MatrixIndexT dimM = 10 + Rand() % 10;
     SpMatrix<Real> S(dimM), S1(dimM);
     TpMatrix<Real> T(dimM), T1(dimM);
-    InitRand(&S);
-    InitRand(&S1);
-    InitRand(&T);
-    InitRand(&T1);
+    S.SetRandn();
+    S1.SetRandn();
+    T.SetRandn();
+    T1.SetRandn();
     Matrix<Real> M(S), M1(S1), N(T), N1(T1);
 
     S.AddSp(1.0, S1);
@@ -983,9 +964,9 @@ static void UnitTestSimpleForMat() {  // test some simple operates on all kinds
     SpMatrix<Real> M(dimM);
     Vector<Real> V(dimM);
 
-    InitRand(&M);
+    InitRandNonsingular(&M);
     SpMatrix<double> Md(M);
-    InitRand(&V);
+    V.SetRandn();
     SpMatrix<Real> Sorig(M);
     M.AddVec2(0.5, V);
     Md.AddVec2(static_cast<Real>(0.5), V);
@@ -1003,7 +984,7 @@ template<typename Real> static void UnitTestRow() {
   for (MatrixIndexT p = 0;p< 10;p++) {
     MatrixIndexT dimM = 10+Rand()%10, dimN = 10+Rand()%10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    InitRandNonsingular(&M);
 
     MatrixIndexT i = Rand() % dimM;  // Row to get.
 
@@ -1015,7 +996,7 @@ template<typename Real> static void UnitTestRow() {
 
     {
       SpMatrix<Real> S(dimN);
-      InitRand(&S);
+      InitRandNonsingular(&S);
       Vector<Real> v1(dimN), v2(dimN);
       Matrix<Real> M(S);
       MatrixIndexT dim2 = Rand() % dimN;
@@ -1040,7 +1021,7 @@ template<typename Real> static void UnitTestAxpy() {
     MatrixIndexT dimM = 10+Rand()%10, dimN = 10+Rand()%10;
     Matrix<Real> M(dimM, dimN), N(dimM, dimN), O(dimN, dimM);
 
-    InitRand(&M); InitRand(&N); InitRand(&O);
+    InitRandNonsingular(&M); InitRandNonsingular(&N); InitRandNonsingular(&O);
     Matrix<Real> Morig(M);
     M.AddMat(0.5, N);
     for (MatrixIndexT i = 0;i < dimM;i++)
@@ -1054,7 +1035,7 @@ template<typename Real> static void UnitTestAxpy() {
     {
       float f = 0.5 * (float) (Rand() % 3);
       Matrix<Real> N(dimM, dimM);
-      InitRand(&N);
+      InitRandNonsingular(&N);
 
       Matrix<Real> N2(N);
       Matrix<Real> N3(N);
@@ -1093,7 +1074,7 @@ template<typename Real> static void UnitTestPower() {
     // this is for matrix-pow
     MatrixIndexT dimM = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimM), N(dimM, dimM);
-    InitRand(&M);
+    M.SetRandn();
     N.AddMatMat(1.0, M, kNoTrans, M, kTrans, 0.0);  // N:=M*M^T.
     SpMatrix<Real> S(dimM);
     S.CopyFromMat(N);  // symmetric so should not crash.
@@ -1105,7 +1086,7 @@ template<typename Real> static void UnitTestPower() {
     // this is for vector-pow
     MatrixIndexT dimV = 10 + Rand() % 10;
     Vector<Real> V(dimV), V1(dimV), V2(dimV);
-    InitRand(&V);
+    V.SetRandn();
     V1.AddVecVec(1.0, V, V, 0.0);  // V1:=V.*V.
     V2.CopyFromVec(V1);
     V2.ApplyPow(0.5);
@@ -1118,7 +1099,7 @@ template<typename Real> static void UnitTestPowerAbs() {
   for (MatrixIndexT iter = 0;iter < 5;iter++) {
     MatrixIndexT dimV = 10 + Rand() % 10;
     Vector<Real> V(dimV), V1(dimV), V2(dimV);
-    InitRand(&V);
+    V.SetRandn();
     V1.AddVecVec(1.0, V, V, 0.0);  // V1:=V.*V.
     V2.CopyFromVec(V1);
     KALDI_LOG << V1;
@@ -1135,7 +1116,7 @@ template<typename Real> static void UnitTestHeaviside() {
   for (MatrixIndexT iter = 0;iter < 5;iter++) {
     MatrixIndexT dimM = 10 + Rand() % 10, dimN = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimN), N(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     N = M;
     N.ApplyHeaviside();
     for (MatrixIndexT r = 0; r < dimM; r++) {
@@ -1159,8 +1140,8 @@ template<typename Real> static void UnitTestAddOuterProductPlusMinus() {
     Vector<Real> v1(dimM), v2(dimN);
 
     for (MatrixIndexT i = 0; i < 5; i++) {
-      InitRand(&v1);
-      InitRand(&v2);
+      v1.SetRandn();
+      v2.SetRandn();
       Real alpha = 0.333 * ((Rand() % 10) - 5);
       M.AddVecVec(alpha, v1, v2);
 
@@ -1180,8 +1161,8 @@ template<typename Real> static void UnitTestSger() {
     MatrixIndexT dimM = 10 + Rand() % 10;
     MatrixIndexT dimN = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimN), M2(dimM, dimN);
-    Vector<Real> v1(dimM); InitRand(&v1);
-    Vector<Real> v2(dimN); InitRand(&v2);
+    Vector<Real> v1(dimM); v1.SetRandn();
+    Vector<Real> v2(dimN); v2.SetRandn();
     Vector<double> v1d(v1), v2d(v2);
     M.AddVecVec(1.0f, v1, v2);
     M2.AddVecVec(1.0f, v1, v2);
@@ -1199,7 +1180,7 @@ template<typename Real> static void UnitTestDeterminant() {  // also tests matri
   for (MatrixIndexT iter = 0;iter < 5;iter++) {  // First test the 2 det routines are the same
     int dimM = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimM), N(dimM, dimM);
-    InitRand(&M);
+    InitRandNonsingular(&M);
     N.AddMatMat(1.0, M, kNoTrans, M, kTrans, 0.0);  // N:=M*M^T.
     for (MatrixIndexT i = 0;i < (MatrixIndexT)dimM;i++) N(i, i) += 0.0001;  // Make sure numerically +ve det-- can by chance be almost singular the way we initialized it (I think)
     SpMatrix<Real> S(dimM);
@@ -1230,7 +1211,7 @@ template<typename Real> static void UnitTestDeterminantSign() {
   for (MatrixIndexT iter = 0;iter < 20;iter++) {  // First test the 2 det routines are the same
     int dimM = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimM), N(dimM, dimM);
-    InitRand(&M);
+    InitRandNonsingular(&M);
     N.AddMatMat(1.0, M, kNoTrans, M, kTrans, 0.0);  // N:=M*M^T.
     for (MatrixIndexT i = 0;i < (MatrixIndexT)dimM;i++) N(i, i) += 0.0001;  // Make sure numerically +ve det-- can by chance be almost singular the way we initialized it (I think)
     SpMatrix<Real> S(dimM);
@@ -1291,68 +1272,13 @@ template<typename Real> static void UnitTestSpVec() {
 }
 
 
-template<typename Real> static void UnitTestSherman() {
-  for (MatrixIndexT iter = 0;iter < 1;iter++) {
-    MatrixIndexT dimM =10;  // 20 + Rand()%10;
-    MatrixIndexT dimK =2;  // 20 + Rand()%dimM;
-    Matrix<Real> A(dimM, dimM), U(dimM, dimK), V(dimM, dimK);
-    InitRand(&A);
-    InitRand(&U);
-    InitRand(&V);
-    for (MatrixIndexT i = 0;i < (MatrixIndexT)dimM;i++) A(i, i) += 0.0001;  // Make sure numerically +ve det-- can by chance be almost singular the way we initialized it (I think)
-
-    Matrix<Real> tmpL(dimM, dimM);
-    tmpL.AddMatMat(1.0, U, kNoTrans, V, kTrans, 0.0);  // tmpL =U *V.
-    tmpL.AddMat(1.0, A);
-    tmpL.Invert();
-
-    Matrix<Real> invA(dimM, dimM);
-    invA.CopyFromMat(A);
-    invA.Invert();
-
-    Matrix<Real> tt(dimK, dimM), I(dimK, dimK);
-    tt.AddMatMat(1.0, V, kTrans, invA, kNoTrans, 0.0);  // tt0 = trans(V) *inv(A)
-
-    Matrix<Real> tt1(dimM, dimK);
-    tt1.AddMatMat(1.0, invA, kNoTrans, U, kNoTrans, 0.0);  // tt1=INV A *U
-    Matrix<Real> tt2(dimK, dimK);
-    tt2.AddMatMat(1.0, V, kTrans, tt1, kNoTrans, 0.0);
-    for (MatrixIndexT i = 0;i < dimK;i++)
-      for (MatrixIndexT j = 0;j < dimK;j++)
-      {
-        if (i == j)
-          I(i, j) = 1.0;
-        else
-          I(i, j) = 0.0;
-      }
-    tt2.AddMat(1.0, I);   // I = identity
-    tt2.Invert();
-
-    Matrix<Real> tt3(dimK, dimM);
-    tt3.AddMatMat(1.0, tt2, kNoTrans, tt, kNoTrans, 0.0);    // tt = tt*tran(V)
-    Matrix<Real> tt4(dimM, dimM);
-    tt4.AddMatMat(1.0, U, kNoTrans, tt3, kNoTrans, 0.0);    // tt = U*tt
-    Matrix<Real> tt5(dimM, dimM);
-    tt5.AddMatMat(1.0, invA, kNoTrans, tt4, kNoTrans, 0.0);    // tt = inv(A)*tt
-
-    Matrix<Real> tmpR(dimM, dimM);
-    tmpR.CopyFromMat(invA);
-    tmpR.AddMat(-1.0, tt5);
-    // printf("#################%f###############################.%f##############", tmpL, tmpR);
-
-    AssertEqual<Real>(tmpL, tmpR);    // checks whether LHS = RHS or not...
-
-  }
-}
-
-
 template<typename Real> static void UnitTestTraceProduct() {
   for (MatrixIndexT iter = 0;iter < 5;iter++) {  // First test the 2 det routines are the same
     int dimM = 10 + Rand() % 10, dimN = 10 + Rand() % 10;
     Matrix<Real> M(dimM, dimN), N(dimM, dimN);
 
-    InitRand(&M);
-    InitRand(&N);
+    M.SetRandn();
+    N.SetRandn();
     Matrix<Real> Nt(N, kTrans);
     Real a = TraceMatMat(M, Nt), b = TraceMatMat(M, N, kTrans);
     printf("m = %d, n = %d\n", dimM, dimN);
@@ -1368,7 +1294,7 @@ template<typename Real> static void UnitTestSvd() {
     Matrix<Real> M(dimM, dimN);
     Matrix<Real> U(dimM, std::min(dimM, dimN)), Vt(std::min(dimM, dimN), dimN);
     Vector<Real> s(std::min(dimM, dimN));
-    InitRand(&M);
+    M.SetRandn();
     if (iter < 2) KALDI_LOG << "M " << M;
     Matrix<Real> M2(dimM, dimN); M2.CopyFromMat(M);
     M.Svd(&s, &U, &Vt);
@@ -1436,7 +1362,7 @@ template<typename Real> static void UnitTestSvdNodestroy() {
     MatrixIndexT minsz = std::min(dimM, dimN);
     Matrix<Real> M(dimM, dimN);
     Matrix<Real> U(dimM, minsz), Vt(minsz, dimN); Vector<Real> v(minsz);
-    InitRand(&M);
+    M.SetRandn();
     if (iter < 2) KALDI_LOG << "M " << M;
     M.Svd(&v, &U, &Vt);
     if (iter < 2) {
@@ -1466,26 +1392,6 @@ template<typename Real> static void UnitTestSvdNodestroy() {
 }
 
 
-/*
-  template<typename Real> static void UnitTestSvdVariants() {  // just make sure it doesn't crash if we call it but don't want left or right singular vectors. there are KALDI_ASSERTs inside the Svd.
-  #ifndef HAVE_ATLAS
-  MatrixIndexT Base = 10, Rand_ = 5, Iter = 25;
-  for (MatrixIndexT iter = 0;iter < Iter;iter++) {
-  MatrixIndexT dimM = Base + Rand() % Rand_, dimN =  Base + Rand() % Rand_;
-  // if (dimM<dimN) std::swap(dimM, dimN);  // M>=N.
-  Matrix<Real> M(dimM, dimN);
-  Matrix<Real> U(dimM, dimM), Vt(dimN, dimN); Vector<Real> v(std::min(dimM, dimN));
-  Matrix<Real> Utmp(dimM, 1); Matrix<Real> Vttmp(1, dimN);
-  InitRand(&M);
-  M.Svd(v, U, Vttmp, "A", "N");
-  M.Svd(v, Utmp, Vt, "N", "A");
-  Matrix<Real> U2(dimM, dimM), Vt2(dimN, dimN); Vector<Real> v2(std::min(dimM, dimN));
-  M.Svd(v, U2, Vt2, "A", "A");
-  AssertEqual(U, U2); AssertEqual(Vt, Vt2);
-
-  }
-  #endif
-  }*/
 
 template<typename Real> static void UnitTestSvdJustvec() {  // Making sure gives same answer if we get just the vector, not the eigs.
   MatrixIndexT Base = 10, Rand_ = 5, Iter = 25;
@@ -1507,7 +1413,7 @@ template<typename Real> static void UnitTestEigSymmetric() {
   for (MatrixIndexT iter = 0;iter < 5;iter++) {
     MatrixIndexT dimM = 20 + Rand()%10;
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     Matrix<Real> M(S);  // copy to regular matrix.
     Matrix<Real> P(dimM, dimM);
     Vector<Real> real_eigs(dimM), imag_eigs(dimM);
@@ -1530,7 +1436,7 @@ template<typename Real> static void UnitTestEig() {
           else
           dimM = 5 + Rand()%10; */
     Matrix<Real> M(dimM, dimM);
-    InitRand(&M);
+    InitRandNonsingular(&M);
     Matrix<Real> P(dimM, dimM);
     Vector<Real> real_eigs(dimM), imag_eigs(dimM);
     M.Eig(&P, &real_eigs, &imag_eigs);
@@ -1583,12 +1489,12 @@ template<typename Real> static void UnitTestEigSp() {
       case 0: // zero matrix.
         break;
       case 1: // general random symmetric matrix.
-        InitRand(&S);
+        InitRandNonsingular(&S);
         break;
       default:
         { // start with a random matrix; do decomposition; change the eigenvalues to
           // try to cover the problematic cases; reconstruct.
-          InitRand(&S);
+          InitRandNonsingular(&S);
           Vector<Real> s(dimM); Matrix<Real> P(dimM, dimM);
           S.Eig(&s, &P);
           // We on purpose create a problematic case where
@@ -1626,8 +1532,7 @@ template<typename Real> static void UnitTestEigSp() {
   }
 }
 
-// TEMP!
-template<typename Real>
+template <typename Real>
 static Real NonOrthogonality(const MatrixBase<Real> &M, MatrixTransposeType transM) {
   SpMatrix<Real> S(transM == kTrans ? M.NumCols() : M.NumRows());
   S.SetUnit();
@@ -1686,7 +1591,7 @@ static void UnitTestTridiagonalize() {
       dim = 16;
     SpMatrix<Real> S(dim), S2(dim), R(dim), S3(dim);
     Matrix<Real> Q(dim, dim);
-    InitRand(&S);
+    InitRandNonsingular(&S);
     // Very small or large scaling is challenging to qr due to squares that
     // could go out of range.
     if (Rand() % 3 == 0)
@@ -1746,7 +1651,7 @@ static void UnitTestTridiagonalizeAndQr() {
     MatrixIndexT dim = 50 + Rand() % 4;
     SpMatrix<Real> S(dim), S2(dim), R(dim), S3(dim), S4(dim);
     Matrix<Real> Q(dim, dim);
-    InitRand(&S);
+    InitRandNonsingular(&S);
     SpMatrix<Real> T(S);
     T.Tridiagonalize(&Q);
     KALDI_LOG << "S trace " << S.Trace() << ", T trace " << T.Trace();
@@ -1788,8 +1693,8 @@ template<typename Real> static void UnitTestMmul() {
     MatrixIndexT dimM = 20 + Rand()%10, dimN = 20 + Rand()%10, dimO = 20 + Rand()%10;  // dims between 10 and 20.
     // MatrixIndexT dimM = 2, dimN = 3, dimO = 4;
     Matrix<Real> A(dimM, dimN), B(dimN, dimO), C(dimM, dimO);
-    InitRand(&A);
-    InitRand(&B);
+    A.SetRandn();
+    B.SetRandn();
     //
     // KALDI_LOG <<"a = " << A;
     // KALDI_LOG<<"B = " << B;
@@ -1817,7 +1722,9 @@ template<typename Real> static void UnitTestMmulSym() {
     Matrix<Real> A(dimM, dimM), B(dimM, dimM), C(dimM, dimM), tmp(dimM, dimM), tmp2(dimM, dimM);
     SpMatrix<Real> sA(dimM), sB(dimM), sC(dimM), stmp(dimM);
 
-    InitRand(&A); InitRand(&B); InitRand(&C);
+    A.SetRandn();
+    B.SetRandn();
+    C.SetRandn();
     // Make A, B, C symmetric.
     tmp.CopyFromMat(A); A.AddMat(1.0, tmp, kTrans);
     tmp.CopyFromMat(B); B.AddMat(1.0, tmp, kTrans);
@@ -1865,7 +1772,7 @@ template<typename Real> static void UnitTestVecmul() {
     Real alpha = 0.333, beta = 0.5;
     Matrix<Real> A(dimM, dimN);
     if (trans == kTrans) A.Transpose();
-    InitRand(&A);
+    A.SetRandn();
     Vector<Real> x(dimM), y(dimN);
     //x.SetRandn();
     y.SetRandn();
@@ -1894,7 +1801,7 @@ template<typename Real> static void UnitTestInverse() {
   for (MatrixIndexT iter = 0;iter < 10;iter++) {
     MatrixIndexT dimM = 20 + Rand()%10;
     Matrix<Real> A(dimM, dimM), B(dimM, dimM), C(dimM, dimM);
-    InitRand(&A);
+    InitRandNonsingular(&A);
     B.CopyFromMat(A);
     B.Invert();
 
@@ -1914,8 +1821,8 @@ template<typename Real> static void UnitTestMulElements() {
   for (MatrixIndexT iter = 0; iter < 5; iter++) {
     MatrixIndexT dimM = 20 + Rand()%10, dimN = 20 + Rand()%10;
     Matrix<Real> A(dimM, dimN), B(dimM, dimN), C(dimM, dimN);
-    InitRand(&A);
-    InitRand(&B);
+    A.SetRandn();
+    B.SetRandn();
 
     C.CopyFromMat(A);
     C.MulElements(B);  // C = A .* B (in Matlab, for example).
@@ -1931,7 +1838,8 @@ template<typename Real> static void UnitTestSpLogExp() {
   for (MatrixIndexT i = 0; i < 5; i++) {
     MatrixIndexT dimM = 10 + Rand() % 10;
 
-    Matrix<Real> M(dimM, dimM); InitRand(&M);
+    Matrix<Real> M(dimM, dimM);
+    InitRandNonsingular(&M);
     SpMatrix<Real> B(dimM);
     B.AddMat2(1.0, M, kNoTrans, 0.0);  // B = M*M^T -> positive definite (since M nonsingular).
 
@@ -1956,8 +1864,8 @@ template<typename Real> static void UnitTestDotprod() {
     MatrixIndexT dimM = 200 + Rand()%100;
     Vector<Real> v(dimM), w(dimM);
 
-    InitRand(&v);
-    InitRand(&w);
+    v.SetRandn();
+    w.SetRandn();
     Vector<double> wd(w);
 
     Real f = VecVec(w, v), f2 = VecVec(wd, v), f3 = VecVec(v, wd);
@@ -1979,13 +1887,13 @@ static void UnitTestResize() {
     for (MatrixIndexT j = 0; j < 3; j++) {
       MatrixResizeType resize_type = static_cast<MatrixResizeType>(j);
       Matrix<Real> M(dimM1, dimN1);
-      InitRand(&M);
+      M.SetRandn();
       Matrix<Real> Mcopy(M);
       Vector<Real> v(dimM1);
-      InitRand(&v);
+      v.SetRandn();
       Vector<Real> vcopy(v);
       SpMatrix<Real> S(dimM1);
-      InitRand(&S);
+      S.SetRandn();
       SpMatrix<Real> Scopy(S);
       M.Resize(dimM2, dimN2, resize_type);
       v.Resize(dimM2, resize_type);
@@ -2020,9 +1928,9 @@ static void UnitTestTp2Sp() {
     MatrixIndexT dimM = 10 + Rand()%3;
 
     TpMatrix<Real> T(dimM);
-    InitRand(&T);
+    T.SetRandn();
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
 
     Matrix<Real> M(T);
     for ( MatrixIndexT i = 0; i < dimM; i++)
@@ -2045,7 +1953,7 @@ static void UnitTestTp2() {
     MatrixIndexT dimM = 10 + Rand()%3;
 
     TpMatrix<Real> T(dimM);
-    InitRand(&T);
+    T.SetRandn();
 
     Matrix<Real> M(T);
 
@@ -2062,11 +1970,11 @@ static void UnitTestAddDiagMat2() {
     MatrixIndexT dimM = 10 + Rand() % 3,
                  dimN = 1 + Rand() % 4;
     Vector<Real> v(dimM);
-    InitRand(&v);
+    v.SetRandn();
     Vector<Real> w(v);
     if (iter % 2 == 1) {
       Matrix<Real> M(dimM, dimN);
-      InitRand(&M);
+      M.SetRandn();
       v.AddDiagMat2(0.5, M, kNoTrans, 0.3);
       Matrix<Real> M2(dimM, dimM);
       M2.AddMatMat(1.0, M, kNoTrans, M, kTrans, 0.0);
@@ -2077,7 +1985,7 @@ static void UnitTestAddDiagMat2() {
       AssertEqual(w, v);
     } else {
       Matrix<Real> M(dimN, dimM);
-      InitRand(&M);
+      M.SetRandn();
       v.AddDiagMat2(0.5, M, kTrans, 0.3);
       Matrix<Real> M2(dimM, dimM);
       M2.AddMatMat(1.0, M, kTrans, M, kNoTrans, 0.0);
@@ -2104,7 +2012,7 @@ static void UnitTestAddDiagMatMat() {
     MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
     Matrix<Real> M(M_orig, transM), N(N_orig, transN);
 
-    InitRand(&v);
+    v.SetRandn();
     Vector<Real> w(v);
 
     w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
@@ -2219,7 +2127,7 @@ static void UnitTestRankNUpdate() {
     SpMatrix<Real> Ap(dimA);
     SpMatrix<Real> Ap2(dimA);
     Matrix<Real>   M(dimO, dimA);
-    InitRand(&M);
+    M.SetRandn();
     Matrix<Real>   N(M, kTrans);
     Af.AddMatMat(1.0, M, kTrans, M, kNoTrans, 0.0);
     Ap.AddMat2(1.0, M, kTrans, 0.0);
@@ -2279,7 +2187,8 @@ template<typename Real> static void  UnitTestLimitCondInvert() {
     MatrixIndexT dimN = dimM + 1 + Rand()%10;
 
     SpMatrix<Real> B(dimM);
-    Matrix<Real> X(dimM, dimN); InitRand(&X);
+    Matrix<Real> X(dimM, dimN);
+    X.SetRandn();
     B.AddMat2(1.0, X, kNoTrans, 0.0);  // B = X*X^T -> positive definite (almost certainly), since N > M.
 
 
@@ -2299,16 +2208,17 @@ template<typename Real> static void  UnitTestFloorChol() {
 
 
     MatrixIndexT dimN = 20 + Rand()%10;
-    Matrix<Real> X(dimM, dimN); InitRand(&X);
+    Matrix<Real> X(dimM, dimN);
+    X.SetRandn();
     SpMatrix<Real> B(dimM);
     B.AddMat2(1.0, X, kNoTrans, 0.0);  // B = X*X^T -> positive semidefinite.
 
     float alpha = (Rand() % 10) + 0.5;
     Matrix<Real> M(dimM, dimM);
-    InitRand(&M);
+    M.SetRandn();
     SpMatrix<Real> C(dimM);
     C.AddMat2(1.0, M, kNoTrans, 0.0);  // C:=M*M^T
-    InitRand(&M);
+    InitRandNonsingular(&M);
     C.AddMat2(1.0, M, kNoTrans, 1.0);  // C+=M*M^T (after making new random M)
     if (i%2 == 0)
       C.Scale(0.001);  // so it's not too much bigger than B (or it's trivial)
@@ -2317,7 +2227,7 @@ template<typename Real> static void  UnitTestFloorChol() {
 
     for (MatrixIndexT j = 0;j < 10;j++) {
       Vector<Real> v(dimM);
-      InitRand(&v);
+      v.SetRandn();
       Real ip_b = VecSpVec(v, B, v);
       Real ip_a = VecSpVec(v, BFloored, v);
       Real ip_c = alpha * VecSpVec(v, C, v);
@@ -2336,11 +2246,13 @@ template<typename Real> static void  UnitTestFloorUnit() {
     MatrixIndexT dimN = 20 + Rand()%10;
     float floor = (Rand() % 10) - 3;
 
-    Matrix<Real> M(dimM, dimN); InitRand(&M);
+    Matrix<Real> M(dimM, dimN);
+    M.SetRandn();
     SpMatrix<Real> B(dimM);
     B.AddMat2(1.0, M, kNoTrans, 0.0);  // B = M*M^T -> positive semidefinite.
 
-    SpMatrix<Real> BFloored(B); BFloored.ApplyFloor(floor);
+    SpMatrix<Real> BFloored(B);
+    BFloored.ApplyFloor(floor);
 
 
     Vector<Real> s(dimM); Matrix<Real> P(dimM, dimM); B.SymPosSemiDefEig(&s, &P);
@@ -2376,7 +2288,8 @@ template<typename Real> static void  UnitTestMat2Vec() {
   for (MatrixIndexT i = 0; i < 5; i++) {
     MatrixIndexT dimM = 10 + Rand() % 10;
 
-    Matrix<Real> M(dimM, dimM); InitRand(&M);
+    Matrix<Real> M(dimM, dimM);
+    M.SetRandn();
     SpMatrix<Real> B(dimM);
     B.AddMat2(1.0, M, kNoTrans, 0.0);  // B = M*M^T -> positive definite (since M nonsingular).
 
@@ -2490,7 +2403,8 @@ template<typename Real> static void  UnitTestSimple() {
     KALDI_ASSERT(!M.IsZero());
     KALDI_ASSERT(M.IsDiagonal());
 
-    SpMatrix<Real> S(dimM); InitRand(&S);
+    SpMatrix<Real> S(dimM);
+    S.SetRandn();
     Matrix<Real> N(S);
     KALDI_ASSERT(!N.IsDiagonal());  // technically could be diagonal, but almost infinitely unlikely.
     KALDI_ASSERT(N.IsSymmetric());
@@ -2499,7 +2413,8 @@ template<typename Real> static void  UnitTestSimple() {
 
     M.SetZero();
     KALDI_ASSERT(M.IsZero());
-    Vector<Real> V(dimM*dimN); InitRand(&V);
+    Vector<Real> V(dimM*dimN);
+    V.SetRandn();
     Vector<Real> V2(V), V3(dimM*dimN);
     V2.ApplyExp();
     AssertEqual(V.Sum(), V2.SumLog());
@@ -2549,14 +2464,14 @@ template<typename Real> static void UnitTestIo() {
       dimM = 0;dimN = 0;  // test case when both are zero.
     }
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     Matrix<Real> N;
     Vector<Real> v1(dimM);
-    InitRand(&v1);
+    v1.SetRandn();
     Vector<Real> v2(dimM);
 
     SpMatrix<Real> S(dimM);
-    InitRand(&S);
+    S.SetRandn();
     KALDI_LOG << "SpMatrix IO: " << S;
     SpMatrix<Real> T(dimM);
 
@@ -2614,16 +2529,16 @@ template<typename Real> static void UnitTestIoCross() {  // across types.
     }
     Matrix<Real> M(dimM, dimN);
     Matrix<Other> MO;
-    InitRand(&M);
+    M.SetRandn();
     Matrix<Real> N(dimM, dimN);
     Vector<Real> v(dimM);
     Vector<Other> vO;
-    InitRand(&v);
+    v.SetRandn();
     Vector<Real> w(dimM);
 
     SpMatrix<Real> S(dimM);
     SpMatrix<Other> SO;
-    InitRand(&S);
+    S.SetRandn();
     SpMatrix<Real> T(dimM);
 
     {
@@ -2675,7 +2590,7 @@ template<typename Real> static void UnitTestHtkIo() {
     hdr.mSampleKind = 8;  // Mel spectrum.
 
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
 
     {
       std::ofstream os("tmpf", std::ios::out|std::ios::binary);
@@ -2709,7 +2624,7 @@ template<typename Real> static void UnitTestRange() {  // Testing SubMatrix clas
     MatrixIndexT dimN = (Rand()%10) + 10;
 
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     MatrixIndexT dimMStart = Rand() % 5;
     MatrixIndexT dimNStart = Rand() % 5;
 
@@ -2725,7 +2640,7 @@ template<typename Real> static void UnitTestRange() {  // Testing SubMatrix clas
       for (MatrixIndexT j = dimNStart;j < dimNEnd;j++)
         KALDI_ASSERT(M(i, j) == sub(i-dimMStart, j-dimNStart));
 
-    InitRand(&sub);
+    sub.SetRandn();
 
     KALDI_ASSERT(sub.Sum() == M.Range(dimMStart, dimMEnd-dimMStart, dimNStart, dimNEnd-dimNStart).Sum());
 
@@ -2739,7 +2654,7 @@ template<typename Real> static void UnitTestRange() {  // Testing SubMatrix clas
     MatrixIndexT length = (Rand()%10) + 10;
 
     Vector<Real> V(length);
-    InitRand(&V);
+    V.SetRandn();
     MatrixIndexT lenStart = Rand() % 5;
 
     MatrixIndexT lenEnd = lenStart + 1 + (Rand()%10); if (lenEnd > length) lenEnd = length;
@@ -2751,7 +2666,7 @@ template<typename Real> static void UnitTestRange() {  // Testing SubMatrix clas
     for (MatrixIndexT i = lenStart;i < lenEnd;i++)
       KALDI_ASSERT(V(i) == sub(i-lenStart));
 
-    InitRand(&sub);
+    sub.SetRandn();
 
     KALDI_ASSERT(sub.Sum() == V.Range(lenStart, lenEnd-lenStart).Sum());
 
@@ -2777,7 +2692,7 @@ template<typename Real> static void UnitTestScale() {
       // now test scale_rows
       M.CopyFromMat(N);  // make same.
       Vector<Real> V(dimM);
-      InitRand(&V);
+      V.SetRandn();
       M.MulRowsVec(V);
       for (MatrixIndexT i = 0; i < dimM;i++)
         for (MatrixIndexT j = 0;j < dimN;j++)
@@ -2788,7 +2703,7 @@ template<typename Real> static void UnitTestScale() {
       // now test scale_cols
       M.CopyFromMat(N);  // make same.
       Vector<Real> V(dimN);
-      InitRand(&V);
+      V.SetRandn();
       M.MulColsVec(V);
       for (MatrixIndexT i = 0; i < dimM;i++)
         for (MatrixIndexT j = 0;j < dimN;j++)
@@ -2806,8 +2721,10 @@ template<typename Real> static void UnitTestMul() {
       if (i%3 == 0) beta = 0.5;
       if (i%5 == 0) alpha = 0.7;
       MatrixIndexT dimM = (Rand()%10) + 10;
-      Vector<Real> v(dimM); InitRand(&v);
-      TpMatrix<Real> T(dimM); InitRand(&T);
+      Vector<Real> v(dimM);
+      v.SetRandn();
+      TpMatrix<Real> T(dimM);
+      T.SetRandn();
       Matrix<Real> M(dimM, dimM);
       if (i%2 == 1)
         M.CopyFromTp(T, trans);
@@ -2832,8 +2749,10 @@ template<typename Real> static void UnitTestMul() {
       if (i%5 == 0) alpha = 0.7;
 
       MatrixIndexT dimM = (Rand()%10) + 10;
-      Vector<Real> v(dimM); InitRand(&v);
-      SpMatrix<Real> T(dimM); InitRand(&T);
+      Vector<Real> v(dimM);
+      v.SetRandn();
+      SpMatrix<Real> T(dimM);
+      T.SetRandn();
       Matrix<Real> M(T);
       Vector<Real> v2(dimM);
       Vector<Real> v3(dimM);
@@ -2849,9 +2768,9 @@ template<typename Real> static void UnitTestInnerProd() {
 
   MatrixIndexT N = 1 + Rand() % 10;
   SpMatrix<Real> S(N);
-  InitRand(&S);
+  S.SetRandn();
   Vector<Real> v(N);
-  InitRand(&v);
+  v.SetRandn();
   Real prod = VecSpVec(v, S, v);
   Real f2=0.0;
   for (MatrixIndexT i = 0; i < N; i++)
@@ -2865,7 +2784,7 @@ template<typename Real> static void UnitTestInnerProd() {
 template<typename Real> static void UnitTestAddToDiag() {
   MatrixIndexT N = 1 + Rand() % 10;
   SpMatrix<Real> S(N);
-  InitRand(&S);
+  S.SetRandn();
   SpMatrix<Real> S2(S);
   Real x = 0.5;
   S.AddToDiag(x);
@@ -2877,7 +2796,7 @@ template<typename Real> static void UnitTestScaleDiag() {
 
   MatrixIndexT N = 1 + Rand() % 10;
   SpMatrix<Real> S(N);
-  InitRand(&S);
+  S.SetRandn();
   SpMatrix<Real> S2(S);
   S.ScaleDiag(0.5);
   for (MatrixIndexT i = 0; i  < N; i++) S2(i, i) *= 0.5;
@@ -2900,8 +2819,8 @@ template<typename Real> static void UnitTestTraceSpSpLower() {
 
   MatrixIndexT N = 1 + Rand() % 10;
   SpMatrix<Real> S(N), T(N);
-  InitRand(&S);
-  InitRand(&T);
+  S.SetRandn();
+  T.SetRandn();
 
   SpMatrix<Real> Sfast(S);
   Sfast.Scale(2.0);
@@ -3055,11 +2974,12 @@ template<typename Real> static void UnitTestSolve() {
     MatrixIndexT dimN = dimM - (Rand()%3);  // slightly lower-dim.
 
     SpMatrix<Real> H(dimM);
-    Matrix<Real> M(dimM, dimN); InitRand(&M);
+    Matrix<Real> M(dimM, dimN);
+    M.SetRandn();
     H.AddMat2(1.0, M, kNoTrans, 0.0);  // H = M M^T
 
     Vector<Real> x(dimM);
-    InitRand(&x);
+    x.SetRandn();
     Vector<Real> tmp(dimM);
     tmp.SetRandn();
     Vector<Real> g(dimM);
@@ -3095,16 +3015,16 @@ template<typename Real> static void UnitTestSolve() {
 
     SpMatrix<Real> Q(dimM), SigmaInv(dimO);
     Matrix<Real> Mtmp(dimM, dimN);
-    InitRand(&Mtmp);
+    Mtmp.SetRandn();
     Q.AddMat2(1.0, Mtmp, kNoTrans, 0.0); // H = M M^T
 
     Matrix<Real> Ntmp(dimO, dimN);
-    InitRand(&Ntmp);
+    Ntmp.SetRandn();
     SigmaInv.AddMat2(1.0, Ntmp, kNoTrans, 0.0); // H = M M^T
 
     Matrix<Real> M(dimO, dimM), Y(dimO, dimM);
-    InitRand(&M);
-    InitRand(&Y);
+    M.SetRandn();
+    Y.SetRandn();
 
     Matrix<Real> M2(M);
 
@@ -3149,8 +3069,8 @@ template<typename Real> static void UnitTestSolve() {
     Matrix<Real> M(dimO, dimM), G(dimO, dimM);
     M.SetRandn();
     G.SetRandn();
-    //    InitRand(&M);
-    //    InitRand(&G);
+    //    InitRandNonsingular(&M);
+    //    InitRandNonsingular(&G);
 
     Matrix<Real> M2(M);
 
@@ -3209,7 +3129,7 @@ template<typename Real> static void UnitTestLbfgs() {
     SpMatrix<Real> S(dim);
     RandPosdefSpMatrix(dim, &S);
     Vector<Real> v(dim);
-    InitRand(&v);
+    v.SetRandn();
     // Function will be f = exp(0.1 * [ x' v  -0.5 x' S x ])
     // This is to maximize; we negate it when minimizing.
 
@@ -3222,7 +3142,7 @@ template<typename Real> static void UnitTestLbfgs() {
     x_opt.AddSpVec(1.0, Sinv, v, 0.0); // S^{-1} v-- the optimum.
 
     Vector<Real> init_x(dim);
-    InitRand(&init_x);
+    init_x.SetRandn();
 
     LbfgsOptions opts;
     opts.minimize = minimize; // This objf has a maximum, not a minimum.
@@ -3315,7 +3235,7 @@ template<typename Real> static void UnitTestMaxMin() {
   MatrixIndexT M = 1 + Rand() % 10, N = 1 + Rand() % 10;
   {
     Vector<Real> v(N);
-    InitRand(&v);
+    v.SetRandn();
     Real min = 1.0e+10, max = -1.0e+10;
     for (MatrixIndexT i = 0; i< N; i++) {
       min = std::min(min, v(i));
@@ -3326,7 +3246,7 @@ template<typename Real> static void UnitTestMaxMin() {
   }
   {
     SpMatrix<Real> S(N);
-    InitRand(&S);
+    S.SetRandn();
     Real min = 1.0e+10, max = -1.0e+10;
     for (MatrixIndexT i = 0; i< N; i++) {
       for (MatrixIndexT j = 0; j <= i; j++) {
@@ -3339,7 +3259,7 @@ template<typename Real> static void UnitTestMaxMin() {
   }
   {
     Matrix<Real> mat(M, N);
-    InitRand(&mat);
+    mat.SetRandn();
     Real min = 1.0e+10, max = -1.0e+10;
     for (MatrixIndexT i = 0; i< M; i++) {
       for (MatrixIndexT j = 0; j < N; j++) {
@@ -3362,7 +3282,7 @@ template<typename Real> static void UnitTestTrace() {
   for (MatrixIndexT i = 0;i < 5;i++) {
     MatrixIndexT dimM = 20 + Rand()%10, dimN = 20 + Rand()%10, dimO = 20 + Rand()%10, dimP = dimM;
     Matrix<Real> A(dimM, dimN), B(dimN, dimO), C(dimO, dimP);
-    InitRand(&A);     InitRand(&B);     InitRand(&C);
+    A.SetRandn();     B.SetRandn();     C.SetRandn();
     Matrix<Real> AT(dimN, dimM), BT(dimO, dimN), CT(dimP, dimO);
     AT.CopyFromMat(A, kTrans); BT.CopyFromMat(B, kTrans); CT.CopyFromMat(C, kTrans);
 
@@ -3404,9 +3324,9 @@ template<typename Real> static void UnitTestTrace() {
   for (MatrixIndexT i = 0;i < 5;i++) {
     MatrixIndexT dimM = 20 + Rand()%10, dimN = 20 + Rand()%10;
     SpMatrix<Real> S(dimM), T(dimN);
-    InitRand(&S); InitRand(&T);
+    S.SetRandn(); T.SetRandn();
     Matrix<Real> M(dimM, dimN), O(dimM, dimN);
-    InitRand(&M); InitRand(&O);
+    M.SetRandn(); O.SetRandn();
     Matrix<Real> sM(S), tM(T);
 
     Real x1 = TraceMatMat(tM, tM);
@@ -3431,7 +3351,7 @@ template<typename Real> static void UnitTestComplexFt() {
   for (MatrixIndexT d = 0; d < 10; d++) {
     MatrixIndexT N = Rand() % 100, twoN = 2*N;
     Vector<Real> v(twoN), w(twoN), x(twoN);
-    InitRand(&v);
+    v.SetRandn();
     ComplexFt(v, &w, true);
     ComplexFt(w, &x, false);
     if (N>0) x.Scale(1.0/static_cast<Real>(N));
@@ -3465,7 +3385,7 @@ template<typename Real> static void UnitTestComplexFft() {
     MatrixIndexT twoN = 2*N;
     Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
 
-    InitRand(&v);
+    v.SetRandn();
 
     if (N< 100) ComplexFt(v, &w_base, true);
     w_alg.CopyFromVec(v);
@@ -3496,7 +3416,7 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
     for (MatrixIndexT p = 0; p < 3; p++) {
       Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
 
-      InitRand(&v);
+      v.SetRandn();
 
       if (N< 100) ComplexFt(v, &w_base, true);
       w_alg.CopyFromVec(v);
@@ -3524,7 +3444,7 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
 template<typename Real> static void UnitTestTranspose() {
 
   Matrix<Real> M(Rand() % 5 + 1, Rand() % 10 + 1);
-  InitRand(&M);
+  M.SetRandn();
   Matrix<Real> N(M, kTrans);
   N.Transpose();
   AssertEqual(M, N);
@@ -3537,9 +3457,9 @@ template<typename Real> static void UnitTestAddVecToRows() {
   for (int i = 0; i < 2; i++) {
     MatrixIndexT dimM = sizes[i] + Rand() % 10, dimN = sizes[i] + Rand() % 10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     Vector<float> v(M.NumCols());
-    InitRand(&v);
+    v.SetRandn();
     Matrix<Real> N(M);
     Vector<float> ones(M.NumRows());
     ones.Set(1.0);
@@ -3577,9 +3497,9 @@ template<typename Real> static void UnitTestAddVecToCols() {
   for (int i = 0; i < 2; i++) {
     MatrixIndexT dimM = sizes[i] + Rand() % 10, dimN = sizes[i] + Rand() % 10;
     Matrix<Real> M(dimM, dimN);
-    InitRand(&M);
+    M.SetRandn();
     Vector<float> v(M.NumRows());
-    InitRand(&v);
+    v.SetRandn();
     Matrix<Real> N(M);
     Vector<float> ones(M.NumCols());
     ones.Set(1.0);
@@ -3630,7 +3550,7 @@ template<typename Real> static void UnitTestSplitRadixComplexFft2() {
     SplitRadixComplexFft<Real> srfft(N);
     for (MatrixIndexT q = 0; q < 3; q++) {
       Vector<Real> v(N*2), vorig(N*2);
-      InitRand(&v);
+      v.SetRandn();
       vorig.CopyFromVec(v);
       srfft.Compute(v.Data(), true);  // forward
       srfft.Compute(v.Data(), false);  // backward
@@ -3648,7 +3568,7 @@ template<typename Real> static void UnitTestRealFft() {
     MatrixIndexT N = N_;
     if (N >90) N *= Rand() % 60;
     Vector<Real> v(N), w(N), x(N), y(N);
-    InitRand(&v);
+    v.SetRandn();
     w.CopyFromVec(v);
     RealFftInefficient(&w, true);
     y.CopyFromVec(v);
@@ -3672,14 +3592,14 @@ template<typename Real> static void UnitTestRealFft() {
 template<typename Real> static void UnitTestSplitRadixRealFft() {
 
   for (MatrixIndexT p = 0; p < 30; p++) {
-    MatrixIndexT logn = 2 + Rand() % 11,
+    MatrixIndexT logn = 2 + Rand() % 9,
         N = 1 << logn;
 
     SplitRadixRealFft<Real> srfft(N), srfft2(srfft);
     std::vector<Real> temp_buffer;
     for (MatrixIndexT q = 0; q < 3; q++) {
       Vector<Real> v(N), w(N), x(N), y(N);
-      InitRand(&v);
+      v.SetRandn();
       w.CopyFromVec(v);
       RealFftInefficient(&w, true);
       y.CopyFromVec(v);
@@ -3789,7 +3709,7 @@ void UnitTestNonsymmetricPower() {
   for (MatrixIndexT iter = 0; iter < 30; iter++) {
     MatrixIndexT dimM = 1 + Rand() % 20;
     Matrix<Real> M(dimM, dimM);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> MM(dimM, dimM);
     MM.AddMatMat(1.0, M, kNoTrans, M, kNoTrans, 0.0);  // MM = M M.
@@ -3808,7 +3728,7 @@ void UnitTestNonsymmetricPower() {
   for (MatrixIndexT iter = 0; iter < 30; iter++) {
     MatrixIndexT dimM = 1 + Rand() % 20;
     Matrix<Real> M(dimM, dimM);
-    InitRand(&M);
+    M.SetRandn();
 
     Matrix<Real> MM(dimM, dimM);
     MM.AddMatMat(1.0, M, kNoTrans, M, kNoTrans, 0.0);  // MM = M M.
@@ -3847,9 +3767,9 @@ void UnitTestNonsymmetricPower() {
 void UnitTestAddVecCross() {
 
   Vector<float> v(5);
-  InitRand(&v);
+  v.SetRandn();
   Vector<double> w(5);
-  InitRand(&w);
+  w.SetRandn();
 
   Vector<float> wf(w);
 
@@ -3888,7 +3808,7 @@ template<typename Real> static void UnitTestMatrixExponential() {
   for (MatrixIndexT p = 0; p < 10; p++) {
     MatrixIndexT dim = 1 + Rand() % 5;
     Matrix<Real> M(dim, dim);
-    InitRand(&M);
+    InitRandNonsingular(&M);
     {  // work out largest eig.
       Real largest_eig = 0.0;
       Vector<Real> real_eigs(dim), imag_eigs(dim);
@@ -3929,11 +3849,9 @@ static void UnitTestMatrixExponentialBackprop() {
     // f is tr(N^T exp(M)).  backpropagating derivative
     // of this function.
     Matrix<double> M(dim, dim), N(dim, dim), delta(dim, dim);
-    InitRand(&M);
-    // { SpMatrix<double> tmp(dim); InitRand(&tmp); M.CopyFromSp(tmp); }
-    InitRand(&N);
-    InitRand(&delta);
-    // { SpMatrix<double> tmp(dim); InitRand(&tmp); delta.CopyFromSp(tmp); }
+    InitRandNonsingular(&M);
+    InitRandNonsingular(&N);
+    InitRandNonsingular(&delta);
     delta.Scale(0.00001);
 
 
@@ -3960,7 +3878,7 @@ static void UnitTestMatrixExponentialBackprop() {
     KALDI_LOG << "f_diff = " << f_diff;
     KALDI_LOG << "f_diff2 = " << f_diff2;
 
-    AssertEqual(f_diff, f_diff2);
+    KALDI_ASSERT(ApproxEqual(f_diff, f_diff2) || fabs(f_diff - f_diff2) < 1.0e-03);
   }
 }
 template<typename Real>
@@ -4123,6 +4041,50 @@ static void UnitTestSvdSpeed() {
   }
 }
 
+template<typename Real> static void UnitTestCompressedMatrix2() {
+  // These are some new tests added after we add the capability to
+  // specify the compression type.
+
+  for (int32 i = 0; i < 10; i++) {
+    // test that the kTwoByteSignedInteger  method works.
+    int32 num_rows = RandInt(1, 5), num_cols = RandInt(1, 10);
+    Matrix<Real> mat(num_rows, num_cols);
+    for (int32 j = 0; j < num_rows; j++) {
+      for (int32 k = 0; k < num_cols; k++) {
+        mat(j, k) = RandInt(-32768, 32767);
+      }
+    }
+    CompressedMatrix cmat(mat, kTwoByteSignedInteger);
+
+    Matrix<Real> mat2(cmat);
+
+    // Check that they are exactly equal.  These integers should all be
+    // exactly representable, and exactly reconstructed after compression.
+    KALDI_ASSERT(ApproxEqual(mat, mat2, Real(0.0)));
+  }
+
+
+  for (int32 i = 0; i < 10; i++) {
+    // test that the kOneByteZeroOne compression method works.
+    int32 num_rows = RandInt(1, 5), num_cols = RandInt(1, 10);
+    Matrix<Real> mat(num_rows, num_cols);
+    for (int32 j = 0; j < num_rows; j++) {
+      for (int32 k = 0; k < num_cols; k++) {
+        mat(j, k) = RandInt(0, 255) / 255.0;
+      }
+    }
+    CompressedMatrix cmat(mat, kOneByteZeroOne);
+
+    Matrix<Real> mat2(cmat);
+
+    // Check that they are almost exactly equal.  (It's not 100% exact because
+    // 1.0 / 255.0 is not exactly representable.
+    KALDI_ASSERT(ApproxEqual(mat, mat2, Real(1.00001)));
+  }
+
+}
+
+
 template<typename Real> static void UnitTestCompressedMatrix() {
   // This is the basic test.
 
@@ -4131,7 +4093,7 @@ template<typename Real> static void UnitTestCompressedMatrix() {
   KALDI_ASSERT(empty_cmat.NumCols() == 0);
 
   // could set num_tot to 10000 for more thorough testing.
-  MatrixIndexT num_failure = 0, num_tot = 10000, max_failure = 1;
+  MatrixIndexT num_failure = 0, num_tot = 1000, max_failure = 1;
   for (MatrixIndexT n = 0; n < num_tot; n++) {
     MatrixIndexT num_rows = Rand() % 20, num_cols = Rand() % 15;
     if (num_rows * num_cols == 0) {
@@ -4143,7 +4105,7 @@ template<typename Real> static void UnitTestCompressedMatrix() {
       num_cols = 1 + Rand() % 3;
     }
     Matrix<Real> M(num_rows, num_cols);
-    if (Rand() % 3 != 0) InitRand(&M);
+    if (Rand() % 3 != 0) M.SetRandn();
     else {
       M.Add(RandGauss());
     }
@@ -4161,7 +4123,16 @@ template<typename Real> static void UnitTestCompressedMatrix() {
       for (MatrixIndexT c = 0; c < num_cols; c++)
         if (Rand() % modulus != 0) M(r, c) = rand_val;
 
-    CompressedMatrix cmat(M);
+
+    CompressionMethod method;
+    switch(RandInt(0, 3)) {
+      case 0: method = kAutomaticMethod; break;
+      case 1: method = kSpeechFeature; break;
+      case 2: method = kTwoByteAuto; break;
+      default: method = kOneByteAuto; break;
+    }
+
+    CompressedMatrix cmat(M, method);
     KALDI_ASSERT(cmat.NumRows() == num_rows);
     KALDI_ASSERT(cmat.NumCols() == num_cols);
 
@@ -4172,9 +4143,11 @@ template<typename Real> static void UnitTestCompressedMatrix() {
     diff.AddMat(-1.0, M);
 
     { // Check that when compressing a matrix that has already been compressed,
-      // and uncompressing, we get the same answer.
-      // ok, actually, we can't guarantee this, so just limit the number of failures.
-      CompressedMatrix cmat2(M2);
+      // and uncompressing, we get the same answer if using the same compression
+      // method.
+      // ok, actually, we can't guarantee this, so just limit the number of
+      // failures.
+      CompressedMatrix cmat2(M2, method);
       Matrix<Real> M3(cmat.NumRows(), cmat.NumCols());
       cmat2.CopyToMat(&M3);
       if (!M2.ApproxEqual(M3, 1.0e-04)) {
@@ -4189,6 +4162,51 @@ template<typename Real> static void UnitTestCompressedMatrix() {
       }
     }
 
+    if (num_rows > 0) {  // Check that the constructor accepting row and column offsets
+                         // etc., works.
+      if (RandInt(0, 1) == 0) {
+        // test the ability of the self-constructor to do row-padding.  (used in
+        // getting nnet3 examples without un-compressing and re-compressing the
+        // data_.
+        bool allow_row_padding = true;
+        int32 row_offset = RandInt(-4, num_rows - 1),
+            col_offset = RandInt(0, num_cols - 1),
+            num_rows_sub = RandInt(1, 4 + num_rows - row_offset),
+            num_cols_sub = RandInt(1, num_cols - col_offset);
+        CompressedMatrix cmat_sub(cmat, row_offset, num_rows_sub,
+                                  col_offset, num_cols_sub, allow_row_padding);
+        Matrix<Real> M2_sub(num_rows_sub, num_cols_sub);
+        for (int32 row = 0; row < num_rows_sub; row++) {
+          int32 old_row = row + row_offset;
+          if (old_row < 0) old_row = 0;
+          else if (old_row >= num_rows) { old_row = num_rows - 1; }
+          SubVector<Real> M2_sub_row(M2_sub, row),
+              M2_row(M2, old_row),
+              M2_row_part(M2_row, col_offset, num_cols_sub);
+          M2_sub_row.CopyFromVec(M2_row_part);
+        }
+        Matrix<Real> M3_sub(cmat_sub);
+        M3_sub.AddMat(-1.0, M2_sub);
+        KALDI_ASSERT(M3_sub.FrobeniusNorm() / (num_rows_sub * num_cols_sub) <
+                     1.0e-03);
+      } else {
+        int32 row_offset = RandInt(0, num_rows - 1),
+            col_offset = RandInt(0, num_cols - 1),
+            num_rows_sub = RandInt(1, num_rows - row_offset),
+            num_cols_sub = RandInt(1, num_cols - col_offset);
+        CompressedMatrix cmat_sub(cmat, row_offset, num_rows_sub,
+                                  col_offset, num_cols_sub);
+        SubMatrix<Real> M2_sub(M2, row_offset, num_rows_sub,
+                               col_offset, num_cols_sub);
+        Matrix<Real> M3_sub(cmat_sub);
+        M3_sub.AddMat(-1.0, M2_sub);
+        KALDI_ASSERT(M3_sub.FrobeniusNorm() / (num_rows_sub * num_cols_sub) <
+                     1.0e-03);
+      }
+    } else {
+      CompressedMatrix cmat_sub(cmat, 0, 0, 0, 0);
+    }
+
     // test CopyRowToVec
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       Vector<Real> V(num_cols);
@@ -4307,7 +4325,7 @@ template<typename Real> static void UnitTestGeneralMatrix() {
   KALDI_ASSERT(empty_pmat.NumCols() == 0);
 
   // could set num_tot to 10000 for more thorough testing.
-  MatrixIndexT num_failure = 0, num_tot = 10000, max_failure = 1;
+  MatrixIndexT num_failure = 0, num_tot = 1000, max_failure = 1;
   for (MatrixIndexT n = 0; n < num_tot; n++) {
     MatrixIndexT num_rows = Rand() % 20, num_cols = Rand() % 15;
     if (num_rows * num_cols == 0) {
@@ -4319,7 +4337,8 @@ template<typename Real> static void UnitTestGeneralMatrix() {
       num_cols = 1 + Rand() % 3;
     }
     Matrix<Real> M(num_rows, num_cols);
-    if (Rand() % 3 != 0) InitRand(&M);
+    if (Rand() % 3 != 0)
+      M.SetRandn();
     else {
       M.Add(RandGauss());
     }
@@ -4359,6 +4378,37 @@ template<typename Real> static void UnitTestGeneralMatrix() {
     Matrix<Real> diff(M2);
     diff.AddMat(-1.0, M);
 
+    if (pmat2.NumRows() > 0) {  // Test ExtractRowRangeWithPadding.
+      int32 row_offset = RandInt(-3, pmat.NumRows() - 1),
+          num_rows = RandInt(1, 3 + pmat.NumRows() - row_offset);
+      GeneralMatrix pmat3;
+      ExtractRowRangeWithPadding(pmat2, row_offset, num_rows, &pmat3);
+
+      Matrix<Real> mat_A(num_rows, pmat.NumCols()),
+          mat_B(num_rows, pmat.NumCols());
+      pmat3.GetMatrix(&mat_A);
+      for (int32 row_out = 0; row_out < num_rows; row_out++) {
+        int32 row_in = row_out + row_offset;
+        if (row_in < 0) row_in = 0;
+        if (row_in >= pmat2.NumRows())
+          row_in = pmat2.NumRows() - 1;
+        SubVector<Real> vec_out(mat_B, row_out),
+            vec_in(M2, row_in);
+        vec_out.CopyFromVec(vec_in);
+      }
+      if (mat_A.NumRows() >= 8) {
+        // there should be exact equality.
+        AssertEqual(mat_A, mat_B, 0.0);
+      } else {
+        // If it was compressed matrix and num-rows selected is < 8 and the
+        // format was the one with per-column headers, then we would have changed
+        // the format to save memory, which is not completely lossless.
+        mat_A.AddMat(-1.0, mat_B);
+        KALDI_ASSERT(mat_A.FrobeniusNorm() /
+                     (mat_A.NumRows() * mat_A.NumCols()) < 0.001);
+      }
+    }
+
     if (n < 5) {  // test I/O.
       bool binary = (n % 2 == 1);
       {
@@ -4582,6 +4632,7 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestLbfgs<Real>();
   // UnitTestSvdBad<Real>(); // test bug in Jama SVD code.
   UnitTestCompressedMatrix<Real>();
+  UnitTestCompressedMatrix2<Real>();
   UnitTestExtractCompressedMatrix<Real>();
   UnitTestResize<Real>();
   UnitTestMatrixExponentialBackprop();
@@ -4642,7 +4693,6 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestTraceProduct<Real>();
   UnitTestTransposeScatter<Real>();
   UnitTestRankNUpdate<Real>();
-  UnitTestSherman<Real>();
   UnitTestSpVec<Real>();
   UnitTestLimitCondInvert<Real>();
   KALDI_LOG << " Point G";
@@ -4734,4 +4784,3 @@ int main() {
   KALDI_LOG << "Tests succeeded.";
 
 }
-
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 477d36f190a..0f0158fb295 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -758,6 +758,12 @@ void GeneralMatrix::SwapSparseMatrix(SparseMatrix<BaseFloat> *smat) {
   smat->Swap(&smat_);
 }
 
+void GeneralMatrix::SwapCompressedMatrix(CompressedMatrix *cmat) {
+  if (mat_.NumRows() != 0 || smat_.NumRows() != 0)
+    KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type.";
+  cmat->Swap(&cmat_);
+}
+
 const CompressedMatrix &GeneralMatrix::GetCompressedMatrix() const {
   if (mat_.NumRows() != 0 || smat_.NumRows() != 0)
     KALDI_ERR << "GetCompressedMatrix called on GeneralMatrix of wrong type.";
@@ -1105,6 +1111,63 @@ void GeneralMatrix::Swap(GeneralMatrix *other) {
 }
 
 
+void ExtractRowRangeWithPadding(
+    const GeneralMatrix &in,
+    int32 row_offset,
+    int32 num_rows,
+    GeneralMatrix *out) {
+  // make sure 'out' is empty to start with.
+  Matrix<BaseFloat> empty_mat;
+  *out = empty_mat;
+  if (num_rows == 0) return;
+  switch (in.Type()) {
+    case kFullMatrix: {
+      const Matrix<BaseFloat> &mat_in = in.GetFullMatrix();
+      int32 num_rows_in = mat_in.NumRows(), num_cols = mat_in.NumCols();
+      KALDI_ASSERT(num_rows_in > 0);  // we can't extract >0 rows from an empty
+                                      // matrix.
+      Matrix<BaseFloat> mat_out(num_rows, num_cols, kUndefined);
+      for (int32 row = 0; row < num_rows; row++) {
+        int32 row_in = row + row_offset;
+        if (row_in < 0) row_in = 0;
+        else if (row_in >= num_rows_in) row_in = num_rows_in - 1;
+        SubVector<BaseFloat> vec_in(mat_in, row_in),
+            vec_out(mat_out, row);
+        vec_out.CopyFromVec(vec_in);
+      }
+      out->SwapFullMatrix(&mat_out);
+      break;
+    }
+    case kSparseMatrix: {
+      const SparseMatrix<BaseFloat> &smat_in = in.GetSparseMatrix();
+      int32 num_rows_in = smat_in.NumRows(),
+          num_cols = smat_in.NumCols();
+      KALDI_ASSERT(num_rows_in > 0);  // we can't extract >0 rows from an empty
+                                      // matrix.
+      SparseMatrix<BaseFloat> smat_out(num_rows, num_cols);
+      for (int32 row = 0; row < num_rows; row++) {
+        int32 row_in = row + row_offset;
+        if (row_in < 0) row_in = 0;
+        else if (row_in >= num_rows_in) row_in = num_rows_in - 1;
+        smat_out.SetRow(row, smat_in.Row(row_in));
+      }
+      out->SwapSparseMatrix(&smat_out);
+      break;
+    }
+    case kCompressedMatrix: {
+      const CompressedMatrix &cmat_in = in.GetCompressedMatrix();
+      bool allow_padding = true;
+      CompressedMatrix cmat_out(cmat_in, row_offset, num_rows,
+                                0, cmat_in.NumCols(), allow_padding);
+      out->SwapCompressedMatrix(&cmat_out);
+      break;
+    }
+    default:
+      KALDI_ERR << "Bad matrix type.";
+  }
+}
+
+
 template class SparseVector<float>;
 template class SparseVector<double>;
 template class SparseMatrix<float>;
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index 9f9362542e1..427fd854fdf 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -226,6 +226,8 @@ enum GeneralMatrixType {
 /// targets which might be sparse or not, and might be compressed or not.
 class GeneralMatrix {
  public:
+  /// Returns the type of the matrix: kSparseMatrix, kCompressedMatrix or
+  /// kFullMatrix.  If this matrix is empty, returns kFullMatrix.
   GeneralMatrixType Type() const;
 
   void Compress();  // If it was a full matrix, compresses, changing Type() to
@@ -236,6 +238,7 @@ class GeneralMatrix {
 
   void Write(std::ostream &os, bool binary) const;
 
+
   /// Note: if you write a compressed matrix in text form, it will be read as
   /// a regular full matrix.
   void Read(std::istream &is, bool binary);
@@ -253,6 +256,10 @@ class GeneralMatrix {
   /// crash.
   const CompressedMatrix &GetCompressedMatrix() const;
 
+  /// Swaps the with the given CompressedMatrix.  This will only work if
+  /// Type() returns kCompressedMatrix, or NumRows() == 0.
+  void SwapCompressedMatrix(CompressedMatrix *cmat);
+
   /// Returns the contents as a Matrix<BaseFloat>.  This will only work if
   /// Type() returns kFullMatrix, or NumRows() == 0; otherwise it will crash.
   const Matrix<BaseFloat>& GetFullMatrix() const;
@@ -363,6 +370,20 @@ void FilterGeneralMatrixRows(const GeneralMatrix &in,
                              const std::vector<bool> &keep_rows,
                              GeneralMatrix *out);
 
+/// This function extracts a row-range of a GeneralMatrix and writes
+/// as a GeneralMatrix containing the same type of underlying
+/// matrix.  If the row-range is partly outside the row-range of 'in'
+/// (i.e. if row_offset < 0 or row_offset + num_rows > in.NumRows())
+/// then it will pad with copies of the first and last row as
+/// needed.
+/// This is more efficient than un-compressing and
+/// re-compressing the underlying CompressedMatrix, and causes
+/// less accuracy loss due to re-compression (no loss in most cases).
+void ExtractRowRangeWithPadding(
+    const GeneralMatrix &in,
+    int32 row_offset,
+    int32 num_rows,
+    GeneralMatrix *out);
 
 
 /// @} end of \addtogroup matrix_group
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index debd93599e9..ce7ebd1dd2a 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -125,7 +125,7 @@ struct ExampleGenerationConfig {
     po->Register("num-frames", &num_frames_str, "Number of frames with labels "
                 "that each example contains (i.e. the left and right context "
                 "are to be added to this).  May just be an integer (e.g. "
-                "--num-frames=8), or an principal value followed by "
+                "--num-frames=8), or a principal value followed by "
                 "alternative values to be used at most once for each utterance "
                 "to deal with odd-sized input, e.g. --num-frames=40,25,50 means "
                 "that most of the time the number of frames will be 40, but to "
diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc
index c011f2a0b8a..684088b4b53 100644
--- a/src/nnet3/nnet-example.cc
+++ b/src/nnet3/nnet-example.cc
@@ -65,6 +65,16 @@ NnetIo::NnetIo(const std::string &name,
     indexes[i].t = t_begin + i;
 }
 
+NnetIo::NnetIo(const std::string &name,
+               int32 t_begin, const GeneralMatrix &feats):
+    name(name), features(feats) {
+  int32 num_rows = feats.NumRows();
+  KALDI_ASSERT(num_rows > 0);
+  indexes.resize(num_rows);  // sets all n,t,x to zeros.
+  for (int32 i = 0; i < num_rows; i++)
+    indexes[i].t = t_begin + i;
+}
+
 void NnetIo::Swap(NnetIo *other) {
   name.swap(other->name);
   indexes.swap(other->indexes);
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index 347894e958c..903227e0b46 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -52,6 +52,13 @@ struct NnetIo {
   NnetIo(const std::string &name,
          int32 t_begin, const MatrixBase<BaseFloat> &feats);
 
+  /// This constructor creates NnetIo with name "name", indexes with n=0, x=0,
+  /// and t values ranging from t_begin to t_begin + feats.NumRows() - 1, and
+  /// the provided features.  t_begin should be the frame that the first row
+  /// of 'feats' represents.
+  NnetIo(const std::string &name,
+         int32 t_begin, const GeneralMatrix &feats);
+
   /// This constructor sets "name" to the provided string, sets "indexes" with
   /// n=0, x=0, and t from t_begin to t_begin + labels.size() - 1, and the labels
   /// as provided.  t_begin should be the frame to which labels[0] corresponds.
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 03623f02a07..efab3e89a5f 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -30,7 +30,7 @@ namespace kaldi {
 namespace nnet3 {
 
 
-static bool ProcessFile(const MatrixBase<BaseFloat> &feats,
+static bool ProcessFile(const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
                         const Posterior &pdf_post,
@@ -66,22 +66,18 @@ static bool ProcessFile(const MatrixBase<BaseFloat> &feats,
     int32 tot_input_frames = chunk.left_context + chunk.num_frames +
         chunk.right_context;
 
-    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
-                                   kUndefined);
-
     int32 start_frame = chunk.first_frame - chunk.left_context;
-    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
-      int32 t2 = t;
-      if (t2 < 0) t2 = 0;
-      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
-      int32 j = t - start_frame;
-      SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j);
-      dest.CopyFromVec(src);
-    }
 
-    NnetExample eg;
+    GeneralMatrix input_frames;
+    ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames,
+                               &input_frames);
 
+    // 'input_frames' now stores the relevant rows (maybe with padding) from the
+    // original Matrix or (more likely) CompressedMatrix.  If a CompressedMatrix,
+    // it does this without un-compressing and re-compressing, so there is no loss
+    // of accuracy.
+
+    NnetExample eg;
     // call the regular input "input".
     eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));
 
@@ -181,8 +177,11 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
 
-    po.Register("compress", &compress, "If true, write egs in "
-                "compressed format (recommended).");
+    po.Register("compress", &compress, "If true, write egs with input features "
+                "in compressed format (recommended).  Update: this is now "
+                "only relevant if the features being read are un-compressed; "
+                "if already compressed, we keep we same compressed format when "
+                "dumping-egs.");
     po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
                 "model");
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
@@ -213,8 +212,11 @@ int main(int argc, char *argv[]) {
         pdf_post_rspecifier = po.GetArg(2),
         examples_wspecifier = po.GetArg(3);
 
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    // SequentialGeneralMatrixReader can read either a Matrix or
+    // CompressedMatrix (or SparseMatrix, but not as relevant here),
+    // and it retains the type.  This way, we can generate parts of
+    // the feature matrices without uncompressing and re-compressing.
+    SequentialGeneralMatrixReader feat_reader(feature_rspecifier);
     RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
@@ -224,7 +226,7 @@ int main(int argc, char *argv[]) {
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      const GeneralMatrix &feats = feat_reader.Value();
       if (!pdf_post_reader.HasKey(key)) {
         KALDI_WARN << "No pdf-level posterior for key " << key;
         num_err++;
diff --git a/src/util/table-types.h b/src/util/table-types.h
index 819c98fdf82..efcdf1b564e 100644
--- a/src/util/table-types.h
+++ b/src/util/table-types.h
@@ -169,6 +169,17 @@ typedef RandomAccessTableReader<TokenVectorHolder>
                                 RandomAccessTokenVectorReader;
 
 
+typedef TableWriter<KaldiObjectHolder<GeneralMatrix> >
+                                      GeneralMatrixWriter;
+typedef SequentialTableReader<KaldiObjectHolder<GeneralMatrix> >
+                              SequentialGeneralMatrixReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<GeneralMatrix> >
+                                RandomAccessGeneralMatrixReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<GeneralMatrix> >
+                                      RandomAccessGeneralMatrixReaderMapped;
+
+
+
 /// @}
 
 // Note: for FST reader/writer, see ../fstext/fstext-utils.h