diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index fec75b01a3f..179a9a06443 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -78,6 +78,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
   num_system_allocations_++;
   void *ans;
   cudaError_t e;
+  size_t cuda_memory_marker = GetDeviceMemoryMarker();
   for (int32 i = 0; i <= 2; i++) {
     if (num_rows != 1) {
       CuTimer tim;
@@ -97,7 +98,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
         KALDI_WARN << "Allocation of " << row_bytes << " x "
                    << num_rows << " region failed: freeing some memory and "
                    << "trying again. ";
-        BaseFloat new_memory_factor = 1.1;
+        BaseFloat new_memory_factor = 1.03;
         if (opts_.memory_factor > new_memory_factor) {
           KALDI_LOG << "To avoid future problems like this, changing "
                     << "memory_factor from " << opts_.memory_factor << " to "
@@ -110,6 +111,8 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
                                               std::min<size_t>(memory_cached,
                                                                memory_requested));
         FreeSomeCachedMemory(memory_to_free);
+        cuda_memory_marker = GetDeviceMemoryMarker(); // refresh the memory marker
+                                                      // as we freed some (hopefully)
       } else {
         KALDI_ERR << "Cannot allocate the requested memory ("
                   << row_bytes << " x " << num_rows << " = "
@@ -117,12 +120,36 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
       }
       cudaGetLastError();  // Clear the error state.
     } else {
+      ReportAllocCallStats(row_bytes, num_rows, *pitch, cuda_memory_marker);
       break;
     }
   }
   return ans;
 }
 
+size_t CuMemoryAllocator::GetDeviceMemoryMarker() const {
+  size_t free_memory_now = 0;
+  if (GetVerboseLevel() >= 5)
+    cudaMemGetInfo(&free_memory_now, NULL);
+  return free_memory_now;
+}
+
+void CuMemoryAllocator::ReportAllocCallStats(int64 row_bytes,
+                                             int64 num_rows,
+                                             int64 pitch,
+                                             size_t marker) const {
+  size_t free_memory_now = 0;
+  if (GetVerboseLevel() >= 5) {
+    cudaMemGetInfo(&free_memory_now, NULL);
+    KALDI_VLOG(5) << "Alloc requested   : " << row_bytes << "x" << num_rows
+                  << " -> " << row_bytes * num_rows << " bytes.";
+    KALDI_VLOG(5) << "Alloc granted     : " << pitch     << "x" << num_rows
+                  << " -> " << pitch * num_rows << " bytes.";
+    KALDI_VLOG(5) << "Alloc device bytes: " << (marker - free_memory_now)
+                  << " bytes (" << (marker - free_memory_now)/(1024*1024) <<"MB)";
+  }
+}
+
 void CuMemoryAllocator::PrintMemoryUsage() const {
   KALDI_LOG << "Memory usage: " << cur_bytes_allocated_
             << " bytes currently allocated (max: "
@@ -158,6 +185,27 @@ CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts):
 void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
                                      size_t num_rows,
                                      size_t *pitch) {
+#if 0
+  t_++;
+  num_user_allocations_++;
+  void *tmp;
+  cudaError_t e;
+  size_t marker = GetDeviceMemoryMarker();
+  CuTimer tim;
+  e = cudaMallocPitch(&tmp, pitch, row_bytes, num_rows);
+  tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed();
+  tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
+  num_system_allocations_++;
+  if (e != cudaSuccess) {
+    PrintMemoryUsage();
+    cudaGetLastError();
+    KALDI_ERR << "Cannot allocate the requested memory ("
+              << row_bytes << " x " << num_rows << " = "
+              << row_bytes * num_rows << " bytes)";
+  }
+  ReportAllocCallStats(row_bytes, num_rows, *pitch, marker);
+  return tmp;
+#else
   CuTimer tim;
   t_++;
   num_user_allocations_++;
@@ -200,6 +248,7 @@ void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
     tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
     return ans;
   }
+#endif
 }
 
 void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
@@ -222,7 +271,7 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
   // we declare to be the time since we last used it multiplied by the size
   // of the memory in the pointer.
   std::vector<BaseFloat> size_factor(num_caches);
-  for (size_t i = 0, j=1; i < num_caches; i++, j *= 2)
+  for (size_t i = 0, j=1; i < num_caches; i++, j *= 1)
     size_factor[i] = j;
 
   std::priority_queue<std::pair<BaseFloat,int32> > queue;
@@ -278,6 +327,11 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
 }
 
 void CuMemoryAllocator::Free(void *ptr) {
+#if 0
+  CuTimer tim;
+  cudaFree(ptr);
+  tot_time_taken_in_cuda_free_ += tim.Elapsed();
+#else
   t_++;
   unordered_map<void*, UsedMemoryElement, PointerHasher>::iterator iter =
       used_map_.find(ptr);
@@ -294,6 +348,7 @@ void CuMemoryAllocator::Free(void *ptr) {
   cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows),
                CachedMemoryElement(ptr, t_, elem.pitch));
   used_map_.erase(iter);
+#endif
 }
 
 size_t CuMemoryAllocator::MruCache::LeastRecentTime() const {
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 0f96315e848..2ee76a65364 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -54,7 +54,7 @@ struct CuAllocatorOptions {
   // is a constant overhead proportional to the number of buckets.
   BaseFloat delete_factor;
 
-  CuAllocatorOptions(): memory_factor(1.3),
+  CuAllocatorOptions(): memory_factor(1.1),
                         delete_factor(0.001) { }
 
   void Check() {
@@ -110,6 +110,11 @@ class CuMemoryAllocator {
 
   CuMemoryAllocator(CuAllocatorOptions opts);
  private:
+  size_t GetDeviceMemoryMarker() const;
+  void ReportAllocCallStats(int64 row_bytes,
+                             int64 num_rows,
+                             int64 pitch,
+                             size_t marker) const;
 
   void FreeSomeCachedMemory(size_t bytes_to_free);
 
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 87e266e1889..b5b6d18e9ac 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -410,6 +410,7 @@ void CuDevice::AccuProfile(const char *function_name,
   }
 }
 
+
 void CuDevice::PrintMemoryUsage() const {
   if (Enabled()) {
     allocator_.PrintMemoryUsage();
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 99105355a8f..ce286ba02d4 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -74,6 +74,9 @@ class CuDevice {
           pitch);
     } else {
       return allocator_.MallocPitch(row_bytes, num_rows, pitch);
+      //void *tmp =  allocator_.Malloc(row_bytes * num_rows);
+      //*pitch = row_bytes;
+      //return tmp;
     }
   }
   inline void Free(void *ptr) {
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2080c60077b..dc4777307d5 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -92,6 +92,74 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   num_minibatches_processed_++;
 }
 
+class ChainTrainerMemoryHolder {
+ public:
+  ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer,
+                           NnetComputer &computer,
+                           const NnetChainExample &eg);
+ private:
+  CuMatrix<BaseFloat> nnet_output_;
+  CuMatrix<BaseFloat> nnet_output_deriv_;
+  CuMatrix<BaseFloat> beta_;
+  CuMatrix<BaseFloat> alpha_;
+
+};
+
+ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer,
+                                                   NnetComputer &computer,
+                                                   const NnetChainExample &eg) {
+
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+
+  int max_rows = 0,
+      max_cols = 0;
+
+  size_t max_frames_per_sequence = 0,
+         max_sequence_size = 0,
+         max_alpha_matrix_size = 0;
+
+  for (; iter != end; ++iter) {
+    const NnetChainSupervision &sup = *iter;
+
+    int output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
+    int output_cols = trainer->nnet_->OutputDim("output");
+
+    size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
+    size_t den_graph_size = trainer->den_graph_.NumStates() + 1;
+    size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
+    size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;
+
+    if (curr_alpha_matrix_size > max_alpha_matrix_size) {
+      max_alpha_matrix_size = curr_alpha_matrix_size;
+      max_frames_per_sequence = curr_frames_per_sequence;
+      max_sequence_size = curr_sequence_size;
+    }
+
+    size_t matrix_size = output_rows * output_cols;
+    if (matrix_size > (max_rows * max_cols)) {
+      max_rows = output_rows;
+      max_cols = output_cols;
+    }
+
+  }
+
+  KALDI_VLOG(5) << "Pre-caching chain training memory";
+  // the sequence of resizes is in a specific order (bigger to smaller)
+  // so that the cudaMalloc won't trash the memory it has already
+  // alloc'd in the previous iterations
+  alpha_.Resize(max_frames_per_sequence,
+                max_sequence_size,
+                kUndefined);
+
+  nnet_output_.Resize(max_rows, max_cols, kUndefined);
+  nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);
+
+  beta_.Resize(2, max_sequence_size, kUndefined);
+
+  KALDI_VLOG(5) << "Precaching chain training memory...Done";
+}
+
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
@@ -100,10 +168,17 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
+
+  // reserve the memory needed in ProcessOutputs and
+  // and release it back (so that it will get cached).
+  ChainTrainerMemoryHolder *memory_holder = new ChainTrainerMemoryHolder(this, computer, eg);
+  delete memory_holder;
+
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
 
+  // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..dbb5e91ca63 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -69,6 +69,7 @@ class NnetChainTrainer {
   void PrintMaxChangeStats() const;
 
   ~NnetChainTrainer();
+  friend class ChainTrainerMemoryHolder;
  private:
   // The internal function for doing one step of conventional SGD training.
   void TrainInternal(const NnetChainExample &eg,