diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index fec75b01a3f..179a9a06443 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -78,6 +78,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, num_system_allocations_++; void *ans; cudaError_t e; + size_t cuda_memory_marker = GetDeviceMemoryMarker(); for (int32 i = 0; i <= 2; i++) { if (num_rows != 1) { CuTimer tim; @@ -97,7 +98,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, KALDI_WARN << "Allocation of " << row_bytes << " x " << num_rows << " region failed: freeing some memory and " << "trying again. "; - BaseFloat new_memory_factor = 1.1; + BaseFloat new_memory_factor = 1.03; if (opts_.memory_factor > new_memory_factor) { KALDI_LOG << "To avoid future problems like this, changing " << "memory_factor from " << opts_.memory_factor << " to " @@ -110,6 +111,8 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, std::min(memory_cached, memory_requested)); FreeSomeCachedMemory(memory_to_free); + cuda_memory_marker = GetDeviceMemoryMarker(); // refresh the memory marker + // as we freed some (hopefully) } else { KALDI_ERR << "Cannot allocate the requested memory (" << row_bytes << " x " << num_rows << " = " @@ -117,12 +120,36 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, } cudaGetLastError(); // Clear the error state. } else { + ReportAllocCallStats(row_bytes, num_rows, *pitch, cuda_memory_marker); break; } } return ans; } +size_t CuMemoryAllocator::GetDeviceMemoryMarker() const { + size_t free_memory_now = 0; + if (GetVerboseLevel() >= 5) + cudaMemGetInfo(&free_memory_now, NULL); + return free_memory_now; +} + +void CuMemoryAllocator::ReportAllocCallStats(int64 row_bytes, + int64 num_rows, + int64 pitch, + size_t marker) const { + size_t free_memory_now = 0; + if (GetVerboseLevel() >= 5) { + cudaMemGetInfo(&free_memory_now, NULL); + KALDI_VLOG(5) << "Alloc requested : " << row_bytes << "x" << num_rows + << " -> " << row_bytes * num_rows << " bytes."; + KALDI_VLOG(5) << "Alloc granted : " << pitch << "x" << num_rows + << " -> " << pitch * num_rows << " bytes."; + KALDI_VLOG(5) << "Alloc device bytes: " << (marker - free_memory_now) + << " bytes (" << (marker - free_memory_now)/(1024*1024) <<"MB)"; + } +} + void CuMemoryAllocator::PrintMemoryUsage() const { KALDI_LOG << "Memory usage: " << cur_bytes_allocated_ << " bytes currently allocated (max: " @@ -158,6 +185,27 @@ CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts): void* CuMemoryAllocator::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { +#if 0 + t_++; + num_user_allocations_++; + void *tmp; + cudaError_t e; + size_t marker = GetDeviceMemoryMarker(); + CuTimer tim; + e = cudaMallocPitch(&tmp, pitch, row_bytes, num_rows); + tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed(); + tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); + num_system_allocations_++; + if (e != cudaSuccess) { + PrintMemoryUsage(); + cudaGetLastError(); + KALDI_ERR << "Cannot allocate the requested memory (" + << row_bytes << " x " << num_rows << " = " + << row_bytes * num_rows << " bytes)"; + } + ReportAllocCallStats(row_bytes, num_rows, *pitch, marker); + return tmp; +#else CuTimer tim; t_++; num_user_allocations_++; @@ -200,6 +248,7 @@ void* CuMemoryAllocator::MallocPitch(size_t row_bytes, tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); return ans; } +#endif } void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) { @@ -222,7 +271,7 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) { // we declare to be the time since we last used it multiplied by the size // of the memory in the pointer. std::vector size_factor(num_caches); - for (size_t i = 0, j=1; i < num_caches; i++, j *= 2) + for (size_t i = 0, j=1; i < num_caches; i++, j *= 1) size_factor[i] = j; std::priority_queue > queue; @@ -278,6 +327,11 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) { } void CuMemoryAllocator::Free(void *ptr) { +#if 0 + CuTimer tim; + cudaFree(ptr); + tot_time_taken_in_cuda_free_ += tim.Elapsed(); +#else t_++; unordered_map::iterator iter = used_map_.find(ptr); @@ -294,6 +348,7 @@ void CuMemoryAllocator::Free(void *ptr) { cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows), CachedMemoryElement(ptr, t_, elem.pitch)); used_map_.erase(iter); +#endif } size_t CuMemoryAllocator::MruCache::LeastRecentTime() const { diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 0f96315e848..2ee76a65364 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -54,7 +54,7 @@ struct CuAllocatorOptions { // is a constant overhead proportional to the number of buckets. BaseFloat delete_factor; - CuAllocatorOptions(): memory_factor(1.3), + CuAllocatorOptions(): memory_factor(1.1), delete_factor(0.001) { } void Check() { @@ -110,6 +110,11 @@ class CuMemoryAllocator { CuMemoryAllocator(CuAllocatorOptions opts); private: + size_t GetDeviceMemoryMarker() const; + void ReportAllocCallStats(int64 row_bytes, + int64 num_rows, + int64 pitch, + size_t marker) const; void FreeSomeCachedMemory(size_t bytes_to_free); diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 87e266e1889..b5b6d18e9ac 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -410,6 +410,7 @@ void CuDevice::AccuProfile(const char *function_name, } } + void CuDevice::PrintMemoryUsage() const { if (Enabled()) { allocator_.PrintMemoryUsage(); diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 99105355a8f..ce286ba02d4 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -74,6 +74,9 @@ class CuDevice { pitch); } else { return allocator_.MallocPitch(row_bytes, num_rows, pitch); + //void *tmp = allocator_.Malloc(row_bytes * num_rows); + //*pitch = row_bytes; + //return tmp; } } inline void Free(void *ptr) { diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 2080c60077b..dc4777307d5 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -92,6 +92,74 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { num_minibatches_processed_++; } +class ChainTrainerMemoryHolder { + public: + ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer, + NnetComputer &computer, + const NnetChainExample &eg); + private: + CuMatrix nnet_output_; + CuMatrix nnet_output_deriv_; + CuMatrix beta_; + CuMatrix alpha_; + +}; + +ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer, + NnetComputer &computer, + const NnetChainExample &eg) { + + std::vector::const_iterator iter = eg.outputs.begin(), + end = eg.outputs.end(); + + int max_rows = 0, + max_cols = 0; + + size_t max_frames_per_sequence = 0, + max_sequence_size = 0, + max_alpha_matrix_size = 0; + + for (; iter != end; ++iter) { + const NnetChainSupervision &sup = *iter; + + int output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence; + int output_cols = trainer->nnet_->OutputDim("output"); + + size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1; + size_t den_graph_size = trainer->den_graph_.NumStates() + 1; + size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences; + size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size; + + if (curr_alpha_matrix_size > max_alpha_matrix_size) { + max_alpha_matrix_size = curr_alpha_matrix_size; + max_frames_per_sequence = curr_frames_per_sequence; + max_sequence_size = curr_sequence_size; + } + + size_t matrix_size = output_rows * output_cols; + if (matrix_size > (max_rows * max_cols)) { + max_rows = output_rows; + max_cols = output_cols; + } + + } + + KALDI_VLOG(5) << "Pre-caching chain training memory"; + // the sequence of resizes is in a specific order (bigger to smaller) + // so that the cudaMalloc won't trash the memory it has already + // alloc'd in the previous iterations + alpha_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + + nnet_output_.Resize(max_rows, max_cols, kUndefined); + nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined); + + beta_.Resize(2, max_sequence_size, kUndefined); + + KALDI_VLOG(5) << "Precaching chain training memory...Done"; +} + void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -100,10 +168,17 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, // store stats. This is mainly important for memory-norm. NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); + + // reserve the memory needed in ProcessOutputs and + // and release it back (so that it will get cached). + ChainTrainerMemoryHolder *memory_holder = new ChainTrainerMemoryHolder(this, computer, eg); + delete memory_holder; + // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); + // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ? this->ProcessOutputs(false, eg, &computer); computer.Run(); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..dbb5e91ca63 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -69,6 +69,7 @@ class NnetChainTrainer { void PrintMaxChangeStats() const; ~NnetChainTrainer(); + friend class ChainTrainerMemoryHolder; private: // The internal function for doing one step of conventional SGD training. void TrainInternal(const NnetChainExample &eg,