Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions src/cudamatrix/cu-allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
num_system_allocations_++;
void *ans;
cudaError_t e;
size_t cuda_memory_marker = GetDeviceMemoryMarker();
for (int32 i = 0; i <= 2; i++) {
if (num_rows != 1) {
CuTimer tim;
Expand All @@ -97,7 +98,7 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
KALDI_WARN << "Allocation of " << row_bytes << " x "
<< num_rows << " region failed: freeing some memory and "
<< "trying again. ";
BaseFloat new_memory_factor = 1.1;
BaseFloat new_memory_factor = 1.03;
if (opts_.memory_factor > new_memory_factor) {
KALDI_LOG << "To avoid future problems like this, changing "
<< "memory_factor from " << opts_.memory_factor << " to "
Expand All @@ -110,19 +111,45 @@ void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
std::min<size_t>(memory_cached,
memory_requested));
FreeSomeCachedMemory(memory_to_free);
cuda_memory_marker = GetDeviceMemoryMarker(); // refresh the memory marker
// as we freed some (hopefully)
} else {
KALDI_ERR << "Cannot allocate the requested memory ("
<< row_bytes << " x " << num_rows << " = "
<< row_bytes * num_rows << " bytes)";
}
cudaGetLastError(); // Clear the error state.
} else {
ReportAllocCallStats(row_bytes, num_rows, *pitch, cuda_memory_marker);
break;
}
}
return ans;
}

size_t CuMemoryAllocator::GetDeviceMemoryMarker() const {
size_t free_memory_now = 0;
if (GetVerboseLevel() >= 5)
cudaMemGetInfo(&free_memory_now, NULL);
return free_memory_now;
}

void CuMemoryAllocator::ReportAllocCallStats(int64 row_bytes,
int64 num_rows,
int64 pitch,
size_t marker) const {
size_t free_memory_now = 0;
if (GetVerboseLevel() >= 5) {
cudaMemGetInfo(&free_memory_now, NULL);
KALDI_VLOG(5) << "Alloc requested : " << row_bytes << "x" << num_rows
<< " -> " << row_bytes * num_rows << " bytes.";
KALDI_VLOG(5) << "Alloc granted : " << pitch << "x" << num_rows
<< " -> " << pitch * num_rows << " bytes.";
KALDI_VLOG(5) << "Alloc device bytes: " << (marker - free_memory_now)
<< " bytes (" << (marker - free_memory_now)/(1024*1024) <<"MB)";
}
}

void CuMemoryAllocator::PrintMemoryUsage() const {
KALDI_LOG << "Memory usage: " << cur_bytes_allocated_
<< " bytes currently allocated (max: "
Expand Down Expand Up @@ -158,6 +185,27 @@ CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts):
void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
size_t num_rows,
size_t *pitch) {
#if 0
t_++;
num_user_allocations_++;
void *tmp;
cudaError_t e;
size_t marker = GetDeviceMemoryMarker();
CuTimer tim;
e = cudaMallocPitch(&tmp, pitch, row_bytes, num_rows);
tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed();
tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
num_system_allocations_++;
if (e != cudaSuccess) {
PrintMemoryUsage();
cudaGetLastError();
KALDI_ERR << "Cannot allocate the requested memory ("
<< row_bytes << " x " << num_rows << " = "
<< row_bytes * num_rows << " bytes)";
}
ReportAllocCallStats(row_bytes, num_rows, *pitch, marker);
return tmp;
#else
CuTimer tim;
t_++;
num_user_allocations_++;
Expand Down Expand Up @@ -200,6 +248,7 @@ void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
return ans;
}
#endif
}

void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
Expand All @@ -222,7 +271,7 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
// we declare to be the time since we last used it multiplied by the size
// of the memory in the pointer.
std::vector<BaseFloat> size_factor(num_caches);
for (size_t i = 0, j=1; i < num_caches; i++, j *= 2)
for (size_t i = 0, j=1; i < num_caches; i++, j *= 1)
size_factor[i] = j;

std::priority_queue<std::pair<BaseFloat,int32> > queue;
Expand Down Expand Up @@ -278,6 +327,11 @@ void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
}

void CuMemoryAllocator::Free(void *ptr) {
#if 0
CuTimer tim;
cudaFree(ptr);
tot_time_taken_in_cuda_free_ += tim.Elapsed();
#else
t_++;
unordered_map<void*, UsedMemoryElement, PointerHasher>::iterator iter =
used_map_.find(ptr);
Expand All @@ -294,6 +348,7 @@ void CuMemoryAllocator::Free(void *ptr) {
cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows),
CachedMemoryElement(ptr, t_, elem.pitch));
used_map_.erase(iter);
#endif
}

size_t CuMemoryAllocator::MruCache::LeastRecentTime() const {
Expand Down
7 changes: 6 additions & 1 deletion src/cudamatrix/cu-allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct CuAllocatorOptions {
// is a constant overhead proportional to the number of buckets.
BaseFloat delete_factor;

CuAllocatorOptions(): memory_factor(1.3),
CuAllocatorOptions(): memory_factor(1.1),
delete_factor(0.001) { }

void Check() {
Expand Down Expand Up @@ -110,6 +110,11 @@ class CuMemoryAllocator {

CuMemoryAllocator(CuAllocatorOptions opts);
private:
size_t GetDeviceMemoryMarker() const;
void ReportAllocCallStats(int64 row_bytes,
int64 num_rows,
int64 pitch,
size_t marker) const;

void FreeSomeCachedMemory(size_t bytes_to_free);

Expand Down
1 change: 1 addition & 0 deletions src/cudamatrix/cu-device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ void CuDevice::AccuProfile(const char *function_name,
}
}


void CuDevice::PrintMemoryUsage() const {
if (Enabled()) {
allocator_.PrintMemoryUsage();
Expand Down
3 changes: 3 additions & 0 deletions src/cudamatrix/cu-device.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class CuDevice {
pitch);
} else {
return allocator_.MallocPitch(row_bytes, num_rows, pitch);
//void *tmp = allocator_.Malloc(row_bytes * num_rows);
//*pitch = row_bytes;
//return tmp;
}
}
inline void Free(void *ptr) {
Expand Down
75 changes: 75 additions & 0 deletions src/nnet3/nnet-chain-training.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,74 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
num_minibatches_processed_++;
}

class ChainTrainerMemoryHolder {
public:
ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer,
NnetComputer &computer,
const NnetChainExample &eg);
private:
CuMatrix<BaseFloat> nnet_output_;
CuMatrix<BaseFloat> nnet_output_deriv_;
CuMatrix<BaseFloat> beta_;
CuMatrix<BaseFloat> alpha_;

};

ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const NnetChainTrainer *const trainer,
NnetComputer &computer,
const NnetChainExample &eg) {

std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
end = eg.outputs.end();

int max_rows = 0,
max_cols = 0;

size_t max_frames_per_sequence = 0,
max_sequence_size = 0,
max_alpha_matrix_size = 0;

for (; iter != end; ++iter) {
const NnetChainSupervision &sup = *iter;

int output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
int output_cols = trainer->nnet_->OutputDim("output");

size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
size_t den_graph_size = trainer->den_graph_.NumStates() + 1;
size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;

if (curr_alpha_matrix_size > max_alpha_matrix_size) {
max_alpha_matrix_size = curr_alpha_matrix_size;
max_frames_per_sequence = curr_frames_per_sequence;
max_sequence_size = curr_sequence_size;
}

size_t matrix_size = output_rows * output_cols;
if (matrix_size > (max_rows * max_cols)) {
max_rows = output_rows;
max_cols = output_cols;
}

}

KALDI_VLOG(5) << "Pre-caching chain training memory";
// the sequence of resizes is in a specific order (bigger to smaller)
// so that the cudaMalloc won't trash the memory it has already
// alloc'd in the previous iterations
alpha_.Resize(max_frames_per_sequence,
max_sequence_size,
kUndefined);

nnet_output_.Resize(max_rows, max_cols, kUndefined);
nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);

beta_.Resize(2, max_sequence_size, kUndefined);

KALDI_VLOG(5) << "Precaching chain training memory...Done";
}

void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
const NnetComputation &computation) {
const NnetTrainerOptions &nnet_config = opts_.nnet_config;
Expand All @@ -100,10 +168,17 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
// store stats. This is mainly important for memory-norm.
NnetComputer computer(nnet_config.compute_config, computation,
nnet_, delta_nnet_);

// reserve the memory needed in ProcessOutputs and
// and release it back (so that it will get cached).
ChainTrainerMemoryHolder *memory_holder = new ChainTrainerMemoryHolder(this, computer, eg);
delete memory_holder;

// give the inputs to the computer object.
computer.AcceptInputs(*nnet_, eg.inputs);
computer.Run();

// Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
this->ProcessOutputs(false, eg, &computer);
computer.Run();

Expand Down
1 change: 1 addition & 0 deletions src/nnet3/nnet-chain-training.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class NnetChainTrainer {
void PrintMaxChangeStats() const;

~NnetChainTrainer();
friend class ChainTrainerMemoryHolder;
private:
// The internal function for doing one step of conventional SGD training.
void TrainInternal(const NnetChainExample &eg,
Expand Down