From 719fba13f53dd6de141200f45f39314a1ab423ee Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Feb 2025 17:07:11 -0800 Subject: [PATCH 1/2] move early_stopping as template --- .../cpu/transformers/generation_shared.h | 2 +- .../cuda/transformers/generation_cuda_impl.cu | 215 +++++++++++------- .../cuda/transformers/generation_cuda_impl.h | 49 +++- .../test/contrib_ops/beam_search_test.cc | 3 - 4 files changed, 180 insertions(+), 89 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h index 8145fbd4a4123..635c4852966f4 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h +++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h @@ -8,7 +8,7 @@ #include #include "core/framework/allocator.h" #include "core/framework/ort_value.h" -#include "contrib_ops/cpu/utils/debug_macros.h" +#include "contrib_ops/cpu/utils/console_dumper.h" namespace onnxruntime { diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index eb1943b59d976..d6a2a3018f726 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -1,11 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - // cub.cuh includes device/dispatch_radix_sort.cuh which has assignment in conditional expressions #if defined(_MSC_VER) #pragma warning(push) -#pragma warning(disable : 4706) +#pragma warning(disable : 4706) #endif #include #if defined(_MSC_VER) @@ -323,8 +322,8 @@ __device__ void BeamHypotheses::Output( int top_k, int max_length, int pad_token_id, - int32_t* sequences, // buffer of shape (num_return_sequences, max_length) - T* sequences_scores) // buffer of shape (num_return_sequences) or empty + int32_t* sequences, // buffer of shape (num_return_sequences, max_length) + T* sequences_scores) // buffer of shape (num_return_sequences) or empty { // Copy the top_k beams into the sequences for (int index = 0; index < top_k; index++) { @@ -343,6 +342,7 @@ __device__ void BeamHypotheses::Output( } } +template __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, BeamScorerState& state, const int32_t* sequences_buffer, @@ -358,24 +358,26 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, // Sequences shape is (batch_size * num_beams, total_sequence_length) // It contains word ID of whole sequence generated so far. // It is different from subgraph input_ids, which only need one word when past state is not empty. - - int batch = threadIdx.x; - int batch_start = batch * state.num_beams_; + const int batch = threadIdx.x; + const int num_beams = state.num_beams_; + const int batch_start = batch * num_beams; cuda::BeamHypotheses& beam_hyp = beam_hyps_[batch]; + if (!beam_hyp.done_) { // Next tokens for this sentence. size_t beam_idx = 0; - size_t top_k = 2 * state.num_beams_; + size_t top_k = 2 * num_beams; for (size_t j = 0; j < top_k; j++) { int32_t next_token = next_tokens[batch * top_k + j]; float next_score = next_scores[batch * top_k + j]; int32_t next_index = next_indices[batch * top_k + j]; int batch_beam_idx = batch_start + next_index; + // Add to generated hypotheses if end of sentence. if ((state.eos_token_id_ >= 0) && (next_token == state.eos_token_id_)) { - bool is_beam_token_worse_than_top_num_beams = (j >= state.num_beams_); + bool is_beam_token_worse_than_top_num_beams = (j >= num_beams); if (is_beam_token_worse_than_top_num_beams) { continue; } @@ -396,21 +398,27 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, } // Once the beam for next step is full, don't add more tokens to it. - if (beam_idx == state.num_beams_) + if (beam_idx == num_beams) break; } // Check if we are done so that we can save a pad step if all(done) - if (beam_hyp.beams_used_ == state.num_beams_) { - if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) { - beam_hyp.done_ = true; - if (atomicAdd(&state.not_done_count_, -1) == 1) - state_cpu.not_done_count_ = 0; // Update the CPU side + if (beam_hyp.beams_used_ == num_beams) { + if constexpr (!early_stopping) { + float best_sum_logprobs = *std::max_element(next_scores + batch_start, next_scores + batch_start + top_k); + if (beam_hyp.CanImprove(best_sum_logprobs, sequence_length)) { + return; + } + } + + beam_hyp.done_ = true; + if (atomicAdd(&(state.not_done_count_), -1) == 1) { + state_cpu.not_done_count_ = 0; // Update the CPU side } } } else { // Pad the batch. - for (size_t beam_idx = 0; beam_idx < state.num_beams_; beam_idx++) { + for (size_t beam_idx = 0; beam_idx < num_beams; beam_idx++) { next_beam_scores_[batch_start + beam_idx] = 0.0f; next_beam_tokens_[batch_start + beam_idx] = state.pad_token_id_; next_beam_indices_[batch_start + beam_idx] = 0; @@ -418,6 +426,31 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, } } +#ifdef DEBUG_GENERATION +__global__ void DumpBeamScorerState(const BeamScorerState& state) { + state.Print(false); +} + +void DumpBeamScorerStates(const BeamScorerState& state_cpu, const BeamScorerState& state, cudaStream_t stream) { + state_cpu.Print(true); + DumpBeamScorerState<<<1, 1, 0, stream>>>(state); + cudaDeviceSynchronize(); +} + +__global__ void DumpBeamSearchScorer(const BeamHypotheses& beam_hyp) { + beam_hyp.Print(); +} + +void DumpBeamHypotheses(gsl::span beam_hyps, cudaStream_t stream) { + printf("\n BeamHypotheses of size %zu: \n", beam_hyps.size()); + for (size_t i = 0; i < beam_hyps.size(); i++) { + printf("\n [%zu]:\n", i); + DumpBeamSearchScorer<<<1, 1, 0, stream>>>(beam_hyps[i]); + cudaDeviceSynchronize(); + } +} +#endif + void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, BeamScorerState& state, gsl::span sequences, @@ -431,18 +464,46 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, gsl::span next_tokens, gsl::span next_indices, cudaStream_t stream) { - BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, - state, - sequences.data(), - sequence_length, - beam_hyps.data(), - next_beam_scores.data(), - next_beam_tokens.data(), - next_beam_indices.data(), - hypothesis_buffer.data(), - next_scores.data(), - next_tokens.data(), - next_indices.data()); +#ifdef DEBUG_GENERATION + printf("\n Before BeamSearchScorer_Process: \n"); + DumpBeamHypotheses(beam_hyps, stream); + DumpBeamScorerStates(state_cpu, state, stream); +#endif + + if (state_cpu.early_stopping_) { + BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, + state, + sequences.data(), + sequence_length, + beam_hyps.data(), + next_beam_scores.data(), + next_beam_tokens.data(), + next_beam_indices.data(), + hypothesis_buffer.data(), + next_scores.data(), + next_tokens.data(), + next_indices.data()); + } else { + BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, + state, + sequences.data(), + sequence_length, + beam_hyps.data(), + next_beam_scores.data(), + next_beam_tokens.data(), + next_beam_indices.data(), + hypothesis_buffer.data(), + next_scores.data(), + next_tokens.data(), + next_indices.data()); + } + +#ifdef DEBUG_GENERATION + cudaDeviceSynchronize(); + printf("\n After BeamSearchScorer_Process: \n"); + DumpBeamHypotheses(beam_hyps, stream); + DumpBeamScorerStates(state_cpu, state, stream); +#endif } __global__ void BeamSearchScorer_AppendNextTokenToSequences1(BeamScorerState& state, @@ -600,14 +661,14 @@ template void LaunchBeamSearchScoreCopy(gsl::span final_scores, gsl::span output_scores, cudaStream_t stream) { - ORT_ENFORCE(final_scores.size() == output_scores.size()); - constexpr unsigned ThreadPerBlock = 256; - unsigned num_blocks = (unsigned)((final_scores.size() + (ThreadPerBlock - 1))/ ThreadPerBlock); + ORT_ENFORCE(final_scores.size() == output_scores.size()); + constexpr unsigned ThreadPerBlock = 256; + unsigned num_blocks = (unsigned)((final_scores.size() + (ThreadPerBlock - 1)) / ThreadPerBlock); - typedef typename ToCudaType::MappedType CudaT; + typedef typename ToCudaType::MappedType CudaT; - FloatConvertAndCopyKernel<<>>( - final_scores.data(), (CudaT*)output_scores.data(), final_scores.size()); + FloatConvertAndCopyKernel<<>>( + final_scores.data(), (CudaT*)output_scores.data(), final_scores.size()); } template void LaunchBeamSearchScoreCopy(gsl::span final_scores, @@ -1444,15 +1505,14 @@ void ReorderPastStatesKernelLauncher(void* out_buffer, template __global__ void CopyCrossQKSingleDecodeStepKernel( - T* target, // shape [batchxbeam, layer_head_pair_count, max_length, frame] + T* target, // shape [batchxbeam, layer_head_pair_count, max_length, frame] T** qk_layer_pointers, int token_index, int num_layers, int num_heads, const int* cross_qk_layer_head_pairs, int frames, - int max_length -) { + int max_length) { const int pair = blockIdx.x; const int layer_head_pair_count = gridDim.x; const int bbm = blockIdx.y; @@ -1464,7 +1524,7 @@ __global__ void CopyCrossQKSingleDecodeStepKernel( T* src = qk_layer_pointers[layer] + ((int64_t)bbm * num_heads + head) * frames; for (int tid = threadIdx.x; tid < frames; tid += blockDim.x) { - target[tid] = src[tid]; // use vectorized read write in future if needed + target[tid] = src[tid]; // use vectorized read write in future if needed } } @@ -1479,8 +1539,7 @@ void LaunchCopyCrossQKSingleDecodeStep( int cross_qk_layer_head_pair_count, const int* cross_qk_layer_head_pairs, int frames, - int max_length -) { + int max_length) { dim3 block(512); dim3 grid(cross_qk_layer_head_pair_count, batchxbeam); typedef typename ToCudaType::MappedType CudaT; @@ -1493,11 +1552,9 @@ void LaunchCopyCrossQKSingleDecodeStep( num_heads, cross_qk_layer_head_pairs, frames, - max_length - ); + max_length); } - template __global__ void CopyDecoderCrossQKAllStepsKernel( int context_decoding_len, @@ -1505,11 +1562,10 @@ __global__ void CopyDecoderCrossQKAllStepsKernel( int num_return_sequences, int max_length, int frames_of_k, - const T* cross_qk_buffer_data, // [batch, num_beams, layer_head_pair_count, max_length, frames] - T* cross_qk_output, // [batch, num_return_sequences, layer_head_pair_count, total_decoding_length, frames] - const int* cache_indir_data, // [batch, num_beams, max_length] - const int32_t* beam_indices -) { + const T* cross_qk_buffer_data, // [batch, num_beams, layer_head_pair_count, max_length, frames] + T* cross_qk_output, // [batch, num_return_sequences, layer_head_pair_count, total_decoding_length, frames] + const int* cache_indir_data, // [batch, num_beams, max_length] + const int32_t* beam_indices) { const int pair = blockIdx.y; const int layer_head_pair_count = gridDim.y; const int total_decoding_length = gridDim.x; @@ -1522,15 +1578,15 @@ __global__ void CopyDecoderCrossQKAllStepsKernel( const int src_beam = beam_indices[batch * num_beams + ret_seq_id] % num_beams; const int64_t offset_in_cache = ((int64_t)batch * num_beams + src_beam) * max_length + token_decoding_index + context_decoding_len; - int bm_mapped = ((num_beams <= 1) ? 0: ((token_decoding_index == total_decoding_length - 1) ? ret_seq_id : cache_indir_data[offset_in_cache])); + int bm_mapped = ((num_beams <= 1) ? 0 : ((token_decoding_index == total_decoding_length - 1) ? ret_seq_id : cache_indir_data[offset_in_cache])); int bi_src = batch * num_beams + bm_mapped; - T* target = cross_qk_output + - (((int64_t)br * layer_head_pair_count + (int64_t)pair) * total_decoding_length + token_decoding_index) * frames_of_k; + T* target = cross_qk_output + + (((int64_t)br * layer_head_pair_count + (int64_t)pair) * total_decoding_length + token_decoding_index) * frames_of_k; const T* src = cross_qk_buffer_data + - ((int64_t)bi_src * layer_head_pair_count * max_length + (int64_t)pair * max_length + token_decoding_index) * frames_of_k; + ((int64_t)bi_src * layer_head_pair_count * max_length + (int64_t)pair * max_length + token_decoding_index) * frames_of_k; for (int tid = threadIdx.x; tid < frames_of_k; tid += blockDim.x) { - target[tid] = src[tid]; // use vectorized read write in future if needed + target[tid] = src[tid]; // use vectorized read write in future if needed } } @@ -1548,8 +1604,7 @@ void LaunchFinalizeCrossQK( float* cross_qk_output, int num_return_sequences, const int* cache_indir_data, - const int32_t* beam_indices -) { + const int32_t* beam_indices) { int64_t br = (int64_t)batch_size * num_return_sequences; ORT_ENFORCE(br < 65536L && cross_qk_layer_head_pair_count < 65536); const int total_decoding_length = iteration_number - 1; @@ -1558,15 +1613,15 @@ void LaunchFinalizeCrossQK( typedef typename ToCudaType::MappedType CudaT; CopyDecoderCrossQKAllStepsKernel<<>>( - context_decoding_len, - num_beams, - num_return_sequences, - max_length, - frames_of_k, - (const CudaT*)cross_qk_buffer_data, - (CudaT*)cross_qk_output, - cache_indir_data, - beam_indices); + context_decoding_len, + num_beams, + num_return_sequences, + max_length, + frames_of_k, + (const CudaT*)cross_qk_buffer_data, + (CudaT*)cross_qk_output, + cache_indir_data, + beam_indices); } template @@ -1575,12 +1630,11 @@ __global__ void ForceDecodingIdsKernel( const int vocab_size, const int32_t* force_ids, int id_len, - int step -) { + int step) { const int num_beams = gridDim.y; const int beam = blockIdx.y; const int batch = blockIdx.z; - beam_scores += (((int64_t)batch * num_beams + beam)* vocab_size); // move to (batch, beam) + beam_scores += (((int64_t)batch * num_beams + beam) * vocab_size); // move to (batch, beam) const int32_t id_wanted = force_ids[((int64_t)batch * id_len) + step]; if (id_wanted < 0 || id_wanted >= vocab_size) return; @@ -1588,7 +1642,7 @@ __global__ void ForceDecodingIdsKernel( const int32_t block_start_id = blockIdx.x * elements_per_block; int32_t token_id = block_start_id + (int)threadIdx.x; - #pragma unroll +#pragma unroll for (int elem = 0; elem < ElementsPerThreads; elem++) { if (token_id < vocab_size) { beam_scores[token_id] = ((token_id == id_wanted) ? 0.0f : cub::FpLimits::Lowest()); @@ -1597,7 +1651,6 @@ __global__ void ForceDecodingIdsKernel( } } - void LaunchForceDecodingIds( float* beam_scores, const int batch_size, @@ -1606,15 +1659,13 @@ void LaunchForceDecodingIds( const int32_t* force_ids, int id_len, int step, - cudaStream_t stream -) { + cudaStream_t stream) { dim3 blocks(512); constexpr int ElementsPerThreads = 4; unsigned gridx = static_cast((vocab_size + 512 * ElementsPerThreads - 1) / (512 * ElementsPerThreads)); dim3 grids(gridx, num_beams, batch_size); ForceDecodingIdsKernel<<>>( - beam_scores, vocab_size, force_ids, id_len, step - ); + beam_scores, vocab_size, force_ids, id_len, step); } template @@ -1624,8 +1675,7 @@ __global__ void SaveNoSpeechProbsKernel( const int batch_size, const int num_beams, const int vocab_size, - const int no_speech_token_id -) { + const int no_speech_token_id) { int b = blockIdx.x * blockDim.x + threadIdx.x; if (b < batch_size) { int64_t src_offset = b * num_beams * vocab_size + no_speech_token_id; @@ -1635,20 +1685,19 @@ __global__ void SaveNoSpeechProbsKernel( template void LaunchSaveNoSpeechProbs( - T* result_no_speech_probs, /* [batch]*/ - const float* probs, /* [batch, num_beams, vocab_size]*/ + T* result_no_speech_probs, /* [batch]*/ + const float* probs, /* [batch, num_beams, vocab_size]*/ const int batch_size, const int num_beams, const int vocab_size, const int no_speech_token_id, - cudaStream_t stream -) { + cudaStream_t stream) { int tpb = 256; int bpg = (batch_size + 255) / 256; typedef typename ToCudaType::MappedType CudaT; SaveNoSpeechProbsKernel<<>>( - (CudaT*)result_no_speech_probs, probs, batch_size, num_beams, vocab_size, no_speech_token_id); + (CudaT*)result_no_speech_probs, probs, batch_size, num_beams, vocab_size, no_speech_token_id); } template void LaunchSaveNoSpeechProbs( @@ -1658,8 +1707,7 @@ template void LaunchSaveNoSpeechProbs( const int num_beams, const int vocab_size, const int no_speech_token_id, - cudaStream_t stream -); + cudaStream_t stream); template void LaunchSaveNoSpeechProbs( MLFloat16* result_no_speech_probs, @@ -1668,8 +1716,7 @@ template void LaunchSaveNoSpeechProbs( const int num_beams, const int vocab_size, const int no_speech_token_id, - cudaStream_t stream -); + cudaStream_t stream); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index 281cb6c725975..8b3c560050934 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include "contrib_ops/cpu/transformers/generation_shared.h" namespace onnxruntime { namespace contrib { @@ -49,6 +51,21 @@ struct HypothesisScore { const int32_t* hypothesis; int hypothesis_length; float score; + +#ifdef DEBUG_GENERATION + __device__ void Print() const { + printf("HypothesisScore (hypothesis_length=%d, score=%f) \n", hypothesis_length, score); + printf(" hypothesis:"); + if (hypothesis_length > 0 && hypothesis != NULL) { + for (int i = 0; i < hypothesis_length; ++i) { + printf("%d ", hypothesis[i]); + } + } else { + printf("(empty)"); + } + printf("\n"); +} +#endif }; struct BeamHypotheses { @@ -71,6 +88,22 @@ struct BeamHypotheses { int pad_token_id, // pad token int32_t* sequences, // buffer with pad token, shape (num_return_sequences, max_length) T* sequences_scores); // buffer for sequence scores, with shape (num_return_sequences) + +#ifdef DEBUG_GENERATION + __device__ void Print() const { + printf("BeamHypotheses:\n"); + printf(" beams_count: %d\n", beams_count_); + printf(" beams_used: %d\n", beams_used_); + printf(" length_penalty: %f\n", length_penalty_); + printf(" done: %s\n", done_ ? "true" : "false"); + + printf(" beams:\n"); + for (int i = 0; i < beams_used_; ++i) { + printf(" Beam %d:\n", i + 1); + beams_[i].Print(); + } + } +#endif }; struct BeamScorerState { @@ -82,8 +115,22 @@ struct BeamScorerState { int eos_token_id_; bool early_stopping_; int not_done_count_; // When zero, every batch entry is done (starts at batch_size_) - int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer. + +#ifdef DEBUG_GENERATION + __host__ __device__ void Print(bool is_cpu) const { + printf("BeamScorerState (cpu=%d) Dump:\n", is_cpu ? 1 : 0); + printf(" batch_size_: %d\n", batch_size_); + printf(" num_beams_: %d\n", num_beams_); + printf(" max_length_: %d\n", max_length_); + printf(" num_return_sequences_: %d\n", num_return_sequences_); + printf(" pad_token_id_: %d\n", pad_token_id_); + printf(" eos_token_id_: %d\n", eos_token_id_); + printf(" early_stopping_: %s\n", early_stopping_ ? "true" : "false"); + printf(" not_done_count_: %d\n", not_done_count_); + printf(" hypothesis_buffer_used_: %d\n", hypothesis_buffer_used_); + } +#endif }; void LaunchInitializeBeamHypotheses(gsl::span beam_hyps, float length_penalty, gsl::span beams, int num_beams, cudaStream_t stream); diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc index 1ae15afdf7482..9da19cb591c09 100644 --- a/onnxruntime/test/contrib_ops/beam_search_test.cc +++ b/onnxruntime/test/contrib_ops/beam_search_test.cc @@ -423,9 +423,6 @@ TEST(BeamSearchTest, DummyT5WithOuterScopeInitializers) { } TEST(BeamSearchTest, DummyT5WithSequenceInputIds) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif // dummy_t5_with_sequence_input_ids.onnx model generated using following command: // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_with_sequence_input_ids.onnx --sequence-as-input ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_with_sequence_input_ids.onnx")); From 2f65d8d402e12274d557c248accfac5654418651 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Feb 2025 17:52:55 -0800 Subject: [PATCH 2/2] format --- .../cuda/transformers/generation_cuda_impl.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index 8b3c560050934..15bb4f469ba3d 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -57,14 +57,14 @@ struct HypothesisScore { printf("HypothesisScore (hypothesis_length=%d, score=%f) \n", hypothesis_length, score); printf(" hypothesis:"); if (hypothesis_length > 0 && hypothesis != NULL) { - for (int i = 0; i < hypothesis_length; ++i) { - printf("%d ", hypothesis[i]); - } + for (int i = 0; i < hypothesis_length; ++i) { + printf("%d ", hypothesis[i]); + } } else { - printf("(empty)"); + printf("(empty)"); } printf("\n"); -} + } #endif }; @@ -99,8 +99,8 @@ struct BeamHypotheses { printf(" beams:\n"); for (int i = 0; i < beams_used_; ++i) { - printf(" Beam %d:\n", i + 1); - beams_[i].Print(); + printf(" Beam %d:\n", i + 1); + beams_[i].Print(); } } #endif @@ -114,7 +114,7 @@ struct BeamScorerState { int pad_token_id_; int eos_token_id_; bool early_stopping_; - int not_done_count_; // When zero, every batch entry is done (starts at batch_size_) + int not_done_count_; // When zero, every batch entry is done (starts at batch_size_) int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer. #ifdef DEBUG_GENERATION