internal merge of PR tensorflow#1350

lgeiger · kpe · commit 563a4102b2a0 · 2019-03-02T23:17:27.000+01:00
PiperOrigin-RevId: 228806666
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
@@ -407,18 +407,13 @@ def input_fn(params):
       return dataset
   else:
     def input_fn():
-      if has_input:
-        input_gen = _decode_batch_input_fn(
-            num_decode_batches, sorted_inputs,
-            inputs_vocab, decode_hp.batch_size,
-            decode_hp.max_input_size, task_id=decode_hp.multiproblem_task_id)
-      else:
-        input_gen = _decode_batch_input_fn_no_padding(sorted_inputs=sorted_inputs,max_batch_size=decode_hp.batch_size,
-                                                      vocabulary=inputs_vocab,max_input_size=decode_hp.max_input_size,
-                                                      decode_hp=decode_hp)
-    gen_fn = make_input_fn_from_generator(input_gen)
-    example = gen_fn()
-    return _decode_input_tensor_to_features_dict(example, hparams)
+      input_gen = _decode_batch_input_fn(
+          num_decode_batches, sorted_inputs,
+          inputs_vocab, decode_hp.batch_size,
+          decode_hp.max_input_size, task_id=decode_hp.multiproblem_task_id)
+      gen_fn = make_input_fn_from_generator(input_gen)
+      example = gen_fn()
+      return _decode_input_tensor_to_features_dict(example, hparams)
   decodes = []
   result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)
 
@@ -648,73 +643,6 @@ def _decode_batch_input_fn(num_decode_batches, sorted_inputs, vocabulary,
         "inputs": np.array(final_batch_inputs).astype(np.int32),
     }
 
-def _decode_batch_input_fn_no_padding(sorted_inputs, max_batch_size, vocabulary, max_input_size, decode_hp):
-    """Generator to produce batches of same length inputs (batch size will be variable)."""
-
-    # First reverse all the input sentences so that if you're going to get OOMs,
-    # you'll see it in the first batch
-    sorted_inputs.reverse()
-
-    #Get variable batch sizes
-    last_batch_length=None
-    batch_lengths, batch_indicies = [],[]
-    for batch_index,elm in enumerate(sorted_inputs):
-        #Exclude whitespace and empty strings from batch length.
-        this_batch_length=len(elm.split(' '))
-        if max_input_size>0:
-            if this_batch_length>max_input_size:
-                this_batch_length=max_input_size
-        if this_batch_length!=last_batch_length:
-            batch_lengths.append(this_batch_length)
-            batch_indicies.append(batch_index)
-            last_batch_length = this_batch_length
-    batch_indicies.append(len(sorted_inputs))
-
-    #Ensure no batches exceed the maximum batch_size
-    batch_sizes = np.diff(batch_indicies)
-    final_batch_sizes = []
-    final_batch_lengths = []
-    for ii,bs in enumerate(batch_sizes):
-      if bs<max_batch_size:
-          final_batch_sizes.append(bs)
-          final_batch_lengths.append(batch_lengths[ii])
-      else:
-          full_batches = bs//max_batch_size
-          partial_batch= bs%max_batch_size
-          for _ in range(full_batches):
-              final_batch_sizes.append(max_batch_size)
-              final_batch_lengths.append(batch_lengths[ii])
-          if partial_batch>0:
-              final_batch_sizes.append(partial_batch)
-              final_batch_lengths.append(batch_lengths[ii])
-
-    #Continue with now variable batch sizes, no need for padding.
-    last_index=0
-    for b,batch_size in enumerate(final_batch_sizes):
-        tf.logging.info("Decoding batch %d" % b)
-        # Batch length should be the same for the entire batch -- Add one additional term for <EOS> token insertion (opt)
-        batch_length = min(max_input_size,final_batch_lengths[b]) + 1
-        batch_inputs = []
-        for inputs in sorted_inputs[last_index:last_index+batch_size]:
-          input_ids = vocabulary.encode(inputs)
-          if max_input_size>0:
-              #For language modeling problems, more recent inputs are often more important.
-              input_ids = input_ids[-max_input_size:]
-          #Padding and <EOS> removed -- for language modeling problems.
-          batch_inputs.append(input_ids)
-        last_index+=batch_size
-
-        final_batch_inputs = []
-        #Ensure consistent batch size
-        for in_ids in batch_inputs:
-          assert len(in_ids) == batch_length
-          x=in_ids
-          final_batch_inputs.append(x)
-
-        yield {
-            "inputs": np.array(final_batch_inputs).astype(np.int32),
-        }
-
 
 def _interactive_input_fn(hparams, decode_hp):
   """Generator that reads from the terminal and yields "interactive inputs".