NVIDIA · Zhilin123 · May 18, 2023 · May 18, 2023 · May 18, 2023
diff --git a/tools/customization_dataset_preparation/customization_dataset_preparation.py b/tools/customization_dataset_preparation/customization_dataset_preparation.py
@@ -92,13 +92,8 @@ def recommend_hyperparameters(df, model=None):
     """
     Makes recommendations on the batch_size to use for training, based on the dataset size
 
-    All hyperparameters except batch_size, max_batch_size and max_seq_length are hardcoded based on API defaults for now
     """
     potential_batch_sizes = [2, 4, 8, 12, 16, 32, 64, 128]
-    bs = 2
-    for potential_bs in potential_batch_sizes:
-        if 0.002 * len(df) > potential_bs:
-            bs = potential_bs
 
     max_bs = 128
     if len(df) < 128:
@@ -107,6 +102,8 @@ def recommend_hyperparameters(df, model=None):
             if potential_bs < len(df) * 0.9:
                 max_bs = potential_bs
 
+    bs = min(max_bs, 32)
+
     df_char_length = df.apply(lambda x: len(x.prompt) + len(x.completion), axis=1)
     length_by_chars = sorted(list(df_char_length))
     n_samples_under_99p5_limit = math.ceil(len(df_char_length) * 0.995)

diff --git a/tools/customization_dataset_preparation/tests/test_customization_dataset_preparation.py b/tools/customization_dataset_preparation/tests/test_customization_dataset_preparation.py
@@ -39,39 +39,43 @@
 def test_recommend_hyperparameters():
     df_100 = pd.DataFrame({'prompt': ['prompt'] * 100, 'completion': ['completion'] * 100})
     assert recommend_hyperparameters(df_100) == {
-        'batch_size': 2,
+        'batch_size': 32,
         'max_batch_size': 64,
         'num_virtual_tokens': 10,
-        'lr': 0.0001,
-        'epochs': 25,
+        'encoder_hidden_size': 1024,
+        'lr': 0.005,
+        'epochs': 10,
         'max_seq_length': 104,
     }
 
     df_1000 = pd.DataFrame({'prompt': ['prompt'] * 1000, 'completion': ['completion'] * 1000})
     assert recommend_hyperparameters(df_1000) == {
-        'batch_size': 2,
+        'batch_size': 32,
         'max_batch_size': 128,
         'num_virtual_tokens': 10,
-        'lr': 0.0001,
-        'epochs': 25,
+        'encoder_hidden_size': 2048,
+        'lr': 0.001,
+        'epochs': 10,
         'max_seq_length': 104,
     }
     df_10000 = pd.DataFrame({'prompt': ['prompt'] * 10000, 'completion': ['completion'] * 10000})
     assert recommend_hyperparameters(df_10000) == {
-        'batch_size': 16,
+        'batch_size': 32,
         'max_batch_size': 128,
         'num_virtual_tokens': 10,
-        'lr': 0.0001,
-        'epochs': 25,
+        'encoder_hidden_size': 4096,
+        'lr': 0.0005,
+        'epochs': 10,
         'max_seq_length': 104,
     }
     df_100000 = pd.DataFrame({'prompt': ['prompt'] * 100000, 'completion': ['completion'] * 100000})
     assert recommend_hyperparameters(df_100000) == {
-        'batch_size': 128,
+        'batch_size': 32,
         'max_batch_size': 128,
         'num_virtual_tokens': 10,
+        'encoder_hidden_size': 4096,
         'lr': 0.0001,
-        'epochs': 25,
+        'epochs': 10,
         'max_seq_length': 104,
     }