Update llm_lib to use include untuned examples

richard-to · richard-to · commit ad0f7ba8e1ac · 2024-09-09T10:57:23.000-07:00
- Allow untuned examples to be used as part of the chat prompt
- Updates format goldens to generate the untuned examples subset
- Minor adjustments to prompt
diff --git a/ai/src/ai/common/llm_lib.py b/ai/src/ai/common/llm_lib.py
@@ -1,27 +1,15 @@
+import json
 import re
 from os import getenv
 from typing import NamedTuple
 
+from dotenv import load_dotenv
 from openai import OpenAI
 from openai.types.chat import (
   ChatCompletionMessageParam,
 )
 
-SYSTEM_INSTRUCTION_PART_1_PATH = "src/ai/prompts/mesop_overview.txt"
-# SYSTEM_INSTRUCTION_PART_2_PATH = "src/ai/prompts/mini_docs.txt"
-
-with open(SYSTEM_INSTRUCTION_PART_1_PATH) as f:
-  SYSTEM_INSTRUCTION_PART_1 = f.read()
-
-# with open(SYSTEM_INSTRUCTION_PART_2_PATH) as f:
-#   SYSTEM_INSTRUCTION_PART_2 = f.read()
-
-# Intentionally skip the more extensive system instruction with docs for now.
-SYSTEM_INSTRUCTION = SYSTEM_INSTRUCTION_PART_1  # + SYSTEM_INSTRUCTION_PART_2
-PROMPT_PATH = "src/ai/prompts/revise_prompt.txt"
-
-with open(PROMPT_PATH) as f:
-  REVISE_APP_BASE_PROMPT = f.read().strip()
+load_dotenv()
 
 EDIT_HERE_MARKER = " # <--- EDIT HERE"
 
@@ -31,6 +19,11 @@ class ApplyPatchResult(NamedTuple):
   result: str
 
 
+def read_file(filepath: str) -> str:
+  with open(filepath) as f:
+    return f.read().strip()
+
+
 def apply_patch(original_code: str, patch: str) -> ApplyPatchResult:
   # Extract the diff content
   diff_pattern = r"<<<<<<< ORIGINAL(.*?)=======\n(.*?)>>>>>>> UPDATED"
@@ -64,24 +57,87 @@ def apply_patch(original_code: str, patch: str) -> ApplyPatchResult:
 )
 
 
+class MessageFormatter:
+  def __init__(self, system_instruction: str, revise_app_prompt: str):
+    self.system_instruction = system_instruction
+    self.revise_app_prompt = revise_app_prompt
+
+  def format_messages(
+    self, code: str, user_input: str, line_number: int | None
+  ) -> list[ChatCompletionMessageParam]:
+    # Add sentinel token based on line_number (1-indexed)
+    if line_number is not None:
+      code_lines = code.splitlines()
+      if 1 <= line_number <= len(code_lines):
+        code_lines[line_number - 1] += EDIT_HERE_MARKER
+      code = "\n".join(code_lines)
+
+    formatted_prompt = self.revise_app_prompt.replace(
+      "<APP_CODE>", code
+    ).replace("<APP_CHANGES>", user_input)
+
+    return [
+      {"role": "system", "content": self.system_instruction},
+      {"role": "user", "content": formatted_prompt},
+    ]
+
+
+def MakeDefaultMessageFormatter():
+  system_instructions = read_file("src/ai/prompts/mesop_overview.txt")
+  base_prompt = read_file("src/ai/prompts/revise_prompt_base.txt")
+  prompt = read_file("src/ai/prompts/revise_prompt_shorter.txt")
+  return MessageFormatter(system_instructions, base_prompt + "\n\n" + prompt)
+
+
+def MakeMessageFormatterShorterUserMsg():
+  """Formats user messages with a shorter prompt.
+
+  We use a shorter prompt since we will be including goldens that
+  have not been fine-tuned yet. This allows us to test new training
+  data without having to fine tune all the time.
+
+  Instead the main user instruction prompt will be bundled with user
+  instructions instead.
+  """
+  system_instructions = read_file("src/ai/prompts/mesop_overview.txt")
+  revise_instructions = read_file("src/ai/prompts/revise_prompt_base.txt")
+  prompt = read_file("src/ai/prompts/revise_prompt_shorter.txt")
+  return MessageFormatter(
+    system_instructions + "\n\n" + revise_instructions, prompt
+  )
+
+
+def load_unused_goldens():
+  goldens_path = "ft/gen/formatted_dataset_for_prompting.jsonl"
+  new_goldens = []
+  num_rows = 0
+  try:
+    with open(goldens_path) as f:
+      for row in f:
+        num_rows += 1
+        messages = json.loads(row)["messages"]
+        new_goldens.append(messages[1])
+        new_goldens.append(messages[2])
+    new_goldens.pop(0)  # Remove the redundant system instruction
+    print(f"Adding {num_rows} additional examples to prompt.")
+  except FileNotFoundError as e:
+    print(e)
+
+  return new_goldens
+
+
+if getenv("MESOP_AI_INCLUDE_NEW_GOLDENS"):
+  message_formatter = MakeMessageFormatterShorterUserMsg()
+  goldens_path = load_unused_goldens()
+else:
+  message_formatter = MakeDefaultMessageFormatter()
+  goldens_path = []
+
+
 def format_messages(
   code: str, user_input: str, line_number: int | None
 ) -> list[ChatCompletionMessageParam]:
-  # Add sentinel token based on line_number (1-indexed)
-  if line_number is not None:
-    code_lines = code.splitlines()
-    if 1 <= line_number <= len(code_lines):
-      code_lines[line_number - 1] += EDIT_HERE_MARKER
-    code = "\n".join(code_lines)
-
-  formatted_prompt = REVISE_APP_BASE_PROMPT.replace("<APP_CODE>", code).replace(
-    "<APP_CHANGES>", user_input
-  )
-
-  return [
-    {"role": "system", "content": SYSTEM_INSTRUCTION},
-    {"role": "user", "content": formatted_prompt},
-  ]
+  return message_formatter.format_messages(code, user_input, line_number)
 
 
 def adjust_mesop_app_stream(
@@ -97,9 +153,12 @@ def adjust_mesop_app_stream(
   """
   messages = format_messages(code, user_input, line_number)
 
+  if goldens_path:
+    messages = [messages[0], *goldens_path, messages[1]]
+
   return client.chat.completions.create(
     model=model,
-    max_tokens=10_000,
+    max_tokens=16_384,
     messages=messages,
     stream=True,
   )
@@ -117,10 +176,12 @@ def adjust_mesop_app_blocking(
   Returns the code diff.
   """
   messages = format_messages(code, user_input, line_number)
+  if goldens_path:
+    messages = [messages[0], *goldens_path, messages[1]]
 
   response = client.chat.completions.create(
     model=model,
-    max_tokens=10_000,
+    max_tokens=16_384,
     messages=messages,
     stream=False,
   )
diff --git a/ai/src/ai/prompts/revise_prompt_base.txt b/ai/src/ai/prompts/revise_prompt_base.txt
@@ -3,7 +3,7 @@ Your task is to modify a Mesop app given the code and a description.
 Make sure to remember these rules when making modifications:
 1. For the @me.page decorator, keep it the same as the original *unless* you need to modify on_load.
 2. Event handler functions cannot use lambdas. You must use functions.
-3. Event handle functions only pass in the event type. They do not accept extra parameters.
+3. Event handler functions only pass in the event type. They do not accept extra parameters.
 4. For padding, make sure to use the the `me.Padding` object rather than a string or int.
 5. For margin, make sure to use the the `me.Margin` object rather than a string or int.
 6. For border, make sure to use the the `me.Border` and `me.BorderSide` objects rather than a string.
@@ -143,13 +143,3 @@ def page():
 >>>>>>> UPDATED
 
 OK, now that I've shown you an example, let's do this for real.
-
-Existing app code:
-```
-<APP_CODE>
-```
-
-User instructions:
-<APP_CHANGES>
-
-Diff output:
diff --git a/ai/src/ai/prompts/revise_prompt_shorter.txt b/ai/src/ai/prompts/revise_prompt_shorter.txt
@@ -0,0 +1,9 @@
+Existing app code:
+```
+<APP_CODE>
+```
+
+User instructions:
+<APP_CHANGES>
+
+Diff output:
diff --git a/ai/src/format_goldens.py b/ai/src/format_goldens.py
@@ -2,14 +2,22 @@
 Formats the golden dataset for the fine-tuning process.
 """
 
+import argparse
 import json
 import os
+from datetime import datetime
 from typing import Any
 
-from ai.common.llm_lib import format_messages
+from ai.common.llm_lib import (
+  MakeDefaultMessageFormatter,
+  MakeMessageFormatterShorterUserMsg,
+)
 
+# TODO: Allow this to be configurable
+FINE_TUNING_CUTOFF = datetime(2024, 8, 2, 0, 0)
 
-def process_goldens():
+
+def process_goldens(skip_fine_tuned_goldens: bool = False):
   dataset: list[dict[str, Any]] = []
   outputs_dir = "ft/goldens"
 
@@ -20,6 +28,13 @@ def process_goldens():
       diff_path = os.path.join(dir_path, "diff.txt")
       line_number: int | None = None
       meta_path = os.path.join(dir_path, "metadata.json")
+
+      _, timestamp = os.path.basename(dir_path).rsplit("_", 1)
+      creation_date = datetime.strptime(timestamp, "%Y%m%d%H%M")
+      print(creation_date)
+      if skip_fine_tuned_goldens and creation_date < FINE_TUNING_CUTOFF:
+        continue
+
       if os.path.exists(meta_path):
         with open(meta_path) as meta_file:
           meta = json.load(meta_file)
@@ -39,10 +54,15 @@ def process_goldens():
       else:
         code = ""
 
+      if skip_fine_tuned_goldens:
+        formatter = MakeMessageFormatterShorterUserMsg()
+      else:
+        formatter = MakeDefaultMessageFormatter()
+
       dataset.append(
         {
           "messages": [
-            *format_messages(code, prompt, line_number),
+            *formatter.format_messages(code, prompt, line_number),
             {
               "role": "assistant",
               "content": diff,
@@ -55,11 +75,24 @@ def process_goldens():
 
 
 if __name__ == "__main__":
-  formatted_dataset = process_goldens()
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    "--skip_fine_tuned_goldens",
+    action="store_true",
+    help="Generates a formatted dataset with goldens that have not been fine tuned.",
+  )
+  args = parser.parse_args()
+
+  formatted_dataset = process_goldens(args.skip_fine_tuned_goldens)
   print(f"Processed {len(formatted_dataset)} samples.")
   # create gen dir if it doesn't exist
   os.makedirs("ft/gen", exist_ok=True)
-  full_path = os.path.join("ft/gen/formatted_dataset.jsonl")
+
+  if args.skip_fine_tuned_goldens:
+    full_path = os.path.join("ft/gen/formatted_dataset_for_prompting.jsonl")
+  else:
+    full_path = os.path.join("ft/gen/formatted_dataset.jsonl")
+
   # Append each sample as a JSON object on a separate line to a file
   with open(full_path, "w") as f:
     for sample in formatted_dataset: