1
+ import json
1
2
import re
2
3
from os import getenv
3
4
from typing import NamedTuple
4
5
6
+ from dotenv import load_dotenv
5
7
from openai import OpenAI
6
8
from openai .types .chat import (
7
9
ChatCompletionMessageParam ,
8
10
)
9
11
10
- SYSTEM_INSTRUCTION_PART_1_PATH = "src/ai/prompts/mesop_overview.txt"
11
- # SYSTEM_INSTRUCTION_PART_2_PATH = "src/ai/prompts/mini_docs.txt"
12
-
13
- with open (SYSTEM_INSTRUCTION_PART_1_PATH ) as f :
14
- SYSTEM_INSTRUCTION_PART_1 = f .read ()
15
-
16
- # with open(SYSTEM_INSTRUCTION_PART_2_PATH) as f:
17
- # SYSTEM_INSTRUCTION_PART_2 = f.read()
18
-
19
- # Intentionally skip the more extensive system instruction with docs for now.
20
- SYSTEM_INSTRUCTION = SYSTEM_INSTRUCTION_PART_1 # + SYSTEM_INSTRUCTION_PART_2
21
- PROMPT_PATH = "src/ai/prompts/revise_prompt.txt"
22
-
23
- with open (PROMPT_PATH ) as f :
24
- REVISE_APP_BASE_PROMPT = f .read ().strip ()
12
+ load_dotenv ()
25
13
26
14
EDIT_HERE_MARKER = " # <--- EDIT HERE"
27
15
@@ -31,6 +19,11 @@ class ApplyPatchResult(NamedTuple):
31
19
result : str
32
20
33
21
22
+ def read_file (filepath : str ) -> str :
23
+ with open (filepath ) as f :
24
+ return f .read ().strip ()
25
+
26
+
34
27
def apply_patch (original_code : str , patch : str ) -> ApplyPatchResult :
35
28
# Extract the diff content
36
29
diff_pattern = r"<<<<<<< ORIGINAL(.*?)=======\n(.*?)>>>>>>> UPDATED"
@@ -64,24 +57,87 @@ def apply_patch(original_code: str, patch: str) -> ApplyPatchResult:
64
57
)
65
58
66
59
60
+ class MessageFormatter :
61
+ def __init__ (self , system_instruction : str , revise_app_prompt : str ):
62
+ self .system_instruction = system_instruction
63
+ self .revise_app_prompt = revise_app_prompt
64
+
65
+ def format_messages (
66
+ self , code : str , user_input : str , line_number : int | None
67
+ ) -> list [ChatCompletionMessageParam ]:
68
+ # Add sentinel token based on line_number (1-indexed)
69
+ if line_number is not None :
70
+ code_lines = code .splitlines ()
71
+ if 1 <= line_number <= len (code_lines ):
72
+ code_lines [line_number - 1 ] += EDIT_HERE_MARKER
73
+ code = "\n " .join (code_lines )
74
+
75
+ formatted_prompt = self .revise_app_prompt .replace (
76
+ "<APP_CODE>" , code
77
+ ).replace ("<APP_CHANGES>" , user_input )
78
+
79
+ return [
80
+ {"role" : "system" , "content" : self .system_instruction },
81
+ {"role" : "user" , "content" : formatted_prompt },
82
+ ]
83
+
84
+
85
+ def MakeDefaultMessageFormatter ():
86
+ system_instructions = read_file ("src/ai/prompts/mesop_overview.txt" )
87
+ base_prompt = read_file ("src/ai/prompts/revise_prompt_base.txt" )
88
+ prompt = read_file ("src/ai/prompts/revise_prompt_shorter.txt" )
89
+ return MessageFormatter (system_instructions , base_prompt + "\n \n " + prompt )
90
+
91
+
92
+ def MakeMessageFormatterShorterUserMsg ():
93
+ """Formats user messages with a shorter prompt.
94
+
95
+ We use a shorter prompt since we will be including goldens that
96
+ have not been fine-tuned yet. This allows us to test new training
97
+ data without having to fine tune all the time.
98
+
99
+ Instead the main user instruction prompt will be bundled with user
100
+ instructions instead.
101
+ """
102
+ system_instructions = read_file ("src/ai/prompts/mesop_overview.txt" )
103
+ revise_instructions = read_file ("src/ai/prompts/revise_prompt_base.txt" )
104
+ prompt = read_file ("src/ai/prompts/revise_prompt_shorter.txt" )
105
+ return MessageFormatter (
106
+ system_instructions + "\n \n " + revise_instructions , prompt
107
+ )
108
+
109
+
110
+ def load_unused_goldens ():
111
+ goldens_path = "ft/gen/formatted_dataset_for_prompting.jsonl"
112
+ new_goldens = []
113
+ num_rows = 0
114
+ try :
115
+ with open (goldens_path ) as f :
116
+ for row in f :
117
+ num_rows += 1
118
+ messages = json .loads (row )["messages" ]
119
+ new_goldens .append (messages [1 ])
120
+ new_goldens .append (messages [2 ])
121
+ new_goldens .pop (0 ) # Remove the redundant system instruction
122
+ print (f"Adding { num_rows } additional examples to prompt." )
123
+ except FileNotFoundError as e :
124
+ print (e )
125
+
126
+ return new_goldens
127
+
128
+
129
+ if getenv ("MESOP_AI_INCLUDE_NEW_GOLDENS" ):
130
+ message_formatter = MakeMessageFormatterShorterUserMsg ()
131
+ goldens_path = load_unused_goldens ()
132
+ else :
133
+ message_formatter = MakeDefaultMessageFormatter ()
134
+ goldens_path = []
135
+
136
+
67
137
def format_messages (
68
138
code : str , user_input : str , line_number : int | None
69
139
) -> list [ChatCompletionMessageParam ]:
70
- # Add sentinel token based on line_number (1-indexed)
71
- if line_number is not None :
72
- code_lines = code .splitlines ()
73
- if 1 <= line_number <= len (code_lines ):
74
- code_lines [line_number - 1 ] += EDIT_HERE_MARKER
75
- code = "\n " .join (code_lines )
76
-
77
- formatted_prompt = REVISE_APP_BASE_PROMPT .replace ("<APP_CODE>" , code ).replace (
78
- "<APP_CHANGES>" , user_input
79
- )
80
-
81
- return [
82
- {"role" : "system" , "content" : SYSTEM_INSTRUCTION },
83
- {"role" : "user" , "content" : formatted_prompt },
84
- ]
140
+ return message_formatter .format_messages (code , user_input , line_number )
85
141
86
142
87
143
def adjust_mesop_app_stream (
@@ -97,9 +153,12 @@ def adjust_mesop_app_stream(
97
153
"""
98
154
messages = format_messages (code , user_input , line_number )
99
155
156
+ if goldens_path :
157
+ messages = [messages [0 ], * goldens_path , messages [1 ]]
158
+
100
159
return client .chat .completions .create (
101
160
model = model ,
102
- max_tokens = 10_000 ,
161
+ max_tokens = 16_384 ,
103
162
messages = messages ,
104
163
stream = True ,
105
164
)
@@ -117,10 +176,12 @@ def adjust_mesop_app_blocking(
117
176
Returns the code diff.
118
177
"""
119
178
messages = format_messages (code , user_input , line_number )
179
+ if goldens_path :
180
+ messages = [messages [0 ], * goldens_path , messages [1 ]]
120
181
121
182
response = client .chat .completions .create (
122
183
model = model ,
123
- max_tokens = 10_000 ,
184
+ max_tokens = 16_384 ,
124
185
messages = messages ,
125
186
stream = False ,
126
187
)
0 commit comments