Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations and readability improvements #14

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@ create_venv:
clean:
find . -type d -name "__pycache__" -exec rm -rf {} +

destroy: clean
reset_run:
find . -type d -name "results" -exec rm -rf {} +
find . -type d -name "contexts" -exec rm -rf {} +

destroy: clean reset_run
rm -rf ./$(VENV_NAME)
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ $ pip install -r requirements.txt
You can then run the analysis on OpenAI or Anthropic models by running `main.py` with the command line arguments shown below. `LLMNeedleHaystackTester` parameters can also be passed as command line arguments, except `model_to_test` and `evaluator` of course.
* `provider` - The provider of the model, available options are `openai` and `anthropic`. Defaults to `openai`
* `evaluator` - The provider for the evaluator model, only `openai` is currently supported. Defaults to `openai`.
* `model_name` - Model name of the language model accessible by the provider. Defaults to `gpt-3.5-turbo-0125`
* `evaluator_model_name` - Model name of the language model accessible by the evaluator. Defaults to `gpt-3.5-turbo-0125`
* `api_key` - API key for either OpenAI or Anthropic provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` depending on the provider. Defaults to `None`.
* `evaluator_api_key` - API key for OpenAI provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY`. Defaults to `None`

Expand Down
9 changes: 6 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
class CommandArgs():
provider: str = "openai"
evaluator: str = "openai"
model_name: Optional[str] = "gpt-3.5-turbo-0125"
evaluator_model_name: Optional[str] = "gpt-3.5-turbo-0125"
api_key: Optional[str] = None
evaluator_api_key: Optional[str] = None
needle: Optional[str] = "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
Expand All @@ -38,16 +40,17 @@ class CommandArgs():
def get_model_to_test(args: CommandArgs) -> ModelProvider:
match args.provider.lower():
case "openai":
return OpenAI(api_key=args.api_key)
return OpenAI(model_name=args.model_name, api_key=args.api_key)
case "anthropic":
return Anthropic(api_key=args.api_key)
return Anthropic(model_name=args.model_name, api_key=args.api_key)
case _:
raise ValueError(f"Invalid provider: {args.provider}")

def get_evaluator(args: CommandArgs) -> Evaluator:
match args.evaluator.lower():
case "openai":
return OpenAIEvaluator(question_asked=args.retrieval_question,
return OpenAIEvaluator(model_name=args.evaluator_model_name,
question_asked=args.retrieval_question,
true_answer=args.needle,
api_key=args.evaluator_api_key)
case _:
Expand Down
10 changes: 7 additions & 3 deletions src/evaluators/openai_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from langchain_community.chat_models import ChatOpenAI

class OpenAIEvaluator(Evaluator):
DEFAULT_MODEL_KWARGS: dict = dict(temperature=0)
CRITERIA = {"accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Expand All @@ -17,11 +18,13 @@ class OpenAIEvaluator(Evaluator):

def __init__(self,
model_name: str = "gpt-3.5-turbo-0125",
model_kwargs: dict = DEFAULT_MODEL_KWARGS,
api_key: str = None,
true_answer: str = None,
question_asked: str = None):
question_asked: str = None,):
"""
:param model_name: The name of the model.
:param model_kwargs: Model configuration. Default is {temperature: 0}
:param api_key: The API key for OpenAI. Default is None.
:param true_answer: The true answer to the question asked.
:param question_asked: The question asked to the model.
Expand All @@ -31,6 +34,7 @@ def __init__(self,
raise ValueError("true_answer and question_asked must be supplied with init.")

self.model_name = model_name
self.model_kwargs = model_kwargs
self.true_answer = true_answer
self.question_asked = question_asked

Expand All @@ -40,8 +44,8 @@ def __init__(self,
self.api_key = api_key or os.getenv('OPENAI_API_KEY')

self.evaluator = ChatOpenAI(model=self.model_name,
temperature=0,
openai_api_key=self.api_key)
openai_api_key=self.api_key,
**self.model_kwargs)

def evaluate_response(self, response: str) -> int:
evaluator = load_evaluator(
Expand Down
145 changes: 86 additions & 59 deletions src/llm_needle_haystack_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def __init__(self,
"""
if not model_to_test:
raise ValueError("A language model must be provided to test.")
if not evaluator:
raise ValueError("An evaluator must be provided to evaluate the model's response.")
if not needle or not haystack_dir or not retrieval_question:
raise ValueError("Needle, haystack, and retrieval_question must be provided.")

Expand All @@ -77,13 +79,20 @@ def __init__(self,
self.save_contexts = save_contexts
self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions
self.print_ongoing_status = print_ongoing_status

self.context_dir = 'contexts'
self.results_dir = 'results'
self.result_file_format = '{model_name}_len_{context_length}_depth_{depth_percent}'
self.testing_results = []

if context_lengths is None:
if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None:
raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.")
else:
self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int)
self.context_lengths = self.get_intervals(context_lengths_min,
context_lengths_max,
context_lengths_num_intervals,
"linear")
else:
self.context_lengths = context_lengths

Expand All @@ -93,13 +102,13 @@ def __init__(self,
if document_depth_percents is None:
if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None:
raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.")

if document_depth_percent_interval_type == 'linear':
self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int)
elif document_depth_percent_interval_type == 'sigmoid':
self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)]
else:
if document_depth_percent_interval_type not in ['linear', 'sigmoid']:
raise ValueError("document_depth_percent_interval_type must be either 'sigmoid' or 'linear' if document_depth_percents is None.")

self.document_depth_percents = self.get_intervals(document_depth_percent_min,
document_depth_percent_max,
document_depth_percent_intervals,
document_depth_percent_interval_type)
else:
self.document_depth_percents = document_depth_percents

Expand All @@ -108,6 +117,17 @@ def __init__(self,

self.evaluation_model = evaluator

def get_intervals(self, min_depth, max_depth, num_intervals, interval_type):
linear_spacing = np.linspace(min_depth, max_depth, num=num_intervals, endpoint=True)

match interval_type:
case 'linear':
return np.round(linear_spacing).astype(int)
case 'sigmoid':
return [self.logistic(x) for x in linear_spacing]
case _:
return []

def logistic(self, x, L=100, x0=50, k=.1):
if x in [0, 100]:
return x
Expand All @@ -122,24 +142,20 @@ async def bound_evaluate_and_log(self, sem, *args):
await self.evaluate_and_log(*args)

async def run_test(self):
sem = Semaphore(self.num_concurrent_requests)

# Run through each iteration of context_lengths and depths
tasks = []
for context_length in self.context_lengths:
for depth_percent in self.document_depth_percents:
task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
tasks.append(task)
async with asyncio.TaskGroup() as tg:
sem = Semaphore(self.num_concurrent_requests)

# Wait for all tasks to complete
await asyncio.gather(*tasks)
# Run through each iteration of context_lengths and depths
for context_length in self.context_lengths:
for depth_percent in self.document_depth_percents:
task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
tg.create_task(task)

async def evaluate_and_log(self, context_length, depth_percent):
# Checks to see if you've already checked a length/percent/version.
# This helps if the program stop running and you want to restart later
if self.save_results:
if self.result_exists(context_length, depth_percent):
return
if self.save_results and self.result_exists(context_length, depth_percent):
return

# Go generate the required length context and place your needle statement in
context = await self.generate_context(context_length, depth_percent)
Expand All @@ -159,47 +175,45 @@ async def evaluate_and_log(self, context_length, depth_percent):
score = self.evaluation_model.evaluate_response(response)

results = {
# 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
'model' : self.model_name,
'context_length' : int(context_length),
'depth_percent' : float(depth_percent),
'version' : self.results_version,
'needle' : self.needle,
'model_response' : response,
'score' : score,
'test_duration_seconds' : test_elapsed_time,
'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
# 'context': context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
'model': self.model_name,
'context_length': int(context_length),
'depth_percent': float(depth_percent),
'version': self.results_version,
'needle': self.needle,
'model_response': response,
'score': score,
'test_duration_seconds': test_elapsed_time,
'test_timestamp_utc': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
}

self.testing_results.append(results)

if self.print_ongoing_status:
print (f"-- Test Summary -- ")
print (f"Duration: {test_elapsed_time:.1f} seconds")
print (f"Context: {context_length} tokens")
print (f"Depth: {depth_percent}%")
print (f"Score: {score}")
print (f"Response: {response}\n")
self.print_status(test_elapsed_time, context_length, depth_percent, score, response)

context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
parsed_model_name = self.model_name.replace(".", "_")
context_file_location = self.result_file_format.format(model_name=parsed_model_name,
context_length=context_length,
depth_percent=int(depth_percent))

if self.save_contexts:
results['file_name'] = context_file_location

# Save the context to file for retesting
if not os.path.exists('contexts'):
os.makedirs('contexts')
if not os.path.exists(self.context_dir):
os.makedirs(self.context_dir)

with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
with open(f'{self.context_dir}/{context_file_location}_context.txt', 'w') as f:
f.write(context)

if self.save_results:
# Save the context to file for retesting
if not os.path.exists('results'):
os.makedirs('results')
if not os.path.exists(self.results_dir):
os.makedirs(self.results_dir)

# Save the result to file for retesting
with open(f'results/{context_file_location}_results.json', 'w') as f:
with open(f'{self.results_dir}/{context_file_location}_results.json', 'w') as f:
json.dump(results, f)

if self.seconds_to_sleep_between_completions:
Expand All @@ -210,20 +224,21 @@ def result_exists(self, context_length, depth_percent):
Checks to see if a result has already been evaluated or not
"""

results_dir = 'results/'
if not os.path.exists(results_dir):
if not os.path.exists(self.results_dir):
return False

filename = self.result_file_format.format(model_name=self.model_name,
context_length=context_length,
depth_percent=depth_percent)
file_path = os.path.join(self.results_dir, f'{filename}.json')
if not os.path.exists(file_path):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We know the exact format of the filename, so instead of looping over every file lets see if the specific file exists, if it does then check the version number else return false.

return False

for filename in os.listdir(results_dir):
if filename.endswith('.json'):
with open(os.path.join(results_dir, filename), 'r') as f:
result = json.load(f)
context_length_met = result['context_length'] == context_length
depth_percent_met = result['depth_percent'] == depth_percent
version_met = result.get('version', 1) == self.results_version
model_met = result['model'] == self.model_name
if context_length_met and depth_percent_met and version_met and model_met:
return True
with open(file_path, 'r') as f:
result = json.load(f)

if result.get('version', 1) == self.results_version:
return True
return False

async def generate_context(self, context_length, depth_percent):
Expand Down Expand Up @@ -265,9 +280,9 @@ def insert_needle(self, context, depth_percent, context_length):
period_tokens = self.model_to_test.encode_text_to_tokens('.')

# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
while (insertion_point > 0) and (tokens_new_context[insertion_point-1] != period_tokens):
insertion_point -= 1
tokens_new_context = tokens_context[:insertion_point]
tokens_new_context = tokens_context[:insertion_point]

# Once we get there, then add in your needle, and stick the rest of your context in on the other end.
# Now we have a needle in a haystack
Expand All @@ -282,12 +297,16 @@ def get_context_length_in_tokens(self, context):

def read_context_files(self):
context = ""
current_context_length = 0
max_context_length = max(self.context_lengths)

while self.get_context_length_in_tokens(context) < max_context_length:
while current_context_length < max_context_length:
for file in glob.glob(f"{self.haystack_dir}/*.txt"):
with open(file, 'r') as f:
context += f.read()
file_content = f.read()

context += file_content
current_context_length += self.get_context_length_in_tokens(file_content)
return context

def encode_and_trim(self, context, context_length):
Expand All @@ -308,6 +327,14 @@ def print_start_test_summary(self):
print (f"- Needle: {self.needle.strip()}")
print ("\n\n")

def print_status(self, elapsed_time, context_length, depth_percent, score, response):
print (f"-- Test Summary -- ")
print (f"Duration: {elapsed_time:.1f} seconds")
print (f"Context: {context_length} tokens")
print (f"Depth: {depth_percent}%")
print (f"Score: {score}")
print (f"Response: {response}\n")

def start_test(self):
if self.print_ongoing_status:
self.print_start_test_summary()
Expand Down
13 changes: 10 additions & 3 deletions src/providers/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@
from typing import Optional

class Anthropic(ModelProvider):
def __init__(self, model_name: str = "claude", api_key: str = None):
DEFAULT_MODEL_KWARGS: dict = dict(max_tokens_to_sample = 300,
temperature = 0)

def __init__(self,
model_name: str = "claude",
model_kwargs: dict = DEFAULT_MODEL_KWARGS,
api_key: str = None):
"""
:param model_name: The name of the model. Default is 'claude'.
:param model_kwargs: Model configuration. Default is {max_tokens_to_sample: 300, temperature: 0}
:param api_key: The API key for Anthropic. Default is None.
"""

Expand All @@ -19,6 +26,7 @@ def __init__(self, model_name: str = "claude", api_key: str = None):
raise ValueError("Either api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env.")

self.model_name = model_name
self.model_kwargs = model_kwargs
self.api_key = api_key or os.getenv('ANTHROPIC_API_KEY')

self.model = AsyncAnthropic(api_key=self.api_key)
Expand All @@ -32,9 +40,8 @@ def __init__(self, model_name: str = "claude", api_key: str = None):
async def evaluate_model(self, prompt: str) -> str:
response = await self.model.completions.create(
model=self.model_name,
max_tokens_to_sample=300,
prompt=prompt,
temperature=0)
**self.model_kwargs)
return response.completion

def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]:
Expand Down
Loading