From d8ae57453d64ebd3b7cddc47567dbef5baba19e8 Mon Sep 17 00:00:00 2001 From: Lazaro Hurtado Date: Tue, 5 Mar 2024 00:03:45 -0800 Subject: [PATCH 1/3] small optimizations --- Makefile | 6 +- README.md | 2 + main.py | 9 +- src/evaluators/openai_evaluator.py | 10 +- src/llm_needle_haystack_tester.py | 145 +++++++++++++++++------------ src/providers/anthropic.py | 13 ++- src/providers/openai.py | 13 ++- 7 files changed, 126 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 9062c0a1..9fd343ef 100644 --- a/Makefile +++ b/Makefile @@ -10,5 +10,9 @@ create_venv: clean: find . -type d -name "__pycache__" -exec rm -rf {} + -destroy: clean +reset_run: + find . -type d -name "results" -exec rm -rf {} + + find . -type d -name "contexts" -exec rm -rf {} + + +destroy: clean reset_run rm -rf ./$(VENV_NAME) \ No newline at end of file diff --git a/README.md b/README.md index 2f21a37d..a1ecf63c 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ $ pip install -r requirements.txt You can then run the analysis on OpenAI or Anthropic models by running `main.py` with the command line arguments shown below. `LLMNeedleHaystackTester` parameters can also be passed as command line arguments, except `model_to_test` and `evaluator` of course. * `provider` - The provider of the model, available options are `openai` and `anthropic`. Defaults to `openai` * `evaluator` - The provider for the evaluator model, only `openai` is currently supported. Defaults to `openai`. +* `model_name` - Model name of the language model accessible by the provider. Defaults to `gpt-3.5-turbo-0125` +* `evaluator_model_name` - Model name of the language model accessible by the evaluator. Defaults to `gpt-3.5-turbo-0125` * `api_key` - API key for either OpenAI or Anthropic provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` depending on the provider. Defaults to `None`. * `evaluator_api_key` - API key for OpenAI provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY`. Defaults to `None` diff --git a/main.py b/main.py index 90b6cccf..c3f98d39 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,8 @@ class CommandArgs(): provider: str = "openai" evaluator: str = "openai" + model_name: Optional[str] = "gpt-3.5-turbo-0125" + evaluator_model_name: Optional[str] = "gpt-3.5-turbo-0125" api_key: Optional[str] = None evaluator_api_key: Optional[str] = None needle: Optional[str] = "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" @@ -38,16 +40,17 @@ class CommandArgs(): def get_model_to_test(args: CommandArgs) -> ModelProvider: match args.provider.lower(): case "openai": - return OpenAI(api_key=args.api_key) + return OpenAI(model_name=args.model_name, api_key=args.api_key) case "anthropic": - return Anthropic(api_key=args.api_key) + return Anthropic(model_name=args.model_name, api_key=args.api_key) case _: raise ValueError(f"Invalid provider: {args.provider}") def get_evaluator(args: CommandArgs) -> Evaluator: match args.evaluator.lower(): case "openai": - return OpenAIEvaluator(question_asked=args.retrieval_question, + return OpenAIEvaluator(model_name=args.evaluator_model_name, + question_asked=args.retrieval_question, true_answer=args.needle, api_key=args.evaluator_api_key) case _: diff --git a/src/evaluators/openai_evaluator.py b/src/evaluators/openai_evaluator.py index 5d18bc8c..39b28295 100644 --- a/src/evaluators/openai_evaluator.py +++ b/src/evaluators/openai_evaluator.py @@ -7,6 +7,7 @@ from langchain_community.chat_models import ChatOpenAI class OpenAIEvaluator(Evaluator): + DEFAULT_MODEL_KWARGS: dict = dict(temperature=0) CRITERIA = {"accuracy": """ Score 1: The answer is completely unrelated to the reference. Score 3: The answer has minor relevance but does not align with the reference. @@ -17,11 +18,13 @@ class OpenAIEvaluator(Evaluator): def __init__(self, model_name: str = "gpt-3.5-turbo-0125", + model_kwargs: dict = DEFAULT_MODEL_KWARGS, api_key: str = None, true_answer: str = None, - question_asked: str = None): + question_asked: str = None,): """ :param model_name: The name of the model. + :param model_kwargs: Model configuration. Default is {temperature: 0} :param api_key: The API key for OpenAI. Default is None. :param true_answer: The true answer to the question asked. :param question_asked: The question asked to the model. @@ -31,6 +34,7 @@ def __init__(self, raise ValueError("true_answer and question_asked must be supplied with init.") self.model_name = model_name + self.model_kwargs = model_kwargs self.true_answer = true_answer self.question_asked = question_asked @@ -40,8 +44,8 @@ def __init__(self, self.api_key = api_key or os.getenv('OPENAI_API_KEY') self.evaluator = ChatOpenAI(model=self.model_name, - temperature=0, - openai_api_key=self.api_key) + openai_api_key=self.api_key, + **self.model_kwargs) def evaluate_response(self, response: str) -> int: evaluator = load_evaluator( diff --git a/src/llm_needle_haystack_tester.py b/src/llm_needle_haystack_tester.py index 5f188382..ee7be428 100644 --- a/src/llm_needle_haystack_tester.py +++ b/src/llm_needle_haystack_tester.py @@ -64,6 +64,8 @@ def __init__(self, """ if not model_to_test: raise ValueError("A language model must be provided to test.") + if not evaluator: + raise ValueError("An evaluator must be provided to evaluate the model's response.") if not needle or not haystack_dir or not retrieval_question: raise ValueError("Needle, haystack, and retrieval_question must be provided.") @@ -77,13 +79,20 @@ def __init__(self, self.save_contexts = save_contexts self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions self.print_ongoing_status = print_ongoing_status + + self.context_dir = 'contexts' + self.results_dir = 'results' + self.result_file_format = '{model_name}_len_{context_length}_depth_{depth_percent}' self.testing_results = [] if context_lengths is None: if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None: raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.") else: - self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int) + self.context_lengths = self.get_intervals(context_lengths_min, + context_lengths_max, + context_lengths_num_intervals, + "linear") else: self.context_lengths = context_lengths @@ -93,13 +102,13 @@ def __init__(self, if document_depth_percents is None: if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None: raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.") - - if document_depth_percent_interval_type == 'linear': - self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int) - elif document_depth_percent_interval_type == 'sigmoid': - self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)] - else: + if document_depth_percent_interval_type not in ['linear', 'sigmoid']: raise ValueError("document_depth_percent_interval_type must be either 'sigmoid' or 'linear' if document_depth_percents is None.") + + self.document_depth_percents = self.get_intervals(document_depth_percent_min, + document_depth_percent_max, + document_depth_percent_intervals, + document_depth_percent_interval_type) else: self.document_depth_percents = document_depth_percents @@ -108,6 +117,17 @@ def __init__(self, self.evaluation_model = evaluator + def get_intervals(self, min_depth, max_depth, num_intervals, interval_type): + linear_spacing = np.linspace(min_depth, max_depth, num=num_intervals, endpoint=True) + + match interval_type: + case 'linear': + return np.round(linear_spacing).astype(int) + case 'sigmoid': + return [self.logistic(x) for x in linear_spacing] + case _: + return [] + def logistic(self, x, L=100, x0=50, k=.1): if x in [0, 100]: return x @@ -122,24 +142,20 @@ async def bound_evaluate_and_log(self, sem, *args): await self.evaluate_and_log(*args) async def run_test(self): - sem = Semaphore(self.num_concurrent_requests) - - # Run through each iteration of context_lengths and depths - tasks = [] - for context_length in self.context_lengths: - for depth_percent in self.document_depth_percents: - task = self.bound_evaluate_and_log(sem, context_length, depth_percent) - tasks.append(task) + async with asyncio.TaskGroup() as tg: + sem = Semaphore(self.num_concurrent_requests) - # Wait for all tasks to complete - await asyncio.gather(*tasks) + # Run through each iteration of context_lengths and depths + for context_length in self.context_lengths: + for depth_percent in self.document_depth_percents: + task = self.bound_evaluate_and_log(sem, context_length, depth_percent) + tg.create_task(task) async def evaluate_and_log(self, context_length, depth_percent): # Checks to see if you've already checked a length/percent/version. # This helps if the program stop running and you want to restart later - if self.save_results: - if self.result_exists(context_length, depth_percent): - return + if self.save_results and self.result_exists(context_length, depth_percent): + return # Go generate the required length context and place your needle statement in context = await self.generate_context(context_length, depth_percent) @@ -159,47 +175,45 @@ async def evaluate_and_log(self, context_length, depth_percent): score = self.evaluation_model.evaluate_response(response) results = { - # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. - 'model' : self.model_name, - 'context_length' : int(context_length), - 'depth_percent' : float(depth_percent), - 'version' : self.results_version, - 'needle' : self.needle, - 'model_response' : response, - 'score' : score, - 'test_duration_seconds' : test_elapsed_time, - 'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') + # 'context': context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. + 'model': self.model_name, + 'context_length': int(context_length), + 'depth_percent': float(depth_percent), + 'version': self.results_version, + 'needle': self.needle, + 'model_response': response, + 'score': score, + 'test_duration_seconds': test_elapsed_time, + 'test_timestamp_utc': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') } self.testing_results.append(results) if self.print_ongoing_status: - print (f"-- Test Summary -- ") - print (f"Duration: {test_elapsed_time:.1f} seconds") - print (f"Context: {context_length} tokens") - print (f"Depth: {depth_percent}%") - print (f"Score: {score}") - print (f"Response: {response}\n") + self.print_status(test_elapsed_time, context_length, depth_percent, score, response) - context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}' + parsed_model_name = self.model_name.replace(".", "_") + context_file_location = self.result_file_format.format(model_name=parsed_model_name, + context_length=context_length, + depth_percent=int(depth_percent)) if self.save_contexts: results['file_name'] = context_file_location # Save the context to file for retesting - if not os.path.exists('contexts'): - os.makedirs('contexts') + if not os.path.exists(self.context_dir): + os.makedirs(self.context_dir) - with open(f'contexts/{context_file_location}_context.txt', 'w') as f: + with open(f'{self.context_dir}/{context_file_location}_context.txt', 'w') as f: f.write(context) if self.save_results: # Save the context to file for retesting - if not os.path.exists('results'): - os.makedirs('results') + if not os.path.exists(self.results_dir): + os.makedirs(self.results_dir) # Save the result to file for retesting - with open(f'results/{context_file_location}_results.json', 'w') as f: + with open(f'{self.results_dir}/{context_file_location}_results.json', 'w') as f: json.dump(results, f) if self.seconds_to_sleep_between_completions: @@ -210,20 +224,21 @@ def result_exists(self, context_length, depth_percent): Checks to see if a result has already been evaluated or not """ - results_dir = 'results/' - if not os.path.exists(results_dir): + if not os.path.exists(self.results_dir): + return False + + filename = self.result_file_format.format(model_name=self.model_name, + context_length=context_length, + depth_percent=depth_percent) + file_path = os.path.join(self.results_dir, f'{filename}.json') + if not os.path.exists(file_path): return False - for filename in os.listdir(results_dir): - if filename.endswith('.json'): - with open(os.path.join(results_dir, filename), 'r') as f: - result = json.load(f) - context_length_met = result['context_length'] == context_length - depth_percent_met = result['depth_percent'] == depth_percent - version_met = result.get('version', 1) == self.results_version - model_met = result['model'] == self.model_name - if context_length_met and depth_percent_met and version_met and model_met: - return True + with open(file_path, 'r') as f: + result = json.load(f) + + if result.get('version', 1) == self.results_version: + return True return False async def generate_context(self, context_length, depth_percent): @@ -265,9 +280,9 @@ def insert_needle(self, context, depth_percent, context_length): period_tokens = self.model_to_test.encode_text_to_tokens('.') # Then we iteration backwards until we find the first period - while tokens_new_context and tokens_new_context[-1] not in period_tokens: + while (insertion_point > 0) and (tokens_new_context[insertion_point-1] != period_tokens): insertion_point -= 1 - tokens_new_context = tokens_context[:insertion_point] + tokens_new_context = tokens_context[:insertion_point] # Once we get there, then add in your needle, and stick the rest of your context in on the other end. # Now we have a needle in a haystack @@ -282,12 +297,16 @@ def get_context_length_in_tokens(self, context): def read_context_files(self): context = "" + current_context_length = 0 max_context_length = max(self.context_lengths) - while self.get_context_length_in_tokens(context) < max_context_length: + while current_context_length < max_context_length: for file in glob.glob(f"{self.haystack_dir}/*.txt"): with open(file, 'r') as f: - context += f.read() + file_content = f.read() + + context += file_content + current_context_length += self.get_context_length_in_tokens(file_content) return context def encode_and_trim(self, context, context_length): @@ -308,6 +327,14 @@ def print_start_test_summary(self): print (f"- Needle: {self.needle.strip()}") print ("\n\n") + def print_status(self, elapsed_time, context_length, depth_percent, score, response): + print (f"-- Test Summary -- ") + print (f"Duration: {elapsed_time:.1f} seconds") + print (f"Context: {context_length} tokens") + print (f"Depth: {depth_percent}%") + print (f"Score: {score}") + print (f"Response: {response}\n") + def start_test(self): if self.print_ongoing_status: self.print_start_test_summary() diff --git a/src/providers/anthropic.py b/src/providers/anthropic.py index 5bacde58..74931dfe 100644 --- a/src/providers/anthropic.py +++ b/src/providers/anthropic.py @@ -6,9 +6,16 @@ from typing import Optional class Anthropic(ModelProvider): - def __init__(self, model_name: str = "claude", api_key: str = None): + DEFAULT_MODEL_KWARGS: dict = dict(max_tokens_to_sample = 300, + temperature = 0) + + def __init__(self, + model_name: str = "claude", + model_kwargs: dict = DEFAULT_MODEL_KWARGS, + api_key: str = None): """ :param model_name: The name of the model. Default is 'claude'. + :param model_kwargs: Model configuration. Default is {max_tokens_to_sample: 300, temperature: 0} :param api_key: The API key for Anthropic. Default is None. """ @@ -19,6 +26,7 @@ def __init__(self, model_name: str = "claude", api_key: str = None): raise ValueError("Either api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env.") self.model_name = model_name + self.model_kwargs = model_kwargs self.api_key = api_key or os.getenv('ANTHROPIC_API_KEY') self.model = AsyncAnthropic(api_key=self.api_key) @@ -32,9 +40,8 @@ def __init__(self, model_name: str = "claude", api_key: str = None): async def evaluate_model(self, prompt: str) -> str: response = await self.model.completions.create( model=self.model_name, - max_tokens_to_sample=300, prompt=prompt, - temperature=0) + **self.model_kwargs) return response.completion def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]: diff --git a/src/providers/openai.py b/src/providers/openai.py index 494dcdbe..9fba2142 100644 --- a/src/providers/openai.py +++ b/src/providers/openai.py @@ -7,9 +7,16 @@ from typing import Optional class OpenAI(ModelProvider): - def __init__(self, model_name: str = "gpt-3.5-turbo-0125", api_key: str = None): + DEFAULT_MODEL_KWARGS: dict = dict(max_tokens = 300, + temperature = 0) + + def __init__(self, + model_name: str = "gpt-3.5-turbo-0125", + model_kwargs: dict = DEFAULT_MODEL_KWARGS, + api_key: str = None): """ :param model_name: The name of the model. Default is 'gpt-3.5-turbo-0125'. + :param model_kwargs: Model configuration. Default is {max_tokens: 300, temperature: 0} :param api_key: The API key for OpenAI. Default is None. """ @@ -17,6 +24,7 @@ def __init__(self, model_name: str = "gpt-3.5-turbo-0125", api_key: str = None): raise ValueError("Either api_key must be supplied with init, or OPENAI_API_KEY must be in env. Used for evaluation model") self.model_name = model_name + self.model_kwargs = model_kwargs self.api_key = api_key or os.getenv('OPENAI_API_KEY') self.model = AsyncOpenAI(api_key=self.api_key) @@ -26,8 +34,7 @@ async def evaluate_model(self, prompt: str) -> str: response = await self.model.chat.completions.create( model=self.model_name, messages=prompt, - max_tokens=300, - temperature=0 + **self.model_kwargs ) return response.choices[0].message.content From d18dde91cf8ee95e434efc5fd2245552f568b4e1 Mon Sep 17 00:00:00 2001 From: Lazaro Hurtado Date: Tue, 5 Mar 2024 00:03:45 -0800 Subject: [PATCH 2/3] small optimizations --- src/llm_needle_haystack_tester.py | 145 ++++++++++++++++++------------ 1 file changed, 86 insertions(+), 59 deletions(-) diff --git a/src/llm_needle_haystack_tester.py b/src/llm_needle_haystack_tester.py index 5f188382..ee7be428 100644 --- a/src/llm_needle_haystack_tester.py +++ b/src/llm_needle_haystack_tester.py @@ -64,6 +64,8 @@ def __init__(self, """ if not model_to_test: raise ValueError("A language model must be provided to test.") + if not evaluator: + raise ValueError("An evaluator must be provided to evaluate the model's response.") if not needle or not haystack_dir or not retrieval_question: raise ValueError("Needle, haystack, and retrieval_question must be provided.") @@ -77,13 +79,20 @@ def __init__(self, self.save_contexts = save_contexts self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions self.print_ongoing_status = print_ongoing_status + + self.context_dir = 'contexts' + self.results_dir = 'results' + self.result_file_format = '{model_name}_len_{context_length}_depth_{depth_percent}' self.testing_results = [] if context_lengths is None: if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None: raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.") else: - self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int) + self.context_lengths = self.get_intervals(context_lengths_min, + context_lengths_max, + context_lengths_num_intervals, + "linear") else: self.context_lengths = context_lengths @@ -93,13 +102,13 @@ def __init__(self, if document_depth_percents is None: if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None: raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.") - - if document_depth_percent_interval_type == 'linear': - self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int) - elif document_depth_percent_interval_type == 'sigmoid': - self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)] - else: + if document_depth_percent_interval_type not in ['linear', 'sigmoid']: raise ValueError("document_depth_percent_interval_type must be either 'sigmoid' or 'linear' if document_depth_percents is None.") + + self.document_depth_percents = self.get_intervals(document_depth_percent_min, + document_depth_percent_max, + document_depth_percent_intervals, + document_depth_percent_interval_type) else: self.document_depth_percents = document_depth_percents @@ -108,6 +117,17 @@ def __init__(self, self.evaluation_model = evaluator + def get_intervals(self, min_depth, max_depth, num_intervals, interval_type): + linear_spacing = np.linspace(min_depth, max_depth, num=num_intervals, endpoint=True) + + match interval_type: + case 'linear': + return np.round(linear_spacing).astype(int) + case 'sigmoid': + return [self.logistic(x) for x in linear_spacing] + case _: + return [] + def logistic(self, x, L=100, x0=50, k=.1): if x in [0, 100]: return x @@ -122,24 +142,20 @@ async def bound_evaluate_and_log(self, sem, *args): await self.evaluate_and_log(*args) async def run_test(self): - sem = Semaphore(self.num_concurrent_requests) - - # Run through each iteration of context_lengths and depths - tasks = [] - for context_length in self.context_lengths: - for depth_percent in self.document_depth_percents: - task = self.bound_evaluate_and_log(sem, context_length, depth_percent) - tasks.append(task) + async with asyncio.TaskGroup() as tg: + sem = Semaphore(self.num_concurrent_requests) - # Wait for all tasks to complete - await asyncio.gather(*tasks) + # Run through each iteration of context_lengths and depths + for context_length in self.context_lengths: + for depth_percent in self.document_depth_percents: + task = self.bound_evaluate_and_log(sem, context_length, depth_percent) + tg.create_task(task) async def evaluate_and_log(self, context_length, depth_percent): # Checks to see if you've already checked a length/percent/version. # This helps if the program stop running and you want to restart later - if self.save_results: - if self.result_exists(context_length, depth_percent): - return + if self.save_results and self.result_exists(context_length, depth_percent): + return # Go generate the required length context and place your needle statement in context = await self.generate_context(context_length, depth_percent) @@ -159,47 +175,45 @@ async def evaluate_and_log(self, context_length, depth_percent): score = self.evaluation_model.evaluate_response(response) results = { - # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. - 'model' : self.model_name, - 'context_length' : int(context_length), - 'depth_percent' : float(depth_percent), - 'version' : self.results_version, - 'needle' : self.needle, - 'model_response' : response, - 'score' : score, - 'test_duration_seconds' : test_elapsed_time, - 'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') + # 'context': context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. + 'model': self.model_name, + 'context_length': int(context_length), + 'depth_percent': float(depth_percent), + 'version': self.results_version, + 'needle': self.needle, + 'model_response': response, + 'score': score, + 'test_duration_seconds': test_elapsed_time, + 'test_timestamp_utc': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') } self.testing_results.append(results) if self.print_ongoing_status: - print (f"-- Test Summary -- ") - print (f"Duration: {test_elapsed_time:.1f} seconds") - print (f"Context: {context_length} tokens") - print (f"Depth: {depth_percent}%") - print (f"Score: {score}") - print (f"Response: {response}\n") + self.print_status(test_elapsed_time, context_length, depth_percent, score, response) - context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}' + parsed_model_name = self.model_name.replace(".", "_") + context_file_location = self.result_file_format.format(model_name=parsed_model_name, + context_length=context_length, + depth_percent=int(depth_percent)) if self.save_contexts: results['file_name'] = context_file_location # Save the context to file for retesting - if not os.path.exists('contexts'): - os.makedirs('contexts') + if not os.path.exists(self.context_dir): + os.makedirs(self.context_dir) - with open(f'contexts/{context_file_location}_context.txt', 'w') as f: + with open(f'{self.context_dir}/{context_file_location}_context.txt', 'w') as f: f.write(context) if self.save_results: # Save the context to file for retesting - if not os.path.exists('results'): - os.makedirs('results') + if not os.path.exists(self.results_dir): + os.makedirs(self.results_dir) # Save the result to file for retesting - with open(f'results/{context_file_location}_results.json', 'w') as f: + with open(f'{self.results_dir}/{context_file_location}_results.json', 'w') as f: json.dump(results, f) if self.seconds_to_sleep_between_completions: @@ -210,20 +224,21 @@ def result_exists(self, context_length, depth_percent): Checks to see if a result has already been evaluated or not """ - results_dir = 'results/' - if not os.path.exists(results_dir): + if not os.path.exists(self.results_dir): + return False + + filename = self.result_file_format.format(model_name=self.model_name, + context_length=context_length, + depth_percent=depth_percent) + file_path = os.path.join(self.results_dir, f'{filename}.json') + if not os.path.exists(file_path): return False - for filename in os.listdir(results_dir): - if filename.endswith('.json'): - with open(os.path.join(results_dir, filename), 'r') as f: - result = json.load(f) - context_length_met = result['context_length'] == context_length - depth_percent_met = result['depth_percent'] == depth_percent - version_met = result.get('version', 1) == self.results_version - model_met = result['model'] == self.model_name - if context_length_met and depth_percent_met and version_met and model_met: - return True + with open(file_path, 'r') as f: + result = json.load(f) + + if result.get('version', 1) == self.results_version: + return True return False async def generate_context(self, context_length, depth_percent): @@ -265,9 +280,9 @@ def insert_needle(self, context, depth_percent, context_length): period_tokens = self.model_to_test.encode_text_to_tokens('.') # Then we iteration backwards until we find the first period - while tokens_new_context and tokens_new_context[-1] not in period_tokens: + while (insertion_point > 0) and (tokens_new_context[insertion_point-1] != period_tokens): insertion_point -= 1 - tokens_new_context = tokens_context[:insertion_point] + tokens_new_context = tokens_context[:insertion_point] # Once we get there, then add in your needle, and stick the rest of your context in on the other end. # Now we have a needle in a haystack @@ -282,12 +297,16 @@ def get_context_length_in_tokens(self, context): def read_context_files(self): context = "" + current_context_length = 0 max_context_length = max(self.context_lengths) - while self.get_context_length_in_tokens(context) < max_context_length: + while current_context_length < max_context_length: for file in glob.glob(f"{self.haystack_dir}/*.txt"): with open(file, 'r') as f: - context += f.read() + file_content = f.read() + + context += file_content + current_context_length += self.get_context_length_in_tokens(file_content) return context def encode_and_trim(self, context, context_length): @@ -308,6 +327,14 @@ def print_start_test_summary(self): print (f"- Needle: {self.needle.strip()}") print ("\n\n") + def print_status(self, elapsed_time, context_length, depth_percent, score, response): + print (f"-- Test Summary -- ") + print (f"Duration: {elapsed_time:.1f} seconds") + print (f"Context: {context_length} tokens") + print (f"Depth: {depth_percent}%") + print (f"Score: {score}") + print (f"Response: {response}\n") + def start_test(self): if self.print_ongoing_status: self.print_start_test_summary() From 93653e4b8b25460135a092e63ffbd4275d89a031 Mon Sep 17 00:00:00 2001 From: Lazaro Hurtado Date: Wed, 13 Mar 2024 07:21:15 -0700 Subject: [PATCH 3/3] update --- needlehaystack/llm_needle_haystack_tester.py | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/needlehaystack/llm_needle_haystack_tester.py b/needlehaystack/llm_needle_haystack_tester.py index b3099176..17297e1a 100644 --- a/needlehaystack/llm_needle_haystack_tester.py +++ b/needlehaystack/llm_needle_haystack_tester.py @@ -16,6 +16,11 @@ class LLMNeedleHaystackTester: """ This class is used to test the LLM Needle Haystack. """ + + CONTEXT_DIR = 'contexts' + RESULTS_DIR = 'results' + RESULT_FILE_FORMAT = '{model_name}_len_{context_length}_depth_{depth_percent}' + def __init__(self, model_to_test: ModelProvider = None, evaluator: Evaluator = None, @@ -81,9 +86,6 @@ def __init__(self, self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions self.print_ongoing_status = print_ongoing_status - self.context_dir = 'contexts' - self.results_dir = 'results' - self.result_file_format = '{model_name}_len_{context_length}_depth_{depth_percent}' self.testing_results = [] if context_lengths is None: @@ -194,7 +196,7 @@ async def evaluate_and_log(self, context_length, depth_percent): self.print_status(test_elapsed_time, context_length, depth_percent, score, response) parsed_model_name = self.model_name.replace(".", "_") - context_file_location = self.result_file_format.format(model_name=parsed_model_name, + context_file_location = self.RESULT_FILE_FORMAT.format(model_name=parsed_model_name, context_length=context_length, depth_percent=int(depth_percent)) @@ -202,19 +204,19 @@ async def evaluate_and_log(self, context_length, depth_percent): results['file_name'] = context_file_location # Save the context to file for retesting - if not os.path.exists(self.context_dir): - os.makedirs(self.context_dir) + if not os.path.exists(self.CONTEXT_DIR): + os.makedirs(self.CONTEXT_DIR) - with open(f'{self.context_dir}/{context_file_location}_context.txt', 'w') as f: + with open(f'{self.CONTEXT_DIR}/{context_file_location}_context.txt', 'w') as f: f.write(context) if self.save_results: # Save the context to file for retesting - if not os.path.exists(self.results_dir): - os.makedirs(self.results_dir) + if not os.path.exists(self.RESULTS_DIR): + os.makedirs(self.RESULTS_DIR) # Save the result to file for retesting - with open(f'{self.results_dir}/{context_file_location}_results.json', 'w') as f: + with open(f'{self.RESULTS_DIR}/{context_file_location}_results.json', 'w') as f: json.dump(results, f) if self.seconds_to_sleep_between_completions: @@ -225,13 +227,13 @@ def result_exists(self, context_length, depth_percent): Checks to see if a result has already been evaluated or not """ - if not os.path.exists(self.results_dir): + if not os.path.exists(self.RESULTS_DIR): return False - filename = self.result_file_format.format(model_name=self.model_name, + filename = self.RESULT_FILE_FORMAT.format(model_name=self.model_name, context_length=context_length, depth_percent=depth_percent) - file_path = os.path.join(self.results_dir, f'{filename}.json') + file_path = os.path.join(self.RESULTS_DIR, f'{filename}.json') if not os.path.exists(file_path): return False