diff --git a/.env.template b/.env.template
index dd690308dd0e..d4d99baa2cfe 100644
--- a/.env.template
+++ b/.env.template
@@ -71,6 +71,7 @@
 OPENAI_API_KEY=your-openai-api-key
 # TEMPERATURE=0
 # USE_AZURE=False
+# OPENAI_ORGANIZATION=your-openai-organization-key-if-applicable
 
 ### AZURE
 # moved to `azure.yaml.template`
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 368930a15f3f..5219b9826bd6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,7 @@ on:
     branches: [ master, ci-test*]
     paths-ignore:
       - 'tests/Auto-GPT-test-cassettes'
+      - 'tests/integration/challenges/current_score.json'
   pull_request_target:
     branches: [ master, stable , ci-test*]
 
@@ -119,6 +120,7 @@ jobs:
       - name: Run pytest tests with coverage
         run: |
           pytest -n auto --cov=autogpt --cov-report term-missing --cov-branch --cov-report xml --cov-report term
+          python tests/integration/challenges/utils/build_current_score.py
         env:
           CI: true
           PROXY: ${{ secrets.PROXY }}
@@ -131,11 +133,20 @@ jobs:
       - name: Update cassette submodule to push target if push event
         if: ${{ github.event_name == 'push' }}
         run: |
-          cd tests/Auto-GPT-test-cassettes
           current_branch=$(echo ${{ github.ref }} | sed -e "s/refs\/heads\///g")
-          git fetch origin $current_branch
           git config --global user.name "Auto-GPT-Bot"
           git config --global user.email "github-bot@agpt.co"
+          git add tests/integration/challenges/current_score.json
+
+          if ! git diff-index --quiet HEAD; then
+              git commit -m "Update current score"
+              git push origin HEAD:refs/heads/$current_branch
+          else
+              echo "The current score didn't change."
+          fi
+          
+          cd tests/Auto-GPT-test-cassettes
+          git fetch origin $current_branch
           git add .
 
           # Check if there are any changes
@@ -150,7 +161,7 @@ jobs:
               git commit -m "Update submodule reference"
               git push origin HEAD:refs/heads/$current_branch
           else
-              echo "No changes to commit"
+              echo "No cassettes changes to commit"
               exit 0
           fi
 
@@ -182,7 +193,7 @@ jobs:
             echo "DIFF_EXISTS=false" >> $GITHUB_ENV
           fi
 
-      - name: Apply or remove prompt change label and comment
+      - name: Apply or remove behaviour change label and comment
         if: ${{ github.event_name == 'pull_request_target' }}
         run: |
           PR_NUMBER=${{ github.event.pull_request.number }}
@@ -195,14 +206,14 @@ jobs:
             -H "Authorization: Bearer $TOKEN" \
             -H "Accept: application/vnd.github.v3+json" \
             https://api.github.com/repos/$REPO/issues/$PR_NUMBER/labels \
-            -d '{"labels":["prompt change"]}'
+            -d '{"labels":["behaviour change"]}'
 
             echo $TOKEN | gh auth login --with-token
-            gh api repos/$REPO/issues/$PR_NUMBER/comments -X POST -F body="You changed AutoGPT's prompt. The cassettes have been updated and will be merged to the submodule when this Pull Request gets merged."
+            gh api repos/$REPO/issues/$PR_NUMBER/comments -X POST -F body="You changed AutoGPT's behaviour. The cassettes have been updated and will be merged to the submodule when this Pull Request gets merged."
           else
             echo "Removing label..."
             curl -X DELETE \
             -H "Authorization: Bearer $TOKEN" \
             -H "Accept: application/vnd.github.v3+json" \
-            https://api.github.com/repos/$REPO/issues/$PR_NUMBER/labels/prompt%20change
+            https://api.github.com/repos/$REPO/issues/$PR_NUMBER/labels/behaviour%20change
           fi
diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml
index 2cc3296b0499..a61b707d8e7c 100644
--- a/.github/workflows/docker-ci.yml
+++ b/.github/workflows/docker-ci.yml
@@ -3,6 +3,9 @@ name: Docker CI
 on:
   push:
     branches: [ master ]
+    paths-ignore:
+      - 'tests/Auto-GPT-test-cassettes'
+      - 'tests/integration/challenges/current_score.json'
   pull_request:
     branches: [ master, stable ]
 
diff --git a/.github/workflows/pr-label.yml b/.github/workflows/pr-label.yml
index ff4174ad81a0..0bab56385b3d 100644
--- a/.github/workflows/pr-label.yml
+++ b/.github/workflows/pr-label.yml
@@ -4,6 +4,9 @@ on:
   # So that PRs touching the same files as the push are updated
   push:
     branches: [ master ]
+    paths-ignore:
+      - 'tests/Auto-GPT-test-cassettes'
+      - 'tests/integration/challenges/current_score.json'
   # So that the `dirtyLabel` is removed if conflicts are resolve
   # We recommend `pull_request_target` so that github secrets are available.
   # In `pull_request` we wouldn't be able to change labels of fork PRs
diff --git a/BULLETIN.md b/BULLETIN.md
index 17c38b8c983c..70be3c3e3709 100644
--- a/BULLETIN.md
+++ b/BULLETIN.md
@@ -51,3 +51,9 @@ memory store was also temporarily removed but we aim to merge a new implementati
 before the next release.
 Whether built-in support for the others will be added back in the future is subject to
 discussion, feel free to pitch in: https://github.com/Significant-Gravitas/Auto-GPT/discussions/4280
+
+# Challenge Workflow 🏆
+If you have been working on challenges... Thank You!
+But to run the debugger challenge or other challenges using cassettes and VCR in docker, You will now need to `pip uninstall vcrpy` and `pip install -r requirements.txt` again.
+This will install a new version of vcrpy that is compatible with running vcr in docker.
+This workflow will be fixed as soon as the maintainer from VCRpy merges our changes.
diff --git a/autogpt/agent/agent.py b/autogpt/agent/agent.py
index a1673ad9b2d1..3dc4d390092b 100644
--- a/autogpt/agent/agent.py
+++ b/autogpt/agent/agent.py
@@ -128,11 +128,13 @@ def signal_handler(signum, frame):
             # Send message to AI, get response
             with Spinner("Thinking... ", plain_output=cfg.plain_output):
                 assistant_reply = chat_with_ai(
+                    cfg,
                     self,
                     self.system_prompt,
                     self.triggering_prompt,
                     cfg.fast_token_limit,
-                )  # TODO: This hardcodes the model to use GPT3.5. Make this an argument
+                    cfg.fast_llm_model,
+                )
 
             assistant_reply_json = fix_json_using_multiple_techniques(assistant_reply)
             for plugin in cfg.plugins:
diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index 1e61b808561c..5f76bb745506 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -64,6 +64,7 @@ def __init__(self) -> None:
         )
 
         self.openai_api_key = os.getenv("OPENAI_API_KEY")
+        self.openai_organization = os.getenv("OPENAI_ORGANIZATION")
         self.temperature = float(os.getenv("TEMPERATURE", "0"))
         self.use_azure = os.getenv("USE_AZURE") == "True"
         self.execute_local_commands = (
@@ -79,6 +80,9 @@ def __init__(self) -> None:
             openai.api_base = self.openai_api_base
             openai.api_version = self.openai_api_version
 
+        if self.openai_organization is not None:
+            openai.organization = self.openai_organization
+
         self.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
         self.elevenlabs_voice_1_id = os.getenv("ELEVENLABS_VOICE_1_ID")
         self.elevenlabs_voice_2_id = os.getenv("ELEVENLABS_VOICE_2_ID")
diff --git a/autogpt/configurator.py b/autogpt/configurator.py
index 6b855fe3243a..324f30843ea1 100644
--- a/autogpt/configurator.py
+++ b/autogpt/configurator.py
@@ -14,6 +14,9 @@
 if TYPE_CHECKING:
     from autogpt.config import Config
 
+GPT_4_MODEL = "gpt-4"
+GPT_3_MODEL = "gpt-3.5-turbo"
+
 
 def create_config(
     config: Config,
@@ -51,8 +54,6 @@ def create_config(
     config.set_debug_mode(False)
     config.set_continuous_mode(False)
     config.set_speak_mode(False)
-    config.set_fast_llm_model(check_model(config.fast_llm_model, "fast_llm_model"))
-    config.set_smart_llm_model(check_model(config.smart_llm_model, "smart_llm_model"))
 
     if debug:
         logger.typewriter_log("Debug Mode: ", Fore.GREEN, "ENABLED")
@@ -83,13 +84,26 @@ def create_config(
         logger.typewriter_log("Speak Mode: ", Fore.GREEN, "ENABLED")
         config.set_speak_mode(True)
 
+    # Set the default LLM models
     if gpt3only:
         logger.typewriter_log("GPT3.5 Only Mode: ", Fore.GREEN, "ENABLED")
-        config.set_smart_llm_model(config.fast_llm_model)
-
-    if gpt4only:
+        # --gpt3only should always use gpt-3.5-turbo, despite user's FAST_LLM_MODEL config
+        config.set_fast_llm_model(GPT_3_MODEL)
+        config.set_smart_llm_model(GPT_3_MODEL)
+
+    elif (
+        gpt4only
+        and check_model(GPT_4_MODEL, model_type="smart_llm_model") == GPT_4_MODEL
+    ):
         logger.typewriter_log("GPT4 Only Mode: ", Fore.GREEN, "ENABLED")
-        config.set_fast_llm_model(config.smart_llm_model)
+        # --gpt4only should always use gpt-4, despite user's SMART_LLM_MODEL config
+        config.set_fast_llm_model(GPT_4_MODEL)
+        config.set_smart_llm_model(GPT_4_MODEL)
+    else:
+        config.set_fast_llm_model(check_model(config.fast_llm_model, "fast_llm_model"))
+        config.set_smart_llm_model(
+            check_model(config.smart_llm_model, "smart_llm_model")
+        )
 
     if memory_type:
         supported_memory = get_supported_memory_backends()
diff --git a/autogpt/llm/chat.py b/autogpt/llm/chat.py
index 292990f5f044..7cb598256b72 100644
--- a/autogpt/llm/chat.py
+++ b/autogpt/llm/chat.py
@@ -13,29 +13,34 @@
 from autogpt.log_cycle.log_cycle import CURRENT_CONTEXT_FILE_NAME
 from autogpt.logs import logger
 
-cfg = Config()
-
 
 # TODO: Change debug from hardcode to argument
 def chat_with_ai(
+    config: Config,
     agent: Agent,
     system_prompt: str,
     user_input: str,
     token_limit: int,
+    model: str | None = None,
 ):
     """
     Interact with the OpenAI API, sending the prompt, user input,
         message history, and permanent memory.
 
     Args:
+        config (Config): The config to use.
+        agent (Agent): The agent to use.
         system_prompt (str): The prompt explaining the rules to the AI.
         user_input (str): The input from the user.
         token_limit (int): The maximum number of tokens allowed in the API call.
+        model (str, optional): The model to use. If None, the config.fast_llm_model will be used. Defaults to None.
 
     Returns:
     str: The AI's response.
     """
-    model = cfg.fast_llm_model  # TODO: Change model from hardcode to argument
+    if model is None:
+        model = config.fast_llm_model
+
     # Reserve 1000 tokens for the response
     logger.debug(f"Token limit: {token_limit}")
     send_token_limit = token_limit - 1000
@@ -140,8 +145,8 @@ def chat_with_ai(
     # Append user input, the length of this is accounted for above
     message_sequence.append(user_input_msg)
 
-    plugin_count = len(cfg.plugins)
-    for i, plugin in enumerate(cfg.plugins):
+    plugin_count = len(config.plugins)
+    for i, plugin in enumerate(config.plugins):
         if not plugin.can_handle_on_planning():
             continue
         plugin_response = plugin.on_planning(
@@ -157,7 +162,6 @@ def chat_with_ai(
             logger.debug(f"Plugins remaining at stop: {plugin_count - i}")
             break
         message_sequence.add("system", plugin_response)
-
     # Calculate remaining tokens
     tokens_remaining = token_limit - current_tokens_used
     # assert tokens_remaining >= 0, "Tokens remaining is negative.
diff --git a/autogpt/main.py b/autogpt/main.py
index 39bbf8b5ad9c..efc70aae27ff 100644
--- a/autogpt/main.py
+++ b/autogpt/main.py
@@ -22,6 +22,21 @@
 from autogpt.workspace import Workspace
 from scripts.install_plugin_deps import install_plugin_dependencies
 
+COMMAND_CATEGORIES = [
+    "autogpt.commands.analyze_code",
+    "autogpt.commands.audio_text",
+    "autogpt.commands.execute_code",
+    "autogpt.commands.file_operations",
+    "autogpt.commands.git_operations",
+    "autogpt.commands.google_search",
+    "autogpt.commands.image_gen",
+    "autogpt.commands.improve_code",
+    "autogpt.commands.web_selenium",
+    "autogpt.commands.write_tests",
+    "autogpt.app",
+    "autogpt.commands.task_statuses",
+]
+
 
 def run_auto_gpt(
     continuous: bool,
@@ -128,30 +143,18 @@ def run_auto_gpt(
     # Create a CommandRegistry instance and scan default folder
     command_registry = CommandRegistry()
 
-    command_categories = [
-        "autogpt.commands.analyze_code",
-        "autogpt.commands.audio_text",
-        "autogpt.commands.execute_code",
-        "autogpt.commands.file_operations",
-        "autogpt.commands.git_operations",
-        "autogpt.commands.google_search",
-        "autogpt.commands.image_gen",
-        "autogpt.commands.improve_code",
-        "autogpt.commands.web_selenium",
-        "autogpt.commands.write_tests",
-        "autogpt.app",
-        "autogpt.commands.task_statuses",
-    ]
     logger.debug(
         f"The following command categories are disabled: {cfg.disabled_command_categories}"
     )
-    command_categories = [
-        x for x in command_categories if x not in cfg.disabled_command_categories
+    enabled_command_categories = [
+        x for x in COMMAND_CATEGORIES if x not in cfg.disabled_command_categories
     ]
 
-    logger.debug(f"The following command categories are enabled: {command_categories}")
+    logger.debug(
+        f"The following command categories are enabled: {enabled_command_categories}"
+    )
 
-    for command_category in command_categories:
+    for command_category in enabled_command_categories:
         command_registry.import_commands(command_category)
 
     ai_name = ""
diff --git a/docs/challenges/information_retrieval/challenge_a.md b/docs/challenges/information_retrieval/challenge_a.md
index 51762fc421cd..de21066ea550 100644
--- a/docs/challenges/information_retrieval/challenge_a.md
+++ b/docs/challenges/information_retrieval/challenge_a.md
@@ -1,16 +1,19 @@
 # Information Retrieval Challenge A
 
-**Status**: Current level to beat: level 1
+**Status**: Current level to beat: level 2
 
 **Command to try**:
 
 ```
-pytest -s tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+pytest -s tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py --level=2
 ```
 
 ## Description
 
-The agent's goal is to find the revenue of Tesla in 2022.
+The agent's goal is to find the revenue of Tesla:
+- level 1 asks the revenue of Tesla in 2022 and explicitly asks to search for 'tesla revenue 2022'
+- level 2 is identical but doesn't ask to search for 'tesla revenue 2022'
+- level 3 asks for tesla's revenue by year since its creation.
 
 It should write the result in a file called output.txt.
 
diff --git a/docs/challenges/information_retrieval/challenge_b.md b/docs/challenges/information_retrieval/challenge_b.md
new file mode 100644
index 000000000000..bf77a984f646
--- /dev/null
+++ b/docs/challenges/information_retrieval/challenge_b.md
@@ -0,0 +1,22 @@
+# Information Retrieval Challenge B
+
+**Status**: Beaten
+
+**Command to try**:
+
+```
+pytest -s tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_b.py
+```
+
+## Description
+
+The agent's goal is to find the names, affiliated university, and discovery of the individuals who won the nobel prize for physics in 2010.
+
+It should write the result in a file called 2010_nobel_prize_winners.txt.
+
+The agent should be able to beat this test consistently (this is the hardest part).
+
+## Objective
+
+The objective of this challenge is to test the agent's ability to retrieve multiple pieces of related information in a consistent way.
+The agent should not use google to perform the task, because it should already know the answer. This why the task fails after 2 cycles (1 cycle to retrieve information, 1 cycle to write the file)
diff --git a/docs/setup.md b/docs/setup.md
index 4bdf6a16b2e6..c4755a8d68db 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -34,13 +34,13 @@ Get your OpenAI API key from: [https://platform.openai.com/account/api-keys](htt
 ### Set up with Docker
 
 1. Make sure you have Docker installed, see [requirements](#requirements)
-2. Pull the latest image from [Docker Hub]
+2. Create a project directory for Auto-GPT
 
         :::shell
-        docker pull significantgravitas/auto-gpt
+        mkdir Auto-GPT
+        cd Auto-GPT
 
-3. Create a folder for Auto-GPT
-4. In the folder, create a file called `docker-compose.yml` with the following contents:
+3. In the project directory, create a file called `docker-compose.yml` with the following contents:
 
         :::yaml
         version: "3.9"
@@ -71,8 +71,13 @@ Get your OpenAI API key from: [https://platform.openai.com/account/api-keys](htt
           redis:
             image: "redis/redis-stack-server:latest"
 
-5. Create the necessary [configuration](#configuration) files. If needed, you can find
+4. Create the necessary [configuration](#configuration) files. If needed, you can find
     templates in the [repository].
+5. Pull the latest image from [Docker Hub]
+
+        :::shell
+        docker pull significantgravitas/auto-gpt
+
 6. Continue to [Run with Docker](#run-with-docker)
 
 !!! note "Docker only supports headless browsing"
diff --git a/mkdocs.yml b/mkdocs.yml
index 37732c56feb5..48fa0cb51dbb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
       - Information retrieval:
         - Introduction: challenges/information_retrieval/introduction.md
         - Information Retrieval Challenge A: challenges/information_retrieval/challenge_a.md
+        - Information Retrieval Challenge B: challenges/information_retrieval/challenge_b.md
   - Submit a Challenge: challenges/submit.md
   - Beat a Challenge: challenges/beat.md
 
diff --git a/requirements.txt b/requirements.txt
index 542f9b50d92e..31f7706a30ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -58,6 +58,6 @@ pytest-benchmark
 pytest-cov
 pytest-integration
 pytest-mock
-vcrpy
+vcrpy @ git+https://github.com/Significant-Gravitas/vcrpy.git@master
 pytest-recording
 pytest-xdist
diff --git a/tests/Auto-GPT-test-cassettes b/tests/Auto-GPT-test-cassettes
index 909c30a0ab86..be280df43d6a 160000
--- a/tests/Auto-GPT-test-cassettes
+++ b/tests/Auto-GPT-test-cassettes
@@ -1 +1 @@
-Subproject commit 909c30a0ab8694db2ef63dacc0b3d6a4a0cd74f2
+Subproject commit be280df43d6a23b8074d9cba10d18ed8724a54c9
diff --git a/tests/integration/agent_factory.py b/tests/integration/agent_factory.py
index 713afb79efb8..30d9cc13b2aa 100644
--- a/tests/integration/agent_factory.py
+++ b/tests/integration/agent_factory.py
@@ -3,6 +3,7 @@
 from autogpt.agent import Agent
 from autogpt.commands.command import CommandRegistry
 from autogpt.config import AIConfig, Config
+from autogpt.main import COMMAND_CATEGORIES
 from autogpt.memory.vector import NoMemory, get_memory
 from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
 from autogpt.workspace import Workspace
@@ -140,27 +141,68 @@ def memory_management_agent(agent_test_config, memory_json_file, workspace: Work
 
 
 @pytest.fixture
-def get_company_revenue_agent(
+def information_retrieval_agents(
     agent_test_config, memory_json_file, workspace: Workspace
 ):
+    agents = []
+    command_registry = CommandRegistry()
+    enabled_command_categories = [
+        x
+        for x in COMMAND_CATEGORIES
+        if x not in agent_test_config.disabled_command_categories
+    ]
+
+    for command_category in enabled_command_categories:
+        command_registry.import_commands(command_category)
+    ai_goals = [
+        "Write to a file called output.txt tesla's revenue in 2022 after searching for 'tesla revenue 2022'.",
+        "Write to a file called output.txt tesla's revenue in 2022.",
+        "Write to a file called output.txt tesla's revenue every year since its creation.",
+    ]
+    for ai_goal in ai_goals:
+        ai_config = AIConfig(
+            ai_name="Information Retrieval Agent",
+            ai_role="an autonomous agent that specializes in retrieving information.",
+            ai_goals=[ai_goal],
+        )
+        ai_config.command_registry = command_registry
+        system_prompt = ai_config.construct_full_prompt()
+        Config().set_continuous_mode(False)
+        agents.append(
+            Agent(
+                ai_name="Information Retrieval Agent",
+                memory=memory_json_file,
+                command_registry=command_registry,
+                config=ai_config,
+                next_action_count=0,
+                system_prompt=system_prompt,
+                triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+                workspace_directory=workspace.root,
+            )
+        )
+    return agents
+
+
+@pytest.fixture
+def kubernetes_agent(memory_json_file, workspace: Workspace):
     command_registry = CommandRegistry()
     command_registry.import_commands("autogpt.commands.file_operations")
-    command_registry.import_commands("autogpt.commands.google_search")
     command_registry.import_commands("autogpt.app")
-    command_registry.import_commands("autogpt.commands.task_statuses")
 
     ai_config = AIConfig(
-        ai_name="Information Retrieval Agent",
-        ai_role="an autonomous agent that specializes in retrieving information.",
+        ai_name="Kubernetes",
+        ai_role="an autonomous agent that specializes in creating Kubernetes deployment templates.",
         ai_goals=[
-            "Search for 'tesla revenue 2022' and write the revenue of Tesla in 2022 to a file called output.txt. You should write the number without commas and you should not use signs like B for billion and M for million.",
+            "Write a simple kubernetes deployment file and save it as a kube.yaml.",
+            # You should make a simple nginx web server that uses docker and exposes the port 80.
         ],
     )
     ai_config.command_registry = command_registry
+
     system_prompt = ai_config.construct_full_prompt()
     Config().set_continuous_mode(False)
     agent = Agent(
-        ai_name="Get-CompanyRevenue",
+        ai_name="Kubernetes-Demo",
         memory=memory_json_file,
         command_registry=command_registry,
         config=ai_config,
@@ -169,29 +211,69 @@ def get_company_revenue_agent(
         triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
         workspace_directory=workspace.root,
     )
+
     return agent
 
 
 @pytest.fixture
-def kubernetes_agent(memory_json_file, workspace: Workspace):
+def get_nobel_prize_agent(agent_test_config, memory_json_file, workspace: Workspace):
     command_registry = CommandRegistry()
     command_registry.import_commands("autogpt.commands.file_operations")
     command_registry.import_commands("autogpt.app")
+    command_registry.import_commands("autogpt.commands.web_selenium")
 
     ai_config = AIConfig(
-        ai_name="Kubernetes",
-        ai_role="an autonomous agent that specializes in creating Kubernetes deployment templates.",
+        ai_name="Get-PhysicsNobelPrize",
+        ai_role="An autonomous agent that specializes in physics history.",
         ai_goals=[
-            "Write a simple kubernetes deployment file and save it as a kube.yaml.",
-            # You should make a simple nginx web server that uses docker and exposes the port 80.
+            "Write to file the winner's name(s), affiliated university, and discovery of the 2010 nobel prize in physics. Write your final answer to 2010_nobel_prize_winners.txt.",
         ],
     )
     ai_config.command_registry = command_registry
 
     system_prompt = ai_config.construct_full_prompt()
     Config().set_continuous_mode(False)
+
     agent = Agent(
-        ai_name="Kubernetes-Demo",
+        ai_name="Get-PhysicsNobelPrize",
+        memory=memory_json_file,
+        command_registry=command_registry,
+        config=ai_config,
+        next_action_count=0,
+        system_prompt=system_prompt,
+        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+        workspace_directory=workspace.root,
+    )
+
+    return agent
+
+
+@pytest.fixture
+def debug_code_agent(agent_test_config, memory_json_file, workspace: Workspace):
+    command_registry = CommandRegistry()
+    command_registry.import_commands("autogpt.commands.file_operations")
+    command_registry.import_commands("autogpt.commands.execute_code")
+    command_registry.import_commands("autogpt.commands.improve_code")
+    command_registry.import_commands("autogpt.app")
+    command_registry.import_commands("autogpt.commands.task_statuses")
+
+    ai_config = AIConfig(
+        ai_name="Debug Code Agent",
+        ai_role="an autonomous agent that specializes in debugging python code",
+        ai_goals=[
+            "1-Run the code in the file named 'code.py' using the execute_code command.",
+            "2-Read code.py to understand why the code is not working as expected.",
+            "3-Modify code.py to fix the error.",
+            "Repeat step 1, 2 and 3 until the code is working as expected. When you're done use the task_complete command.",
+            "Do not use any other commands than execute_python_file and write_file",
+        ],
+    )
+    ai_config.command_registry = command_registry
+
+    system_prompt = ai_config.construct_full_prompt()
+    Config().set_continuous_mode(False)
+    agent = Agent(
+        ai_name="Debug Code Agent",
         memory=memory_json_file,
         command_registry=command_registry,
         config=ai_config,
diff --git a/tests/integration/challenges/challenge_decorator/challenge.py b/tests/integration/challenges/challenge_decorator/challenge.py
index baf821a1dd37..fd3b60cb6cb1 100644
--- a/tests/integration/challenges/challenge_decorator/challenge.py
+++ b/tests/integration/challenges/challenge_decorator/challenge.py
@@ -9,6 +9,7 @@ def __init__(
         name: str,
         category: str,
         max_level: int,
+        is_new_challenge: bool,
         max_level_beaten: Optional[int],
         level_to_run: Optional[int] = None,
     ) -> None:
@@ -19,3 +20,4 @@ def __init__(
         self.succeeded = False
         self.skipped = False
         self.level_to_run = level_to_run
+        self.is_new_challenge = is_new_challenge
diff --git a/tests/integration/challenges/challenge_decorator/challenge_decorator.py b/tests/integration/challenges/challenge_decorator/challenge_decorator.py
index 580dc0890697..fe12317eed8b 100644
--- a/tests/integration/challenges/challenge_decorator/challenge_decorator.py
+++ b/tests/integration/challenges/challenge_decorator/challenge_decorator.py
@@ -1,4 +1,3 @@
-import contextlib
 import os
 from functools import wraps
 from typing import Any, Callable, Optional
@@ -23,6 +22,7 @@ def challenge(func: Callable[..., Any]) -> Callable[..., None]:
     @wraps(func)
     def wrapper(*args: Any, **kwargs: Any) -> None:
         run_remaining = MAX_LEVEL_TO_IMPROVE_ON if Challenge.BEAT_CHALLENGES else 1
+        original_error = None
 
         while run_remaining > 0:
             current_score, new_score, new_score_location = get_scores()
@@ -32,9 +32,12 @@ def wrapper(*args: Any, **kwargs: Any) -> None:
             )
             if challenge.level_to_run is not None:
                 kwargs["level_to_run"] = challenge.level_to_run
-                with contextlib.suppress(AssertionError):
+                try:
                     func(*args, **kwargs)
                     challenge.succeeded = True
+                except AssertionError as err:
+                    original_error = err
+                    challenge.succeeded = False
             else:
                 challenge.skipped = True
             if os.environ.get("CI") == "true":
@@ -48,9 +51,11 @@ def wrapper(*args: Any, **kwargs: Any) -> None:
                 pytest.skip("This test has not been unlocked yet.")
 
             if not challenge.succeeded:
-                if Challenge.BEAT_CHALLENGES:
+                if Challenge.BEAT_CHALLENGES or challenge.is_new_challenge:
                     # xfail
                     pytest.xfail("Challenge failed")
+                if original_error:
+                    raise original_error
                 raise AssertionError("Challenge failed")
             run_remaining -= 1
 
diff --git a/tests/integration/challenges/challenge_decorator/challenge_utils.py b/tests/integration/challenges/challenge_decorator/challenge_utils.py
index b94f71649038..7db7648fa4bc 100644
--- a/tests/integration/challenges/challenge_decorator/challenge_utils.py
+++ b/tests/integration/challenges/challenge_decorator/challenge_utils.py
@@ -13,13 +13,13 @@ def create_challenge(
     level_to_run: Optional[int] = None,
 ) -> Challenge:
     challenge_category, challenge_name = get_challenge_identifiers(func)
-
+    is_new_challenge = challenge_name not in current_score.get(challenge_category, {})
     max_level = get_max_level(current_score, challenge_category, challenge_name)
     max_level_beaten = get_max_level_beaten(
         current_score, challenge_category, challenge_name
     )
     level_to_run = get_level_to_run(
-        is_beat_challenges, level_to_run, max_level, max_level_beaten
+        is_beat_challenges, level_to_run, max_level, max_level_beaten, is_new_challenge
     )
 
     return Challenge(
@@ -28,6 +28,7 @@ def create_challenge(
         max_level=max_level,
         max_level_beaten=max_level_beaten,
         level_to_run=level_to_run,
+        is_new_challenge=is_new_challenge,
     )
 
 
@@ -36,7 +37,10 @@ def get_level_to_run(
     level_to_run: Optional[int],
     max_level: int,
     max_level_beaten: Optional[int],
+    is_new_challenge: bool,
 ) -> Optional[int]:
+    if is_new_challenge:
+        return 1
     if level_to_run is not None:
         if level_to_run > max_level:
             raise ValueError(
diff --git a/tests/integration/challenges/conftest.py b/tests/integration/challenges/conftest.py
index 8a1b5c406bfd..5514a1293fba 100644
--- a/tests/integration/challenges/conftest.py
+++ b/tests/integration/challenges/conftest.py
@@ -1,9 +1,31 @@
+from typing import Any, Dict, Optional
+
 import pytest
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 
 from tests.integration.challenges.challenge_decorator.challenge import Challenge
+from tests.integration.conftest import BASE_VCR_CONFIG
+from tests.vcr.vcr_filter import before_record_response
+
+
+def before_record_response_filter_errors(
+    response: Dict[str, Any]
+) -> Optional[Dict[str, Any]]:
+    """In challenges we don't want to record errors (See issue #4461)"""
+    if response["status"]["code"] >= 400:
+        return None
+
+    return before_record_response(response)
+
+
+@pytest.fixture(scope="module")
+def vcr_config() -> Dict[str, Any]:
+    # this fixture is called by the pytest-recording vcr decorator.
+    return BASE_VCR_CONFIG | {
+        "before_record_response": before_record_response_filter_errors,
+    }
 
 
 def pytest_addoption(parser: Parser) -> None:
diff --git a/tests/integration/challenges/current_score.json b/tests/integration/challenges/current_score.json
index a734ff5d674d..726613991d48 100644
--- a/tests/integration/challenges/current_score.json
+++ b/tests/integration/challenges/current_score.json
@@ -9,8 +9,18 @@
             "max_level_beaten": 1
         }
     },
+    "debug_code": {
+        "debug_code_challenge_a": {
+            "max_level": 1,
+            "max_level_beaten": 1
+        }
+    },
     "information_retrieval": {
         "information_retrieval_challenge_a": {
+            "max_level": 3,
+            "max_level_beaten": 1
+        },
+        "information_retrieval_challenge_b": {
             "max_level": 1,
             "max_level_beaten": 1
         }
@@ -28,7 +38,7 @@
         },
         "memory_challenge_b": {
             "max_level": 5,
-            "max_level_beaten": 1
+            "max_level_beaten": null
         },
         "memory_challenge_c": {
             "max_level": 5,
diff --git a/tests/integration/challenges/debug_code/data/two_sum.py b/tests/integration/challenges/debug_code/data/two_sum.py
new file mode 100644
index 000000000000..305cff4e41d0
--- /dev/null
+++ b/tests/integration/challenges/debug_code/data/two_sum.py
@@ -0,0 +1,19 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[int]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
+
+
+# Example usage:
+nums = [2, 7, 11, 15]
+target = 9
+result = two_sum(nums, target)
+print(result)  # Output: [0, 1]
diff --git a/tests/integration/challenges/debug_code/data/two_sum_tests.py b/tests/integration/challenges/debug_code/data/two_sum_tests.py
new file mode 100644
index 000000000000..0eb89bcbfc95
--- /dev/null
+++ b/tests/integration/challenges/debug_code/data/two_sum_tests.py
@@ -0,0 +1,30 @@
+# mypy: ignore-errors
+# we need a new line at the top of the file to avoid a syntax error
+
+
+def test_two_sum(nums, target, expected_result):
+    # These tests are appended to the two_sum file so we can ignore this error for now
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+# test the trivial case with the first two numbers
+nums = [2, 7, 11, 15]
+target = 9
+expected_result = [0, 1]
+test_two_sum(nums, target, expected_result)
+
+# test for ability to use zero and the same number twice
+nums = [2, 7, 0, 15, 12, 0]
+target = 0
+expected_result = [2, 5]
+test_two_sum(nums, target, expected_result)
+
+# test for first and last index usage and negative numbers
+nums = [-6, 7, 11, 4]
+target = -2
+expected_result = [0, 3]
+test_two_sum(nums, target, expected_result)
diff --git a/tests/integration/challenges/debug_code/test_debug_code_challenge_a.py b/tests/integration/challenges/debug_code/test_debug_code_challenge_a.py
new file mode 100644
index 000000000000..008e562ce307
--- /dev/null
+++ b/tests/integration/challenges/debug_code/test_debug_code_challenge_a.py
@@ -0,0 +1,51 @@
+from pathlib import Path
+
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.execute_code import execute_python_file
+from autogpt.commands.file_operations import append_to_file, write_to_file
+from autogpt.config import Config
+from tests.integration.challenges.challenge_decorator.challenge_decorator import (
+    challenge,
+)
+from tests.integration.challenges.utils import run_interaction_loop
+from tests.utils import requires_api_key
+
+CYCLE_COUNT = 5
+
+
+@pytest.mark.vcr
+@requires_api_key("OPENAI_API_KEY")
+@challenge
+def test_debug_code_challenge_a(
+    debug_code_agent: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: MockerFixture,
+    config: Config,
+    level_to_run: int,
+) -> None:
+    """
+    Test whether the agent can debug a simple code snippet.
+
+    :param debug_code_agent: The agent to test.
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    :patched_api_requestor: Sends api requests to our API CI pipeline
+    :config: The config object for the agent.
+    :level_to_run: The level to run.
+    """
+
+    file_path = str(debug_code_agent.workspace.get_path("code.py"))
+
+    code_file_path = Path(__file__).parent / "data" / "two_sum.py"
+    test_file_path = Path(__file__).parent / "data" / "two_sum_tests.py"
+
+    write_to_file(file_path, code_file_path.read_text(), config)
+
+    run_interaction_loop(monkeypatch, debug_code_agent, CYCLE_COUNT)
+
+    append_to_file(file_path, test_file_path.read_text(), config)
+
+    output = execute_python_file(file_path, config)
+    assert "error" not in output.lower(), f"Errors found in output: {output}!"
diff --git a/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py b/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
index 7a9de8ab3dd6..6b970e8b227d 100644
--- a/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+++ b/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
@@ -9,6 +9,7 @@
 from tests.utils import requires_api_key
 
 CYCLE_COUNT = 3
+EXPECTED_REVENUES = [["81"], ["81"], ["81", "53", "24", "21", "11", "7", "4", "3", "2"]]
 from autogpt.agent import Agent
 
 
@@ -16,7 +17,7 @@
 @requires_api_key("OPENAI_API_KEY")
 @challenge
 def test_information_retrieval_challenge_a(
-    get_company_revenue_agent: Agent,
+    information_retrieval_agents: Agent,
     monkeypatch: pytest.MonkeyPatch,
     patched_api_requestor: None,
     config: Config,
@@ -28,8 +29,13 @@ def test_information_retrieval_challenge_a(
     :param get_company_revenue_agent: The agent to test.
     :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
     """
-    run_interaction_loop(monkeypatch, get_company_revenue_agent, CYCLE_COUNT)
+    information_retrieval_agent = information_retrieval_agents[level_to_run - 1]
+    run_interaction_loop(monkeypatch, information_retrieval_agent, CYCLE_COUNT)
 
-    file_path = str(get_company_revenue_agent.workspace.get_path("output.txt"))
+    file_path = str(information_retrieval_agent.workspace.get_path("output.txt"))
     content = read_file(file_path, config)
-    assert "81" in content, "Expected the file to contain 81"
+    expected_revenues = EXPECTED_REVENUES[level_to_run - 1]
+    for revenue in expected_revenues:
+        assert (
+            f"{revenue}." in content or f"{revenue}," in content
+        ), f"Expected the file to contain {revenue}"
diff --git a/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_b.py b/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_b.py
new file mode 100644
index 000000000000..feac95a0f646
--- /dev/null
+++ b/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_b.py
@@ -0,0 +1,51 @@
+import contextlib
+
+import pytest
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file
+from autogpt.config import Config
+from tests.integration.challenges.challenge_decorator.challenge_decorator import (
+    challenge,
+)
+from tests.integration.challenges.utils import run_interaction_loop
+from tests.utils import requires_api_key
+
+CYCLE_COUNT = 3
+
+
+@pytest.mark.vcr
+@requires_api_key("OPENAI_API_KEY")
+@challenge
+def test_information_retrieval_challenge_b(
+    get_nobel_prize_agent: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: None,
+    level_to_run: int,
+    config: Config,
+) -> None:
+    """
+    Test the challenge_b function in a given agent by mocking user inputs and checking the output file content.
+
+    :param get_nobel_prize_agent: The agent to test.
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    :param patched_api_requestor: APIRequestor Patch to override the openai.api_requestor module for testing.
+    :param level_to_run: The level to run.
+    :param config: The config object.
+    """
+
+    with contextlib.suppress(SystemExit):
+        run_interaction_loop(monkeypatch, get_nobel_prize_agent, CYCLE_COUNT)
+
+    file_path = str(
+        get_nobel_prize_agent.workspace.get_path("2010_nobel_prize_winners.txt")
+    )
+    content = read_file(file_path, config)
+    assert "Andre Geim" in content, "Expected the file to contain Andre Geim"
+    assert (
+        "Konstantin Novoselov" in content
+    ), "Expected the file to contain Konstantin Novoselov"
+    assert (
+        "University of Manchester" in content
+    ), "Expected the file to contain University of Manchester"
+    assert "graphene" in content, "Expected the file to contain graphene"
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index cb49bc13a626..686f50be405f 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -7,21 +7,23 @@
 from tests.conftest import PROXY
 from tests.vcr.vcr_filter import before_record_request, before_record_response
 
+BASE_VCR_CONFIG = {
+    "record_mode": "new_episodes",
+    "before_record_request": before_record_request,
+    "before_record_response": before_record_response,
+    "filter_headers": [
+        "Authorization",
+        "X-OpenAI-Client-User-Agent",
+        "User-Agent",
+    ],
+    "match_on": ["method", "body"],
+}
+
 
 @pytest.fixture(scope="session")
 def vcr_config():
     # this fixture is called by the pytest-recording vcr decorator.
-    return {
-        "record_mode": "new_episodes",
-        "before_record_request": before_record_request,
-        "before_record_response": before_record_response,
-        "filter_headers": [
-            "Authorization",
-            "X-OpenAI-Client-User-Agent",
-            "User-Agent",
-        ],
-        "match_on": ["method", "body"],
-    }
+    return BASE_VCR_CONFIG
 
 
 def patch_api_base(requestor):
diff --git a/tests/test_image_gen.py b/tests/integration/test_image_gen.py
similarity index 97%
rename from tests/test_image_gen.py
rename to tests/integration/test_image_gen.py
index 5c04921b5a52..0156c9e5bdc9 100644
--- a/tests/test_image_gen.py
+++ b/tests/integration/test_image_gen.py
@@ -16,11 +16,9 @@ def image_size(request):
     return request.param
 
 
-@pytest.mark.xfail(
-    reason="The image is too big to be put in a cassette for a CI pipeline. We're looking into a solution."
-)
 @requires_api_key("OPENAI_API_KEY")
-def test_dalle(config, workspace, image_size):
+@pytest.mark.vcr
+def test_dalle(config, workspace, image_size, patched_api_requestor):
     """Test DALL-E image generation."""
     generate_and_validate(
         config,
diff --git a/tests/test_config.py b/tests/test_config.py
index 81d151cd242e..eb6946c91d22 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -2,14 +2,17 @@
 Test cases for the Config class, which handles the configuration settings
 for the AI and ensures it behaves as a singleton.
 """
+from unittest import mock
 from unittest.mock import patch
 
 import pytest
 
-from autogpt.configurator import create_config
+from autogpt.config.config import Config
+from autogpt.configurator import GPT_3_MODEL, GPT_4_MODEL, create_config
+from autogpt.workspace.workspace import Workspace
 
 
-def test_initial_values(config):
+def test_initial_values(config: Config):
     """
     Test if the initial values of the Config class attributes are set correctly.
     """
@@ -22,7 +25,7 @@ def test_initial_values(config):
     assert config.smart_token_limit == 8000
 
 
-def test_set_continuous_mode(config):
+def test_set_continuous_mode(config: Config):
     """
     Test if the set_continuous_mode() method updates the continuous_mode attribute.
     """
@@ -36,7 +39,7 @@ def test_set_continuous_mode(config):
     config.set_continuous_mode(continuous_mode)
 
 
-def test_set_speak_mode(config):
+def test_set_speak_mode(config: Config):
     """
     Test if the set_speak_mode() method updates the speak_mode attribute.
     """
@@ -50,7 +53,7 @@ def test_set_speak_mode(config):
     config.set_speak_mode(speak_mode)
 
 
-def test_set_fast_llm_model(config):
+def test_set_fast_llm_model(config: Config):
     """
     Test if the set_fast_llm_model() method updates the fast_llm_model attribute.
     """
@@ -64,7 +67,7 @@ def test_set_fast_llm_model(config):
     config.set_fast_llm_model(fast_llm_model)
 
 
-def test_set_smart_llm_model(config):
+def test_set_smart_llm_model(config: Config):
     """
     Test if the set_smart_llm_model() method updates the smart_llm_model attribute.
     """
@@ -78,7 +81,7 @@ def test_set_smart_llm_model(config):
     config.set_smart_llm_model(smart_llm_model)
 
 
-def test_set_fast_token_limit(config):
+def test_set_fast_token_limit(config: Config):
     """
     Test if the set_fast_token_limit() method updates the fast_token_limit attribute.
     """
@@ -92,7 +95,7 @@ def test_set_fast_token_limit(config):
     config.set_fast_token_limit(fast_token_limit)
 
 
-def test_set_smart_token_limit(config):
+def test_set_smart_token_limit(config: Config):
     """
     Test if the set_smart_token_limit() method updates the smart_token_limit attribute.
     """
@@ -106,7 +109,7 @@ def test_set_smart_token_limit(config):
     config.set_smart_token_limit(smart_token_limit)
 
 
-def test_set_debug_mode(config):
+def test_set_debug_mode(config: Config):
     """
     Test if the set_debug_mode() method updates the debug_mode attribute.
     """
@@ -121,7 +124,7 @@ def test_set_debug_mode(config):
 
 
 @patch("openai.Model.list")
-def test_smart_and_fast_llm_models_set_to_gpt4(mock_list_models, config):
+def test_smart_and_fast_llm_models_set_to_gpt4(mock_list_models, config: Config):
     """
     Test if models update to gpt-3.5-turbo if both are set to gpt-4.
     """
@@ -158,7 +161,7 @@ def test_smart_and_fast_llm_models_set_to_gpt4(mock_list_models, config):
     config.set_smart_llm_model(smart_llm_model)
 
 
-def test_missing_azure_config(config, workspace):
+def test_missing_azure_config(config: Config, workspace: Workspace):
     config_file = workspace.get_path("azure_config.yaml")
     with pytest.raises(FileNotFoundError):
         config.load_azure_config(str(config_file))
@@ -170,3 +173,61 @@ def test_missing_azure_config(config, workspace):
     assert config.openai_api_base == ""
     assert config.openai_api_version == "2023-03-15-preview"
     assert config.azure_model_to_deployment_id_map == {}
+
+
+def test_create_config_gpt4only(config: Config) -> None:
+    fast_llm_model = config.fast_llm_model
+    smart_llm_model = config.smart_llm_model
+    with mock.patch("autogpt.llm.api_manager.ApiManager.get_models") as mock_get_models:
+        mock_get_models.return_value = [{"id": GPT_4_MODEL}]
+        create_config(
+            config=config,
+            continuous=False,
+            continuous_limit=None,
+            ai_settings_file=None,
+            prompt_settings_file=None,
+            skip_reprompt=False,
+            speak=False,
+            debug=False,
+            gpt3only=False,
+            gpt4only=True,
+            memory_type=None,
+            browser_name=None,
+            allow_downloads=False,
+            skip_news=False,
+        )
+        assert config.fast_llm_model == GPT_4_MODEL
+        assert config.smart_llm_model == GPT_4_MODEL
+
+    # Reset config
+    config.set_fast_llm_model(fast_llm_model)
+    config.set_smart_llm_model(smart_llm_model)
+
+
+def test_create_config_gpt3only(config: Config) -> None:
+    fast_llm_model = config.fast_llm_model
+    smart_llm_model = config.smart_llm_model
+    with mock.patch("autogpt.llm.api_manager.ApiManager.get_models") as mock_get_models:
+        mock_get_models.return_value = [{"id": GPT_3_MODEL}]
+        create_config(
+            config=config,
+            continuous=False,
+            continuous_limit=None,
+            ai_settings_file=None,
+            prompt_settings_file=None,
+            skip_reprompt=False,
+            speak=False,
+            debug=False,
+            gpt3only=True,
+            gpt4only=False,
+            memory_type=None,
+            browser_name=None,
+            allow_downloads=False,
+            skip_news=False,
+        )
+        assert config.fast_llm_model == GPT_3_MODEL
+        assert config.smart_llm_model == GPT_3_MODEL
+
+    # Reset config
+    config.set_fast_llm_model(fast_llm_model)
+    config.set_smart_llm_model(smart_llm_model)