diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 424eccc716..50ee87b2a9 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -96,7 +96,7 @@ jobs:
           cmake --build --preset linux_gcc_cpu_release
           cmake --build --preset linux_gcc_cpu_release --target PyPackageBuild
 
-      - name: Install the python wheel and test dependencies
+      - name: Install the Python wheel and test dependencies
         run: |
           python3 -m pip install -r test/python/requirements.txt --user
           python3 -m pip install -r test/python/cpu/torch/requirements.txt --user
@@ -110,9 +110,14 @@ jobs:
           ls -l ${{ github.workspace }}/build/cpu
           ls -l ${{ github.workspace }}/build/cpu/wheel
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          set -e -x
+          python3 build.py --config=Release --build_dir build/cpu --build_java --parallel --cmake_generator "Ninja"
+
       # This will also download all the test models to the test/test_models directory
       # These models are used by the python tests as well as C#, C++ and others.
-      - name: Run the python tests
+      - name: Run the Python tests
         run: |
           export ORTGENAI_LOG_ORT_LIB=1
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
@@ -123,10 +128,19 @@ jobs:
           cd test/csharp
           dotnet test /p:Configuration=Release /p:NativeBuildOutputDir="../../build/cpu/" /p:OrtLibDir="../../ort/lib/" --verbosity normal
 
-      - name: Build the Java API and Run the Java Tests
+      - name: Build the C# Examples
         run: |
-          set -e -x
-          python3 build.py --config=Release --build_dir build/cpu --build_java --parallel --cmake_generator "Ninja"
+          export ORTGENAI_LOG_ORT_LIB=1
+          cd examples/csharp/ModelChat
+          dotnet build -c Release
+          cd ../ModelMM
+          dotnet build -c Release
+
+      - name: Test the C# LLM Example with Tool Calling
+        run: |
+          export ORTGENAI_LOG_ORT_LIB=1
+          python3 test/python/special_tokens.py -p test/test_models/qwen-2.5-0.5b/int4/cpu/tokenizer.json -s "<tool_call>" -e "</tool_call>"
+          ./examples/csharp/ModelChat/bin/Release/net8.0/ModelChat -m test/test_models/qwen-2.5-0.5b/int4/cpu/ -e cpu --response_format lark_grammar --tools_file test/test_models/tool-definitions/weather.json --tool_call_start "<tool_call>" --tool_call_end "</tool_call>" --user_prompt "What is the weather in Redmond, WA?" --tool_output --non_interactive --verbose
 
       - name: Run tests
         run: |
diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
index 5621f7be71..6b4b89c010 100644
--- a/.github/workflows/linux-cpu-x64-nightly-build.yml
+++ b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -66,7 +66,7 @@ jobs:
       - name: Run Q&A Example
         run: |
           python3 -m onnxruntime_genai.models.builder -i /data/ortgenai/pytorch/qwen2.5-0.5b-instruct -e cpu -p int4 -o ./example-models/qwen2.5-0.5b-instruct
-          python3 examples/python/model-qa.py -m ./example-models/qwen2.5-0.5b-instruct -e cpu --input_prompt "what is 10+4?" > output.log 2>&1
+          python3 examples/python/model-qa.py -m ./example-models/qwen2.5-0.5b-instruct -e cpu --user_prompt "what is 10+4?" --non_interactive > output.log 2>&1
           if cat output.log | grep -Eq "14|fourteen"; then
             echo "Result seems correct"
           else
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index f689754c4b..e6e5f73a9a 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -112,7 +112,7 @@ jobs:
             --multiple_repos \
             --repository onnxruntimecudabuildx64
 
-      - name: Config with Cmake in Docker
+      - name: Config with CMake in Docker
         run: |
           set -e -x
           docker run \
@@ -125,7 +125,7 @@ jobs:
                 -DMANYLINUX=ON \
                 -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} "
 
-      - name: Build with Cmake in Docker
+      - name: Build with CMake in Docker
         run: |
           set -e -x
           docker run \
@@ -136,7 +136,23 @@ jobs:
             bash -c " \
               /usr/bin/cmake --build --preset linux_gcc_cuda_release && /usr/bin/cmake --build --preset linux_gcc_cuda_release --target PyPackageBuild"
 
-      - name: Install the onnxruntime-genai Python wheel and run python test
+      - name: Build the Java API and Run the Java Tests in Docker
+        run: |
+          set -e -x
+          docker run \
+            --gpus all \
+            --rm \
+            --user 0 \
+            --volume $GITHUB_WORKSPACE:/ort_genai_src \
+            -w /ort_genai_src onnxruntimecudabuildx64 bash -c " \
+              alias python3=${{ env.PYTHON_EXECUTABLE }} && \
+              dnf -y update && dnf install -y python3.11-devel && dnf install -y python3-pip python3-setuptools python3-wheel && \
+              ${{ env.PYTHON_EXECUTABLE }} -m pip install -r test/python/requirements.txt --user && \
+              ${{ env.PYTHON_EXECUTABLE }} -m pip install -r test/python/cuda/torch/requirements.txt --user && \
+              ${{ env.PYTHON_EXECUTABLE }} -m pip install -r test/python/cuda/ort/requirements.txt --user && \
+              ${{ env.PYTHON_EXECUTABLE }} build.py --config=Release --build_dir build/cuda --build_java --parallel --cmake_generator Ninja --cmake_extra_defines PYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }}"
+
+      - name: Install the onnxruntime-genai Python wheel and run Python tests
         run: |
           echo "Installing the onnxruntime-genai Python wheel and running the Python tests"
           docker run \
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index fc1c068494..127d164654 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -108,7 +108,7 @@ jobs:
           cmake --build --preset macos_arm64_cpu_release --target PyPackageBuild
         continue-on-error: false
 
-      - name: Install the python wheel and test dependencies
+      - name: Install the Python wheel and test dependencies
         run: |
           python3 -m venv genai-macos-venv
           source genai-macos-venv/bin/activate
@@ -117,6 +117,12 @@ jobs:
           python3 -m pip install -r test/python/macos/ort/requirements.txt
           python3 -m pip install build/cpu/osx-arm64/wheel/onnxruntime_genai*.whl --no-deps
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          set -e -x
+          source genai-macos-venv/bin/activate
+          python3 build.py --config=Release --build_dir build/cpu/osx-arm64 --build_java --parallel --cmake_generator "Unix Makefiles" --macos MacOSX --osx_arch arm64 --apple_deploy_target 12.0 --apple_sysroot macosx
+
       - name: Remove the ort lib and header files
         run: |
           rm -rf ort
@@ -130,7 +136,7 @@ jobs:
 
       # This will also download all the test models to the test/test_models directory
       # These models are used by the python tests as well as C#, C++ and others.
-      - name: Run the python tests
+      - name: Run the Python tests
         run: |
           source genai-macos-venv/bin/activate
           export HF_TOKEN="12345"
@@ -144,11 +150,19 @@ jobs:
           cd test/csharp
           dotnet test /p:Configuration=Release /p:NativeBuildOutputDir="../../build/cpu/osx-arm64" --verbosity normal
 
-      - name: Build the Java API and Run the Java Tests
+      - name: Build the C# Examples
         run: |
-          set -e -x
-          source genai-macos-venv/bin/activate
-          python3 build.py --config=Release --build_dir build/cpu/osx-arm64 --build_java --parallel --cmake_generator "Unix Makefiles" --macos MacOSX --osx_arch arm64 --apple_deploy_target 12.0 --apple_sysroot macosx
+          export ORTGENAI_LOG_ORT_LIB=1
+          cd examples/csharp/ModelChat
+          dotnet build -c Release
+          cd ../ModelMM
+          dotnet build -c Release
+
+      - name: Test the C# LLM Example with Tool Calling
+        run: |
+          export ORTGENAI_LOG_ORT_LIB=1
+          python3 test/python/special_tokens.py -p test/test_models/qwen-2.5-0.5b/int4/cpu/tokenizer.json -s "<tool_call>" -e "</tool_call>"
+          ./examples/csharp/ModelChat/bin/Release/net8.0/ModelChat -m test/test_models/qwen-2.5-0.5b/int4/cpu/ -e cpu --response_format lark_grammar --tools_file test/test_models/tool-definitions/weather.json --tool_call_start "<tool_call>" --tool_call_end "</tool_call>" --user_prompt "What is the weather in Redmond, WA?" --tool_output --non_interactive --verbose
 
       - name: Run tests
         run: |
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 057151daf4..41f05cdcc6 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -93,10 +93,16 @@ jobs:
         run: |
           # Uninstalling LLVM/Clang as it is no longer required and causes issues with numpy installation
           choco uninstall llvm --yes
-          python -m pip install "numpy<2" coloredlogs flatbuffers packaging protobuf sympy pytest
+          python -m pip install -r test\python\requirements.txt --user
+          python -m pip install -r test\python\cpu\torch\requirements.txt --user
+          python -m pip install -r test\python\cpu\ort\requirements.txt --user
           python -m pip install onnxruntime-qnn
           python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          python build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+
       - name: Run the Python Tests
         run: |
           python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
@@ -106,9 +112,17 @@ jobs:
           cd test\csharp
           dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" /p:OrtLibDir="$env:GITHUB_WORKSPACE\ort\lib"
 
-      - name: Build the Java API and Run the Java Tests
+      - name: Build the C# Examples
         run: |
-          python build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+          cd examples\csharp\ModelChat
+          dotnet build -c Release
+          cd ..\ModelMM
+          dotnet build -c Release
+
+      - name: Test the C# LLM Example with Tool Calling
+        run: |
+          python3 test\python\special_tokens.py -p test\test_models\qwen-2.5-0.5b\int4\cpu\tokenizer.json -s "<tool_call>" -e "</tool_call>"
+          .\examples\csharp\ModelChat\bin\Release\net8.0\ModelChat.exe -m test\test_models\qwen-2.5-0.5b\int4\cpu\ -e cpu --response_format lark_grammar --tools_file test\test_models\tool-definitions\weather.json --tool_call_start "<tool_call>" --tool_call_end "</tool_call>" --user_prompt "What is the weather in Redmond, WA?" --tool_output --non_interactive --verbose
 
       - name: Verify Build Artifacts
         if: always()
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index 723f52c624..9f6069e6e8 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -102,13 +102,17 @@ jobs:
           cmake --build --preset windows_x64_cpu_release --parallel
           cmake --build --preset windows_x64_cpu_release --target PyPackageBuild
 
-      - name: Install the python wheel and test dependencies
+      - name: Install the Python wheel and test dependencies
         run: |
           python3 -m pip install -r test\python\requirements.txt --user
           python3 -m pip install -r test\python\cpu\torch\requirements.txt --user
           python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
           python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          python3 build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+
       - name: Run the Python Tests
         run: |
           python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
@@ -118,9 +122,17 @@ jobs:
           cd test\csharp
           dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" /p:OrtLibDir="$env:GITHUB_WORKSPACE\ort\lib" --verbosity normal
 
-      - name: Build the Java API and Run the Java Tests
+      - name: Build the C# Examples
         run: |
-          python3 build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+          cd examples\csharp\ModelChat
+          dotnet build -c Release
+          cd ..\ModelMM
+          dotnet build -c Release
+
+      - name: Test the C# LLM Example with Tool Calling
+        run: |
+          python3 test\python\special_tokens.py -p test\test_models\qwen-2.5-0.5b\int4\cpu\tokenizer.json -s "<tool_call>" -e "</tool_call>"
+          .\examples\csharp\ModelChat\bin\Release\net8.0\ModelChat.exe -m test\test_models\qwen-2.5-0.5b\int4\cpu\ -e cpu --response_format lark_grammar --tools_file test\test_models\tool-definitions\weather.json --tool_call_start "<tool_call>" --tool_call_end "</tool_call>" --user_prompt "What is the weather in Redmond, WA?" --tool_output --non_interactive --verbose
 
       - name: Verify Build Artifacts
         if: always()
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index bd634fef5c..d036850864 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -98,6 +98,10 @@ jobs:
           python -m pip install -r test\python\cuda\ort\requirements.txt
           python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          python build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+
       - name: Run the Python Tests
         run: |
           python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
@@ -115,6 +119,18 @@ jobs:
           cd test\csharp
           dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" /p:OrtLibDir="$env:GITHUB_WORKSPACE\ort\lib"
 
+      - name: Build the C# Examples
+        run: |
+          cd examples\csharp\ModelChat
+          dotnet build -c Release
+          cd ..\ModelMM
+          dotnet build -c Release
+
+      - name: Test the C# LLM Example with Tool Calling
+        run: |
+          python test\python\special_tokens.py -p test\test_models\qwen-2.5-0.5b\int4\cpu\tokenizer.json -s "<tool_call>" -e "</tool_call>"
+          .\examples\csharp\ModelChat\bin\Release\net8.0\ModelChat.exe -m test\test_models\qwen-2.5-0.5b\int4\cpu\ -e cpu --response_format lark_grammar --tools_file test\test_models\tool-definitions\weather.json --tool_call_start "<tool_call>" --tool_call_end "</tool_call>" --user_prompt "What is the weather in Redmond, WA?" --tool_output --non_interactive --verbose
+
       - name: Prepend CUDA to PATH and Run tests
         run: |-
           $env:PATH = "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin;" + $env:PATH 
diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml
index 6377573575..b477ae459a 100644
--- a/.github/workflows/win-directml-x64-build.yml
+++ b/.github/workflows/win-directml-x64-build.yml
@@ -114,6 +114,10 @@ jobs:
           python -m pip install -r test\python\directml\ort\requirements.txt
           python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
+      - name: Build the Java API and Run the Java Tests
+        run: |
+          python build.py --config=Release --build_dir $env:binaryDir --build_java --parallel
+
       - name: Run the Python Tests
         run: |
           python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
diff --git a/.gitignore b/.gitignore
index e89b87511e..dcd6a9ebe8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,7 @@ models_outputs_cpu
 benchmark/python/output
 examples/python/genai_models
 examples/python/hf_cache
-examples/csharp/HelloPhi/models
+examples/csharp/ModelChat/models
 
 !test/test_models/hf-internal-testing/
 !test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
diff --git a/.pipelines/stages/jobs/custom-nuget-packaging-job.yml b/.pipelines/stages/jobs/custom-nuget-packaging-job.yml
index 3308363a1c..0152a6ef1b 100644
--- a/.pipelines/stages/jobs/custom-nuget-packaging-job.yml
+++ b/.pipelines/stages/jobs/custom-nuget-packaging-job.yml
@@ -167,4 +167,3 @@ jobs:
     inputs:
       targetPath: '$(Build.ArtifactStagingDirectory)\nuget'
       artifactName: $(genai_nuget_package_name)
-      
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/nuget-validation-job.yml b/.pipelines/stages/jobs/nuget-validation-job.yml
index 80df35a438..9d3472421e 100644
--- a/.pipelines/stages/jobs/nuget-validation-job.yml
+++ b/.pipelines/stages/jobs/nuget-validation-job.yml
@@ -98,6 +98,16 @@ jobs:
     ${{ else }}:
       value: 'cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4'
 
+  - name: prebuild_phi4_mm_model_folder
+    ${{ if eq(parameters.ep, 'cpu') }}:
+      value: 'cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4'
+    ${{ elseif eq(parameters.ep, 'cuda') }}:
+      value: 'gpu/gpu-int4-rtn-block-32'
+    ${{ elseif eq(parameters.ep, 'directml')}}:
+      value: 'gpu/gpu-int4-rtn-block-32'
+    ${{ else }}:
+      value: 'cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4'
+
   - name: cuda_docker_image
     ${{ if eq(parameters.cuda_version, '11.8') }}:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250714.2
@@ -143,14 +153,14 @@ jobs:
       HuggingFaceRepo: 'microsoft/Phi-3-mini-4k-instruct-onnx'
       LocalFolder: 'phi3-mini'
       RepoFolder: $(prebuild_phi3_mini_model_folder)
-      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/HelloPhi'
+      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/ModelChat'
       HuggingFaceToken: $(HF_TOKEN)
       os: ${{ parameters.os }}
 
   - template: steps/nuget-validation-step.yml
     parameters:
-      CsprojFolder: "examples/csharp/HelloPhi"
-      CsprojName: "HelloPhi"
+      CsprojFolder: "examples/csharp/ModelChat"
+      CsprojName: "ModelChat"
       CsprojConfiguration: $(csproj_configuration)
       LocalFolder: 'phi3-mini'
       ModelFolder: $(prebuild_phi3_mini_model_folder)
@@ -160,14 +170,14 @@ jobs:
       HuggingFaceRepo: 'microsoft/Phi-3.5-vision-instruct-onnx'
       LocalFolder: 'phi3.5-vision'
       RepoFolder: $(prebuild_phi3_5_vision_model_folder)
-      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/HelloPhi3V'
+      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/ModelMM'
       HuggingFaceToken: $(HF_TOKEN)
       os: ${{ parameters.os }}
 
   - template: steps/nuget-validation-step.yml
     parameters:
-      CsprojFolder: "examples/csharp/HelloPhi3V"
-      CsprojName: "HelloPhi3V"
+      CsprojFolder: "examples/csharp/ModelMM"
+      CsprojName: "ModelMM"
       CsprojConfiguration: $(csproj_configuration)
       LocalFolder: 'phi3.5-vision'
       ModelFolder: $(prebuild_phi3_5_vision_model_folder)
@@ -177,14 +187,14 @@ jobs:
       HuggingFaceRepo: 'microsoft/Phi-4-multimodal-instruct-onnx'
       LocalFolder: 'phi4-mm'
       RepoFolder: $(prebuild_phi4_mm_model_folder)
-      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/HelloPhi4MM'
+      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/ModelMM'
       HuggingFaceToken: $(HF_TOKEN)
       os: ${{ parameters.os }}
 
   - template: steps/nuget-validation-step.yml
     parameters:
-      CsprojFolder: "examples/csharp/HelloPhi4MM"
-      CsprojName: "HelloPhi4MM"
+      CsprojFolder: "examples/csharp/ModelMM"
+      CsprojName: "ModelMM"
       CsprojConfiguration: $(csproj_configuration)
       LocalFolder: 'phi4-mm'
       ModelFolder: $(prebuild_phi4_mm_model_folder)
diff --git a/.pipelines/stages/jobs/py-validation-job.yml b/.pipelines/stages/jobs/py-validation-job.yml
index 426be14ba2..1b825c094b 100644
--- a/.pipelines/stages/jobs/py-validation-job.yml
+++ b/.pipelines/stages/jobs/py-validation-job.yml
@@ -211,7 +211,7 @@ jobs:
   - template: steps/python-validation-step.yml
     parameters:
       PythonScriptFolder: "examples/python"
-      PythonScriptName: "phi4-mm.py"
+      PythonScriptName: "model-mm.py"
       LocalFolder: 'phi4-mm'
       ModelFolder: $(prebuild_phi4_mm_model_folder)
 
diff --git a/.pipelines/stages/jobs/steps/nuget-validation-step.yml b/.pipelines/stages/jobs/steps/nuget-validation-step.yml
index e5004545fd..c4b9a5682d 100644
--- a/.pipelines/stages/jobs/steps/nuget-validation-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-validation-step.yml
@@ -34,7 +34,7 @@ steps:
       Copy-Item -Force -Recurse -Verbose $(Build.BinariesDirectory)/nuget/* -Destination ${{ parameters.CsprojFolder }}
       cd ${{ parameters.CsprojFolder }}
       dotnet restore -r $(os)-$(arch) /property:Configuration=${{ parameters.CsprojConfiguration }} --source https://api.nuget.org/v3/index.json --source https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json --source $PWD --disable-parallel --verbosity detailed
-      dotnet run -r $(os)-$(arch) --configuration ${{ parameters.CsprojConfiguration }} --no-restore --verbosity normal -- -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non-interactive
+      dotnet run -r $(os)-$(arch) --configuration ${{ parameters.CsprojConfiguration }} --no-restore --verbosity normal -- -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non_interactive
     displayName: 'Run ${{ parameters.CsprojName }} With Artifact on Windows'
     workingDirectory: '$(Build.Repository.LocalPath)'
     condition: eq(variables['os'], 'win')
@@ -49,7 +49,7 @@ steps:
       cd ${{ parameters.CsprojFolder }}
       dotnet restore -r $(os)-$(arch) /property:Configuration=${{ parameters.CsprojConfiguration }} --source https://api.nuget.org/v3/index.json --source https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json --source $PWD --disable-parallel --verbosity detailed
       dotnet build ./${{ parameters.CsprojName }}.csproj -r $(os)-$(arch) /property:Configuration=${{ parameters.CsprojConfiguration }} --no-restore --self-contained --verbosity normal
-      ls -l ./bin/${{ parameters.CsprojConfiguration }}/net6.0/$(os)-$(arch)/
+      ls -l ./bin/${{ parameters.CsprojConfiguration }}/net8.0/$(os)-$(arch)/
     displayName: 'Perform dotnet restore & build'
     workingDirectory: '$(Build.Repository.LocalPath)'
     condition: or(eq(variables['os'], 'linux'), eq(variables['os'], 'osx'))
@@ -70,8 +70,8 @@ steps:
         bash -c " \
             export ORTGENAI_LOG_ORT_LIB=1 && \
             cd /ort_genai_src/${{ parameters.CsprojFolder }} && \
-            chmod +x ./bin/Release_Cuda/net6.0/linux-x64/${{ parameters.CsprojName }} && \
-            ./bin/Release_Cuda/net6.0/linux-x64/${{ parameters.CsprojName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non-interactive"
+            chmod +x ./bin/Release_Cuda/net8.0/linux-x64/${{ parameters.CsprojName }} && \
+            ./bin/Release_Cuda/net8.0/linux-x64/${{ parameters.CsprojName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non_interactive"
 
     displayName: 'Run ${{ parameters.CsprojName }} With Artifact on Linux CUDA'
     workingDirectory: '$(Build.Repository.LocalPath)'
@@ -80,7 +80,7 @@ steps:
   - bash: |
       export ORTGENAI_LOG_ORT_LIB=1
       cd ${{ parameters.CsprojFolder }}
-      dotnet run -r $(os)-$(arch) --configuration ${{ parameters.CsprojConfiguration }} --no-build --verbosity normal -- -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non-interactive
+      dotnet run -r $(os)-$(arch) --configuration ${{ parameters.CsprojConfiguration }} --no-build --verbosity normal -- -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non_interactive
     displayName: 'Run ${{ parameters.CsprojName }} With Artifact on Linux/macOS CPU'
     workingDirectory: '$(Build.Repository.LocalPath)'
     condition: and(or(eq(variables['os'], 'linux'), eq(variables['os'], 'osx')), eq(variables['ep'], 'cpu'))
diff --git a/.pipelines/stages/jobs/steps/python-validation-step.yml b/.pipelines/stages/jobs/steps/python-validation-step.yml
index 48bbb6e691..7424541526 100644
--- a/.pipelines/stages/jobs/steps/python-validation-step.yml
+++ b/.pipelines/stages/jobs/steps/python-validation-step.yml
@@ -46,9 +46,9 @@ steps:
       python -m pip install --no-index --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
 
       if ("$(ep)" -eq "directml") {
-        python ${{ parameters.PythonScriptName }} -m .\${{ parameters.LocalFolder }}\${{ parameters.ModelFolder }} -e dml --non-interactive
+        python ${{ parameters.PythonScriptName }} -m .\${{ parameters.LocalFolder }}\${{ parameters.ModelFolder }} -e dml --non_interactive
       } else {
-        python ${{ parameters.PythonScriptName }} -m .\${{ parameters.LocalFolder }}\${{ parameters.ModelFolder }} -e $(ep) --non-interactive
+        python ${{ parameters.PythonScriptName }} -m .\${{ parameters.LocalFolder }}\${{ parameters.ModelFolder }} -e $(ep) --non_interactive
       }
     displayName: 'Run ${{ parameters.PythonScriptName }} With Artifact on Windows'
     workingDirectory: '$(Build.Repository.LocalPath)'
@@ -73,7 +73,7 @@ steps:
             $python_exe -m pip install -r /ort_genai_src/test/python/cuda/ort/requirements.txt && \
             cd /ort_genai_src/${{ parameters.PythonScriptFolder }} && \
             $python_exe -m pip install --no-index --find-links=/ort_genai_binary/wheel $(pip_package_name) && \
-            $python_exe ${{ parameters.PythonScriptName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non-interactive"
+            $python_exe ${{ parameters.PythonScriptName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non_interactive"
 
     displayName: 'Run ${{ parameters.PythonScriptName }} With Artifact on Linux CUDA'
     workingDirectory: '$(Build.Repository.LocalPath)'
@@ -92,7 +92,7 @@ steps:
       fi
       cd ${{ parameters.PythonScriptFolder }}
       python -m pip install --no-index --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
-      python ${{ parameters.PythonScriptName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non-interactive
+      python ${{ parameters.PythonScriptName }} -m ./${{ parameters.LocalFolder }}/${{ parameters.ModelFolder }} -e $(ep) --non_interactive
     displayName: 'Run ${{ parameters.PythonScriptName }} With Artifact on Linux/macOS CPU'
     workingDirectory: '$(Build.Repository.LocalPath)'
     condition: and(or(eq(variables['os'], 'linux'), eq(variables['os'], 'osx')), eq(variables['ep'], 'cpu'))
\ No newline at end of file
diff --git a/README.md b/README.md
index 29313d8feb..aceeaedf66 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
 
    model = og.Model('cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4')
    tokenizer = og.Tokenizer(model)
-   tokenizer_stream = tokenizer.create_stream()
+   stream = tokenizer.create_stream()
     
    # Set the max length to something sensible by default,
    # since otherwise it will be set to the entire context length
@@ -81,7 +81,7 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
       while not generator.is_done():
          generator.generate_next_token()
          new_token = generator.get_next_tokens()[0]
-         print(tokenizer_stream.decode(new_token), end='', flush=True)
+         print(stream.decode(new_token), end='', flush=True)
    except KeyboardInterrupt:
          print("  --control+c pressed, aborting generation--")
 
diff --git a/build.py b/build.py
index c3b6e915ca..ff595232ce 100644
--- a/build.py
+++ b/build.py
@@ -775,16 +775,17 @@ def build_examples(args: argparse.Namespace, env: dict[str, str]):
     samples_to_build = [
         "-DMODEL_QA=ON",
         "-DMODEL_CHAT=ON",
-        "-DMODEL_VISION=ON",
-        "-DPHI4-MM=ON",
+        "-DMODEL_MM=ON",
         "-DWHISPER=ON",
     ]
 
-    include_dir = REPO_ROOT / "src"
-    lib_dir = args.build_dir
+    ort_include_dir = REPO_ROOT / "ort" / "include"
+    ort_lib_dir = REPO_ROOT / "ort" / "lib"
+    oga_include_dir = REPO_ROOT / "src"
+    oga_lib_dir = args.build_dir
     if util.is_windows():
         # On Windows, the library files are in a subdirectory named after the configuration (e.g. Debug, Release, etc.)
-        lib_dir = lib_dir / args.config
+        oga_lib_dir = oga_lib_dir / args.config
 
     cmake_command = (
         [
@@ -798,8 +799,10 @@ def build_examples(args: argparse.Namespace, env: dict[str, str]):
         ]
         + samples_to_build
         + [
-            "-DORT_GENAI_INCLUDE_DIR=" + str(include_dir),
-            "-DORT_GENAI_LIB_DIR=" + str(lib_dir),
+            "-DORT_INCLUDE_DIR=" + str(ort_include_dir),
+            "-DORT_LIB_DIR=" + str(ort_lib_dir),
+            "-DOGA_INCLUDE_DIR=" + str(oga_include_dir),
+            "-DOGA_LIB_DIR=" + str(oga_lib_dir),
         ]
     )
 
diff --git a/docs/ConstrainedDecoding.md b/docs/ConstrainedDecoding.md
new file mode 100644
index 0000000000..b45d3f2110
--- /dev/null
+++ b/docs/ConstrainedDecoding.md
@@ -0,0 +1,10 @@
+## Constrained Decoding
+
+Constrained Decoding is useful when using function/tool calling as it helps in ensuring the output is in the correct format (i.e. ensures structured outputs).
+
+We have integrated [LLGuidance](https://github.com/guidance-ai/llguidance) for constrained decoding. There are three types of constrained decoding enabled right now:
+1. Lark Grammar (Recommended): This option allows you to have an option for a regular output as well as function/tool output in JSON format.
+2. JSON Schema: Output will be JSON schema and it will be one of the function/tools provided.
+3. Regex: If a particular regular expression is desired.
+
+To ensure that the function/tool calling works correctly with constrained decoding, you need to modify your tokenizer.json file. For each model that has its own tool calling token, the tool calling token's `special` attribute needs to be set to true. For example, Phi-4 mini uses the <|tool_call|> and <|/tool_call|> tokens so you should set the `special` attribute for them as `true` inside `tokenizer.json`.
diff --git a/documents/DownloadModels.md b/docs/DownloadModels.md
similarity index 88%
rename from documents/DownloadModels.md
rename to docs/DownloadModels.md
index 3221b992f4..88f0342537 100644
--- a/documents/DownloadModels.md
+++ b/docs/DownloadModels.md
@@ -1,10 +1,7 @@
 # Download Options for ONNX Runtime GenAI Models
 
-This guide covers two easy ways to download models for use with ONNX Runtime GenAI:
+This guide covers ways to download models for use with ONNX Runtime GenAI.
 
-Using Foundry Local
-
-Using Hugging Face CLI
 
 ## Download via Foundry Local
 
@@ -32,7 +29,7 @@ Using Hugging Face CLI
    huggingface-cli download <model_name> --include <subfolder_name>/* --local-dir .
    ```
 
-   For example, to download the Phi-4 mini instruct gpu model:
+   For example, to download the Phi-4 mini instruct generic-GPU model:
    ```
    huggingface-cli download microsoft/Phi-4-mini-instruct-onnx --include gpu/* --local-dir .
    ```
diff --git a/documents/Runtime_option.md b/docs/RuntimeOptions.md
similarity index 78%
rename from documents/Runtime_option.md
rename to docs/RuntimeOptions.md
index b8490c1cde..1c3c3c2ee1 100644
--- a/documents/Runtime_option.md
+++ b/docs/RuntimeOptions.md
@@ -1,6 +1,6 @@
 # Runtime Options
 
-This file will provide details on the usage of SetRuntimeOption API. It will list all the current key value pairs which can be used as an input for this API.
+This file will provide details on the usage of the SetRuntimeOption API. It will list all the current key value pairs which can be used as an input for this API.
 
 ## Set Terminate
 
diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index d6d514cb3b..226e88d994 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -3,49 +3,91 @@ cmake_minimum_required(VERSION 3.18.1)
 project(ortgenaiapp)
 set(CMAKE_CXX_STANDARD 20)
 
+# Download and make available nlohmann/json
+include(FetchContent)
+FetchContent_Declare(
+  nlohmann_json
+  GIT_REPOSITORY https://github.com/nlohmann/json.git
+  GIT_TAG        v3.12.0     # Or update to latest release
+)
+FetchContent_MakeAvailable(nlohmann_json)
+
+# Download and make available CLI11
+include(FetchContent)
+FetchContent_Declare(
+  CLI11
+  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+  GIT_TAG        v2.6.1      # Or update to latest release
+)
+FetchContent_MakeAvailable(CLI11)
+
 option(USE_CXX "Invoke the C++ example" ON)
 option(MODEL_CHAT "Build the Model Chat example" OFF)
-option(MODEL_QA "Build the Model Q&A example without multi-turn prompting" OFF)
-option(MODEL_VISION "Build the Model Vision example" OFF)
-option(PHI4-MM "Build the Phi-4 mm example" OFF)
+option(MODEL_QA "Build the Model Q&A example" OFF)
+option(MODEL_MM "Build the Model Multimodal example" OFF)
 option(WHISPER "Build the Whisper example" OFF)
 
 if(USE_CXX)
   add_compile_definitions(USE_CXX)
 endif()
 
+# Set expected library filenames
 if(WIN32)
-  set(ONNXRUNTIME_GENAI_LIB "onnxruntime-genai.dll")
+  set(ORT_LIB_FILE "onnxruntime.dll")
+  set(OGA_LIB_FILE "onnxruntime-genai.dll")
 elseif(APPLE)
-  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.dylib")
+  set(ORT_LIB_FILE "libonnxruntime.dylib")
+  set(OGA_LIB_FILE "libonnxruntime-genai.dylib")
 elseif(CMAKE_SYSTEM_NAME MATCHES "AIX")
-  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.a")
+  set(ORT_LIB_FILE "libonnxruntime.a")
+  set(OGA_LIB_FILE "libonnxruntime-genai.a")
 else()
-  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.so")
+  set(ORT_LIB_FILE "libonnxruntime.so")
+  set(OGA_LIB_FILE "libonnxruntime-genai.so")
 endif()
 
-# Set default library directory if not specified
-if(NOT ORT_GENAI_LIB_DIR)
-  set(ORT_GENAI_LIB_DIR "${CMAKE_SOURCE_DIR}/lib")
+# Set default variables to examples/c/include and examples/c/lib if not specified
+if(NOT ORT_INCLUDE_DIR)
+  set(ORT_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include")
+endif()
+if(NOT OGA_INCLUDE_DIR)
+  set(OGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include")
+endif()
+if(NOT ORT_LIB_DIR)
+  set(ORT_LIB_DIR "${CMAKE_SOURCE_DIR}/lib")
+endif()
+if(NOT OGA_LIB_DIR)
+  set(OGA_LIB_DIR "${CMAKE_SOURCE_DIR}/lib")
 endif()
 
-file(GLOB ort_genai_libs "${ORT_GENAI_LIB_DIR}/*")
-
-message(STATUS "ORT_GENAI_LIB_DIR: ${ORT_GENAI_LIB_DIR}")
+# Store all library files in each directory
+file(GLOB ort_libs "${ORT_LIB_DIR}/*")
+file(GLOB oga_libs "${OGA_LIB_DIR}/*")
 
 function(prepare_executable executable)
-  target_link_directories(${executable} PRIVATE ${ORT_GENAI_LIB_DIR})
-  target_link_libraries(${executable} PRIVATE ${ONNXRUNTIME_GENAI_LIB})
+  # Link directory and library for ORT and ORT GenAI
+  target_link_directories(${executable} PRIVATE ${ORT_LIB_DIR})
+  target_link_libraries(${executable} PRIVATE ${ORT_LIB_FILE})
+  target_link_directories(${executable} PRIVATE ${OGA_LIB_DIR})
+  target_link_libraries(${executable} PRIVATE ${OGA_LIB_FILE})
 
-  if (ORT_GENAI_INCLUDE_DIR)
-    target_include_directories(${executable} PRIVATE ${ORT_GENAI_INCLUDE_DIR})
-  else()
-    target_include_directories(${executable} PRIVATE ${CMAKE_SOURCE_DIR}/include)
-  endif()
+  # Add include directories for each executable
+  target_include_directories(${executable} PRIVATE ${ORT_INCLUDE_DIR})
+  target_include_directories(${executable} PRIVATE ${OGA_INCLUDE_DIR})
 
+  target_link_libraries(${executable} PUBLIC onnxruntime)
   target_link_libraries(${executable} PUBLIC onnxruntime-genai)
 
-  foreach(DEPENDENCY_FILE ${ort_genai_libs})
+  foreach(DEPENDENCY_FILE ${ort_libs})
+    if (NOT IS_DIRECTORY ${DEPENDENCY_FILE})
+      add_custom_command(
+        TARGET ${executable} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DEPENDENCY_FILE} $<TARGET_FILE_DIR:${executable}>
+      )
+    endif()
+  endforeach()
+
+  foreach(DEPENDENCY_FILE ${oga_libs})
     if (NOT IS_DIRECTORY ${DEPENDENCY_FILE})
       add_custom_command(
         TARGET ${executable} POST_BUILD
@@ -60,24 +102,27 @@ set(EXAMPLES_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src)
 if(MODEL_CHAT)
   add_executable(model_chat ${EXAMPLES_SOURCE_DIR}/model_chat.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
   prepare_executable(model_chat)
+  target_link_libraries(model_chat PRIVATE nlohmann_json::nlohmann_json)
+  target_link_libraries(model_chat PRIVATE CLI11::CLI11)
 endif()
 
 if(MODEL_QA)
   add_executable(model_qa ${EXAMPLES_SOURCE_DIR}/model_qa.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
   prepare_executable(model_qa)
+  target_link_libraries(model_qa PRIVATE nlohmann_json::nlohmann_json)
+  target_link_libraries(model_qa PRIVATE CLI11::CLI11)
 endif()
 
-if(MODEL_VISION)
-  add_executable(model_vision ${EXAMPLES_SOURCE_DIR}/model_vision.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
-  prepare_executable(model_vision)
-endif()
-
-if(PHI4-MM)
-  add_executable(phi4-mm ${CMAKE_SOURCE_DIR}/src/phi4-mm.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
-  prepare_executable(phi4-mm)
+if(MODEL_MM)
+  add_executable(model_mm ${EXAMPLES_SOURCE_DIR}/model_mm.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
+  prepare_executable(model_mm)
+  target_link_libraries(model_mm PRIVATE nlohmann_json::nlohmann_json)
+  target_link_libraries(model_mm PRIVATE CLI11::CLI11)
 endif()
 
 if(WHISPER)
-  add_executable(whisper ${CMAKE_SOURCE_DIR}/src/whisper.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
+  add_executable(whisper ${EXAMPLES_SOURCE_DIR}/whisper.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
   prepare_executable(whisper)
+  target_link_libraries(whisper PRIVATE nlohmann_json::nlohmann_json)
+  target_link_libraries(whisper PRIVATE CLI11::CLI11)
 endif()
diff --git a/examples/c/README.md b/examples/c/README.md
index a947b06493..2301ce37a9 100644
--- a/examples/c/README.md
+++ b/examples/c/README.md
@@ -1,21 +1,16 @@
-# ONNX Runtime GenAI C Example
+# ONNX Runtime GenAI C/C++ Examples
 
-> 📝 **Note:** The examples from the main branch of this repository are compatible with the binaries build from the same commit. Therefore, if using the example from `main`, ONNX Runtime GenAI needs to be built from source. If this is your scenario, just build the library and the examples will be auto built along with the library.
-If this is not your scenario, please use prebuilt binaries from the release you're interested in and use the examples from the same version tag and follow the steps below.
+> 📝 **Note:** The examples from the main branch of this repository are compatible with the binaries built from the same commit. Therefore, if using the example from `main`, ONNX Runtime GenAI needs to be built from source. If this is your scenario, just build the library and the examples will be auto built along with the library. If this is not your scenario, please use prebuilt binaries from the release you're interested in and use the examples from the same version tag and follow the steps below.
 
-## Download the model
+## Install ONNX Runtime GenAI
 
-1. Download and install [foundry-local](https://github.com/microsoft/Foundry-Local/releases)
-2. List available models: `foundry model list`
-3. Download a model you would like to run. For example: `foundry model download Phi-4-generic-cpu`
-4. Find out where the model is saved on disk: `foundry cache location`
-5. Identify the path to the model on disk. For example: `C:\Users\<user>\.foundry\Microsoft\Phi-4-generic-cpu\cpu-int4-rtn-block-32-acc-level-4`
+Install the C headers according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html).
 
-> 📝 **Note:** Foundry Local CLI is not available on Linux at the moment. Please download the model from a Windows or a macOS machine and copy it over to your Linux machine if you would like to run on Linux.
+## Download a Model
 
-For other options to download models, read through [our download options](https://github.com/microsoft/onnxruntime-genai/blob/main/documents/DownloadModels.md).
+There are many places to obtain a model. Please read through [our download options](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/DownloadModels.md).
 
-## Build the C++ Example
+## Build a C/C++ Example
 
 1. Clone the repo: `git clone https://github.com/microsoft/onnxruntime-genai.git`
    - Use the relevant release tag that aligns with the version of the libraries you're planning to use.
@@ -59,11 +54,65 @@ For other options to download models, read through [our download options](https:
       cmake --build build --parallel --config Debug
       ```
 
-## Run the sample
+## Run an Example
 
 1. On Windows:
-   - cd build\Debug
-   - .\model_qa.exe <path/to/model/from/above> <execution_provider>
+
+```powershell
+# Prerequisite: navigate to the compiled binaries.
+cd build\Debug
+```
+
+```powershell
+# The `model-chat` script allows for multi-turn conversations.
+.\model_chat.exe -m {path to model folder} -e {execution provider}
+```
+
+```powershell
+# The `model-qa` script streams the output text token by token.
+.\model_qa.exe -m {path to model folder} -e {execution provider}
+```
+
+```powershell
+# The `model-mm` script works for multi-modal models and streams the output text token by token.
+.\model_mm.exe -m {path to model folder} -e {execution provider}
+```
+
 2. On Linux and macOS:
-   - cd build
-   - ./model_qa <path/to/model/from/above> <execution_provider>
+
+```powershell
+# Prerequisite: navigate to the compiled binaries.
+cd build
+```
+
+```bash
+# The `model-chat` script allows for multi-turn conversations.
+./model_chat -m {path to model folder} -e {execution provider}
+```
+
+```bash
+# The `model-qa` script streams the output text token by token.
+./model_qa -m {path to model folder} -e {execution provider}
+```
+
+```bash
+# The `model-mm` script works for multi-modal models and streams the output text token by token.
+./model_mm -m {path to model folder} -e {execution provider}
+```
+
+## Tool Calling
+
+Please read through [our constrained decoding](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/ConstrainedDecoding.md) options to learn more.
+
+Here are some examples of how you can run the C/C++ examples with function/tool calling.
+
+```bash
+# Using JSON Schema with only tool call output
+./model_qa -m {path to model folder} -e {execution provider} --response_format json_schema --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+
+# Using Lark Grammar with only tool call output
+./model_mm -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+
+# Using Lark Grammar with text or tool call output
+./model_chat -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --text_output --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+```
diff --git a/examples/c/src/common.cpp b/examples/c/src/common.cpp
index b91bfc3c81..15159e779a 100644
--- a/examples/c/src/common.cpp
+++ b/examples/c/src/common.cpp
@@ -1,9 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "common.h"
 #include <cassert>
 
+#include "common.h"
+
 void Timing::RecordStartTimestamp() {
   assert(start_timestamp_.time_since_epoch().count() == 0);
   start_timestamp_ = Clock::now();
@@ -39,10 +40,6 @@ void Timing::Log(const int prompt_tokens_length, const int new_tokens_length) {
   std::cout << "-------------" << std::endl;
 }
 
-bool FileExists(const char* path) {
-  return static_cast<bool>(std::ifstream(path));
-}
-
 std::string Trim(const std::string& str) {
   const size_t first = str.find_first_not_of(' ');
   if (std::string::npos == first) {
@@ -52,69 +49,642 @@ std::string Trim(const std::string& str) {
   return str.substr(first, (last - first + 1));
 }
 
-static void print_usage(int /*argc*/, char** argv) {
-  std::cerr << "usage: " << argv[0] << " <model_path> [execution_provider] [ep_library_path]" << std::endl;
-  std::cerr << "  model_path:         [required] Path to the folder containing onnx models, genai_config.json, etc." << std::endl;
-  std::cerr << "  execution_provider: [optional] Force use of a particular execution provider (e.g. \"cpu\", \"cuda\", \"NvTensorRtRtx\")" << std::endl;
-  std::cerr << "                      If not specified, EP / provider options specified in genai_config.json will be used." << std::endl;
-  std::cerr << "  ep_library_path:    [optional] Path to execution provider DLL/SO for plug-in providers" << std::endl;
-  std::cerr << "                      Example: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll" << std::endl;
-  std::cerr << std::endl;
-  std::cerr << "Examples:" << std::endl;
-  std::cerr << "  " << argv[0] << " /path/to/model" << std::endl;
-  std::cerr << "  " << argv[0] << " /path/to/model cuda" << std::endl;
-  std::cerr << "  " << argv[0] << " /path/to/model cuda /path/to/onnxruntime_providers_cuda.dll" << std::endl;
-  std::cerr << "  " << argv[0] << " /path/to/model NvTensorRtRtx /path/to/onnxruntime_providers_tensorrt.dll" << std::endl;
-}
-
-bool parse_args(int argc, char** argv, std::string& model_path, std::string& ep, std::string* ep_library_path) {
-  if (argc < 2) {
-    print_usage(argc, argv);
+// Define to_json and from_json for std::optional<T>
+// Must be done within nlohmann::adl_serializer and not as standalone methods
+namespace nlohmann {
+template <class T>
+struct adl_serializer<std::optional<T>> {
+  static void to_json(nlohmann::ordered_json& j, const std::optional<T>& opt) {
+    if (opt.has_value()) {
+      j = *opt;
+    } else {
+      j = nullptr;
+    }
+  }
+  static void from_json(const nlohmann::ordered_json& j, std::optional<T>& opt) {
+    if (j.is_null()) {
+      opt = std::nullopt;
+      return;
+    }
+    opt = j.get<T>();
+  }
+};
+}  // namespace nlohmann
+
+void to_json(nlohmann::ordered_json& j, const ToolSchema& tool) {
+  j = nlohmann::ordered_json{{"description", tool.description}, {"type", tool.type}, {"properties", tool.properties}, {"required", tool.required}, {"additionalProperties", tool.additionalProperties}};
+}
+
+void from_json(const nlohmann::ordered_json& j, ToolSchema& tool) {
+  j.at("type").get_to(tool.type);
+
+  if (j.contains("description")) {
+    j.at("description").get_to(tool.description);
+  }
+
+  if (j.contains("properties")) {
+    tool.properties = j.at("properties");
+  }
+
+  if (j.contains("required")) {
+    j.at("required").get_to(tool.required);
+  }
+
+  if (j.contains("additionalProperties")) {
+    j.at("additionalProperties").get_to(tool.additionalProperties);
+  } else {
+    tool.additionalProperties = false;
+  }
+}
+
+void to_json(nlohmann::ordered_json& j, const JsonSchema& schema) {
+  j = nlohmann::ordered_json{{"x-guidance", schema.xGuidance}, {"type", schema.type}, {"items", schema.items}, {"minItems", schema.minItems}};
+}
+
+void from_json(const nlohmann::ordered_json& j, JsonSchema& schema) {
+  j.at("x-guidance").get_to(schema.xGuidance);
+  j.at("type").get_to(schema.type);
+  j.at("items").get_to(schema.items);
+  j.at("minItems").get_to(schema.minItems);
+}
+
+void to_json(nlohmann::ordered_json& j, const FunctionDefinition& func) {
+  j = nlohmann::ordered_json{{"name", func.name}, {"description", func.description}, {"parameters", func.parameters}};
+}
+
+void from_json(const nlohmann::ordered_json& j, FunctionDefinition& func) {
+  j.at("name").get_to(func.name);
+
+  if (j.contains("description")) {
+    j.at("description").get_to(func.description);
+  }
+
+  if (j.contains("parameters")) {
+    func.parameters = j.at("parameters");
+  }
+}
+
+void to_json(nlohmann::ordered_json& j, const Tool& t) {
+  j = nlohmann::ordered_json{{"type", t.type}, {"function", t.function}};
+}
+
+void from_json(const nlohmann::ordered_json& j, Tool& t) {
+  j.at("type").get_to(t.type);
+  j.at("function").get_to(t.function);
+}
+
+void to_json(nlohmann::ordered_json& j, const GeneratorParamsArgs& a) {
+  j = nlohmann::ordered_json{{"batch_size", a.batch_size}, {"num_beams", a.num_beams}, {"num_return_sequences", a.num_return_sequences}};
+  // Add optional generator params if provided
+  if (a.chunk_size != 0) j["chunk_size"] = a.chunk_size;
+  if (a.do_sample) j["do_sample"] = a.do_sample.value();
+  if (a.min_length) j["min_length"] = a.min_length.value();
+  if (a.max_length) j["max_length"] = a.max_length.value();
+  if (a.repetition_penalty) j["repetition_penalty"] = a.repetition_penalty.value();
+  if (a.temperature) j["temperature"] = a.temperature.value();
+  if (a.top_k) j["top_k"] = a.top_k.value();
+  if (a.top_p) j["top_p"] = a.top_p.value();
+}
+
+void from_json(const nlohmann::ordered_json& j, GeneratorParamsArgs& a) {
+  if (j.contains("batch_size")) j.at("batch_size").get_to(a.batch_size);
+  if (j.contains("chunk_size")) j.at("chunk_size").get_to(a.chunk_size);
+  if (j.contains("do_sample")) j.at("do_sample").get_to(a.do_sample);
+  if (j.contains("min_length")) j.at("min_length").get_to(a.min_length);
+  if (j.contains("max_length")) j.at("max_length").get_to(a.max_length);
+  if (j.contains("num_beams")) j.at("num_beams").get_to(a.num_beams);
+  if (j.contains("num_return_sequences")) j.at("num_return_sequences").get_to(a.num_return_sequences);
+  if (j.contains("repetition_penalty")) j.at("repetition_penalty").get_to(a.repetition_penalty);
+  if (j.contains("temperature")) j.at("temperature").get_to(a.temperature);
+  if (j.contains("top_k")) j.at("top_k").get_to(a.top_k);
+  if (j.contains("top_p")) j.at("top_p").get_to(a.top_p);
+}
+
+bool ParseArgs(
+    int argc,
+    char** argv,
+    GeneratorParamsArgs& generator_params_args,
+    GuidanceArgs& guidance_args,
+    std::string& model_path,
+    std::string& ep,
+    std::string& ep_path,
+    std::string& system_prompt,
+    std::string& user_prompt,
+    bool& verbose,
+    bool& debug,
+    bool& interactive,
+    bool& rewind,
+    std::vector<std::string>& image_paths,
+    std::vector<std::string>& audio_paths) {
+  CLI::App app{"Command-line arguments for ORT GenAI C/C++ examples"};
+  argv = app.ensure_utf8(argv);
+
+  std::string generator_params("Generator Params");
+  std::string guidance("Guidance Arguments");
+
+  app.add_option("-b,--batch_size", generator_params_args.batch_size, "Batch size used during inference.")->group(generator_params);
+  app.add_option("-c,--chunk_size", generator_params_args.chunk_size, "Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)")->group(generator_params);
+  app.add_option("-s,--do_sample", generator_params_args.do_sample, "Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false")->group(generator_params);
+  app.add_option("-i,--min_length", generator_params_args.min_length, "Min number of tokens to generate including the prompt")->group(generator_params);
+  app.add_option("-l,--max_length", generator_params_args.max_length, "Max number of tokens to generate including the prompt")->group(generator_params);
+  app.add_option("-n,--num_beams", generator_params_args.num_beams, "Number of beams to create")->group(generator_params);
+  app.add_option("-q,--num_return_sequences", generator_params_args.num_return_sequences, "Number of return sequences to produce")->group(generator_params);
+  app.add_option("-r,--repetition_penalty", generator_params_args.repetition_penalty, "Repetition penalty to sample with")->group(generator_params);
+  app.add_option("-t,--temperature", generator_params_args.temperature, "Temperature to sample with")->group(generator_params);
+  app.add_option("-k,--top_k", generator_params_args.top_k, "Top k tokens to sample from")->group(generator_params);
+  app.add_option("-p,--top_p", generator_params_args.top_p, "Top p probability to sample with")->group(generator_params);
+
+  app.add_option("--response_format", guidance_args.response_format, "Provide response format for the model")->group(guidance);
+  app.add_option("--tools_file", guidance_args.tools_file, "Path to file containing list of OpenAI-compatible tool definitions. Ex: test/test_models/tool-definitions/weather.json")->group(guidance);
+  app.add_flag("--text_output", guidance_args.text_output, "Produce a text response in the output")->group(guidance);
+  app.add_flag("--tool_output", guidance_args.tool_output, "Produce a tool call in the output")->group(guidance);
+  app.add_option("--tool_call_start", guidance_args.tool_call_start, "String representation of tool call start (ex: <|tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.")->group(guidance);
+  app.add_option("--tool_call_end", guidance_args.tool_call_end, "String representation of tool call end (ex: <|/tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.")->group(guidance);
+
+  app.add_option("-m,--model_path", model_path, "ONNX model folder path (must contain genai_config.json and model.onnx)")->required();
+  app.add_option("-e,--execution_provider", ep, "Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.");
+  app.add_flag("-v,--verbose", verbose, "Print verbose output and timing information. Defaults to false");
+  app.add_flag("-d,--debug", debug, "Dump input and output tensors with debug mode. Defaults to false");
+
+  app.add_option("--ep_path", ep_path, "Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)");
+  app.add_option("--system_prompt", system_prompt, "System prompt to use for the model.");
+  app.add_option("--user_prompt", user_prompt, "User prompt to use for the model.");
+  app.add_flag("--rewind", rewind, "Rewind to the system prompt after each generation. Defaults to false. Only used in model_chat.");
+  app.add_flag_callback(
+      "--non_interactive", [&] { interactive = false; }, "Disable interactive mode");
+
+  app.add_option("--image_paths", image_paths, "Space-separated list of paths to images. Only used in model_mm.")->expected(0, -1);
+  app.add_option("--audio_paths", audio_paths, "Space-separated list of paths to audios. Only used in model_mm.")->expected(0, -1);
+
+  try {
+    app.parse(argc, argv);
+  } catch (...) {
+    std::cout << app.help() << std::endl;
     return false;
   }
-  model_path = argv[1];
-  if (argc > 2) {
-    ep = argv[2];
+  return true;
+}
+
+void SetLogger(bool inputs, bool outputs) {
+  Oga::SetLogBool("enabled", true);
+  Oga::SetLogBool("model_input_values", inputs);
+  Oga::SetLogBool("model_output_values", outputs);
+}
+
+void RegisterEP(const std::string& ep, const std::string& ep_path) {
+  if (ep_path.empty()) {
+    return;  // No library path specified, skip registration
+  }
+
+  std::cout << "Registering execution provider: " << ep_path << std::endl;
+  auto env = Ort::Env();
+  if (ep.compare("cuda") == 0) {
+    env.RegisterExecutionProviderLibrary("CUDAExecutionProvider", std::filesystem::path(ep_path).c_str());
+  } else if (ep.compare("NvTensorRtRtx") == 0) {
+    env.RegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", std::filesystem::path(ep_path).c_str());
   } else {
-    ep = "follow_config";
+    std::cout << "Warning: EP registration not supported for " << ep << std::endl;
+    std::cout << "Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries." << std::endl;
+    return;
+  }
+
+  std::cout << "Registered " << ep << " successfully!" << std::endl;
+}
+
+std::unique_ptr<OgaConfig> GetConfig(const std::string& path, const std::string& ep, const std::unordered_map<std::string, std::string>& ep_options, GeneratorParamsArgs& search_options) {
+  auto config = OgaConfig::Create(path.c_str());
+  if (ep.compare("follow_config") != 0) {
+    config->ClearProviders();
+    if (ep.compare("cpu") != 0) {
+      std::cout << "Setting model to " << ep << std::endl;
+      config->AppendProvider(ep.c_str());
+    }
+
+    // Set any EP-specific options
+    for (const auto& [key, val] : ep_options) {
+      if (key.compare("enable_cuda_graph") == 0 && (ep.compare("cuda") == 0 || ep.compare("NvTensorRtRtx") == 0) && search_options.num_beams > 1) {
+        config->SetProviderOption(ep.c_str(), "enable_cuda_graph", "0");
+      } else {
+        config->SetProviderOption(ep.c_str(), key.c_str(), val.c_str());
+      }
+    }
+  }
+
+  // Set any search-specific options that need to be known before constructing a Model object
+  // Otherwise they can be set with params.SetSearchOptions(search_options)
+  nlohmann::ordered_json j = search_options;
+  std::string s = j.dump();
+  config->Overlay(s.c_str());
+  return config;
+}
+
+void SetSearchOptions(OgaGeneratorParams& generatorParams, GeneratorParamsArgs& args, bool verbose) {
+  std::vector<std::string> opts;
+  if (args.batch_size) {
+    generatorParams.SetSearchOption("batch_size", args.batch_size);
+    opts.push_back("batch_size: " + std::to_string(args.batch_size));
+  }
+  if (args.do_sample) {
+    generatorParams.SetSearchOptionBool("do_sample", args.do_sample.value());
+    opts.push_back("do_sample: " + std::to_string(args.do_sample.value()));
+  }
+  if (args.min_length) {
+    generatorParams.SetSearchOption("min_length", args.min_length.value());
+    opts.push_back("min_length: " + std::to_string(args.min_length.value()));
+  }
+  if (args.num_beams) {
+    generatorParams.SetSearchOption("num_beams", args.num_beams);
+    opts.push_back("num_beams: " + std::to_string(args.num_beams));
+  }
+  if (args.num_return_sequences) {
+    generatorParams.SetSearchOption("num_return_sequences", args.num_return_sequences);
+    opts.push_back("num_return_sequences: " + std::to_string(args.num_return_sequences));
+  }
+  if (args.repetition_penalty) {
+    generatorParams.SetSearchOption("repetition_penalty", args.repetition_penalty.value());
+    opts.push_back("repetition_penalty: " + std::to_string(args.repetition_penalty.value()));
+  }
+  if (args.temperature) {
+    generatorParams.SetSearchOption("temperature", args.temperature.value());
+    opts.push_back("temperature: " + std::to_string(args.temperature.value()));
+  }
+  if (args.top_k) {
+    generatorParams.SetSearchOption("top_k", args.top_k.value());
+    opts.push_back("top_k: " + std::to_string(args.top_k.value()));
+  }
+  if (args.top_p) {
+    generatorParams.SetSearchOption("top_p", args.top_p.value());
+    opts.push_back("top_p: " + std::to_string(args.top_p.value()));
+  }
+  if (verbose) {
+    std::cout << "GeneratorParams created: {";
+    for (int i = 0; i < opts.size(); i++) {
+      std::cout << opts[i];
+      if (i != opts.size() - 1) std::cout << ", ";
+    }
+    std::cout << "}" << std::endl;
   }
-  if (ep_library_path) {
-    if (argc > 3) {
-      *ep_library_path = argv[3];
+}
+
+std::string ApplyChatTemplate(const std::string& model_path, OgaTokenizer& tokenizer, const std::string& messages, bool add_generation_prompt, const std::string& tools) {
+  std::string template_str = "";
+  std::filesystem::path jinja_path = std::filesystem::path(model_path) / "chat_template.jinja";
+  if (std::filesystem::exists(jinja_path)) {
+    std::ifstream file(jinja_path, std::ios::binary);
+    if (file) {
+      std::ostringstream oss;
+      oss << file.rdbuf();
+      template_str = oss.str();
     } else {
-      *ep_library_path = "";
+      // If the file exists but can't be opened, fall back to empty template.
+      template_str.clear();
     }
   }
-  return true;
+
+  std::string prompt = std::string(tokenizer.ApplyChatTemplate(template_str.c_str(), messages.c_str(), tools.c_str(), add_generation_prompt));
+  return prompt;
 }
 
-void append_provider(OgaConfig& config, const std::string& provider) {
-  if (provider.compare("follow_config") != 0) {
-    config.ClearProviders();
-    if (provider.compare("cpu") != 0) {
-      config.AppendProvider(provider.c_str());
-      if (provider.compare("cuda") == 0) {
-        config.SetProviderOption(provider.c_str(), "enable_cuda_graph", "0");
-      }
+std::string GetUserPrompt(const std::string& prompt, bool interactive) {
+  std::string text;
+
+  while (true) {
+    if (interactive) {
+      // If interactive mode is on
+      std::cout << "Prompt (Use quit() to exit):" << std::endl;
+      // Clear any cin error flags because of SIGINT
+      std::cin.clear();
+      std::getline(std::cin, text);
+    } else {
+      // Use provided prompt (whether default or user-provided)
+      text = prompt;
+    }
+
+    if (text.empty()) {
+      std::cout << "Empty input. Please enter a valid prompt." << std::endl;
+      continue;  // Skip to the next iteration if input is empty
+    } else {
+      break;
+    }
+  }
+
+  return text;
+}
+
+std::vector<std::string> GetUserMediaPaths(const std::vector<std::string>& media_paths, bool interactive, std::string& media_type) {
+  // Check media type
+  std::string media_type_lower = media_type;
+  std::transform(media_type_lower.begin(), media_type_lower.end(), media_type_lower.begin(), [](unsigned char c) { return std::tolower(c); });
+  if (!(media_type == "audio" || media_type == "image")) {
+    throw std::invalid_argument("Media type must be 'image' or 'audio'");
+  }
+  std::string media_type_capitalized = (char)std::toupper(media_type[0]) + media_type.substr(1);
+
+  std::vector<std::string> paths;
+  if (!media_paths.empty()) {
+    // If user-provided media paths
+    paths = media_paths;
+  } else if (interactive) {
+    // If interactive mode is on
+    std::string paths_str;
+    std::cout << media_type_capitalized << " Path (comma separated; leave empty if no " << media_type << "):" << std::endl;
+    std::getline(std::cin, paths_str);
+
+    std::unique_ptr<OgaImages> images;
+    for (size_t start = 0, end = 0; end < paths_str.size(); start = end + 1) {
+      end = paths_str.find(',', start);
+      paths.push_back(Trim(paths_str.substr(start, end - start)));
+    }
+  }
+
+  paths.erase(std::remove_if(paths.begin(), paths.end(), [](const std::string& s) { return s.empty(); }), paths.end());
+  for (const auto& path : paths) {
+    if (!std::filesystem::exists(path)) {
+      std::string error_message = media_type_capitalized + " file not found: " + path;
+      throw std::runtime_error(error_message);
     }
+    std::cout << "Using " << media_type << ": " << path << std::endl;
   }
+
+  return paths;
 }
 
-void register_provider_library(const std::string& provider, const std::string& library_path) {
-  if (library_path.empty()) {
-    return;  // No library path specified, skip registration
+std::tuple<std::unique_ptr<OgaImages>, int> GetUserImages(const std::vector<std::string>& image_paths, bool interactive) {
+  std::string media_type = "image";
+  std::vector<std::string> paths = GetUserMediaPaths(image_paths, interactive, media_type);
+  if (paths.empty()) {
+    std::cout << "No " << media_type << " provided" << std::endl;
+    return std::make_tuple(nullptr, 0);
+  }
+
+  std::vector<const char*> paths_c;
+  for (const auto& path : paths) {
+    paths_c.push_back(path.c_str());
+  }
+
+  std::unique_ptr<OgaImages> images = OgaImages::Load(paths_c);
+  return std::make_tuple(std::move(images), static_cast<int>(paths.size()));
+}
+
+std::tuple<std::unique_ptr<OgaAudios>, int> GetUserAudios(const std::vector<std::string>& audio_paths, bool interactive) {
+  std::string media_type = "audio";
+  std::vector<std::string> paths = GetUserMediaPaths(audio_paths, interactive, media_type);
+  if (paths.empty()) {
+    std::cout << "No " << media_type << " provided" << std::endl;
+    return std::make_tuple(nullptr, 0);
+  }
+
+  std::vector<const char*> paths_c;
+  for (const auto& path : paths) {
+    paths_c.push_back(path.c_str());
+  }
+
+  std::unique_ptr<OgaAudios> audios = OgaAudios::Load(paths_c);
+  return std::make_tuple(std::move(audios), static_cast<int>(paths.size()));
+}
+
+nlohmann::ordered_json GetUserContent(const std::string& model_type, int num_images, int num_audios, const std::string& prompt) {
+  nlohmann::ordered_json content_json;
+
+  // Combine all image tags, audio tags, and text into one user content
+  std::string image_tags = "", audio_tags = "", content = "";
+  if (model_type == "phi3v") {
+    // Phi-3 vision, Phi-3.5 vision
+    for (int i = 0; i < num_images; i++) {
+      image_tags += "<|image_" + std::to_string(i + 1) + "|>\\n";
+    }
+    content = image_tags + prompt;
+    content_json = nlohmann::ordered_json(content);
+
+  } else if (model_type == "phi4mm") {
+    // Phi-4 multimodal
+    for (int i = 0; i < num_images; i++) {
+      image_tags += "<|image_" + std::to_string(i + 1) + "|>\\n";
+    }
+    for (int i = 0; i < num_audios; i++) {
+      audio_tags += "<|audio_" + std::to_string(i + 1) + "|>\\n";
+    }
+    content = image_tags + audio_tags + prompt;
+    content_json = nlohmann::ordered_json(content);
+
+  } else if (model_type == "qwen2_5_vl" || model_type == "fara") {
+    // Qwen-2.5 VL, Fara
+    for (int i = 0; i < num_images; i++) {
+      image_tags += "<|vision_start|><|image_pad|><|vision_end|>";
+    }
+    content = image_tags + prompt;
+    content_json = nlohmann::ordered_json(content);
+
+  } else {
+    // Gemma-3 style: structured content
+    content_json = nlohmann::ordered_json::array();
+
+    // Add N image blocks
+    for (int i = 0; i < num_images; i++) {
+      content_json.push_back(nlohmann::ordered_json::object({{"type", "image"}}));
+    }
+
+    // Always add a text block (with the user prompt)
+    content_json.push_back(nlohmann::ordered_json::object({{"type", "text"}, {"text", prompt}}));
+  }
+
+  return content_json;
+}
+
+std::vector<ToolSchema> ToolsToSchemas(std::vector<Tool>& tools) {
+  std::vector<ToolSchema> tool_schemas;
+  for (Tool tool : tools) {
+    std::unordered_map<std::string, std::string> name;
+    name["const"] = tool.function.name;
+
+    nlohmann::ordered_json properties = {};
+    properties["name"] = name;
+
+    bool tool_parameters_exist = tool.function.parameters.size() != 0;
+    if (tool_parameters_exist) {
+      nlohmann::ordered_json parameters = {};
+      parameters["type"] = tool.function.parameters.contains("type") ? tool.function.parameters["type"] : "object";
+      nlohmann::ordered_json empty_map = {};
+      parameters["properties"] = tool.function.parameters.contains("properties") ? tool.function.parameters["properties"] : empty_map;
+      std::vector<std::string> empty_list;
+      parameters["required"] = tool.function.parameters.contains("required") ? tool.function.parameters["required"].get<std::vector<std::string>>() : empty_list;
+
+      properties["parameters"] = parameters;
+    }
+
+    ToolSchema tool_schema;
+    tool_schema.description = tool.function.description;
+    tool_schema.type = "object";
+    tool_schema.properties = properties;
+    tool_schema.required = tool_parameters_exist ? std::vector<std::string>{"name", "parameters"} : std::vector<std::string>{"name"};
+    tool_schema.additionalProperties = false;
+
+    tool_schemas.push_back(tool_schema);
   }
+  return tool_schemas;
+}
+
+std::string GetJsonSchema(std::vector<Tool>& tools, bool tool_output) {
+  auto schemas = ToolsToSchemas(tools);
+
+  nlohmann::ordered_json x_guidance = {};
+  x_guidance["whitespace_flexible"] = false;
+  x_guidance["key_separator"] = ": ";
+  x_guidance["item_separator"] = ", ";
+
+  std::unordered_map<std::string, std::vector<ToolSchema>> items;
+  items["anyOf"] = schemas;
+
+  JsonSchema json_schema;
+  json_schema.xGuidance = x_guidance;
+  json_schema.type = "array";
+  json_schema.items = items;
+  json_schema.minItems = tool_output ? 1 : 0;
+
+  // Serialize JSON schema to string
+  nlohmann::ordered_json j = json_schema;
+  std::string s = j.dump();
+  return s;
+}
+
+std::string GetLarkGrammar(std::vector<Tool>& tools, bool text_output, bool tool_output, const std::string& tool_call_start, const std::string& tool_call_end) {
+  bool known_tool_call_ids = tool_call_start != "" && tool_call_end != "";
+  std::string call_type = known_tool_call_ids ? "toolcall" : "functioncall";
+
+  std::vector<std::string> rows;
+  std::string start_row;
+  if (text_output && !tool_output) {
+    start_row = "start: TEXT";
+  } else if (!text_output && tool_output) {
+    start_row = "start: " + call_type;
+  } else if (text_output && tool_output) {
+    start_row = "start: TEXT | " + call_type;
+  } else {
+    throw new std::runtime_error("At least one of 'text_output' and 'tool_output' must be true");
+  }
+  rows.push_back(start_row);
+
+  if (text_output) {
+    std::string text_row = "TEXT: /[^{<](.|\\n)*/";
+    rows.push_back(text_row);
+  }
+
+  if (tool_output) {
+    std::string schema = GetJsonSchema(tools, tool_output);
+    if (known_tool_call_ids) {
+      std::string tool_row = "toolcall: " + tool_call_start + " functioncall " + tool_call_end;
+      rows.push_back(tool_row);
+    }
+
+    std::string func_row = "functioncall: %json " + schema;
+    rows.push_back(func_row);
+  }
+
+  std::string grammar = "";
+  for (int i = 0; i < rows.size(); i++) {
+    grammar += rows[i];
+    if (i != rows.size() - 1) grammar += "\n";
+  }
+  return grammar;
+}
+
+std::vector<Tool> ToTool(std::vector<nlohmann::ordered_json>& tool_defs) {
+  std::vector<Tool> tools;
+  for (const auto& tool_def : tool_defs) {
+    Tool tool = tool_def.get<Tool>();
+    tools.push_back(tool);
+  }
+  return tools;
+}
 
-  std::cout << "Registering execution provider library: " << library_path << std::endl;
+std::tuple<std::string, std::string, std::string> GetGuidance(
+    const std::string& response_format,
+    const std::string& filepath,
+    const std::string& tools_str,
+    std::vector<nlohmann::ordered_json>* tools,
+    bool text_output,
+    bool tool_output,
+    const std::string& tool_call_start,
+    const std::string& tool_call_end) {
+  std::string guidance_type = "";
+  std::string guidance_data = "";
+  std::vector<Tool> all_tools;
 
-  if (provider.compare("cuda") == 0) {
-    OgaRegisterExecutionProviderLibrary("CUDAExecutionProvider", library_path.c_str());
-    std::cout << "Successfully registered CUDAExecutionProvider from " << library_path << std::endl;
-  } else if (provider.compare("NvTensorRtRtx") == 0) {
-    OgaRegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", library_path.c_str());
-    std::cout << "Successfully registered NvTensorRTRTXExecutionProvider from " << library_path << std::endl;
+  // Get list of tools from a range of sources (filepath, JSON-serialized string, in-memory)
+  if (tool_output) {
+    if (std::filesystem::exists(filepath)) {
+      std::string json_str;
+      std::ifstream file(filepath, std::ios::binary);
+      if (file) {
+        std::ostringstream oss;
+        oss << file.rdbuf();
+        json_str = oss.str();
+      }
+      if (json_str.empty()) {
+        throw new std::runtime_error("Error: JSON file is empty.");
+      }
+
+      nlohmann::ordered_json j = nlohmann::ordered_json::parse(json_str);
+      if (j.empty()) {
+        throw new std::runtime_error("Error: Tools did not de-serialize correctly");
+      }
+
+      std::vector<nlohmann::ordered_json> defs;
+      defs.reserve(j.size());
+      for (const auto& item : j) {
+        defs.push_back(item);
+      }
+      all_tools = ToTool(defs);
+    } else if (!tools_str.empty()) {
+      nlohmann::ordered_json j = nlohmann::ordered_json::parse(tools_str);
+      if (j.empty()) {
+        throw new std::runtime_error("Error: Tools did not de-serialize correctly");
+      }
+
+      std::vector<nlohmann::ordered_json> defs;
+      defs.reserve(j.size());
+      for (const auto& item : j) {
+        defs.push_back(item);
+      }
+      all_tools = ToTool(defs);
+    } else if (tools && !tools->empty()) {
+      try {
+        all_tools = ToTool(*tools);
+      } catch (...) {
+        throw new std::runtime_error("Could not convert tools from vector<nlohmann::ordered_json> to vector<Tool>");
+      }
+    } else {
+      throw new std::runtime_error("Error: Please provide the list of tools through a file, JSON-serialized string, or a list of tools");
+    }
+
+    if (all_tools.empty()) {
+      throw new std::runtime_error("Error: Could not obtain a list of tools in memory");
+    }
+  }
+
+  if (response_format == "text" || response_format == "lark_grammar") {
+    if (response_format == "text") {
+      bool right_settings = text_output && !tool_output;
+      if (!right_settings) {
+        throw new std::runtime_error("Error: A response format of 'text' requires text_output = true and tool_output = false");
+      }
+    }
+
+    guidance_type = "lark_grammar";
+    guidance_data = GetLarkGrammar(all_tools, text_output, tool_output, tool_call_start, tool_call_end);
+  } else if (response_format == "json_schema" || response_format == "json_object") {
+    bool right_settings = tool_output && !text_output;
+    if (!right_settings) {
+      throw new std::runtime_error("Error: A response format of 'json_schema' or 'json_object' requires text_output = false and tool_output = true");
+    }
+
+    guidance_type = "json_schema";
+    guidance_data = GetJsonSchema(all_tools, tool_output);
   } else {
-    std::cerr << "Warning: Provider library registration not supported for provider '" << provider << "'" << std::endl;
-    std::cerr << "         Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries." << std::endl;
+    throw new std::runtime_error("Error: Invalid response format provided");
   }
-}
\ No newline at end of file
+
+  nlohmann::ordered_json j = all_tools;
+  std::string s = j.dump();
+  return std::make_tuple(guidance_type, guidance_data, s);
+}
diff --git a/examples/c/src/common.h b/examples/c/src/common.h
index 3322a4de9e..b161acfaac 100644
--- a/examples/c/src/common.h
+++ b/examples/c/src/common.h
@@ -2,12 +2,20 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <chrono>
 #include <condition_variable>
-#include <mutex>
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <iomanip>
+#include <optional>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include <CLI/CLI.hpp>
+#include <nlohmann/json.hpp>
+#include "onnxruntime_cxx_api.h"
 #include "ort_genai.h"
 
 using Clock = std::chrono::high_resolution_clock;
@@ -19,9 +27,7 @@ class Timing {
  public:
   Timing(const Timing&) = delete;
   Timing& operator=(const Timing&) = delete;
-
   Timing() = default;
-
   ~Timing() = default;
 
   void RecordStartTimestamp();
@@ -35,20 +41,367 @@ class Timing {
   TimePoint end_timestamp_;
 };
 
-bool FileExists(const char* path);
-
+/**
+ * @brief Trim user-provided filepath
+ *
+ * @param str Filepath to trim
+ *
+ * @return Trimmed filepath
+ */
 std::string Trim(const std::string& str);
 
-// Returns true if model_path, ep, and ep_library_path were able to be set from user cmd-line args.
-// Returns false if insufficient cmd-line arguments were passed.
-// Note: ep will be set to "follow_config" if user only gives model_path
-// Note: ep_library_path will be empty if not provided (or if nullptr is passed)
-bool parse_args(int /*argc*/, char** argv, std::string& model_path, std::string& ep, std::string* ep_library_path = nullptr);
+/**
+ * @brief A class for defining a tool in a JSON schema compatible way
+ */
+struct ToolSchema {
+  std::string description;
+  std::string type;
+  nlohmann::ordered_json properties;
+  std::vector<std::string> required;
+  bool additionalProperties;
+};
+
+/**
+ * @brief Convert ToolSchema to JSON
+ *
+ * @param j JSON object
+ * @param tool ToolSchema object
+ *
+ * @return None
+ */
+void to_json(nlohmann::ordered_json& j, const ToolSchema& tool);
+
+/**
+ * @brief Convert JSON to ToolSchema
+ *
+ * @param j JSON object
+ * @param tool ToolSchema object
+ *
+ * @return None
+ */
+void from_json(const nlohmann::ordered_json& j, ToolSchema& tool);
+
+/**
+ * @brief A class for defining a JSON schema for guidance
+ */
+struct JsonSchema {
+  nlohmann::ordered_json xGuidance;
+  std::string type;
+  std::unordered_map<std::string, std::vector<ToolSchema>> items;
+  int minItems;
+};
+
+/**
+ * @brief Convert JsonSchema to JSON
+ *
+ * @param j JSON object
+ * @param schema JsonSchema object
+ *
+ * @return None
+ */
+void to_json(nlohmann::ordered_json& j, const JsonSchema& schema);
+
+/**
+ * @brief Convert JSON to JsonSchema
+ *
+ * @param j JSON object
+ * @param schema JsonSchema object
+ *
+ * @return None
+ */
+void from_json(const nlohmann::ordered_json& j, JsonSchema& schema);
+
+/**
+ * @brief A class for defining a function in an OpenAI-compatible way
+ */
+struct FunctionDefinition {
+  std::string name;
+  std::string description;
+  nlohmann::ordered_json parameters;
+};
+
+/**
+ * @brief Convert FunctionDefinition to JSON
+ *
+ * @param j JSON object
+ * @param func FunctionDefinition object
+ *
+ * @return None
+ */
+void to_json(nlohmann::ordered_json& j, const FunctionDefinition& func);
+
+/**
+ * @brief Convert JSON to FunctionDefinition
+ *
+ * @param j JSON object
+ * @param func FunctionDefinition object
+ *
+ * @return None
+ */
+void from_json(const nlohmann::ordered_json& j, FunctionDefinition& func);
+
+/**
+ * @brief A class for defining a tool in an OpenAI-compatible way
+ */
+struct Tool {
+  std::string type;
+  FunctionDefinition function;
+};
+
+/**
+ * @brief Convert Tool to JSON
+ *
+ * @param j JSON object
+ * @param t Tool object
+ *
+ * @return None
+ */
+void to_json(nlohmann::ordered_json& j, const Tool& t);
+
+/**
+ * @brief Convert JSON to Tool
+ *
+ * @param j JSON object
+ * @param t Tool object
+ *
+ * @return None
+ */
+void from_json(const nlohmann::ordered_json& j, Tool& t);
+
+/**
+ * @brief A class for holding parsed values for generator params
+ */
+struct GeneratorParamsArgs {
+  int batch_size = 1;
+  int chunk_size = 0;
+  std::optional<bool> do_sample;
+  std::optional<int> min_length;
+  std::optional<int> max_length;
+  int num_beams = 1;
+  int num_return_sequences = 1;
+  std::optional<double> repetition_penalty;
+  std::optional<double> temperature;
+  std::optional<int> top_k;
+  std::optional<double> top_p;
+};
+
+/**
+ * @brief Convert GeneratorParamsArgs to JSON
+ *
+ * @param j JSON object
+ * @param a Args object
+ *
+ * @return None
+ */
+void to_json(nlohmann::ordered_json& j, const GeneratorParamsArgs& a);
+
+/**
+ * @brief Convert JSON to GeneratorParamsArgs
+ *
+ * @param j JSON object
+ * @param a Args object
+ *
+ * @return None
+ */
+void from_json(const nlohmann::ordered_json& j, GeneratorParamsArgs& a);
+
+/**
+ * @brief A class for holding parsed values for guidance
+ */
+struct GuidanceArgs {
+  std::string response_format = "";
+  std::string tools_file = "";
+  bool text_output = false;
+  bool tool_output = false;
+  std::string tool_call_start = "";
+  std::string tool_call_end = "";
+};
+
+/**
+ * @brief Parse command-line arguments from user
+ *
+ * @param argc Number of command-line arguments provided
+ * @param argv Contents of command-line arguments provided
+ * @param generator_params_args Struct to hold args for generation params
+ * @param guidance_args Struct to hold args for guidance
+ * @param model_path Path to model folder containing GenAI config
+ * @param ep Name of execution provider to set
+ * @param ep_path Path to execution provider to register
+ * @param system_prompt System prompt to use for the model
+ * @param user_prompt User prompt to use for the model
+ * @param verbose Use verbose logging
+ * @param debug Use debug mode to dump input and output tensors
+ * @param interactive Run in interactive mode
+ * @param rewind Rewind to the system prompt after each generation
+ * @param image_paths File paths to images
+ * @param audio_paths File paths to audios
+ *
+ * @return true if command-line arguments can be parsed, else false
+ */
+bool ParseArgs(int argc, char** argv, GeneratorParamsArgs& generator_params_args, GuidanceArgs& guidance_args, std::string& model_path, std::string& ep, std::string& ep_path, std::string& system_prompt, std::string& user_prompt, bool& verbose, bool& debug, bool& interactive, bool& rewind, std::vector<std::string>& image_paths, std::vector<std::string>& audio_paths);
+
+/**
+ * @brief Set log options inside ORT GenAI
+ *
+ * @param inputs Dump inputs to the model in the console
+ * @param outputs Dump outputs to the model in the console
+ *
+ * @return None
+ */
+void SetLogger(bool inputs = true, bool outputs = true);
+
+/**
+ * @brief Register execution provider if path is provided
+ *
+ * @param ep Name of execution provider
+ * @param ep_path Path to execution provider to register
+ *
+ * @return None
+ */
+void RegisterEP(const std::string& ep, const std::string& ep_path);
+
+/**
+ * @brief Get OgaConfig object and set EP-specific and search-specific options inside it
+ *
+ * @param path Path to model folder containing GenAI config
+ * @param ep Name of execution provider to set
+ * @param ep_options Map of EP-specific option names and their values
+ * @param search_options Map of search-specific option names and their values
+ *
+ * @return ORT GenAI config object with all options set
+ */
+std::unique_ptr<OgaConfig> GetConfig(const std::string& path, const std::string& ep, const std::unordered_map<std::string, std::string>& ep_options, GeneratorParamsArgs& search_options);
+
+/**
+ * @brief Set search options for a generator's params during decoding
+ *
+ * @param generatorParams Generator params object to set on
+ * @param args Arguments provided by user
+ * @param verbose Use verbose logging
+ *
+ * @return None
+ */
+void SetSearchOptions(OgaGeneratorParams& generatorParams, GeneratorParamsArgs& args, bool verbose);
+
+/**
+ * @brief Apply the chat template with various fallback options
+ *
+ * @param model_path Path to folder containing model
+ * @param tokenizer Tokenizer object to use
+ * @param messages String-encoded list of messages
+ * @param add_generation_prompt Add tokens to indicate the start of the AI's response
+ * @param tools String-encoded list of tools
+ *
+ * @return Prompt to encode
+ */
+std::string ApplyChatTemplate(const std::string& model_path, OgaTokenizer& tokenizer, const std::string& messages, bool add_generation_prompt, const std::string& tools = "");
+
+/**
+ * @brief Get prompt for 'user' role in chat template
+ *
+ * @param prompt Provided prompt
+ * @param interactive Interactive mode (otherwise uses either user-provided prompt or default)
+ *
+ * @return Prompt to use
+ */
+std::string GetUserPrompt(const std::string& prompt, bool interactive);
+
+/**
+ * @brief Get paths to media for user
+ *
+ * @param media_paths User-provided media paths
+ * @param interactive Interactive mode (otherwise uses either user-provided media paths or default)
+ * @param media_type The media type being obtained
+ *
+ * @return all media filepaths to read and encode
+ */
+std::vector<std::string> GetUserMediaPaths(const std::vector<std::string>& media_paths, bool interactive, const std::string& media_type);
+
+/**
+ * @brief Get images for user
+ *
+ * @param image_paths User-provided image paths
+ * @param interactive Interactive mode (otherwise uses either user-provided image paths or default)
+ *
+ * @return (all images, number of images) as a tuple
+ */
+std::tuple<std::unique_ptr<OgaImages>, int> GetUserImages(const std::vector<std::string>& image_paths, bool interactive);
+
+/**
+ * @brief Get audios for user
+ *
+ * @param audio_paths User-provided audio paths
+ * @param interactive Interactive mode (otherwise uses either user-provided audio paths or default)
+ *
+ * @return (all audios, number of audios) as a tuple
+ */
+std::tuple<std::unique_ptr<OgaAudios>, int> GetUserAudios(const std::vector<std::string>& audio_paths, bool interactive);
+
+/**
+ * @brief Get content for 'user' role in chat template
+ *
+ * @param model_type Model type inside ORT GenAI
+ * @param num_images Number of images
+ * @param num_audios Number of audios
+ * @param prompt User prompt
+ *
+ * @return JSON-encoded combined content for 'user' role
+ */
+nlohmann::ordered_json GetUserContent(const std::string& model_type, int num_images, int num_audios, const std::string& prompt);
+
+/**
+ * @brief Convert a list of tools to a list of tool schemas
+ *
+ * @param tools List of OpenAI-compatible tools
+ *
+ * @return List of JSON schema compatible tools
+ */
+std::vector<ToolSchema> ToolsToSchemas(std::vector<Tool>& tools);
+
+/**
+ * @brief Create a JSON schema from a list of tools
+ *
+ * @param tools List of OpenAI-compatible tools
+ * @param tool_output Output can have a tool call
+ *
+ * @return JSON schema as a JSON-compatible string
+ */
+std::string GetJsonSchema(std::vector<Tool>& tools, bool tool_output);
+
+/**
+ * @brief Create a LARK grammar from a list of tools
+ *
+ * @param tools List of OpenAI-compatible tools
+ * @param text_output Output can have text
+ * @param tool_output Output can have a tool call
+ * @param tool_call_start String representation of tool call starting token
+ * @param tool_call_end String representation of tool call ending token
+ *
+ * @return LARK grammar as a string
+ */
+std::string GetLarkGrammar(std::vector<Tool>& tools, bool text_output, bool tool_output, const std::string& tool_call_start, const std::string& tool_call_end);
 
-// Append provider / options to config.
-// This is a no-op if provider=="follow_config"
-void append_provider(OgaConfig& config, const std::string& provider);
+/**
+ * @brief Convert a JSON-deserialized object of tools to a list of Tool objects
+ *
+ * @param tool_defs JSON-deserialized object containing OpenAI-compatible tool definitions
+ *
+ * @return List of Tool objects
+ */
+std::vector<Tool> ToTool(std::vector<nlohmann::ordered_json>& tool_defs);
 
-// Register execution provider library if specified
-// This enables plug-in provider support for CUDA and NvTensorRT
-void register_provider_library(const std::string& provider, const std::string& library_path);
\ No newline at end of file
+/**
+ * @brief Create a grammar to use with LLGuidance
+ *
+ * @param response_format Type of format requested
+ * @param filepath Path to file containing OpenAI-compatible tool definitions
+ * @param tools_str JSON-serialized string containing OpenAI-compatible tool definitions
+ * @param tools List of OpenAI-compatible tools defined in memory
+ * @param text_output Output can have text
+ * @param tool_output Output can have a tool call
+ * @param tool_call_start String representation of tool call starting token
+ * @param tool_call_end String representation of tool call ending token
+ *
+ * @return (grammar type, grammar data, tools) as a tuple of strings
+ */
+std::tuple<std::string, std::string, std::string> GetGuidance(const std::string& response_format = "", const std::string& filepath = "", const std::string& tools_str = "", std::vector<nlohmann::ordered_json>* tools = nullptr, bool text_output = true, bool tool_output = false, const std::string& tool_call_start = "", const std::string& tool_call_end = "");
diff --git a/examples/c/src/model_chat.cpp b/examples/c/src/model_chat.cpp
index b4725d67e0..37a537fb40 100644
--- a/examples/c/src/model_chat.cpp
+++ b/examples/c/src/model_chat.cpp
@@ -1,18 +1,19 @@
+// -----------------------------------------------------------------------------------------------
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
-#include <iomanip>
-#include <string>
-#include <csignal>
-
-#include "ort_genai.h"
-#include "common.h"
-
+//
 // C++ API Example for Model Chat
 // This example demonstrates how to use the C++ API of the ONNX Runtime GenAI library
 // to perform model chat tasks (i.e. continuous decoding). It includes functionalities
 // to create a model, tokenizer, and generator, and to handle user input for generating
 // responses based on prompts.
+// -----------------------------------------------------------------------------------------------
+
+#include <csignal>
+#include <iomanip>
+#include <string>
+
+#include "common.h"
 
 OgaGenerator* g_generator = nullptr;
 
@@ -24,76 +25,139 @@ void TerminateGeneration(int signum) {
   g_generator->SetRuntimeOption("terminate_session", "1");
 }
 
-void CXX_API(const char* model_path, const char* execution_provider, const char* ep_library_path) {
-  // Register execution provider library if specified (for plug-in providers)
-  std::string provider(execution_provider);
-  std::string library_path(ep_library_path);
-  register_provider_library(provider, library_path);
-
-  std::cout << "Creating config..." << std::endl;
-  auto config = OgaConfig::Create(model_path);
-
-  append_provider(*config, provider);
-
-  std::cout << "Creating model..." << std::endl;
+void CXX_API(
+    GeneratorParamsArgs& generator_params_args,
+    GuidanceArgs& guidance_args,
+    const std::string& model_path,
+    const std::string& ep,
+    const std::string& ep_path,
+    const std::string& system_prompt,
+    const std::string& user_prompt,
+    bool verbose,
+    bool debug,
+    bool interactive,
+    bool rewind) {
+  if (debug) SetLogger();
+  RegisterEP(ep, ep_path);
+
+  if (verbose) std::cout << "Creating config..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  auto config = GetConfig(model_path, ep, ep_options, generator_params_args);
+
+  if (verbose) std::cout << "Creating model..." << std::endl;
   auto model = OgaModel::Create(*config);
 
-  std::cout << "Creating tokenizer..." << std::endl;
+  if (verbose) std::cout << "Creating tokenizer..." << std::endl;
   auto tokenizer = OgaTokenizer::Create(*model);
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+  auto stream = OgaTokenizerStream::Create(*tokenizer);
 
+  // Set search options for generator params
   auto params = OgaGeneratorParams::Create(*model);
-  params->SetSearchOption("max_length", 1024);
+  SetSearchOptions(*params, generator_params_args, verbose);
+
+  // Create system message
+  nlohmann::ordered_json message = nlohmann::ordered_json::array();
+  message.push_back({{"role", "system"}, {"content", system_prompt}});
+
+  // Get and set guidance info if requested
+  std::string guidance_type, guidance_data, tools;
+  if (!guidance_args.response_format.empty()) {
+    std::cout << "Make sure your tool call start id and tool call end id are marked as special in tokenizer.json" << std::endl;
+    std::tie(guidance_type, guidance_data, tools) = GetGuidance(
+        guidance_args.response_format,
+        guidance_args.tools_file,
+        "",       // tools_str
+        nullptr,  // tools
+        guidance_args.text_output,
+        guidance_args.tool_output,
+        guidance_args.tool_call_start,
+        guidance_args.tool_call_end);
+    message[0]["tools"] = tools;
+
+    params->SetGuidance(guidance_type.c_str(), guidance_data.c_str());
+    if (verbose) {
+      std::cout << std::endl;
+      std::cout << "Guidance type is: " << guidance_type << std::endl;
+      std::cout << "Guidance data is: \n"
+                << guidance_data << std::endl;
+      std::cout << std::endl;
+    }
+  }
 
+  // Create generator
   auto generator = OgaGenerator::Create(*model, *params);
   g_generator = generator.get();  // Store the current generator for termination
+  if (verbose) std::cout << "Generator created" << std::endl;
 
-  // Define System Prompt
-  std::string system_prompt = "You are a helpful AI assistant.";
+  // Apply chat template
+  std::string prompt;
+  try {
+    bool add_generation_prompt = false;
+    prompt = ApplyChatTemplate(model_path, *tokenizer, message.dump(), add_generation_prompt, tools);
+  } catch (...) {
+    prompt = system_prompt;
+  }
+  if (verbose) std::cout << "System prompt: " << prompt << "\n"
+                         << std::endl;
+
+  // Encode system prompt and append tokens to model
+  auto sequences = OgaSequences::Create();
+  tokenizer->Encode(prompt.c_str(), *sequences);
+  const int prompt_tokens_length = sequences->SequenceCount(0);
+  generator->AppendTokenSequences(*sequences);
 
+  // Keep asking for input prompts in a loop
   while (true) {
-    signal(SIGINT, TerminateGeneration);
+    // Get user prompt
     std::string text;
-    std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl;
-    // Clear Any cin error flags because of SIGINT
-    std::cin.clear();
-    std::getline(std::cin, text);
-
-    if (text.empty()) {
-      std::cout << "Empty input. Please enter a valid prompt." << std::endl;
-      continue;  // Skip to the next iteration if input is empty
-    } else if (text == "quit()") {
-      break;  // Exit the loop
+
+    if (interactive) {
+      std::cout << "Prompt (Use quit() to exit):" << std::endl;
+      // Clear any cin error flags because of SIGINT
+      std::cin.clear();
+      std::getline(std::cin, text);
+
+      if (text.empty()) {
+        std::cout << "Empty input. Please enter a valid prompt." << std::endl;
+        continue;  // Skip to the next iteration if input is empty
+      } else if (text == "quit()") {
+        break;  // Exit the loop
+      }
+    } else {
+      text = user_prompt;
     }
 
-    const std::string messages = R"(
-      [
-        {
-          "role": "system",
-          "content": ")" + system_prompt +
-                                 R"("
-        },
-        {
-          "role": "user",
-          "content": ")" + text + R"("
-        }
-      ]
-    )";
-    system_prompt.clear();  // Clear the system prompt to avoid reusing it in the next iteration
-    std::string prompt = std::string(tokenizer->ApplyChatTemplate("", messages.c_str(), "", true));
+    signal(SIGINT, TerminateGeneration);
 
+    // Start timings
     bool is_first_token = true;
     Timing timing;
     timing.RecordStartTimestamp();
 
-    auto sequences = OgaSequences::Create();
-    tokenizer->Encode(prompt.c_str(), *sequences);
+    // Create user message
+    message = nlohmann::ordered_json::array();
+    message.push_back({{"role", "user"}, {"content", text}});
 
-    std::cout << "Generating response..." << std::endl;
-    generator->SetRuntimeOption("terminate_session", "0");
+    // Apply chat template
+    try {
+      bool add_generation_prompt = true;
+      prompt = ApplyChatTemplate(model_path, *tokenizer, message.dump(), add_generation_prompt);
+    } catch (...) {
+      prompt = text;
+    }
+    if (verbose) std::cout << "User prompt: " << prompt << "\n"
+                           << std::endl;
+
+    // Encode user prompt and append tokens to model
+    sequences = OgaSequences::Create();
+    tokenizer->Encode(prompt.c_str(), *sequences);
     generator->AppendTokenSequences(*sequences);
-    const auto current_token_count = generator->GetSequenceCount(0);
 
+    // Run generation loop
+    if (verbose) std::cout << "Running generation loop..." << std::endl;
+    std::cout << std::endl;
+    std::cout << "Output: ";
+    const auto current_token_count = generator->GetSequenceCount(0);
     try {
       while (!generator->IsDone()) {
         generator->GenerateNextToken();
@@ -104,40 +168,62 @@ void CXX_API(const char* model_path, const char* execution_provider, const char*
         }
 
         const auto new_token = generator->GetNextTokens()[0];
-        std::cout << tokenizer_stream->Decode(new_token) << std::flush;
+        std::cout << stream->Decode(new_token) << std::flush;
       }
     } catch (const std::exception& e) {
-      std::cout << "\n\033[31mTerminating generation: " << e.what() << "\033[0m" << std::endl;
+      std::cout << "\n"
+                << "Terminating generation: " << e.what() << std::endl;
       generator->RewindTo(current_token_count);  // Rewind to the last valid state
     }
-
     timing.RecordEndTimestamp();
-    const int prompt_tokens_length = sequences->SequenceCount(0);
+
     const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
     timing.Log(prompt_tokens_length, new_tokens_length);
 
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
+    std::cout << "\n\n"
+              << std::endl;
+    if (!interactive) break;
+
+    // Rewind the generator to the system prompt. This will erase all the chat history with the model.
+    if (rewind) generator->RewindTo(prompt_tokens_length);
   }
 }
 
 int main(int argc, char** argv) {
-  std::string model_path, ep, ep_library_path;
-  if (!parse_args(argc, argv, model_path, ep, &ep_library_path)) {
+  // Get command-line args
+  GeneratorParamsArgs generator_params_args;
+  GuidanceArgs guidance_args;
+  std::string model_path, ep = "follow_config", ep_path = "", system_prompt = "You are a helpful AI assistant.", user_prompt = "What color is the sky?";
+  bool verbose = false, debug = false, interactive = true, rewind = false;
+  std::vector<std::string> image_paths;
+  std::vector<std::string> audio_paths;
+  if (!ParseArgs(argc, argv, generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind, image_paths, audio_paths)) {
     return -1;
   }
 
   // Responsible for cleaning up the library during shutdown
   OgaHandle handle;
 
-  std::cout << "---------------------------" << std::endl;
+  std::cout << "----------------------------" << std::endl;
   std::cout << "Hello, ORT GenAI Model Chat!" << std::endl;
-  std::cout << "---------------------------" << std::endl;
+  std::cout << "----------------------------" << std::endl;
+
+  std::cout << "Model path: " << model_path << std::endl;
+  std::cout << "Execution provider: " << ep << std::endl;
+  if (!ep_path.empty()) std::cout << "Execution provider path: " << ep_path << std::endl;
+  std::cout << "System prompt: " << system_prompt << std::endl;
+  if (!interactive) std::cout << "User prompt: " << user_prompt << std::endl;
+  std::cout << "Verbose: " << verbose << std::endl;
+  std::cout << "Debug: " << debug << std::endl;
+  std::cout << "Interactive: " << interactive << std::endl;
+  std::cout << "Rewind: " << rewind << std::endl;
+  std::cout << "--------------------------" << std::endl;
+  std::cout << std::endl;
 
   try {
-    CXX_API(model_path.c_str(), ep.c_str(), ep_library_path.c_str());
+    CXX_API(generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind);
   } catch (const std::exception& e) {
-    std::cerr << "\033[31mError: " << e.what() << "\033[0m" << std::endl;
+    std::cerr << "Error: " << e.what() << std::endl;
     return -1;
   }
 
diff --git a/examples/c/src/model_mm.cpp b/examples/c/src/model_mm.cpp
new file mode 100644
index 0000000000..e01393c41a
--- /dev/null
+++ b/examples/c/src/model_mm.cpp
@@ -0,0 +1,222 @@
+// -----------------------------------------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+// C++ API Example for Model Question-Answering
+// This example demonstrates how to use the C++ API of the ONNX Runtime GenAI library
+// to perform model question-answering tasks. It includes functionalities to create a model,
+// tokenizer, and generator, and to handle user input for generating responses based on prompts.
+// -----------------------------------------------------------------------------------------------
+
+#include <csignal>
+#include <iomanip>
+#include <string>
+
+#include "common.h"
+
+OgaGenerator* g_generator = nullptr;
+
+void TerminateGeneration(int signum) {
+  if (g_generator == nullptr) {
+    return;
+  }
+  g_generator->SetRuntimeOption("terminate_session", "1");
+}
+
+void CXX_API(
+    GeneratorParamsArgs& generator_params_args,
+    GuidanceArgs& guidance_args,
+    const std::string& model_path,
+    const std::string& ep,
+    const std::string& ep_path,
+    const std::vector<std::string>& image_paths,
+    const std::vector<std::string>& audio_paths,
+    const std::string& system_prompt,
+    const std::string& user_prompt,
+    bool verbose,
+    bool debug,
+    bool interactive) {
+  if (debug) SetLogger();
+  RegisterEP(ep, ep_path);
+
+  if (verbose) std::cout << "Creating config..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  auto config = GetConfig(model_path, ep, ep_options, generator_params_args);
+
+  if (verbose) std::cout << "Creating model..." << std::endl;
+  auto model = OgaModel::Create(*config);
+
+  if (verbose) std::cout << "Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  auto stream = OgaTokenizerStream::Create(*tokenizer);
+
+  if (verbose) std::cout << "Creating processor..." << std::endl;
+  auto processor = OgaMultiModalProcessor::Create(*model);
+
+  // Create running list of messages
+  std::vector<nlohmann::ordered_json> input_list;
+  nlohmann::ordered_json system_message = nlohmann::ordered_json{{"role", "system"}, {"content", system_prompt}};
+  input_list.push_back(system_message);
+
+  // Get and set guidance info if requested
+  std::string guidance_type, guidance_data, tools;
+  if (!guidance_args.response_format.empty()) {
+    std::cout << "Make sure your tool call start id and tool call end id are marked as special in tokenizer.json" << std::endl;
+    std::tie(guidance_type, guidance_data, tools) = GetGuidance(
+        guidance_args.response_format,
+        guidance_args.tools_file,
+        "",       // tools_str
+        nullptr,  // tools
+        guidance_args.text_output,
+        guidance_args.tool_output,
+        guidance_args.tool_call_start,
+        guidance_args.tool_call_end);
+
+    input_list[0]["tools"] = tools;
+  }
+
+  // Keep asking for input prompts in a loop
+  while (true) {
+    // Get images
+    std::unique_ptr<OgaImages> images;
+    int num_images;
+    std::tie(images, num_images) = GetUserImages(image_paths, interactive);
+
+    // Get audios
+    std::unique_ptr<OgaAudios> audios;
+    int num_audios;
+    std::tie(audios, num_audios) = GetUserAudios(audio_paths, interactive);
+
+    // Get user prompt
+    std::string text = GetUserPrompt(user_prompt, interactive);
+    signal(SIGINT, TerminateGeneration);
+    if (text == "quit()") {
+      break;  // Exit the loop
+    }
+
+    // Construct user content based on inputs
+    auto type = model->GetType();
+    nlohmann::ordered_json user_content = GetUserContent(std::string(type), num_images, num_audios, text);
+
+    // Add user message to list of messages
+    nlohmann::ordered_json user_message = nlohmann::ordered_json{{"role", "user"}, {"content", user_content}};
+    input_list.push_back(user_message);
+    nlohmann::ordered_json j = input_list;
+    std::string messages = j.dump();
+
+    // Start timings
+    bool is_first_token = true;
+    Timing timing;
+    timing.RecordStartTimestamp();
+
+    // Initialize generator params
+    auto params = OgaGeneratorParams::Create(*model);
+    SetSearchOptions(*params, generator_params_args, verbose);
+
+    // Initialize guidance info
+    if (!guidance_args.response_format.empty()) {
+      params->SetGuidance(guidance_type.c_str(), guidance_data.c_str());
+      if (verbose) {
+        std::cout << std::endl;
+        std::cout << "Guidance type is: " << guidance_type << std::endl;
+        std::cout << "Guidance data is: \n"
+                  << guidance_data << std::endl;
+        std::cout << std::endl;
+      }
+    }
+
+    // Create generator
+    auto generator = OgaGenerator::Create(*model, *params);
+    g_generator = generator.get();  // Store the current generator for termination
+    if (verbose) std::cout << "Generator created" << std::endl;
+
+    // Apply chat template
+    std::string prompt;
+    try {
+      bool add_generation_prompt = true;
+      prompt = ApplyChatTemplate(model_path, *tokenizer, messages, add_generation_prompt, tools);
+    } catch (...) {
+      prompt = text;
+    }
+    if (verbose) std::cout << "Prompt: " << prompt << "\n"
+                           << std::endl;
+
+    // Encode combined system + user prompt and append inputs to model
+    auto input_tensors = processor->ProcessImagesAndAudios(prompt.c_str(), images.get(), audios.get());
+    generator->SetInputs(*input_tensors);
+    const int prompt_tokens_length = generator->GetSequenceCount(0);
+
+    // Run generation loop
+    if (verbose) std::cout << "Running generation loop..." << std::endl;
+    std::cout << std::endl;
+    std::cout << "Output: ";
+    try {
+      while (!generator->IsDone()) {
+        generator->GenerateNextToken();
+
+        if (is_first_token) {
+          timing.RecordFirstTokenTimestamp();
+          is_first_token = false;
+        }
+
+        const auto new_token = generator->GetNextTokens()[0];
+        std::cout << stream->Decode(new_token) << std::flush;
+      }
+    } catch (const std::exception& e) {
+      std::cout << "\n"
+                << "Terminating generation: " << e.what() << std::endl;
+    }
+    timing.RecordEndTimestamp();
+
+    // Clear the generator after use
+    g_generator = nullptr;
+
+    // Remove user message from list of messages
+    input_list.pop_back();
+
+    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    timing.Log(prompt_tokens_length, new_tokens_length);
+
+    std::cout << "\n\n\n";
+    if (!interactive) break;
+  }
+}
+
+int main(int argc, char** argv) {
+  // Get command-line args
+  GeneratorParamsArgs generator_params_args;
+  GuidanceArgs guidance_args;
+  std::string model_path, ep = "follow_config", ep_path = "", system_prompt = "You are a helpful AI assistant.", user_prompt = "What color is the sky?";
+  bool verbose = false, debug = false, interactive = true, rewind = true;
+  std::vector<std::string> image_paths;
+  std::vector<std::string> audio_paths;
+  if (!ParseArgs(argc, argv, generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind, image_paths, audio_paths)) {
+    return -1;
+  }
+
+  // Responsible for cleaning up the library during shutdown
+  OgaHandle handle;
+
+  std::cout << "--------------------------" << std::endl;
+  std::cout << "Hello, ORT GenAI Model-MM!" << std::endl;
+  std::cout << "--------------------------" << std::endl;
+
+  std::cout << "Model path: " << model_path << std::endl;
+  std::cout << "Execution provider: " << ep << std::endl;
+  if (!ep_path.empty()) std::cout << "Execution provider path: " << ep_path << std::endl;
+  std::cout << "System prompt: " << system_prompt << std::endl;
+  if (!interactive) std::cout << "User prompt: " << user_prompt << std::endl;
+  std::cout << "Verbose: " << verbose << std::endl;
+  std::cout << "Interactive: " << interactive << std::endl;
+  std::cout << "--------------------------" << std::endl;
+  std::cout << std::endl;
+
+  try {
+    CXX_API(generator_params_args, guidance_args, model_path, ep, ep_path, image_paths, audio_paths, system_prompt, user_prompt, verbose, debug, interactive);
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/c/src/model_qa.cpp b/examples/c/src/model_qa.cpp
index 22a459f02e..25b39dfce7 100644
--- a/examples/c/src/model_qa.cpp
+++ b/examples/c/src/model_qa.cpp
@@ -1,18 +1,19 @@
+// -----------------------------------------------------------------------------------------------
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+//
+// C++ API Example for Model Question-Answering
+// This example demonstrates how to use the C++ API of the ONNX Runtime GenAI library
+// to perform model question-answering tasks. It includes functionalities to create a model,
+// tokenizer, and generator, and to handle user input for generating responses based on prompts.
+// -----------------------------------------------------------------------------------------------
 
+#include <csignal>
 #include <iomanip>
 #include <string>
-#include <csignal>
 
-#include "ort_genai.h"
 #include "common.h"
 
-// C++ API Example for Model Question-Answering
-// This example demonstrates how to use the C++ API of the ONNX Runtime GenAI library
-// to perform model question-answering tasks. It includes functionalities to create a model,
-// tokenizer, and generator, and to handle user input for generating responses based on prompts.
-
 OgaGenerator* g_generator = nullptr;
 
 void TerminateGeneration(int signum) {
@@ -22,69 +23,114 @@ void TerminateGeneration(int signum) {
   g_generator->SetRuntimeOption("terminate_session", "1");
 }
 
-void CXX_API(const char* model_path, const char* execution_provider, const char* ep_library_path) {
-  // Register execution provider library if specified (for plug-in providers)
-  std::string provider(execution_provider);
-  std::string library_path(ep_library_path);
-  register_provider_library(provider, library_path);
-
-  std::cout << "Creating config..." << std::endl;
-  auto config = OgaConfig::Create(model_path);
-
-  append_provider(*config, provider);
-
-  std::cout << "Creating model..." << std::endl;
+void CXX_API(
+    GeneratorParamsArgs& generator_params_args,
+    GuidanceArgs& guidance_args,
+    const std::string& model_path,
+    const std::string& ep,
+    const std::string& ep_path,
+    const std::string& system_prompt,
+    const std::string& user_prompt,
+    bool verbose,
+    bool debug,
+    bool interactive) {
+  if (debug) SetLogger();
+  RegisterEP(ep, ep_path);
+
+  if (verbose) std::cout << "Creating config..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  auto config = GetConfig(model_path, ep, ep_options, generator_params_args);
+
+  if (verbose) std::cout << "Creating model..." << std::endl;
   auto model = OgaModel::Create(*config);
 
-  std::cout << "Creating tokenizer..." << std::endl;
+  if (verbose) std::cout << "Creating tokenizer..." << std::endl;
   auto tokenizer = OgaTokenizer::Create(*model);
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+  auto stream = OgaTokenizerStream::Create(*tokenizer);
+
+  // Create running list of messages
+  std::vector<nlohmann::ordered_json> input_list;
+  nlohmann::ordered_json system_message = nlohmann::ordered_json{{"role", "system"}, {"content", system_prompt}};
+  input_list.push_back(system_message);
+
+  // Get and set guidance info if requested
+  std::string guidance_type, guidance_data, tools;
+  if (!guidance_args.response_format.empty()) {
+    std::cout << "Make sure your tool call start id and tool call end id are marked as special in tokenizer.json" << std::endl;
+    std::tie(guidance_type, guidance_data, tools) = GetGuidance(
+        guidance_args.response_format,
+        guidance_args.tools_file,
+        "",       // tools_str
+        nullptr,  // tools
+        guidance_args.text_output,
+        guidance_args.tool_output,
+        guidance_args.tool_call_start,
+        guidance_args.tool_call_end);
+
+    input_list[0]["tools"] = tools;
+  }
 
+  // Keep asking for input prompts in a loop
   while (true) {
-    std::string text;
-    std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl;
-    // Clear Any cin error flags because of SIGINT
-    std::cin.clear();
-    std::getline(std::cin, text);
-
-    if (text.empty()) {
-      std::cout << "Empty input. Please enter a valid prompt." << std::endl;
-      continue;  // Skip to the next iteration if input is empty
-    } else if (text == "quit()") {
+    // Get user prompt
+    std::string text = GetUserPrompt(user_prompt, interactive);
+    signal(SIGINT, TerminateGeneration);
+    if (text == "quit()") {
       break;  // Exit the loop
     }
 
-    signal(SIGINT, TerminateGeneration);
-
-    const std::string messages = R"(
-      [
-        {
-          "role": "system",
-          "content": "You are a helpful AI assistant."
-        },
-        {
-          "role": "user",
-          "content": ")" + text + R"("
-        }
-      ]
-    )";
-    const std::string prompt = std::string(tokenizer->ApplyChatTemplate("", messages.c_str(), "", true));
+    // Add user message to list of messages
+    nlohmann::ordered_json user_message = nlohmann::ordered_json{{"role", "user"}, {"content", text}};
+    input_list.push_back(user_message);
+    nlohmann::ordered_json j = input_list;
+    std::string messages = j.dump();
 
+    // Start timings
     bool is_first_token = true;
     Timing timing;
     timing.RecordStartTimestamp();
 
-    auto sequences = OgaSequences::Create();
-    tokenizer->Encode(prompt.c_str(), *sequences);
-
-    std::cout << "Generating response..." << std::endl;
-
+    // Initialize generator params
     auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 1024);
+    SetSearchOptions(*params, generator_params_args, verbose);
+
+    // Initialize guidance info
+    if (!guidance_args.response_format.empty()) {
+      params->SetGuidance(guidance_type.c_str(), guidance_data.c_str());
+      if (verbose) {
+        std::cout << std::endl;
+        std::cout << "Guidance type is: " << guidance_type << std::endl;
+        std::cout << "Guidance data is: \n"
+                  << guidance_data << std::endl;
+        std::cout << std::endl;
+      }
+    }
+
+    // Create generator
     auto generator = OgaGenerator::Create(*model, *params);
     g_generator = generator.get();  // Store the current generator for termination
+    if (verbose) std::cout << "Generator created" << std::endl;
+
+    // Apply chat template
+    std::string prompt;
+    try {
+      bool add_generation_prompt = true;
+      prompt = ApplyChatTemplate(model_path, *tokenizer, messages, add_generation_prompt, tools);
+    } catch (...) {
+      prompt = text;
+    }
+    if (verbose) std::cout << "Prompt: " << prompt << "\n"
+                           << std::endl;
+
+    // Encode combined system + user prompt and append tokens to model
+    auto sequences = OgaSequences::Create();
+    tokenizer->Encode(prompt.c_str(), *sequences);
     generator->AppendTokenSequences(*sequences);
 
+    // Run generation loop
+    if (verbose) std::cout << "Running generation loop..." << std::endl;
+    std::cout << std::endl;
+    std::cout << "Output: ";
     try {
       while (!generator->IsDone()) {
         generator->GenerateNextToken();
@@ -95,40 +141,60 @@ void CXX_API(const char* model_path, const char* execution_provider, const char*
         }
 
         const auto new_token = generator->GetNextTokens()[0];
-        std::cout << tokenizer_stream->Decode(new_token) << std::flush;
+        std::cout << stream->Decode(new_token) << std::flush;
       }
     } catch (const std::exception& e) {
-      std::cout << "\n\033[31mTerminating generation: " << e.what() << "\033[0m" << std::endl;
+      std::cout << "\n"
+                << "Terminating generation: " << e.what() << std::endl;
     }
-
     timing.RecordEndTimestamp();
+
+    // Clear the generator after use
+    g_generator = nullptr;
+
+    // Remove user message from list of messages
+    input_list.pop_back();
+
     const int prompt_tokens_length = sequences->SequenceCount(0);
     const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
     timing.Log(prompt_tokens_length, new_tokens_length);
 
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
-
-    g_generator = nullptr;  // Clear the generator after use
+    std::cout << "\n\n\n";
+    if (!interactive) break;
   }
 }
 
 int main(int argc, char** argv) {
-  std::string model_path, ep, ep_library_path;
-  if (!parse_args(argc, argv, model_path, ep, &ep_library_path)) {
+  // Get command-line args
+  GeneratorParamsArgs generator_params_args;
+  GuidanceArgs guidance_args;
+  std::string model_path, ep = "follow_config", ep_path = "", system_prompt = "You are a helpful AI assistant.", user_prompt = "What color is the sky?";
+  bool verbose = false, debug = false, interactive = true, rewind = true;
+  std::vector<std::string> image_paths;
+  std::vector<std::string> audio_paths;
+  if (!ParseArgs(argc, argv, generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind, image_paths, audio_paths)) {
     return -1;
   }
 
   // Responsible for cleaning up the library during shutdown
   OgaHandle handle;
 
-  std::cout << "-------------------------" << std::endl;
+  std::cout << "--------------------------" << std::endl;
   std::cout << "Hello, ORT GenAI Model-QA!" << std::endl;
-  std::cout << "-------------------------" << std::endl;
+  std::cout << "--------------------------" << std::endl;
+
+  std::cout << "Model path: " << model_path << std::endl;
+  std::cout << "Execution provider: " << ep << std::endl;
+  if (!ep_path.empty()) std::cout << "Execution provider path: " << ep_path << std::endl;
+  std::cout << "System prompt: " << system_prompt << std::endl;
+  if (!interactive) std::cout << "User prompt: " << user_prompt << std::endl;
+  std::cout << "Verbose: " << verbose << std::endl;
+  std::cout << "Interactive: " << interactive << std::endl;
+  std::cout << "--------------------------" << std::endl;
+  std::cout << std::endl;
 
-  std::cout << "C++ API" << std::endl;
   try {
-    CXX_API(model_path.c_str(), ep.c_str(), ep_library_path.c_str());
+    CXX_API(generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive);
   } catch (const std::exception& e) {
     std::cerr << "Error: " << e.what() << std::endl;
     return -1;
diff --git a/examples/c/src/model_vision.cpp b/examples/c/src/model_vision.cpp
deleted file mode 100644
index a4afe40bc0..0000000000
--- a/examples/c/src/model_vision.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <cstring>
-#include <memory>
-#include "common.h"
-#include "ort_genai.h"
-
-// C++ API Example
-
-void CXX_API(const char* model_path, const char* execution_provider, const char* ep_library_path) {
-  // Register execution provider library if specified (for plug-in providers)
-  std::string provider(execution_provider);
-  std::string library_path(ep_library_path);
-  register_provider_library(provider, library_path);
-
-  std::cout << "Creating config..." << std::endl;
-  auto config = OgaConfig::Create(model_path);
-
-  append_provider(*config, provider);
-
-  std::cout << "Creating model..." << std::endl;
-  auto model = OgaModel::Create(*config);
-
-  std::cout << "Creating multimodal processor..." << std::endl;
-  auto processor = OgaMultiModalProcessor::Create(*model);
-
-  auto tokenizer = OgaTokenizer::Create(*model);
-
-  auto stream = OgaTokenizerStream::Create(*processor);
-
-  while (true) {
-    std::string image_paths_str;
-    std::cout << "Image Path (comma separated; leave empty if no image):" << std::endl;
-    std::getline(std::cin, image_paths_str);
-    std::unique_ptr<OgaImages> images;
-    std::vector<std::string> image_paths;
-    for (size_t start = 0, end = 0; end < image_paths_str.size(); start = end + 1) {
-      end = image_paths_str.find(',', start);
-      image_paths.push_back(Trim(image_paths_str.substr(start, end - start)));
-    }
-    if (image_paths.empty()) {
-      std::cout << "No image provided" << std::endl;
-    } else {
-      std::cout << "Loading images..." << std::endl;
-      for (const auto& image_path : image_paths) {
-        if (!FileExists(image_path.c_str())) {
-          throw std::runtime_error(std::string("Image file not found: ") + image_path);
-        }
-      }
-      std::vector<const char*> image_paths_c;
-      for (const auto& image_path : image_paths) image_paths_c.push_back(image_path.c_str());
-      images = OgaImages::Load(image_paths_c);
-    }
-
-    std::string text;
-    std::cout << "Prompt: " << std::endl;
-    std::getline(std::cin, text);
-
-    // Construct messages string with special tokens for ApplyChatTemplate.
-
-    // Note: The Phi-3 Vision chat template expects content to be string, whereas in
-    // Gemma-3-like models, content type is supported, so we handle these differently.
-
-    std::string messages;
-    if (std::string(model->GetType()) == "phi3v") {
-      // Phi-3 Vision-style multimodal usage with image tags
-      std::string content;
-      for (size_t i = 0; i < image_paths.size(); ++i)
-        content += "<|image_" + std::to_string(i + 1) + "|>\\n";
-      content += text;
-      messages = R"([{"role": "user", "content": ")" + content + R"("}])";
-    } else {
-      // Gemma-style multimodal usage with content type
-      const std::string image_content = R"({ "type": "image" })";
-      std::string content = "[";
-      for (size_t i = 0; i < image_paths.size(); ++i) {
-        content += image_content + ", ";
-      }
-      const std::string text_content = R"({ "type": "text", "text": ")";
-      content += text_content + text + R"(" }])";
-      messages = R"([{"role": "user", "content": )" + content + R"(}])";
-    }
-
-    std::string prompt = std::string(tokenizer->ApplyChatTemplate("", messages.c_str(), "", true));
-
-    std::cout << "Processing images and prompt..." << std::endl;
-    auto input_tensors = processor->ProcessImages(prompt.c_str(), images.get());
-
-    std::cout << "Generating response..." << std::endl;
-    auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 7680);
-
-    auto generator = OgaGenerator::Create(*model, *params);
-    generator->SetInputs(*input_tensors);
-
-    while (!generator->IsDone()) {
-      generator->GenerateNextToken();
-      const auto new_token = generator->GetNextTokens()[0];
-      std::cout << stream->Decode(new_token) << std::flush;
-    }
-
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
-  }
-}
-
-int main(int argc, char** argv) {
-  std::string model_path, ep, ep_library_path;
-  if (!parse_args(argc, argv, model_path, ep, &ep_library_path)) {
-    return -1;
-  }
-
-  std::cout << "-----------------------------" << std::endl;
-  std::cout << "Hello, ORT GenAI Model-Vision" << std::endl;
-  std::cout << "-----------------------------" << std::endl;
-
-  std::cout << "C++ API" << std::endl;
-  CXX_API(model_path.c_str(), ep.c_str(), ep_library_path.c_str());
-
-  return 0;
-}
\ No newline at end of file
diff --git a/examples/c/src/phi4-mm.cpp b/examples/c/src/phi4-mm.cpp
deleted file mode 100644
index 8c172fe97e..0000000000
--- a/examples/c/src/phi4-mm.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <iostream>
-#include <string>
-#include <fstream>
-#include <memory>
-#include "common.h"
-#include "ort_genai.h"
-
-// C++ API Example
-
-void CXX_API(const char* model_path, const char* execution_provider, const char* ep_library_path) {
-  // Register execution provider library if specified (for plug-in providers)
-  std::string provider(execution_provider);
-  std::string library_path(ep_library_path);
-  register_provider_library(provider, library_path);
-
-  std::cout << "Creating config..." << std::endl;
-  auto config = OgaConfig::Create(model_path);
-
-  append_provider(*config, provider);
-
-  std::cout << "Creating model..." << std::endl;
-  auto model = OgaModel::Create(*config);
-
-  std::cout << "Creating multimodal processor..." << std::endl;
-  auto processor = OgaMultiModalProcessor::Create(*model);
-
-  auto stream = OgaTokenizerStream::Create(*processor);
-  auto tokenizer = OgaTokenizer::Create(*model);
-
-  while (true) {
-    // Get images
-    std::string image_paths_str;
-    std::cout << "Image Path (comma separated; leave empty if no image):" << std::endl;
-    std::getline(std::cin, image_paths_str);
-    std::unique_ptr<OgaImages> images;
-    std::vector<std::string> image_paths;
-    for (size_t start = 0, end = 0; end < image_paths_str.size(); start = end + 1) {
-      end = image_paths_str.find(',', start);
-      image_paths.push_back(Trim(image_paths_str.substr(start, end - start)));
-    }
-    if (image_paths.empty()) {
-      std::cout << "No image provided" << std::endl;
-    } else {
-      std::cout << "Loading images..." << std::endl;
-      for (const auto& image_path : image_paths) {
-        if (!FileExists(image_path.c_str())) {
-          throw std::runtime_error(std::string("Image file not found: ") + image_path);
-        }
-      }
-      std::vector<const char*> image_paths_c;
-      for (const auto& image_path : image_paths) image_paths_c.push_back(image_path.c_str());
-      images = OgaImages::Load(image_paths_c);
-    }
-
-    // Get audios
-    std::string audio_paths_str;
-    std::cout << "Audio Path (comma separated; leave empty if no audio):" << std::endl;
-    std::getline(std::cin, audio_paths_str);
-    std::unique_ptr<OgaAudios> audios;
-    std::vector<std::string> audio_paths;
-    for (size_t start = 0, end = 0; end < audio_paths_str.size(); start = end + 1) {
-      end = audio_paths_str.find(',', start);
-      audio_paths.push_back(Trim(audio_paths_str.substr(start, end - start)));
-    }
-    if (audio_paths.empty()) {
-      std::cout << "No audio provided" << std::endl;
-    } else {
-      std::cout << "Loading audios..." << std::endl;
-      for (const auto& audio_path : audio_paths) {
-        if (!FileExists(audio_path.c_str())) {
-          throw std::runtime_error(std::string("Audio file not found: ") + audio_path);
-        }
-      }
-      std::vector<const char*> audio_paths_c;
-      for (const auto& audio_path : audio_paths) audio_paths_c.push_back(audio_path.c_str());
-      audios = OgaAudios::Load(audio_paths_c);
-    }
-
-    std::string text;
-    std::cout << "Prompt: " << std::endl;
-    std::getline(std::cin, text);
-
-    // Construct messages string with special tokens for ApplyChatTemplate
-    std::string content;
-    for (size_t i = 0; i < image_paths.size(); ++i)
-      content += "<|image_" + std::to_string(i + 1) + "|>\\n";
-    for (size_t i = 0; i < audio_paths.size(); ++i)
-      content += "<|audio_" + std::to_string(i + 1) + "|>\\n";
-    content += text;
-
-    const std::string messages = R"([{"role": "user", "content": ")" + content + R"("}])";
-
-    std::string prompt = std::string(tokenizer->ApplyChatTemplate("", messages.c_str(), "", true));
-
-    std::cout << "Processing images, audios, and prompt..." << std::endl;
-    auto input_tensors = processor->ProcessImagesAndAudios(prompt.c_str(), images.get(), audios.get());
-
-    std::cout << "Generating response..." << std::endl;
-    auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 7680);
-
-    auto generator = OgaGenerator::Create(*model, *params);
-    generator->SetInputs(*input_tensors);
-
-    while (!generator->IsDone()) {
-      generator->GenerateNextToken();
-      const auto new_token = generator->GetNextTokens()[0];
-      std::cout << stream->Decode(new_token) << std::flush;
-    }
-
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
-  }
-}
-
-int main(int argc, char** argv) {
-  std::string model_path, ep, ep_library_path;
-  if (!parse_args(argc, argv, model_path, ep, &ep_library_path)) {
-    return -1;
-  }
-
-  std::cout << "--------------------" << std::endl;
-  std::cout << "Hello, Phi-4-Multimodal!" << std::endl;
-  std::cout << "--------------------" << std::endl;
-  CXX_API(model_path.c_str(), ep.c_str(), ep_library_path.c_str());
-
-  return 0;
-}
\ No newline at end of file
diff --git a/examples/c/src/whisper.cpp b/examples/c/src/whisper.cpp
index 32e8c9e029..11b9c37ee2 100644
--- a/examples/c/src/whisper.cpp
+++ b/examples/c/src/whisper.cpp
@@ -31,7 +31,8 @@ void CXX_API(const char* model_path, int32_t num_beams) {
     } else {
       std::cout << "Loading audios..." << std::endl;
       for (const auto& audio_path : audio_paths) {
-        if (!FileExists(audio_path.c_str())) {
+        std::filesystem::path p(audio_path);
+        if (!std::filesystem::exists(p)) {
           throw std::runtime_error(std::string("Audio file not found: ") + audio_path);
         }
       }
@@ -69,8 +70,7 @@ void CXX_API(const char* model_path, int32_t num_beams) {
       std::cout << processor->Decode(tokens, num_tokens) << std::endl;
     }
 
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
+    std::cout << "\n\n\n";
   }
 }
 
@@ -111,7 +111,8 @@ void C_API(const char* model_path, int32_t num_beams) {
     } else {
       std::cout << "Loading audios..." << std::endl;
       for (const auto& audio_path : audio_paths) {
-        if (!FileExists(audio_path.c_str())) {
+        std::filesystem::path p(audio_path);
+        if (!std::filesystem::exists(p)) {
           throw std::runtime_error(std::string("Audio file not found: ") + audio_path);
         }
         std::vector<const char*> audio_paths_c;
@@ -161,8 +162,8 @@ void C_API(const char* model_path, int32_t num_beams) {
       std::cout << str << std::endl;
     }
 
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
+    std::cout << "\n\n"
+              << std::endl;
 
     OgaDestroyGenerator(generator);
     OgaDestroyGeneratorParams(params);
diff --git a/examples/chat_app/README.md b/examples/chat_app/README.md
deleted file mode 100755
index 3755325c51..0000000000
--- a/examples/chat_app/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# LLM Chat UI <!-- omit in toc -->
-
-This is a chat demo using the various versions of the LLMs
-
-> The app supports all of the CPU, CUDA and DirectML. CUDA is used as an example.
-
-**Contents**:
-- [Setup](#setup)
-- [Get the model](#get-the-model)
-- [Launch the app](#launch-the-app)
-
-## Setup
-
-1. Install **onnxruntime-genai-cuda** 
-    > If you want to use DirectML model, you can download `onnxruntime-genai-directml` package.
-   
-   ```
-   pip install numpy
-   pip install --pre onnxruntime-genai-cuda
-   ```
-
-2. Get this example
-
-   ```bash
-   git clone -n --depth=1 --filter=tree:0  https://github.com/microsoft/onnxruntime-genai.git
-   cd onnxruntime-genai
-   git sparse-checkout set --no-cone examples/chat_app
-   git checkout
-   cd examples/chat_app
-   ```
-
-3. Install the requirements
-
-    ```bash
-    pip install huggingface-hub mdtex2html
-    pip install gradio==4.36.0 # Gradio 3.47 breaks the UI and versions between 3.42 and 3.47 haven't been tested
-    ```
-
-
-## Get the model
-
-> If you already downloaded your model, you can skip this part and add `--model_path` when launching the app
-> For example. `python chat_app/app.py -m "/mnt/onnx/Phi-3-vision"`
-
-```bash
-cd ..
-huggingface-cli download microsoft/Phi-3-vision-128k-instruct-onnx-cuda --include cuda-int4-rtn-block-32/* --local-dir .
-mkdir -p models/cuda
-mv cuda-int4-rtn-block-32 models/cuda-int4/Phi-3-vision
-```
-
-If you would like the app to discover your models, please create the following folder structure, with the `models` folder at the same level as `chat_app`, one folder containing a set of models, and the actual models below this.
-
-```
---chat_app
---models
-   --directml
-      --phi-3-vision-directml-int4-awq-block-128
-      --meta-llama_Llama-2-7b-chat-hf
-      --mistralai_Mistral-7B-Instruct-v0.1
-            ...
-   --cuda-int4
-      --Phi-3-vision
-```
-
-If there is the word `vision` in the folder name containing the model files, the app will create a UI that processes images. If not, it will create a UI that processes language only.
-
-## Launch the app
-
-```
-python app.py
-```
-
-You can also attach your model that is outside of `models` folder to the app by passing arguments of `--model_path` and `--model_name`.
-
-```bash
-python chat_app/app.py --model_name "Phi-3-vision" --model_path "/mnt/onnx/Phi-3-vision"
-```
-
-You should see output from console
-```
-Running on local URL:  http://127.0.0.1:7860
-
-To create a public link, set `share=True` in `launch()`.
-```
-
-Then open the local URL in browser
-![alt text](image.png)
-
-For vision model, you will have the below UI interface.
-
-![alt text](vision_UI_interface.png)
diff --git a/examples/chat_app/__init__.py b/examples/chat_app/__init__.py
deleted file mode 100755
index cc2c489b27..0000000000
--- a/examples/chat_app/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
diff --git a/examples/chat_app/app.py b/examples/chat_app/app.py
deleted file mode 100755
index cff38054e2..0000000000
--- a/examples/chat_app/app.py
+++ /dev/null
@@ -1,261 +0,0 @@
-import argparse
-import gc
-import os
-from pathlib import Path
-
-import gradio as gr
-from app_modules.overwrites import postprocess
-from app_modules.presets import description, small_and_beautiful_theme, title
-from app_modules.utils import cancel_outputing, delete_last_conversation, reset_state, reset_textbox, transfer_input
-from interface.hddr_llm_onnx_interface import ONNXModel
-from interface.multimodal_onnx_interface import MultiModal_ONNXModel
-
-top_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-optimized_directory = os.path.join(top_directory, "models")
-available_models = {}
-
-interface = None
-
-
-def change_model_listener(new_model_name):
-    global interface
-
-    # if a model exists - shut it down before trying to create the new one
-    if interface is not None:
-        interface.shutdown()
-        del interface
-        gc.collect()
-
-    d = available_models[new_model_name]
-
-    if "vision" in new_model_name:
-        print("Configuring for multi-modal model")
-        interface = MultiModal_ONNXModel(
-            model_path=d["model_dir"],
-            execution_provider=d["provider"],
-        )
-    else:
-        print("Configuring for language-only model")
-        interface = ONNXModel(
-            model_path=d["model_dir"],
-            execution_provider=d["provider"],
-        )
-
-    # interface.initialize()
-
-    return [
-        new_model_name,
-        gr.update(visible="vision" in new_model_name),
-        [],
-        [],
-        gr.update(value=""),
-        "",
-    ]
-
-
-def change_image_visibility(new_model_name):
-    if "vision" in new_model_name:
-        return gr.update(visible=True)
-
-    return gr.update(visible=False)
-
-
-gr.Chatbot.postprocess = postprocess
-
-with Path(f"{top_directory}/chat_app/assets/custom.css").open() as f:
-    custom_css = f.read()
-
-
-def interface_predict(*args):
-    res = interface.predict(*args)
-    yield from res
-
-
-def interface_retry(*args):
-    res = interface.retry(*args)
-    yield from res
-
-
-def get_ep_name(name):
-    new_name = name.lower().replace("directml", "dml")
-    if "cpu" in new_name:
-        return "cpu"
-    elif "cuda" in new_name:
-        return "cuda"
-    elif "dml" in new_name:
-        return "dml"
-    elif "nvtensorrtrtx" in new_name:
-        return "NvTensorRtRtx"
-    raise ValueError(f"{new_name} is not recognized.")
-
-
-def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_path: str = ""):
-    if os.path.exists(optimized_directory):
-        for ep_name in os.listdir(optimized_directory):
-            sub_optimized_directory = os.path.join(optimized_directory, ep_name)
-            for model_name in os.listdir(sub_optimized_directory):
-                available_models[model_name] = {
-                    "model_dir": os.path.join(sub_optimized_directory, model_name),
-                    "provider": get_ep_name(ep_name),
-                }
-
-    if model_path:
-        available_models[model_name] = {"model_dir": model_path, "provider": get_ep_name(model_path)}
-
-    with gr.Blocks(css=custom_css, theme=small_and_beautiful_theme) as demo:
-        history = gr.State([])
-        user_question = gr.State("")
-        with gr.Row():
-            gr.HTML(title)
-            status_display = gr.Markdown("Success", elem_id="status_display")
-
-        with gr.Row():
-            with gr.Column(scale=4):
-                with gr.Row():
-                    chatbot = gr.Chatbot(elem_id="chuanhu_chatbot", height=650)
-                with gr.Row():
-                    with gr.Column(scale=12):
-                        user_input = gr.Textbox(show_label=False, placeholder="Enter text")
-                    with gr.Column(min_width=70, scale=1):
-                        submit_button = gr.Button("Send")
-                    with gr.Column(min_width=70, scale=1):
-                        cancel_button = gr.Button("Stop")
-                with gr.Row():
-                    empty_button = gr.Button(
-                        "🧹 New Conversation",
-                    )
-                    retry_button = gr.Button("🔄 Regenerate")
-                    delete_last_button = gr.Button("🗑️ Remove Last Turn")
-            reset_args = {"fn": reset_textbox, "inputs": [], "outputs": [user_input, status_display]}
-            with gr.Column(), gr.Column(min_width=50, scale=1), gr.Tab(label="Parameter Setting"):
-                gr.Markdown("# Model")
-                model_name = gr.Dropdown(
-                    choices=list(available_models.keys()),
-                    label="Model",
-                    show_label=False,  # default="Empty STUB",
-                    value=next(iter(available_models.keys())),
-                )
-                max_length_tokens = gr.Slider(
-                    minimum=0,
-                    maximum=131072,
-                    value=8192,
-                    step=128,
-                    interactive=True,
-                    label="Max Token Length",
-                )
-                max_context_length_tokens = gr.Slider(
-                    minimum=0,
-                    maximum=131072,
-                    value=8192,
-                    step=128,
-                    interactive=True,
-                    label="Max History Token Length",
-                )
-                token_printing_step = gr.Slider(
-                    minimum=1, maximum=50, value=4, step=1, interactive=True, label="Token Printing Step", visible=False
-                )
-                images = gr.File(file_count="multiple", file_types=["image"], label="Upload image(s)", visible=False)
-                images.change(
-                    reset_state,
-                    outputs=[chatbot, history, status_display],
-                    show_progress=True,
-                )
-                images.change(**reset_args)
-
-                model_name.change(
-                    change_model_listener,
-                    inputs=[model_name],
-                    outputs=[model_name, images, chatbot, history, user_input, status_display],
-                )
-        gr.Markdown(description)
-
-        predict_args = {
-            "fn": interface_predict,
-            "inputs": [
-                user_question,
-                chatbot,
-                history,
-                max_length_tokens,
-                max_context_length_tokens,
-                token_printing_step,
-                images,
-            ],
-            "outputs": [chatbot, history, status_display],
-            "show_progress": True,
-        }
-        retry_args = {
-            "fn": interface_retry,
-            "inputs": [chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, images],
-            "outputs": [chatbot, history, status_display],
-            "show_progress": True,
-        }
-
-        # Chatbot
-        transfer_input_args = {
-            "fn": transfer_input,
-            "inputs": [user_input],
-            "outputs": [user_question, user_input, submit_button],
-            "show_progress": True,
-        }
-
-        predict_event1 = user_input.submit(**transfer_input_args).then(**predict_args)
-
-        predict_event2 = submit_button.click(**transfer_input_args).then(**predict_args)
-
-        empty_button.click(
-            reset_state,
-            outputs=[chatbot, history, status_display],
-            show_progress=True,
-        )
-        empty_button.click(**reset_args)
-
-        predict_event3 = retry_button.click(**retry_args)
-
-        delete_last_button.click(
-            delete_last_conversation,
-            [chatbot, history],
-            [chatbot, history, status_display],
-            show_progress=True,
-        )
-        cancel_button.click(
-            cancel_outputing,
-            [],
-            [status_display],
-            cancels=[predict_event1, predict_event2, predict_event3],
-        )
-
-        demo.load(change_model_listener, inputs=[model_name], outputs=[model_name, images], concurrency_limit=1)
-
-    demo.title = "Local Model UI"
-
-    if expose_locally:
-        demo.launch(server_name="0.0.0.0", server_port=5000)
-    else:
-        demo.launch(share=True, server_port=5000)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--expose_locally", action="store_true")
-    parser.add_argument(
-        "--model_path", "-m", type=str, required=False, help="The location where your model is located."
-    )
-    parser.add_argument("--model_name", "-n", type=str, required=False, help="The name of your model")
-    args = parser.parse_args()
-    model_path = args.model_path
-
-    if not os.path.exists(optimized_directory) and not model_path:
-        raise ValueError("Please download the model into models folder or load the model by passing --model_path")
-
-    if args.model_path:
-        model_name = os.path.basename(model_path)
-        # check if genai_config.json in the model foler
-        if "genai_config.json" not in os.listdir(model_path):
-            raise ValueError(
-                f"Your model_path folder do not include 'genai.json' file, please double check your model_path '{model_path}'"
-            )
-
-    if args.model_name:
-        model_name = args.model_name
-
-    launch_chat_app(args.expose_locally, model_name, model_path)
diff --git a/examples/chat_app/app_modules/overwrites.py b/examples/chat_app/app_modules/overwrites.py
deleted file mode 100755
index 8807b89027..0000000000
--- a/examples/chat_app/app_modules/overwrites.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from __future__ import annotations
-
-from .presets import gr
-from .utils import convert_asis, convert_mdtext, detect_converted_mark
-
-
-def postprocess(self, y: list[tuple[str | None, str | None]]) -> list[tuple[str | None, str | None]]:
-    """Each message and response should be a string, which may be in Markdown format.
-
-    Returns:
-        List of tuples representing the message and response.
-        Each message and response will be a string of HTML.
-
-    """
-    if y is None or y == []:
-        return []
-    temp = []
-    for x in y:
-        user, bot = x
-        if not detect_converted_mark(user):
-            user = convert_asis(user)
-        if not detect_converted_mark(bot):
-            bot = convert_mdtext(bot)
-        temp.append((user, bot))
-    return temp
-
-
-GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse
diff --git a/examples/chat_app/app_modules/presets.py b/examples/chat_app/app_modules/presets.py
deleted file mode 100755
index 64a5398ea3..0000000000
--- a/examples/chat_app/app_modules/presets.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import gradio as gr
-
-title = """<h1 align="left" style="min-width:200px; margin-top:0;">LLM Chat UI, Powered By ONNX</h1>"""
-description = """\
-<div align="center" style="margin:16px 0">
-This is a chat demo using the various versions of the LLMs
-</div>
-"""
-CONCURRENT_COUNT = 100
-
-
-ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
-
-small_and_beautiful_theme = gr.themes.Soft(
-    primary_hue=gr.themes.Color(
-        c50="#02C160",
-        c100="rgba(2, 193, 96, 0.2)",
-        c200="#02C160",
-        c300="rgba(2, 193, 96, 0.32)",
-        c400="rgba(2, 193, 96, 0.32)",
-        c500="rgba(2, 193, 96, 1.0)",
-        c600="rgba(2, 193, 96, 1.0)",
-        c700="rgba(2, 193, 96, 0.32)",
-        c800="rgba(2, 193, 96, 0.32)",
-        c900="#02C160",
-        c950="#02C160",
-    ),
-    secondary_hue=gr.themes.Color(
-        c50="#576b95",
-        c100="#576b95",
-        c200="#576b95",
-        c300="#576b95",
-        c400="#576b95",
-        c500="#576b95",
-        c600="#576b95",
-        c700="#576b95",
-        c800="#576b95",
-        c900="#576b95",
-        c950="#576b95",
-    ),
-    neutral_hue=gr.themes.Color(
-        name="gray",
-        c50="#f9fafb",
-        c100="#f3f4f6",
-        c200="#e5e7eb",
-        c300="#d1d5db",
-        c400="#B2B2B2",
-        c500="#808080",
-        c600="#636363",
-        c700="#515151",
-        c800="#393939",
-        c900="#272727",
-        c950="#171717",
-    ),
-    radius_size=gr.themes.sizes.radius_sm,
-).set(
-    button_primary_background_fill="#06AE56",
-    button_primary_background_fill_dark="#06AE56",
-    button_primary_background_fill_hover="#07C863",
-    button_primary_border_color="#06AE56",
-    button_primary_border_color_dark="#06AE56",
-    button_primary_text_color="#FFFFFF",
-    button_primary_text_color_dark="#FFFFFF",
-    button_secondary_background_fill="#F2F2F2",
-    button_secondary_background_fill_dark="#2B2B2B",
-    button_secondary_text_color="#393939",
-    button_secondary_text_color_dark="#FFFFFF",
-    background_fill_primary="#F7F7F7",
-    background_fill_primary_dark="#1F1F1F",
-    block_title_text_color="*primary_500",
-    block_title_background_fill="*primary_100",
-    input_background_fill="#F6F6F6",
-)
diff --git a/examples/chat_app/app_modules/utils.py b/examples/chat_app/app_modules/utils.py
deleted file mode 100755
index 1ce8ef0060..0000000000
--- a/examples/chat_app/app_modules/utils.py
+++ /dev/null
@@ -1,222 +0,0 @@
-from __future__ import annotations
-
-import html
-import re
-
-import gradio as gr
-import mdtex2html
-from markdown import markdown
-from pygments import highlight
-from pygments.formatters import HtmlFormatter
-from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
-
-from .presets import ALREADY_CONVERTED_MARK
-
-
-def markdown_to_html_with_syntax_highlight(md_str):
-    def replacer(match):
-        lang = match.group(1) or "text"
-        code = match.group(2)
-        lang = lang.strip()
-        # print(1,lang)
-        if lang == "text":
-            lexer = guess_lexer(code)
-            lang = lexer.name
-            # print(2,lang)
-        try:
-            lexer = get_lexer_by_name(lang, stripall=True)
-        except ValueError:
-            lexer = get_lexer_by_name("python", stripall=True)
-        formatter = HtmlFormatter()
-        # print(3,lexer.name)
-        highlighted_code = highlight(code, lexer, formatter)
-
-        return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
-
-    code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
-    md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
-
-    return markdown(md_str)
-
-
-def normalize_markdown(md_text: str) -> str:
-    lines = md_text.split("\n")
-    normalized_lines = []
-    inside_list = False
-
-    for i, line in enumerate(lines):
-        if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
-            if not inside_list and i > 0 and lines[i - 1].strip() != "":
-                normalized_lines.append("")
-            inside_list = True
-            normalized_lines.append(line)
-        elif inside_list and line.strip() == "":
-            if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
-                normalized_lines.append(line)
-            continue
-        else:
-            inside_list = False
-            normalized_lines.append(line)
-
-    return "\n".join(normalized_lines)
-
-
-def convert_mdtext(md_text):
-    code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
-    inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
-    code_blocks = code_block_pattern.findall(md_text)
-    non_code_parts = code_block_pattern.split(md_text)[::2]
-
-    result = []
-    for non_code, code in zip(non_code_parts, [*code_blocks, ""], strict=False):
-        if non_code.strip():
-            formatted_non_code = normalize_markdown(non_code)
-            if inline_code_pattern.search(formatted_non_code):
-                result.append(markdown(formatted_non_code, extensions=["tables"]))
-            else:
-                result.append(mdtex2html.convert(formatted_non_code, extensions=["tables"]))
-        if code.strip():
-            formatted_code = f"\n```{code}\n\n```"
-            formatted_code = markdown_to_html_with_syntax_highlight(formatted_code)
-            result.append(formatted_code)
-    result = "".join(result)
-    result += ALREADY_CONVERTED_MARK
-    return result
-
-
-def convert_asis(userinput):
-    return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>' + ALREADY_CONVERTED_MARK
-
-
-def detect_converted_mark(userinput):
-    return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
-
-
-def detect_language(code):
-    if code.startswith("\n"):
-        first_line = ""
-    else:
-        first_line = code.strip().split("\n", 1)[0]
-    language = first_line.lower() if first_line else ""
-    first_line_length = len(first_line)
-    code_without_language = code[first_line_length:].lstrip() if first_line else code
-    return language, code_without_language
-
-
-def convert_to_markdown(text):
-    text = text.replace("$", "&#36;")
-
-    def replace_leading_tabs_and_spaces(line):
-        new_line = []
-
-        for char in line:
-            if char == "\t":
-                new_line.append("&#9;")
-            elif char == " ":
-                new_line.append("&nbsp;")
-            else:
-                break
-        new_line_length = len(new_line)
-        return "".join(new_line) + line[new_line_length:]
-
-    markdown_text = ""
-    lines = text.split("\n")
-    in_code_block = False
-
-    for line in lines:
-        if in_code_block is False and line.startswith("```"):
-            in_code_block = True
-            markdown_text += f"{line}\n"
-        elif in_code_block is True and line.startswith("```"):
-            in_code_block = False
-            markdown_text += f"{line}\n"
-        elif in_code_block:
-            markdown_text += f"{line}\n"
-        else:
-            stripped_line = replace_leading_tabs_and_spaces(line)
-            stripped_line = re.sub(r"^(#)", r"\\\1", stripped_line)
-            markdown_text += f"{stripped_line}  \n"
-
-    return markdown_text
-
-
-def add_language_tag(text):
-    def detect_language(code_block):
-        try:
-            lexer = guess_lexer(code_block)
-            return lexer.name.lower()
-        except ClassNotFound:
-            return ""
-
-    code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
-
-    def replacement(match):
-        code_block = match.group(2)
-        if match.group(2).startswith("\n"):
-            language = detect_language(code_block)
-            if language:
-                return f"```{language}{code_block}```"
-            else:
-                return f"```\n{code_block}```"
-        else:
-            return match.group(1) + code_block + "```"
-
-    return code_block_pattern.sub(replacement, text)
-
-
-def delete_last_conversation(chatbot, history):
-    if len(chatbot) > 0:
-        chatbot.pop()
-
-    if len(history) > 0:
-        history.pop()
-
-    return (
-        chatbot,
-        history,
-        "Delete Done",
-    )
-
-
-def reset_state():
-    return [], [], "Reset Done"
-
-
-def reset_textbox():
-    return gr.update(value=""), ""
-
-
-def cancel_outputing():
-    return "Stop Done"
-
-
-def transfer_input(inputs):
-    return (
-        inputs,
-        gr.update(value=""),
-        gr.Button(visible=True),
-    )
-
-
-class State:
-    interrupted = False
-
-    def interrupt(self):
-        self.interrupted = True
-
-    def recover(self):
-        self.interrupted = False
-
-
-shared_state = State()
-
-
-def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
-    for stop_word in stop_words:
-        if s.endswith(stop_word):
-            return True
-        for i in range(1, len(stop_word)):
-            if s.endswith(stop_word[:i]):
-                return True
-
-    return False
diff --git a/examples/chat_app/assets/custom.css b/examples/chat_app/assets/custom.css
deleted file mode 100755
index d9c46c0908..0000000000
--- a/examples/chat_app/assets/custom.css
+++ /dev/null
@@ -1,487 +0,0 @@
-:root {
-    --chatbot-color-light: #F3F3F3;
-    --chatbot-color-dark: #121111;
-}
-
-/* status_display */
-#status_display {
-    display: flex;
-    min-height: 2.5em;
-    align-items: flex-end;
-    justify-content: flex-end;
-}
-
-#status_display p {
-    font-size: .85em;
-    font-family: monospace;
-    color: var(--body-text-color-subdued);
-}
-
-
-
-/* usage_display */
-#usage_display {
-    height: 1em;
-}
-
-#usage_display p {
-    padding: 0 1em;
-    font-size: .85em;
-    font-family: monospace;
-    color: var(--body-text-color-subdued);
-}
-
-/* list */
-ol:not(.options),
-ul:not(.options) {
-    padding-inline-start: 2em !important;
-}
-
-/* Thank @Keldos-Li for fixing it */
-/* Light mode (default) */
-#chuanhu_chatbot {
-    background-color: var(--chatbot-color-light) !important;
-    color: #000000 !important;
-}
-
-[data-testid="bot"] {
-}
-
-[data-testid="user"] {
-    background-color: #02C160 !important;
-    color: #F3F3F3 !important;
-    font-size: medium;
-}
-
-/* Dark mode */
-.dark #chuanhu_chatbot {
-    background-color: var(--chatbot-color-dark) !important;
-    color: #F3F3F3 !important;
-}
-
-.dark [data-testid="bot"] {
-    background-color: #2C2C2C !important;
-}
-
-.dark [data-testid="user"] {
-    background-color: #26B561 !important;
-}
-
-#chuanhu_chatbot {
-    height: 100%;
-    min-height: 400px;
-}
-
-[class *="message"] {
-    border-radius: var(--radius-xl) !important;
-    border: none;
-    font-size: var(--text-md) !important;
-    line-height: var(--line-md) !important;
-    min-width: calc(var(--text-md)*var(--line-md) + 2*var(--spacing-xl));
-}
-
-[data-testid="bot"] {
-    max-width: 85%;
-    border-bottom-left-radius: 0 !important;
-}
-
-[data-testid="user"] {
-    max-width: 85%;
-    width: auto !important;
-    border-bottom-right-radius: 0 !important;
-}
-
-/* Table */
-table {
-    margin: 1em 0;
-    border-collapse: collapse;
-    empty-cells: show;
-}
-
-td,
-th {
-    border: 1.2px solid var(--border-color-primary) !important;
-    padding: 0.2em;
-}
-
-thead {
-    background-color: rgba(175, 184, 193, 0.2);
-}
-
-thead th {
-    padding: .5em .2em;
-}
-
-/* Inline code */
-#chuanhu_chatbot code {
-    display: inline;
-    white-space: break-spaces;
-    border-radius: 6px;
-    margin: 0 2px 0 2px;
-    padding: .2em .4em .1em .4em;
-    background-color: rgba(175, 184, 193, 0.2);
-}
-
-/* Code block */
-#chuanhu_chatbot pre code {
-    display: block;
-    overflow: auto;
-    white-space: pre;
-    background-color: hsla(0, 0%, 0%, 80%) !important;
-    border-radius: 10px;
-    padding: 1.4em 1.2em 0em 1.4em;
-    margin: 1.2em 2em 1.2em 0.5em;
-    color: #F3F3F3;
-    box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
-}
-
-/* Hightlight */
-#chuanhu_chatbot .highlight {
-    background-color: transparent
-}
-
-#chuanhu_chatbot .highlight .hll {
-    background-color: #49483e
-}
-
-#chuanhu_chatbot .highlight .c {
-    color: #75715e
-}
-
-/* Comment */
-#chuanhu_chatbot .highlight .err {
-    color: #960050;
-    background-color: #1e0010
-}
-
-/* Error */
-#chuanhu_chatbot .highlight .k {
-    color: #66d9ef
-}
-
-/* Keyword */
-#chuanhu_chatbot .highlight .l {
-    color: #ae81ff
-}
-
-/* Literal */
-#chuanhu_chatbot .highlight .n {
-    color: #8828f2
-}
-
-/* Name */
-#chuanhu_chatbot .highlight .o {
-    color: #f92672
-}
-
-/* Operator */
-#chuanhu_chatbot .highlight .p {
-    color: #482822
-}
-
-/* Punctuation */
-#chuanhu_chatbot .highlight .ch {
-    color: #75715e
-}
-
-/* Comment.Hashbang */
-#chuanhu_chatbot .highlight .cm {
-    color: #75715e
-}
-
-/* Comment.Multiline */
-#chuanhu_chatbot .highlight .cp {
-    color: #75715e
-}
-
-/* Comment.Preproc */
-#chuanhu_chatbot .highlight .cpf {
-    color: #75715e
-}
-
-/* Comment.PreprocFile */
-#chuanhu_chatbot .highlight .c1 {
-    color: #75715e
-}
-
-/* Comment.Single */
-#chuanhu_chatbot .highlight .cs {
-    color: #75715e
-}
-
-/* Comment.Special */
-#chuanhu_chatbot .highlight .gd {
-    color: #f92672
-}
-
-/* Generic.Deleted */
-#chuanhu_chatbot .highlight .ge {
-    font-style: italic
-}
-
-/* Generic.Emph */
-#chuanhu_chatbot .highlight .gi {
-    color: #a6e22e
-}
-
-/* Generic.Inserted */
-#chuanhu_chatbot .highlight .gs {
-    font-weight: bold
-}
-
-/* Generic.Strong */
-#chuanhu_chatbot .highlight .gu {
-    color: #75715e
-}
-
-/* Generic.Subheading */
-#chuanhu_chatbot .highlight .kc {
-    color: #66d9ef
-}
-
-/* Keyword.Constant */
-#chuanhu_chatbot .highlight .kd {
-    color: #66d9ef
-}
-
-/* Keyword.Declaration */
-#chuanhu_chatbot .highlight .kn {
-    color: #f92672
-}
-
-/* Keyword.Namespace */
-#chuanhu_chatbot .highlight .kp {
-    color: #66d9ef
-}
-
-/* Keyword.Pseudo */
-#chuanhu_chatbot .highlight .kr {
-    color: #66d9ef
-}
-
-/* Keyword.Reserved */
-#chuanhu_chatbot .highlight .kt {
-    color: #66d9ef
-}
-
-/* Keyword.Type */
-#chuanhu_chatbot .highlight .ld {
-    color: #162b74
-}
-
-/* Literal.Date */
-#chuanhu_chatbot .highlight .m {
-    color: #ae81ff
-}
-
-/* Literal.Number */
-#chuanhu_chatbot .highlight .s {
-    color: #062b84
-}
-
-/* Literal.String */
-#chuanhu_chatbot .highlight .na {
-    color: #a6e22e
-}
-
-/* Name.Attribute */
-#chuanhu_chatbot .highlight .nb {
-    color: #482822
-}
-
-/* Name.Builtin */
-#chuanhu_chatbot .highlight .nc {
-    color: #a6e22e
-}
-
-/* Name.Class */
-#chuanhu_chatbot .highlight .no {
-    color: #66d9ef
-}
-
-/* Name.Constant */
-#chuanhu_chatbot .highlight .nd {
-    color: #a6e22e
-}
-
-/* Name.Decorator */
-#chuanhu_chatbot .highlight .ni {
-    color: #482822
-}
-
-/* Name.Entity */
-#chuanhu_chatbot .highlight .ne {
-    color: #a6e22e
-}
-
-/* Name.Exception */
-#chuanhu_chatbot .highlight .nf {
-    color: #a6e22e
-}
-
-/* Name.Function */
-#chuanhu_chatbot .highlight .nl {
-    color: #1818f2
-}
-
-/* Name.Label */
-#chuanhu_chatbot .highlight .nn {
-    color: #482822
-}
-
-/* Name.Namespace */
-#chuanhu_chatbot .highlight .nx {
-    color: #a6e22e
-}
-
-/* Name.Other */
-#chuanhu_chatbot .highlight .py {
-    color: #482822
-}
-
-/* Name.Property */
-#chuanhu_chatbot .highlight .nt {
-    color: #f92672
-}
-
-/* Name.Tag */
-#chuanhu_chatbot .highlight .nv {
-    color: #482822
-}
-
-/* Name.Variable */
-#chuanhu_chatbot .highlight .ow {
-    color: #f92672
-}
-
-/* Operator.Word */
-#chuanhu_chatbot .highlight .w {
-    color: #482822
-}
-
-/* Text.Whitespace */
-#chuanhu_chatbot .highlight .mb {
-    color: #ae81ff
-}
-
-/* Literal.Number.Bin */
-#chuanhu_chatbot .highlight .mf {
-    color: #ae81ff
-}
-
-/* Literal.Number.Float */
-#chuanhu_chatbot .highlight .mh {
-    color: #ae81ff
-}
-
-/* Literal.Number.Hex */
-#chuanhu_chatbot .highlight .mi {
-    color: #ae81ff
-}
-
-/* Literal.Number.Integer */
-#chuanhu_chatbot .highlight .mo {
-    color: #ae81ff
-}
-
-/* Literal.Number.Oct */
-#chuanhu_chatbot .highlight .sa {
-    color: #162b74
-}
-
-/* Literal.String.Affix */
-#chuanhu_chatbot .highlight .sb {
-    color: #161b74
-}
-
-/* Literal.String.Backtick */
-#chuanhu_chatbot .highlight .sc {
-    color: #162b74
-}
-
-/* Literal.String.Char */
-#chuanhu_chatbot .highlight .dl {
-    color: #162b74
-}
-
-/* Literal.String.Delimiter */
-#chuanhu_chatbot .highlight .sd {
-    color: #162b74
-}
-
-/* Literal.String.Doc */
-#chuanhu_chatbot .highlight .s2 {
-    color: #162b74
-}
-
-/* Literal.String.Double */
-#chuanhu_chatbot .highlight .se {
-    color: #ae81ff
-}
-
-/* Literal.String.Escape */
-#chuanhu_chatbot .highlight .sh {
-    color: #162b74
-}
-
-/* Literal.String.Heredoc */
-#chuanhu_chatbot .highlight .si {
-    color: #162b74
-}
-
-/* Literal.String.Interpol */
-#chuanhu_chatbot .highlight .sx {
-    color: #162b74
-}
-
-/* Literal.String.Other */
-#chuanhu_chatbot .highlight .sr {
-    color: #162b74
-}
-
-/* Literal.String.Regex */
-#chuanhu_chatbot .highlight .s1 {
-    color: #162b74
-}
-
-/* Literal.String.Single */
-#chuanhu_chatbot .highlight .ss {
-    color: #162b74
-}
-
-/* Literal.String.Symbol */
-#chuanhu_chatbot .highlight .bp {
-    color: #482822
-}
-
-/* Name.Builtin.Pseudo */
-#chuanhu_chatbot .highlight .fm {
-    color: #a6e22e
-}
-
-/* Name.Function.Magic */
-#chuanhu_chatbot .highlight .vc {
-    color: #482822
-}
-
-/* Name.Variable.Class */
-#chuanhu_chatbot .highlight .vg {
-    color: #482822
-}
-
-/* Name.Variable.Global */
-#chuanhu_chatbot .highlight .vi {
-    color: #482822
-}
-
-/* Name.Variable.Instance */
-#chuanhu_chatbot .highlight .vm {
-    color: #482822
-}
-
-/* Name.Variable.Magic */
-#chuanhu_chatbot .highlight .il {
-    color: #ae81ff
-}
-
-/* Literal.Number.Integer.Long */
diff --git a/examples/chat_app/assets/custom.js b/examples/chat_app/assets/custom.js
deleted file mode 100755
index 219691448b..0000000000
--- a/examples/chat_app/assets/custom.js
+++ /dev/null
@@ -1 +0,0 @@
-// custom javascript here
diff --git a/examples/chat_app/consts.py b/examples/chat_app/consts.py
deleted file mode 100755
index 44db59915a..0000000000
--- a/examples/chat_app/consts.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    level=logging.INFO,
-)
-
-default_prompt = "<|user|>\n<|image_1|>\nWhat is shown in this image?<|end|>\n<|assistant|>\n"
diff --git a/examples/chat_app/image.png b/examples/chat_app/image.png
deleted file mode 100755
index dc7fc90bb7..0000000000
Binary files a/examples/chat_app/image.png and /dev/null differ
diff --git a/examples/chat_app/interface/hddr_llm_onnx_interface.py b/examples/chat_app/interface/hddr_llm_onnx_interface.py
deleted file mode 100755
index 8c7941a0fd..0000000000
--- a/examples/chat_app/interface/hddr_llm_onnx_interface.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import gc
-import logging
-import os
-import sys
-
-import onnxruntime_genai as og
-from app_modules.utils import convert_to_markdown, is_stop_word_or_prefix, shared_state
-
-current_dir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(os.path.join(current_dir, "..", "..", ".."))
-
-
-class ONNXModel:
-    """A wrapper for OnnxRuntime-GenAI to run ONNX LLM model."""
-
-    def __init__(self, model_path, execution_provider):
-        self.og = og
-
-        logging.info("Loading model...")
-        self.config = og.Config(model_path)
-        self.config.clear_providers()
-        if execution_provider != "cpu":
-            self.config.append_provider(execution_provider)
-        self.model = og.Model(self.config)
-        logging.info("Loaded model...")
-
-        self.tokenizer = og.Tokenizer(self.model)
-        self.tokenizer_stream = self.tokenizer.create_stream()
-        self.model_path = model_path
-
-        if "phi" in self.model_path:
-            self.template_header = ""
-            self.enable_history_max = 10 if "mini" in self.model_path else 2
-            self.history_template = "<|user|>{input}<|end|><|assistant|>{response}<|end|>"
-            self.chat_template = "<|user|>{input}<|end|><|assistant|>"
-        elif "Llama-3" in self.model_path:
-            self.enable_history_max = 2
-            self.template_header = """<|start_header_id|>system<|end_header_id|>
-You are a helpful AI assistant.<|eot_id|>"""
-            self.history_template = """<|start_header_id|>user<|end_header_id|>
-{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-{response}<|eot_id|>"""
-
-            self.chat_template = """<|start_header_id|>user<|end_header_id|>
-{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
-
-            # self.chat_template = llama3_template
-        else:
-            self.enable_history_max = 2
-            self.template_header = "<s>"
-            self.history_template = "[INST] {input} [/INST]{response}</s>"
-            self.chat_template = "[INST] {input} [/INST]"
-
-    def generate_prompt_with_history(self, text, history, max_length=2048):
-        prompt = ""
-
-        for dialog in history[-self.enable_history_max :]:
-            prompt += f"{self.history_template.format(input=dialog[0], response=dialog[1])}"
-
-        prompt = self.template_header + prompt
-
-        prompt += f"{self.chat_template.format(input=text)}"
-
-        input_ids = self.tokenizer.encode(prompt)
-
-        if len(input_ids) <= max_length:
-            return input_ids
-        else:
-            history.clear()
-            if "Llama-3" in self.model_path:
-                prompt = self.template_header
-            prompt += f"{self.chat_template.format(input=text)}"
-            return self.tokenizer.encode(prompt)
-
-    def search(
-        self,
-        input_ids,
-        max_length: int,
-        token_printing_step: int = 4,
-    ):
-        output_tokens = []
-
-        params = og.GeneratorParams(self.model)
-        search_options = {"max_length": max_length}
-        params.set_search_options(**search_options)
-
-        generator = og.Generator(self.model, params)
-        generator.append_tokens(input_ids)
-
-        idx = 0
-        while not generator.is_done():
-            idx += 1
-            generator.generate_next_token()
-            next_token = generator.get_next_tokens()[0]
-            output_tokens.append(next_token)
-
-            if idx % token_printing_step == 0:
-                yield self.tokenizer.decode(output_tokens)
-
-    def predict(self, text, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, *args):
-        if text == "":
-            yield chatbot, history, "Empty context."
-            return
-
-        inputs = self.generate_prompt_with_history(text, history, max_length=max_context_length_tokens)
-
-        if inputs is None:
-            yield chatbot, history, "Input too long."
-            return
-
-        input_ids = inputs[-max_context_length_tokens:]
-
-        human_tokens = [
-            "[|Human|]",
-            "Human:",
-            "### HUMAN:",
-            "### User:",
-            "USER:",
-            "<|im_start|>user",
-            "<|user|>",
-            "### Instruction:",
-            "GPT4 Correct User:",
-        ]
-
-        ai_tokens = [
-            "[|AI|]",
-            "AI:",
-            "### RESPONSE:",
-            "### Response:",
-            "ASSISTANT:",
-            "<|im_start|>assistant",
-            "<|assistant|>",
-            "GPT4 Correct Assistant:",
-            "### Assistant:",
-        ]
-
-        for x in self.search(
-            input_ids,
-            max_length=max_length_tokens,
-            token_printing_step=token_printing_step,
-        ):
-            sentence = x
-
-            if is_stop_word_or_prefix(sentence, ["[|Human|]", "[|AI|]", "Human:", "AIL"]) is False:
-                for human_token in human_tokens:
-                    if human_token in sentence:
-                        sentence = sentence[: sentence.index(human_token)].strip()
-                        break
-
-                for ai_token in ai_tokens:
-                    if ai_token in sentence:
-                        sentence = sentence[: sentence.index(ai_token)].strip()
-                        break
-                sentence = sentence.strip()
-                a, b = (
-                    [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]],
-                    [
-                        *history,
-                        [text, sentence],
-                    ],
-                )
-                yield a, b, "Generating..."
-
-            if shared_state.interrupted:
-                shared_state.recover()
-                try:
-                    yield a, b, "Stop: Success"
-                    return
-                except Exception as e:
-                    print(type(e).__name__, e)
-
-        del input_ids
-        gc.collect()
-
-        try:
-            yield a, b, "Generate: Success"
-        except Exception as e:
-            print(type(e).__name__, e)
-
-        return
-
-    def shutdown(self):
-        pass
-
-    def retry(self, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step):
-        if len(history) == 0:
-            yield chatbot, history, "Empty context"
-            return
-        chatbot.pop()
-        inputs = history.pop()[0]
-        yield from self.predict(
-            inputs,
-            chatbot,
-            history,
-            max_length_tokens,
-            max_context_length_tokens,
-            token_printing_step,
-        )
diff --git a/examples/chat_app/interface/multimodal_onnx_interface.py b/examples/chat_app/interface/multimodal_onnx_interface.py
deleted file mode 100755
index 909915a540..0000000000
--- a/examples/chat_app/interface/multimodal_onnx_interface.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import gc
-
-import onnxruntime_genai as og
-from app_modules.utils import convert_to_markdown, shared_state
-from consts import default_prompt, logging
-
-logging.getLogger("interface")
-
-
-class MultiModal_ONNXModel:
-    """A wrapper for ONNXRuntime GenAI to run ONNX Multimodal model"""
-
-    def __init__(self, model_path, execution_provider):
-        self.og = og
-
-        logging.info("Loading model...")
-        self.config = og.Config(model_path)
-        self.config.clear_providers()
-        if execution_provider != "cpu":
-            self.config.append_provider(execution_provider)
-        self.model = og.Model(self.config)
-        logging.info("Loaded model ...")
-
-        self.processor = self.model.create_multimodal_processor()
-        self.tokenizer = self.processor.create_stream()
-
-        self.enable_history_max = 2
-        self.template_header = "<s>"
-        self.history_template = "[INST] {input} [/INST]{response}</s>"
-        self.chat_template = "<|user|>\n{tags}\n{input}<|end|>\n<|assistant|>\n"
-
-    def generate_prompt_with_history(self, images, history, text=default_prompt, max_length=3072):
-        prompt = ""
-
-        for dialog in history[-self.enable_history_max :]:
-            prompt += f"{self.history_template.format(input=dialog[0], response=dialog[1])}"
-
-        prompt = self.template_header + prompt
-
-        image_tags = ""
-        for i in range(len(images)):
-            image_tags += f"<|image_{i + 1}|>\n"
-
-        prompt += f"{self.chat_template.format(input=text, tags=image_tags)}"
-        if len(prompt) > max_length:
-            history.clear()
-            prompt = f"{self.chat_template.format(input=text, tags=image_tags)}"
-
-        self.images = og.Images.open(*images)
-
-        logging.info("Preprocessing images and prompt ...")
-        inputs = self.processor(prompt, images=self.images)
-        return inputs
-
-    def search(self, inputs, max_length: int = 3072, token_printing_step: int = 1):
-        output = ""
-        params = og.GeneratorParams(self.model)
-        params.set_inputs(inputs)
-
-        search_options = {"max_length": max_length}
-        params.set_search_options(**search_options)
-        generator = og.Generator(self.model, params)
-
-        idx = 0
-        while not generator.is_done():
-            idx += 1
-            generator.generate_next_token()
-            next_token = generator.get_next_tokens()[0]
-            output += self.tokenizer.decode(next_token)
-
-        return output
-
-    def predict(self, text, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, *args):
-        if text == "":
-            yield chatbot, history, "Empty context"
-            return
-
-        inputs = self.generate_prompt_with_history(
-            text=text, history=history, images=args[0], max_length=max_context_length_tokens
-        )
-
-        sentence = self.search(
-            inputs,
-            max_length=max_length_tokens,
-            token_printing_step=token_printing_step,
-        )
-
-        sentence = sentence.strip()
-        a, b = (
-            [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]],
-            [
-                *history,
-                [text, sentence],
-            ],
-        )
-        yield a, b, "Generating ... "
-
-        if shared_state.interrupted:
-            shared_state.recover()
-            try:
-                yield a, b, "Stop: Success"
-                return
-            except Exception as e:
-                print(type(e).__name__, e)
-
-        del inputs
-        gc.collect()
-
-        try:
-            yield a, b, "Generate: Success"
-
-        except Exception as e:
-            print(type(e).__name__, e)
-
-        return
-
-    def shutdown(self):
-        pass
-
-    def retry(self, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, *args):
-        if len(history) == 0:
-            yield chatbot, history, "Empty context"
-            return
-
-        chatbot.pop()
-        inputs = history.pop()[0]
-
-        yield from self.predict(
-            inputs, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, args[0]
-        )
diff --git a/examples/chat_app/vision_UI_interface.png b/examples/chat_app/vision_UI_interface.png
deleted file mode 100644
index 48fecec3c1..0000000000
Binary files a/examples/chat_app/vision_UI_interface.png and /dev/null differ
diff --git a/examples/csharp/Common/Common.cs b/examples/csharp/Common/Common.cs
new file mode 100644
index 0000000000..d15476c374
--- /dev/null
+++ b/examples/csharp/Common/Common.cs
@@ -0,0 +1,1101 @@
+﻿using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System.CommandLine;
+using System.Reflection;
+using System.Reflection.Metadata.Ecma335;
+using System.Text;
+using System.Text.Encodings.Web;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+
+namespace CommonUtils
+{
+    public static class Common
+    {
+        /// <summary>
+        /// Set log options inside ORT GenAI
+        /// </summary>
+        /// <param name="inputs">Dump inputs to the model in the console</param>
+        /// <param name="outputs">Dump outputs to the model in the console</param>
+        /// <returns>
+        /// None
+        /// </returns>
+        public static void SetLogger(bool inputs = true, bool outputs = true)
+        {
+            Utils.SetLogBool("enabled", true);
+            Utils.SetLogBool("model_input_values", inputs);
+            Utils.SetLogBool("model_output_values", outputs);
+        }
+
+        /// <summary>
+        /// Register execution provider if path is provided
+        /// </summary>
+        /// <param name="ep">Name of execution provider to set</param>
+        /// <param name="ep_path">Path to execution provider to set</param>
+        /// <returns>
+        /// None
+        /// </returns>
+        public static void RegisterEP(string ep, string ep_path)
+        {
+            if (string.IsNullOrEmpty(ep_path))
+            {
+                return; // No library path specified, skip registration
+            }
+
+            Console.WriteLine($"Registering execution provider: {ep_path}");
+
+            var ortEnv = OrtEnv.Instance();
+            if (string.Equals(ep, "cuda", StringComparison.OrdinalIgnoreCase))
+            {
+                ortEnv.RegisterExecutionProviderLibrary("CUDAExecutionProvider", ep_path);
+            }
+            else if (string.Equals(ep, "NvTensorRtRtx", StringComparison.OrdinalIgnoreCase))
+            {
+                ortEnv.RegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", ep_path);
+            }
+            else
+            {
+                Console.WriteLine($"Warning: EP registration not supported for {ep}");
+                Console.WriteLine("Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries.");
+                return;
+            }
+
+            Console.WriteLine($"Registered {ep} successfully!");
+        }
+
+        /// <summary>
+        /// Get Config object and set EP-specific and search-specific options inside it
+        /// </summary>
+        /// <param name="path">Path to model folder containing GenAI config</param>
+        /// <param name="ep">Name of execution provider to set</param>
+        /// <param name="ep_options">Map of EP-specific option names and their values</param>
+        /// <param name="search_options">Class of search-specific option names and their values</param>
+        /// <returns>
+        /// ORT GenAI config object with all options set
+        /// </returns>
+        public static Config GetConfig(string path, string ep, Dictionary<string, string>? ep_options, GeneratorParamsArgs search_options)
+        {
+            var config = new Config(path);
+            if (ep != "follow_config")
+            {
+                config.ClearProviders();
+                if (ep != "cpu")
+                {
+                    Console.WriteLine($"Setting model to {ep}");
+                    config.AppendProvider(ep);
+                }
+
+                // Set any EP-specific options
+                if (ep_options != null)
+                {
+                    foreach (var kvp in ep_options)
+                    {
+                        var k = kvp.Key;
+                        var v = kvp.Value;
+                        if (k == "enable_cuda_graph" && (ep == "cuda" || ep == "NvTensorRtRtx") && search_options.num_beams > 1)
+                        {
+                            // Disable CUDA graph if using beam search (num_beams > 1),
+                            // num_beams > 1 requires past_present_share_buffer to be false so enable_cuda_graph must be false
+                            config.SetProviderOption(ep, "enable_cuda_graph", "0");
+                        }
+                        else
+                        {
+                            config.SetProviderOption(ep, k, v);
+                        }
+                    }
+                }
+            }
+
+            /**
+             * TODO: Uncomment the below snippet to use config.Overlay once the C# binding to Config.Overlay
+             * is in a stable package release.
+             */
+
+            // // Create serializer context to skip null attributes
+            // var options = new JsonSerializerOptions()
+            // {
+            //     WriteIndented = true,
+            //     PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
+            //     DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
+            // };
+            // var ctx = new ArgsSerializerContext(options);
+            // var json = JsonSerializer.Serialize(search_options, ctx.GeneratorParamsArgs);
+
+            // // Set any search-specific options that need to be known before constructing a Model object
+            // // Otherwise they can be set with params.SetSearchOptions(search_options)
+            // config.Overlay(json);
+            return config;
+        }
+
+        /// <summary>
+        /// Set search options for a generator's params during decoding
+        /// </summary>
+        /// <param name="generatorParams">Generator params object to set on</param>
+        /// <param name="args">Arguments provided by user</param>
+        /// <param name="verbose">Use verbose logging</param>
+        /// <returns>
+        /// None
+        /// </returns>
+        public static void SetSearchOptions(GeneratorParams generatorParams, GeneratorParamsArgs args, bool verbose)
+        {
+            var type = args.GetType();
+            var options = new List<string>();
+            foreach (var prop in type.GetProperties(BindingFlags.Instance | BindingFlags.Public))
+            {
+                var name = prop.Name;
+                var value = prop.GetValue(args);
+                if (value == null || name == "chunk_size") continue;
+
+                if (name == "do_sample")
+                {
+                    var val = Convert.ToBoolean(value);
+                    options.Add($"{name}: {val}");
+                    generatorParams.SetSearchOption(name, val);
+                }
+                else
+                {
+                    var val = Convert.ToDouble(value);
+                    options.Add($"{name}: {val}");
+                    generatorParams.SetSearchOption(name, val);
+                }
+            }
+            
+            if (verbose) Console.WriteLine("GeneratorParams created: {" + string.Join(", ", options) + "}");
+        }
+
+        /// <summary>
+        /// Apply the chat template with various fallback options
+        /// </summary>
+        /// <param name="model_path">Path to folder containing model</param>
+        /// <param name="tokenizer">Tokenizer object to use</param>
+        /// <param name="messages">String-encoded list of messages</param>
+        /// <param name="add_generation_prompt">Add tokens to indicate the start of the AI's response</param>
+        /// <param name="tools">String-encoded list of tools</param>
+        /// <returns>
+        /// Prompt to encode
+        /// </returns>
+        public static string ApplyChatTemplate(string model_path, Tokenizer tokenizer, string messages, bool add_generation_prompt, string tools = "")
+        {
+            var template_str = "";
+            var jinja_path = Path.Combine(model_path, "chat_template.jinja");
+            if (File.Exists(jinja_path))
+            {
+                template_str = File.ReadAllText(jinja_path, Encoding.UTF8);
+            }
+
+            var prompt = tokenizer.ApplyChatTemplate(
+                messages: messages,
+                tools: tools,
+                add_generation_prompt: add_generation_prompt,
+                template_str: template_str
+            );
+            return prompt;
+        }
+
+        /// <summary>
+        /// Get prompt for 'user' role in chat template
+        /// </summary>
+        /// <param name="prompt">Provided prompt</param>
+        /// <param name="interactive">Interactive mode (otherwise uses either user-provided prompt or default)</param>
+        /// <returns>
+        /// Prompt to use
+        /// </returns>
+        public static string GetUserPrompt(string prompt, bool interactive)
+        {
+            string? text;
+            while (true)
+            {
+                if (interactive)
+                {
+                    Console.Write("Prompt (Use quit() to exit): ");
+                    text = Console.ReadLine();
+                }
+                else
+                {
+                    text = prompt;
+                }
+
+                if (string.IsNullOrEmpty(text))
+                {
+                    Console.WriteLine("Empty input. Please enter a valid prompt.");
+                    continue;  // Skip to the next iteration if input is empty
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return text;
+        }
+
+        /// <summary>
+        /// Get paths to media for user
+        /// </summary>
+        /// <param name="media_paths">User-provided media paths</param>
+        /// <param name="interactive">Interactive mode (otherwise uses either user-provided media paths or default)</param>
+        /// <param name="media_type">The media type being obtained</param>
+        /// <returns>
+        /// All media filepaths to read and encode
+        /// </returns>
+        public static List<string> GetUserMediaPaths(List<string> media_paths, bool interactive, string media_type)
+        {
+            // Check media type
+            var media_type_lower = media_type.ToLowerInvariant();
+            if (media_type_lower != "audio" && media_type_lower != "image")
+            {
+                throw new Exception("Media type must be 'image' or 'audio'");
+            }
+            var media_type_capitalized = char.ToUpperInvariant(media_type_lower[0]) + media_type_lower[1..];
+
+            var paths = new List<string>();
+            if (media_paths.Count > 0)
+            {
+                // If user-provided media paths
+                paths = media_paths;
+            }
+            else if (interactive)
+            {
+                // If interactive mode is on
+                Console.Write($"{media_type_capitalized} Path (comma separated; leave empty if no {media_type_lower}): ");
+                var line = Console.ReadLine() ?? string.Empty;
+
+                // Split by comma, trim whitespace and surrounding quotes
+                paths = line.Split(',', StringSplitOptions.RemoveEmptyEntries)
+                            .Select(p =>
+                            {
+                                // Trim quotes
+                                var s = p.Trim();
+                                if (s.Length >= 2 && ((s[0] == '"' && s[^1] == '"') || (s[0] == '\'' && s[^1] == '\'')))
+                                {
+                                    s = s[1..^1]; // strip surrounding quotes
+                                }
+                                return s;
+                            })
+                            .Where(p => !string.IsNullOrWhiteSpace(p))
+                            .ToList();
+            }
+
+            paths = paths.Where(p => !string.IsNullOrWhiteSpace(p)).Select(p => p.Trim()).ToList();
+            foreach (var path in paths)
+            {
+                if (!File.Exists(path))
+                {
+                    throw new Exception($"{media_type_capitalized} file not found: {path}");
+                }
+                Console.WriteLine($"Using {media_type_lower}: {path}");
+            }
+
+            return paths;
+        }
+
+        /// <summary>
+        /// Get images for user
+        /// </summary>
+        /// <param name="image_paths">User-provided image paths</param>
+        /// <param name="interactive">Interactive mode (otherwise uses either user-provided image paths or default)</param>
+        /// <returns>
+        /// (all images, number of images) as a tuple
+        /// </returns>
+        public static (Images?, int) GetUserImages(List<string> image_paths, bool interactive)
+        {
+            var media_type = "image";
+            List<string> paths = GetUserMediaPaths(image_paths, interactive, media_type);
+            if (paths.Count == 0)
+            {
+                Console.WriteLine($"No {media_type} provided");
+                return (null, 0);
+            }
+
+            var images = Images.Load(paths.ToArray());
+            return (images, paths.Count);
+        }
+
+        /// <summary>
+        /// Get audios for user
+        /// </summary>
+        /// <param name="audio_paths">User-provided audio paths</param>
+        /// <param name="interactive">Interactive mode (otherwise uses either user-provided audio paths or default)</param>
+        /// <returns>
+        /// (all audios, number of audios) as a tuple
+        /// </returns>
+        public static (Audios?, int) GetUserAudios(List<string> audio_paths, bool interactive)
+        {
+            var media_type = "audio";
+            List<string> paths = GetUserMediaPaths(audio_paths, interactive, media_type);
+            if (paths.Count == 0)
+            {
+                Console.WriteLine($"No {media_type} provided");
+                return (null, 0);
+            }
+
+            var audios = Audios.Load(paths.ToArray());
+            return (audios, paths.Count);
+        }
+
+        /// <summary>
+        /// Get content for 'user' role in chat template
+        /// </summary>
+        /// <param name="model_type">Model type inside ORT GenAI</param>
+        /// <param name="num_images">Number of images</param>
+        /// <param name="num_audios">Number of audios</param>
+        /// <param name="prompt">User prompt</param>
+        /// <returns>
+        /// Combined content for 'user' role
+        /// </returns>
+        public static string GetUserContent(string model_type, int num_images, int num_audios, string prompt)
+        {
+            string content;
+            // Combine all image tags, audio tags, and text into one user content
+            if (model_type == "phi3v")
+            {
+                // Phi-3 vision, Phi-3.5 vision
+                var image_tags = "";
+                for (int i = 0; i < num_images; i++)
+                {
+                    image_tags += $"<|image_{i + 1}|>\n";
+                }
+                content = image_tags + prompt;
+            }
+            else if (model_type == "phi4mm")
+            {
+                // Phi-4 multimodal
+                var image_tags = "";
+                for (int i = 0; i < num_images; i++)
+                {
+                    image_tags += $"<|image_{i + 1}|>\n";
+                }
+                var audio_tags = "";
+                for (int i = 0; i < num_audios; i++)
+                {
+                    audio_tags += $"<|audio_{i + 1}|>\n";
+                }
+                content = image_tags + audio_tags + prompt;
+            }
+            else if (model_type == "qwen2_5_vl" || model_type == "fara")
+            {
+                // Qwen-2.5 VL, Fara
+                var image_tags = "";
+                for (int i = 0; i < num_images; i++)
+                {
+                    image_tags += "<|vision_start|><|image_pad|><|vision_end|>";
+                }
+                content = image_tags + prompt;
+            }
+            else
+            {
+                // Gemma-3 style: structured content
+                var list = new List<Dictionary<string, string>>();
+                for (int i = 0; i < num_images; i++)
+                {
+                    list.Add(new Dictionary<string, string>
+                    {
+                        ["type"] = "image"
+                    });
+                }
+                list.Add(new Dictionary<string, string>
+                {
+                    ["type"] = "text",
+                    ["text"] = prompt
+                });
+                content = JsonSerializer.Serialize(list);
+            }
+
+            return content;
+        }
+
+        /// <summary>
+        /// Convert a list of tools to a list of tool schemas
+        /// </summary>
+        /// <param name="tools">List of OpenAI-compatible tools</param>
+        /// <returns>
+        /// List of JSON schema compatible tools
+        /// </returns>
+        public static IList<ToolSchema> ToolsToSchemas(IList<Tool> tools)
+        {
+            var tool_schemas = new List<ToolSchema> { };
+            foreach (var tool in tools)
+            {
+                var name = new Dictionary<string, string>()
+                {
+                    { "const", tool.Function.Name }
+                };
+                var properties = new Dictionary<string, object>
+                {
+                    { "name", name }
+                };
+
+                var tool_parameters_exist = tool.Function.Parameters.Count != 0;
+                if (tool_parameters_exist)
+                {
+                    var parameters = new Dictionary<string, object>
+                    {
+                        { "type", tool.Function.Parameters.GetValueOrDefault("type", "object") },
+                        { "properties", tool.Function.Parameters.GetValueOrDefault("properties", new Dictionary<string, object>{}) },
+                        { "required", tool.Function.Parameters.GetValueOrDefault("required", new List<string>{}) }
+                    };
+                    properties.Add("parameters", parameters);
+                }
+
+                var tool_schema = new ToolSchema()
+                {
+                    Description = tool.Function.Description,
+                    Type = "object",
+                    Properties = properties,
+                    Required = tool_parameters_exist ? ["name", "parameters"] : ["name"],
+                    AdditionalProperties = false
+                };
+                tool_schemas.Add(tool_schema);
+            }
+            return tool_schemas;
+        }
+
+        /// <summary>
+        /// Create a JSON schema from a list of tools
+        /// </summary>
+        /// <param name="tools">List of OpenAI-compatible tools</param>
+        /// <param name="tool_output">Output can have a tool call</param>
+        /// <returns>
+        /// JSON schema as a JSON-compatible string
+        /// </returns>
+        public static string GetJsonSchema(IList<Tool> tools, bool tool_output)
+        {
+            var schemas = ToolsToSchemas(tools);
+            var x_guidance = new Dictionary<string, object>
+            {
+                { "whitespace_flexible", false },
+                { "key_separator",  ": "},
+                { "item_separator", ", " }
+            };
+            var json_schema = new JsonSchema
+            {
+                XGuidance = x_guidance,
+                Type = "array",
+                Items = new Dictionary<string, IList<ToolSchema>>{
+                    { "anyOf", schemas }
+                },
+                MinItems = tool_output ? 1 : 0
+            };
+
+            // Create serializer context with encoder to not escape non-ASCII characters (e.g. don't convert '&' to \u0026)
+            // and to skip null attributes
+            var options = new JsonSerializerOptions()
+            {
+                WriteIndented = true,
+                PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
+                Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
+                DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
+            };
+            var ctx = new ToolSerializerContext(options);
+
+            return JsonSerializer.Serialize(json_schema, ctx.JsonSchema);
+        }
+
+        /// <summary>
+        /// Create a LARK grammar from a list of tools
+        /// </summary>
+        /// <param name="tools">List of OpenAI-compatible tools</param>
+        /// <param name="text_output">Output can have text</param>
+        /// <param name="tool_output">Output can have a tool call</param>
+        /// <param name="tool_call_start">String representation of tool call starting token</param>
+        /// <param name="tool_call_end">String representation of tool call ending token</param>
+        /// <returns>
+        /// LARK grammar as a string
+        /// </returns>
+        public static string GetLarkGrammar(IList<Tool> tools, bool text_output, bool tool_output, string tool_call_start, string tool_call_end)
+        {
+            var known_tool_call_ids = !string.IsNullOrEmpty(tool_call_start) && !string.IsNullOrEmpty(tool_call_end);
+            var call_type = known_tool_call_ids ? "toolcall" : "functioncall";
+
+            var rows = new List<string>();
+            string? start_row;
+            if (text_output && !tool_output)
+            {
+                start_row = "start: TEXT";
+            } 
+            else if (!text_output && tool_output)
+            {
+                start_row = $"start: {call_type}";
+            }    
+            else if (text_output && tool_output)
+            {
+                start_row = $"start: TEXT | {call_type}";
+            }
+            else
+            {
+                throw new Exception("At least one of 'text_output' and 'tool_output' must be true");
+            }
+            rows.Add(start_row);
+
+            if (text_output)
+            {
+                var text_row = "TEXT: /[^{<](.|\\n)*/";
+                rows.Add(text_row);
+            }
+
+            if (tool_output)
+            {
+                var schema = GetJsonSchema(tools: tools, tool_output: tool_output);
+                if (known_tool_call_ids)
+                {
+                    var tool_row = $"toolcall: {tool_call_start} functioncall {tool_call_end}";
+                    rows.Add(tool_row);
+                }
+
+                var func_row = $"functioncall: %json {schema}";
+                rows.Add(func_row);
+            }
+
+            var grammar = string.Join("\n", rows);
+            return grammar;
+        }
+
+        /// <summary>
+        /// Convert a JSON-deserialized object of tools to a list of Tool objects
+        /// </summary>
+        /// <param name="tool_defs">JSON-deserialized object containing OpenAI-compatible tool definitions</param>
+        /// <returns>
+        /// List of Tool objects
+        /// </returns>
+        public static IList<Tool> ToTool(IList<Dictionary<string, object>> tool_defs)
+        {
+            var tools = new List<Tool> { };
+            foreach (var tool_def in tool_defs)
+            {
+                if (tool_def.TryGetValue("function", out var functionObj))
+                {
+                    var functionStr = JsonSerializer.Serialize(functionObj);
+                    var functionDict = JsonSerializer.Deserialize(functionStr, ToolSerializerContext.Default.DictionaryStringObject);
+                    if (functionDict == null) continue;
+                    
+                    var name = functionDict.TryGetValue("name", out var nameObj) ? nameObj?.ToString() ?? string.Empty : string.Empty;
+                    var description = functionDict.TryGetValue("description", out var descObj) ? descObj?.ToString() ?? string.Empty : string.Empty;
+
+                    if (functionDict.TryGetValue("parameters", out var paramObj))
+                    {
+                        var paramStr = JsonSerializer.Serialize(paramObj);
+                        var paramDict = JsonSerializer.Deserialize(paramStr, ToolSerializerContext.Default.DictionaryStringObject);
+                        if (paramDict == null) continue;
+
+                        var func = new FunctionDefinition
+                        {
+                            Name = name,
+                            Description = description,
+                            Parameters = paramDict
+                        };
+                        var tool = new Tool()
+                        {
+                            Type = "function",
+                            Function = func
+                        };
+                        tools.Add(tool);
+                    }
+                }
+            }
+            return tools;
+        }
+
+        /// <summary>
+        /// Create a grammar to use with LLGuidance
+        /// </summary>
+        /// <param name="response_format">Type of format requested</param>
+        /// <param name="filepath">Path to file containing OpenAI-compatible tool definitions</param>
+        /// <param name="tools_str">JSON-serialized string containing OpenAI-compatible tool definitions</param>
+        /// <param name="tools">List of OpenAI-compatible tools defined in memory</param>
+        /// <param name="text_output">Output can have text</param>
+        /// <param name="tool_output">Output can have a tool call</param>
+        /// <param name="tool_call_start">String representation of tool call starting token (e.g. <tool_call>)</param>
+        /// <param name="tool_call_end">String representation of tool call ending token (e.g. </tool_call>)</param>
+        /// <returns>
+        /// (grammar type, grammar data, tools) as a tuple of strings
+        /// </returns>
+        public static (string, string, string) GetGuidance(
+            string response_format = "",
+            string filepath = "",
+            string tools_str = "",
+            List<object>? tools = null,
+            bool text_output = true,
+            bool tool_output = false,
+            string tool_call_start = "",
+            string tool_call_end = "")
+        {
+            var guidance_type = "";
+            var guidance_data = "";
+            IList<Tool> all_tools = [];
+
+            // Get list of tools from a range of sources (filepath, JSON-serialized string, in-memory)
+            if (tool_output)
+            {
+                if (File.Exists(filepath))
+                {
+                    var json_str = File.ReadAllText(filepath);
+                    if (string.IsNullOrWhiteSpace(json_str))
+                    {
+                        throw new Exception("Error: JSON file is empty.");
+                    }
+
+                    var tool_defs = JsonSerializer.Deserialize(json_str, ToolSerializerContext.Default.IListDictionaryStringObject);
+                    if (tool_defs == null)
+                    {
+                        throw new Exception("Error: Tools did not de-serialize correctly");
+                    }
+                    all_tools = ToTool(tool_defs);
+                }
+                else if (!string.IsNullOrEmpty(tools_str))
+                {
+                    var tool_defs = JsonSerializer.Deserialize(tools_str, ToolSerializerContext.Default.IListDictionaryStringObject);
+                    if (tool_defs == null)
+                    {
+                        throw new Exception("Error: Tools did not de-serialize correctly");
+                    }
+                    all_tools = ToTool(tool_defs);
+                }
+                else if (tools != null && tools.Count > 0)
+                {
+                    try
+                    {
+                        all_tools = ToTool(tools.Cast<Dictionary<string, object>>().ToList());
+                    }
+                    catch
+                    {
+                        Console.WriteLine("Could not convert tools from List<object> to List<Dictionary<string, object>>");
+                        try
+                        {
+                            all_tools = tools.Cast<Tool>().ToList();
+                        }
+                        catch
+                        {
+                            Console.WriteLine("Could not convert tools from List<object> to List<Tool>");
+                        }
+                    }
+                }
+                else
+                {
+                    throw new Exception("Error: Please provide the list of tools through a file, JSON-serialized string, or a list of tools");
+                }
+
+                if (all_tools.Count <= 0)
+                { 
+                    throw new Exception("Error: Could not obtain a list of tools in memory");
+                }
+            }
+
+            // Create guidance based on user-provided response format
+            if (response_format == "text" || response_format == "lark_grammar")
+            {
+                if (response_format == "text")
+                {
+                    var right_settings = text_output && !tool_output;
+                    if (!right_settings)
+                    {
+                        throw new Exception("Error: A response format of 'text' requires text_output = true and tool_output = false");
+                    }
+                }
+
+                guidance_type = "lark_grammar";
+                guidance_data = GetLarkGrammar(
+                    tools: all_tools,
+                    text_output: text_output,
+                    tool_output: tool_output,
+                    tool_call_start: tool_call_start,
+                    tool_call_end: tool_call_end);
+            }
+            else if (response_format == "json_schema" || response_format == "json_object")
+            {
+                var right_settings = tool_output && !text_output;
+                if (!right_settings)
+                {
+                    throw new Exception("Error: A response format of 'json_schema' or 'json_object' requires text_output = false and tool_output = true");
+                }
+
+                guidance_type = "json_schema";
+                guidance_data = GetJsonSchema(tools: all_tools, tool_output: tool_output);
+            }
+            else
+            {
+                throw new Exception("Error: Invalid response format provided");
+            }
+
+            return (guidance_type, guidance_data, JsonSerializer.Serialize(all_tools, ToolSerializerContext.Default.IListTool));
+        }
+
+        /// <summary>
+        /// Add arguments for the generator params
+        /// </summary>
+        /// <param name="parser">Original parser object with existing arguments</param>
+        /// <return>
+        /// None
+        /// </return>
+        public static void GetGeneratorParamsArgs(RootCommand parser)
+        {
+            var batch_size = new Option<int>(
+                name: "batch_size",
+                aliases: ["-b", "--batch_size"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => 1,
+                Description = "Batch size for input payload"
+            };
+
+            var chunk_size = new Option<int>(
+                name: "chunk_size",
+                aliases: ["-c", "--chunk_size"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => 0,
+                Description = "Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)"
+            };
+
+            var do_sample = new Option<bool>(
+                name: "do_sample",
+                aliases: ["-s", "--do_sample"]
+            )
+            {
+                Arity = ArgumentArity.Zero,
+                DefaultValueFactory = (_) => false,
+                Description = "Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false"
+            };
+
+            var min_length = new Option<int?>(
+                name: "min_length",
+                aliases: ["-i", "--min_length"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Min number of tokens to generate including the prompt"
+            };
+
+            var max_length = new Option<int?>(
+                name: "max_length",
+                aliases: ["-l", "--max_length"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Max number of tokens to generate including the prompt"
+            };
+
+            var num_beams = new Option<int>(
+                name: "num_beams",
+                aliases: ["-nb", "--num_beams"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => 1,
+                Description = "Number of beams to create"
+            };
+
+            var num_return_sequences = new Option<int>(
+                name: "num_return_sequences",
+                aliases: ["-rs", "--num_return_sequences"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => 1,
+                Description = "Number of return sequences to produce"
+            };
+
+            var repetition_penalty = new Option<double?>(
+                name: "repetition_penalty",
+                aliases: ["-r", "--repetition_penalty"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Repetition penalty to sample with"
+            };
+
+            var temperature = new Option<double?>(
+                name: "temperature",
+                aliases: ["-t", "--temperature"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Temperature to sample with"
+            };
+
+            var top_k = new Option<int?>(
+                name: "top_k",
+                aliases: ["-k", "--top_k"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Top k tokens to sample from"
+            };
+
+            var top_p = new Option<double?>(
+                name: "top_p",
+                aliases: ["-p", "--top_p"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                Description = "Top p probability to sample with"
+            };
+
+            parser.Add(batch_size);
+            parser.Add(chunk_size);
+            parser.Add(do_sample);
+            parser.Add(min_length);
+            parser.Add(max_length);
+            parser.Add(num_beams);
+            parser.Add(num_return_sequences);
+            parser.Add(repetition_penalty);
+            parser.Add(temperature);
+            parser.Add(top_k);
+            parser.Add(top_p);
+        }
+
+        /// <summary>
+        /// Add arguments for guidance options
+        /// </summary>
+        /// <param name="parser">Original parser object with existing arguments</param>
+        /// <return>
+        /// None
+        /// </return>
+        public static void GetGuidanceArgs(RootCommand parser)
+        {
+            var response_format = new Option<string>(
+                name: "response_format",
+                aliases: ["-rf", "--response_format"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => "",
+                Description = "Provide response format for the model",
+            };
+            response_format.Validators.Add(result => {
+                var value = result.GetValue(response_format)!;
+                if (string.IsNullOrEmpty(value)) return;
+
+                var options = new List<string> { "text", "json_object", "json_schema", "lark_grammar" };
+                if (!options.Contains(value))
+                {
+                    var options_str = string.Join(", ", options);
+                    result.AddError($"Response format must be from one of the options: {options_str}");
+                }
+            });
+
+            var tools_file = new Option<string>(
+                name: "tools_file",
+                aliases: ["-tf", "--tools_file"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => "",
+                Description = "Path to file containing list of OpenAI-compatible tool definitions. Ex: test/test_models/tool-definitions/weather.json"
+            };
+            tools_file.Validators.Add(result =>
+            {
+                var value = result.GetValue(tools_file)!;
+                if (string.IsNullOrEmpty(value)) return;
+
+                if (!value.EndsWith(".json"))
+                {
+                    result.AddError("Path must be to a .json file");
+                }
+                if (!File.Exists(value))
+                {
+                    result.AddError("JSON file does not exist");
+                }
+            });
+
+            var text_output = new Option<bool>(
+                name: "text_output",
+                aliases: ["-text", "--text_output"]
+            )
+            {
+                Arity = ArgumentArity.Zero,
+                DefaultValueFactory = (_) => false,
+                Description = "Produce a text response in the output"
+            };
+
+            var tool_output = new Option<bool>(
+                name: "tool_output",
+                aliases: ["-tool", "--tool_output"]
+            )
+            {
+                Arity = ArgumentArity.Zero,
+                DefaultValueFactory = (_) => false,
+                Description = "Produce a tool call in the output"
+            };
+
+            var tool_call_start = new Option<string>(
+                name: "tool_call_start",
+                aliases: ["-tcs", "--tool_call_start"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => "",
+                Description = "String representation of tool call start (ex: <|tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work."
+            };
+
+            var tool_call_end = new Option<string>(
+                name: "tool_call_end",
+                aliases: ["-tce", "--tool_call_end"]
+            )
+            {
+                Arity = ArgumentArity.ExactlyOne,
+                DefaultValueFactory = (_) => "",
+                Description = "String representation of tool call end (ex: <|/tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work."
+            };
+
+            parser.Add(response_format);
+            parser.Add(tools_file);
+            parser.Add(text_output);
+            parser.Add(tool_output);
+            parser.Add(tool_call_start);
+            parser.Add(tool_call_end);
+        }
+
+        /// <summary>
+        /// Set arguments for generator params and guidance
+        /// </summary>
+        /// <param name="parseResult">Parsed result with user-provided arguments</param>
+        /// <return>
+        /// (GeneratorParamsArgs, GuidanceArgs) as a tuple of user-provided arguments
+        /// </return>
+        public static (GeneratorParamsArgs, GuidanceArgs) SetGroupedArgs(ParseResult parseResult)
+        {
+            GeneratorParamsArgs generatorParamsArgs = new GeneratorParamsArgs
+            {
+                batch_size = parseResult.GetValue<int>("batch_size"),
+                chunk_size = parseResult.GetValue<int>("chunk_size"),
+                do_sample = parseResult.GetValue<bool>("do_sample"),
+                min_length = parseResult.GetValue<int?>("min_length"),
+                max_length = parseResult.GetValue<int?>("max_length"),
+                num_beams = parseResult.GetValue<int>("num_beams"),
+                num_return_sequences = parseResult.GetValue<int>("num_return_sequences"),
+                repetition_penalty = parseResult.GetValue<double?>("repetition_penalty"),
+                temperature = parseResult.GetValue<double?>("temperature"),
+                top_k = parseResult.GetValue<int?>("top_k"),
+                top_p = parseResult.GetValue<double?>("top_p")
+            };
+
+            GuidanceArgs guidanceArgs = new GuidanceArgs
+            {
+                response_format = parseResult.GetValue<string>("response_format") ?? "",
+                tools_file = parseResult.GetValue<string>("tools_file") ?? "",
+                text_output = parseResult.GetValue<bool>("text_output"),
+                tool_output = parseResult.GetValue<bool>("tool_output"),
+                tool_call_start = parseResult.GetValue<string>("tool_call_start") ?? "",
+                tool_call_end = parseResult.GetValue<string>("tool_call_end") ?? ""
+            };
+
+            return (generatorParamsArgs, guidanceArgs);
+        }
+    }
+
+    /// <summary>
+    /// A class for defining a tool in a JSON schema compatible way
+    /// </summary>
+    public class ToolSchema
+    {
+        [JsonPropertyName("description")]
+        public required string Description { get; set; }
+        [JsonPropertyName("type")]
+        public required string Type { get; set; }
+        [JsonPropertyName("properties")]
+        public required Dictionary<string, object> Properties { get; set; }
+        [JsonPropertyName("required")]
+        public required IList<string> Required { get; set; }
+        [JsonPropertyName("additionalProperties")]
+        public required bool AdditionalProperties { get; set; }
+    }
+
+    /// <summary>
+    /// A class for defining a JSON schema for guidance
+    /// </summary>
+    public class JsonSchema
+    {
+        [JsonPropertyName("x-guidance")]
+        public required Dictionary<string, object> XGuidance { get; set; }
+        [JsonPropertyName("type")]
+        public required string Type { get; set; }
+        [JsonPropertyName("items")]
+        public required Dictionary<string, IList<ToolSchema>> Items { get; set; }
+        [JsonPropertyName("minItems")]
+        public required int MinItems { get; set; }
+    }
+
+    /// <summary>
+    /// A class for defining a function in an OpenAI-compatible way
+    /// </summary>
+    public class FunctionDefinition
+    {
+        [JsonPropertyName("name")]
+        public required string Name { get; set; }
+        [JsonPropertyName("description")]
+        public required string Description { get; set; }
+        [JsonPropertyName("parameters")]
+        public required Dictionary<string, object> Parameters { get; set; }
+    }
+
+    /// <summary>
+    /// A class for defining a tool in an OpenAI-compatible way
+    /// </summary>
+    public class Tool
+    {
+        [JsonPropertyName("type")]
+        public required string Type { get; set; }
+        [JsonPropertyName("function")]
+        public required FunctionDefinition Function { get; set; }
+    }
+
+    [JsonSourceGenerationOptions(WriteIndented = true, PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)]
+    [JsonSerializable(typeof(ToolSchema))]
+    [JsonSerializable(typeof(JsonSchema))]
+    [JsonSerializable(typeof(FunctionDefinition))]
+    [JsonSerializable(typeof(Tool))]
+    [JsonSerializable(typeof(JsonElement))]
+    [JsonSerializable(typeof(Dictionary<string, string>))]
+    [JsonSerializable(typeof(Dictionary<string, object>))]
+    [JsonSerializable(typeof(IList<Dictionary<string, object>>))]
+    [JsonSerializable(typeof(List<Dictionary<string, object>>))]
+    [JsonSerializable(typeof(IList<Tool>))]
+    [JsonSerializable(typeof(List<Tool>))]
+    public sealed partial class ToolSerializerContext : JsonSerializerContext
+    {
+    }
+
+    /// <summary>
+    /// A class for holding parsed values for generator params
+    /// </summary>
+    public class GeneratorParamsArgs
+    {
+        // In case the user doesn't provide the batch size, set it to 1
+        public int batch_size { get; set; } = 1;
+        // In case the user doesn't provide the chunk size, set it to 0
+        public int chunk_size { get; set; } = 0;
+        public bool? do_sample { get; set; }
+        public int? min_length { get; set; }
+        public int? max_length { get; set; }
+        // In case the user doesn't provide the number of beams, set it to 1
+        public int num_beams { get; set; } = 1;
+        // In case the user doesn't provide the number of return sequences, set it to 1
+        public int num_return_sequences { get; set; } = 1;
+        public double? repetition_penalty { get; set; }
+        public double? temperature { get; set; }
+        public int? top_k { get; set; }
+        public double? top_p { get; set; }
+    }
+
+    /// <summary>
+    /// A class for holding parsed values for guidance
+    /// </summary>
+    public class GuidanceArgs
+    {
+        public string response_format { get; set; } = "";
+        public string tools_file { get; set; } = "";
+        public bool text_output { get; set; } = false;
+        public bool tool_output { get; set; } = false;
+        public string tool_call_start { get; set; } = "";
+        public string tool_call_end { get; set; } = "";
+    }
+
+    [JsonSourceGenerationOptions(WriteIndented = true, PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)]
+    [JsonSerializable(typeof(GeneratorParamsArgs))]
+    [JsonSerializable(typeof(GuidanceArgs))]
+    public sealed partial class ArgsSerializerContext : JsonSerializerContext
+    {
+    }
+}
diff --git a/examples/csharp/Genny/.gitignore b/examples/csharp/Genny/.gitignore
deleted file mode 100644
index 4961924315..0000000000
--- a/examples/csharp/Genny/.gitignore
+++ /dev/null
@@ -1,346 +0,0 @@
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-##
-## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
-
-# User-specific files
-*.suo
-*.user
-*.userosscache
-*.sln.docstates
-
-# User-specific files (MonoDevelop/Xamarin Studio)
-*.userprefs
-
-# Build results
-[Dd]ebug/
-[Dd]ebugPublic/
-[Rr]elease/
-[Rr]eleases/
-x64/
-x86/
-bld/
-[Bb]in/
-[Oo]bj/
-[Ll]og/
-
-# Visual Studio 2015/2017 cache/options directory
-.vs/
-# Uncomment if you have tasks that create the project's static files in wwwroot
-#wwwroot/
-
-# Visual Studio 2017 auto generated files
-Generated\ Files/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-# NUNIT
-*.VisualState.xml
-TestResult.xml
-
-# Build Results of an ATL Project
-[Dd]ebugPS/
-[Rr]eleasePS/
-dlldata.c
-
-# Benchmark Results
-BenchmarkDotNet.Artifacts/
-
-# .NET Core
-project.lock.json
-project.fragment.lock.json
-artifacts/
-**/Properties/launchSettings.json
-
-# StyleCop
-StyleCopReport.xml
-
-# Files built by Visual Studio
-*_i.c
-*_p.c
-*_i.h
-*.ilk
-*.obj
-*.iobj
-*.pch
-*.pdb
-*.ipdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.svclog
-*.scc
-
-# Chutzpah Test files
-_Chutzpah*
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opendb
-*.opensdf
-*.sdf
-*.cachefile
-*.VC.db
-*.VC.VC.opendb
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-*.sap
-
-# Visual Studio Trace Files
-*.e2e
-
-# TFS 2012 Local Workspace
-$tf/
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-*.DotSettings.user
-
-# JustCode is a .NET coding add-in
-.JustCode
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# AxoCover is a Code Coverage Tool
-.axoCover/*
-!.axoCover/settings.json
-
-# Visual Studio code coverage results
-*.coverage
-*.coveragexml
-
-# NCrunch
-_NCrunch_*
-.*crunch*.local.xml
-nCrunchTemp_*
-
-# MightyMoose
-*.mm.*
-AutoTest.Net/
-
-# Web workbench (sass)
-.sass-cache/
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.[Pp]ublish.xml
-*.azurePubxml
-# Note: Comment the next line if you want to checkin your web deploy settings,
-# but database connection strings (with potential passwords) will be unencrypted
-*.pubxml
-*.publishproj
-
-# Microsoft Azure Web App publish settings. Comment the next line if you want to
-# checkin your Azure Web App publish settings, but sensitive information contained
-# in these scripts will be unencrypted
-PublishScripts/
-
-# NuGet Packages
-*.nupkg
-# The packages folder can be ignored because of Package Restore
-**/[Pp]ackages/*
-# except build/, which is used as an MSBuild target.
-!**/[Pp]ackages/build/
-# Uncomment if necessary however generally it will be regenerated when needed
-#!**/[Pp]ackages/repositories.config
-# NuGet v3's project.json files produces more ignorable files
-*.nuget.props
-*.nuget.targets
-
-# Microsoft Azure Build Output
-csx/
-*.build.csdef
-
-# Microsoft Azure Emulator
-ecf/
-rcf/
-
-# Windows Store app package directories and files
-AppPackages/
-BundleArtifacts/
-Package.StoreAssociation.xml
-_pkginfo.txt
-*.appx
-
-# Visual Studio cache files
-# files ending in .cache can be ignored
-*.[Cc]ache
-# but keep track of directories ending in .cache
-!*.[Cc]ache/
-
-# Others
-ClientBin/
-~$*
-*~
-*.dbmdl
-*.dbproj.schemaview
-*.jfm
-*.pfx
-*.publishsettings
-orleans.codegen.cs
-
-# Including strong name files can present a security risk 
-# (https://github.com/github/gitignore/pull/2483#issue-259490424)
-#*.snk
-
-# Since there are multiple workflows, uncomment next line to ignore bower_components
-# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
-#bower_components/
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file
-# to a newer Visual Studio version. Backup files are not needed,
-# because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-ServiceFabricBackup/
-*.rptproj.bak
-
-# SQL Server files
-*.mdf
-*.ldf
-*.ndf
-
-# Business Intelligence projects
-*.rdl.data
-*.bim.layout
-*.bim_*.settings
-*.rptproj.rsuser
-
-# Microsoft Fakes
-FakesAssemblies/
-
-# GhostDoc plugin setting file
-*.GhostDoc.xml
-
-# Node.js Tools for Visual Studio
-.ntvs_analysis.dat
-node_modules/
-
-# Visual Studio 6 build log
-*.plg
-
-# Visual Studio 6 workspace options file
-*.opt
-
-# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
-*.vbw
-
-# Visual Studio LightSwitch build output
-**/*.HTMLClient/GeneratedArtifacts
-**/*.DesktopClient/GeneratedArtifacts
-**/*.DesktopClient/ModelManifest.xml
-**/*.Server/GeneratedArtifacts
-**/*.Server/ModelManifest.xml
-_Pvt_Extensions
-
-# Paket dependency manager
-.paket/paket.exe
-paket-files/
-
-# FAKE - F# Make
-.fake/
-
-# JetBrains Rider
-.idea/
-*.sln.iml
-
-# CodeRush
-.cr/
-
-# Python Tools for Visual Studio (PTVS)
-__pycache__/
-*.pyc
-
-# Cake - Uncomment if you are using it
-# tools/**
-# !tools/packages.config
-
-# Tabs Studio
-*.tss
-
-# Telerik's JustMock configuration file
-*.jmconfig
-
-# BizTalk build output
-*.btp.cs
-*.btm.cs
-*.odx.cs
-*.xsd.cs
-
-# OpenCover UI analysis results
-OpenCover/
-
-# Azure Stream Analytics local run output 
-ASALocalRun/
-
-# MSBuild Binary and Structured Log
-*.binlog
-
-# NVidia Nsight GPU debugger configuration file
-*.nvuser
-
-# MFractors (Xamarin productivity tool) working folder 
-.mfractor/
-/docs/build
-src/TensorFlowNET.Native/bazel-*
-src/TensorFlowNET.Native/c_api.h
-/.vscode
-test/TensorFlowNET.Examples/mnist
-
-
-# training model resources
-.resources
-/redist
-*.xml
-*.xsd
-
-# docs
-site/
-
-docker-test-output/*
diff --git a/examples/csharp/Genny/Assets/Screenshot1.PNG b/examples/csharp/Genny/Assets/Screenshot1.PNG
deleted file mode 100644
index 59ef9f19ad..0000000000
Binary files a/examples/csharp/Genny/Assets/Screenshot1.PNG and /dev/null differ
diff --git a/examples/csharp/Genny/Assets/Screenshot2.PNG b/examples/csharp/Genny/Assets/Screenshot2.PNG
deleted file mode 100644
index d1c6354813..0000000000
Binary files a/examples/csharp/Genny/Assets/Screenshot2.PNG and /dev/null differ
diff --git a/examples/csharp/Genny/Genny.sln b/examples/csharp/Genny/Genny.sln
deleted file mode 100644
index 860fbaaa02..0000000000
--- a/examples/csharp/Genny/Genny.sln
+++ /dev/null
@@ -1,37 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.9.34622.214
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Genny", "Genny\Genny.csproj", "{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug_Cuda|x64 = Debug_Cuda|x64
-		Debug_DirectML|x64 = Debug_DirectML|x64
-		Debug|x64 = Debug|x64
-		Release_Cuda|x64 = Release_Cuda|x64
-		Release_DirectML|x64 = Release_DirectML|x64
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_Cuda|x64.ActiveCfg = Debug_Cuda|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_Cuda|x64.Build.0 = Debug_Cuda|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_DirectML|x64.ActiveCfg = Debug_DirectML|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_DirectML|x64.Build.0 = Debug_DirectML|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug|x64.ActiveCfg = Debug|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug|x64.Build.0 = Debug|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_Cuda|x64.ActiveCfg = Release_Cuda|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_Cuda|x64.Build.0 = Release_Cuda|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_DirectML|x64.ActiveCfg = Release_DirectML|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_DirectML|x64.Build.0 = Release_DirectML|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release|x64.ActiveCfg = Release|x64
-		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {A7159277-CA72-45A9-8327-E3BF29214643}
-	EndGlobalSection
-EndGlobal
diff --git a/examples/csharp/Genny/Genny/App.xaml b/examples/csharp/Genny/Genny/App.xaml
deleted file mode 100644
index ec5ea8fd14..0000000000
--- a/examples/csharp/Genny/Genny/App.xaml
+++ /dev/null
@@ -1,10 +0,0 @@
-﻿<Application x:Class="Genny.App"
-             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-             xmlns:local="clr-namespace:Genny"
-             StartupUri="MainWindow.xaml">
-    <Application.Resources>
-        <BitmapImage x:Key="ImageAvatarUser" UriSource="/Images/user.png" />
-        <BitmapImage x:Key="ImageAvatarRobot" UriSource="/Images/robot.png" />
-    </Application.Resources>
-</Application>
diff --git a/examples/csharp/Genny/Genny/App.xaml.cs b/examples/csharp/Genny/Genny/App.xaml.cs
deleted file mode 100644
index b6e61e540f..0000000000
--- a/examples/csharp/Genny/Genny/App.xaml.cs
+++ /dev/null
@@ -1,11 +0,0 @@
-﻿using System.Windows;
-
-namespace Genny
-{
-    /// <summary>
-    /// Interaction logic for App.xaml
-    /// </summary>
-    public partial class App : Application
-    {
-    }
-}
diff --git a/examples/csharp/Genny/Genny/AssemblyInfo.cs b/examples/csharp/Genny/Genny/AssemblyInfo.cs
deleted file mode 100644
index b0ec827578..0000000000
--- a/examples/csharp/Genny/Genny/AssemblyInfo.cs
+++ /dev/null
@@ -1,10 +0,0 @@
-using System.Windows;
-
-[assembly: ThemeInfo(
-    ResourceDictionaryLocation.None,            //where theme specific resource dictionaries are located
-                                                //(used if a resource is not found in the page,
-                                                // or application resource dictionaries)
-    ResourceDictionaryLocation.SourceAssembly   //where the generic resource dictionary is located
-                                                //(used if a resource is not found in the page,
-                                                // app, or any theme specific resource dictionaries)
-)]
diff --git a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml
deleted file mode 100644
index 2983243b59..0000000000
--- a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml
+++ /dev/null
@@ -1,82 +0,0 @@
-﻿<UserControl x:Class="Genny.Controls.SearchOptionsControl"
-             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
-             xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
-             mc:Ignorable="d"
-             d:DesignWidth="300"
-             Name="UI">
-    <Grid DataContext="{Binding ElementName=UI}">
-        <StackPanel Margin="3">
-            <UniformGrid Columns="2" >
-
-                <StackPanel Margin="0,0,4,0">
-                    <DockPanel>
-                        <Label>TopK</Label>
-                        <TextBlock Text="{Binding ElementName=SliderTopK, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderTopK" Value="{Binding SearchOptions.TopK}" Minimum="0" Maximum="200" TickFrequency="1" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-                <StackPanel Margin="4,0,0,0">
-                    <DockPanel>
-                        <Label>TopP</Label>
-                        <TextBlock Text="{Binding ElementName=SliderTopP, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderTopP" Value="{Binding SearchOptions.TopP}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-
-                <StackPanel Margin="0,0,4,0">
-                    <DockPanel>
-                        <Label>Temperature</Label>
-                        <TextBlock Text="{Binding ElementName=SliderTemperature, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderTemperature" Value="{Binding SearchOptions.Temperature}" Minimum="0" Maximum="5" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-                <StackPanel Margin="4,0,0,0">
-                    <DockPanel>
-                        <Label>RepetitionPenalty</Label>
-                        <TextBlock Text="{Binding ElementName=SliderRepetitionPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderRepetitionPenalty" Value="{Binding SearchOptions.RepetitionPenalty}" Minimum="0" Maximum="5" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-
-                <StackPanel Margin="0,0,4,0">
-                    <DockPanel>
-                        <Label>MinLength</Label>
-                        <TextBlock Text="{Binding ElementName=SliderMinLength, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderMinLength" Value="{Binding SearchOptions.MinLength}" Minimum="0" Maximum="2048" TickFrequency="1" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-                <StackPanel Margin="4,0,0,0">
-                    <DockPanel>
-                        <Label>MaxLength</Label>
-                        <TextBlock Text="{Binding ElementName=SliderMaxLength, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderMaxLength" Value="{Binding SearchOptions.MaxLength}" Minimum="1" Maximum="2048" TickFrequency="1" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-
-                <StackPanel Margin="0,0,4,0">
-                    <DockPanel>
-                        <Label>LengthPenalty</Label>
-                        <TextBlock Text="{Binding ElementName=SliderLengthPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderLengthPenalty" Value="{Binding SearchOptions.LengthPenalty}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-                <StackPanel Margin="4,0,0,0">
-                    <DockPanel>
-                        <Label>DiversityPenalty</Label>
-                        <TextBlock Text="{Binding ElementName=SliderDiversityPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
-                    </DockPanel>
-                    <Slider Name="SliderDiversityPenalty" Value="{Binding SearchOptions.DiversityPenalty}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
-                </StackPanel>
-
-            </UniformGrid>
-
-            <StackPanel >
-                <CheckBox Content="DoSample" IsChecked="{Binding SearchOptions.DoSample}" Margin="0,15,0,0"/>
-                <CheckBox Content="EarlyStopping" IsChecked="{Binding SearchOptions.EarlyStopping}" Margin="0,6,0,0"/>
-                <CheckBox Content="PastPresentShareBuffer" IsChecked="{Binding SearchOptions.PastPresentShareBuffer}" Margin="0,6,0,0"/>
-            </StackPanel>
-        </StackPanel>
-    </Grid>
-</UserControl>
diff --git a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs
deleted file mode 100644
index 6386a43ded..0000000000
--- a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs
+++ /dev/null
@@ -1,30 +0,0 @@
-﻿using Genny.ViewModel;
-using System.Windows;
-using System.Windows.Controls;
-
-namespace Genny.Controls
-{
-    /// <summary>
-    /// Interaction logic for SearchOptionsControl.xaml
-    /// </summary>
-    public partial class SearchOptionsControl : UserControl
-    {
-        public SearchOptionsControl()
-        {
-            InitializeComponent();
-        }
-
-        public static readonly DependencyProperty SearchOptionsProperty =
-           DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(SearchOptionsControl), new PropertyMetadata(new SearchOptionsModel()));
-
-
-        /// <summary>
-        /// Gets or sets the search options.
-        /// </summary>
-        public SearchOptionsModel SearchOptions
-        {
-            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
-            set { SetValue(SearchOptionsProperty, value); }
-        }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Extensions.cs b/examples/csharp/Genny/Genny/Extensions.cs
deleted file mode 100644
index 5074df1e25..0000000000
--- a/examples/csharp/Genny/Genny/Extensions.cs
+++ /dev/null
@@ -1,50 +0,0 @@
-﻿using Genny.ViewModel;
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System.Threading;
-using System.Threading.Tasks;
-using System.Windows;
-
-namespace Genny
-{
-    internal static class Extensions
-    {
-
-        /// <summary>
-        /// Applies the search options to the generator parameters.
-        /// </summary>
-        /// <param name="generatorParams">The generator parameters.</param>
-        /// <param name="searchOptions">The search options.</param>
-        internal static void ApplySearchOptions(this GeneratorParams generatorParams, SearchOptionsModel searchOptions)
-        {
-            generatorParams.SetSearchOption("top_p", searchOptions.TopP);
-            generatorParams.SetSearchOption("top_k", searchOptions.TopK);
-            generatorParams.SetSearchOption("temperature", searchOptions.Temperature);
-            generatorParams.SetSearchOption("repetition_penalty", searchOptions.RepetitionPenalty);
-            generatorParams.SetSearchOption("past_present_share_buffer", searchOptions.PastPresentShareBuffer);
-            generatorParams.SetSearchOption("num_return_sequences", searchOptions.NumReturnSequences);
-            generatorParams.SetSearchOption("no_repeat_ngram_size", searchOptions.NoRepeatNgramSize);
-            generatorParams.SetSearchOption("min_length", searchOptions.MinLength);
-            generatorParams.SetSearchOption("max_length", searchOptions.MaxLength);
-            generatorParams.SetSearchOption("length_penalty", searchOptions.LengthPenalty);
-            generatorParams.SetSearchOption("early_stopping", searchOptions.EarlyStopping);
-            generatorParams.SetSearchOption("do_sample", searchOptions.DoSample);
-            generatorParams.SetSearchOption("diversity_penalty", searchOptions.DiversityPenalty);
-        }
-
-        internal static Task<Sequences> EncodeAsync(this Tokenizer tokenizer, string input, CancellationToken cancellationToken = default)
-        {
-            return Application.Current.Dispatcher.Invoke(() =>
-            {
-                return Task.Run(() => tokenizer.Encode(input), cancellationToken);
-            });
-        }
-
-        internal static Task<string> DecodeAsync(this Tokenizer tokenizer, int[] input, CancellationToken cancellationToken = default)
-        {
-            return Application.Current.Dispatcher.Invoke(() =>
-            {
-                return Task.Run(() => tokenizer.Decode(input), cancellationToken);
-            });
-        }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Genny.csproj b/examples/csharp/Genny/Genny/Genny.csproj
deleted file mode 100644
index 10a39d7e2f..0000000000
--- a/examples/csharp/Genny/Genny/Genny.csproj
+++ /dev/null
@@ -1,26 +0,0 @@
-﻿<Project Sdk="Microsoft.NET.Sdk">
-
-  <PropertyGroup>
-    <OutputType>WinExe</OutputType>
-    <TargetFramework>net6.0-windows</TargetFramework>
-    <Nullable>disable</Nullable>
-    <ImplicitUsings>disable</ImplicitUsings>
-    <UseWPF>true</UseWPF>
-    <UseWindowsForms>true</UseWindowsForms>
-    <PlatformTarget>x64</PlatformTarget>
-    <Platforms>x64</Platforms>
-    <Configurations>Debug;Release;Debug_Cuda;Release_Cuda;Debug_DirectML;Release_DirectML</Configurations>
-  </PropertyGroup>
-
-  <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.4.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' "/>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.4.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.4.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' "/>
-  </ItemGroup>
-
-  <ItemGroup>
-    <Resource Include="Images\user.png" />
-    <Resource Include="Images\robot.png" />
-  </ItemGroup>
-
-</Project>
diff --git a/examples/csharp/Genny/Genny/Images/robot.png b/examples/csharp/Genny/Genny/Images/robot.png
deleted file mode 100644
index 96edd0fb10..0000000000
Binary files a/examples/csharp/Genny/Genny/Images/robot.png and /dev/null differ
diff --git a/examples/csharp/Genny/Genny/Images/user.png b/examples/csharp/Genny/Genny/Images/user.png
deleted file mode 100644
index dcaf32f594..0000000000
Binary files a/examples/csharp/Genny/Genny/Images/user.png and /dev/null differ
diff --git a/examples/csharp/Genny/Genny/MainWindow.xaml b/examples/csharp/Genny/Genny/MainWindow.xaml
deleted file mode 100644
index 3d721f96b5..0000000000
--- a/examples/csharp/Genny/Genny/MainWindow.xaml
+++ /dev/null
@@ -1,72 +0,0 @@
-﻿<Window x:Class="Genny.MainWindow"
-        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
-        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
-        xmlns:views="clr-namespace:Genny.Views"
-        xmlns:controls="clr-namespace:Genny.Controls"
-        mc:Ignorable="d"
-        Title="Genny" Height="450" Width="800" Name="UI">
-    <Grid DataContext="{Binding ElementName=UI}">
-        <DockPanel>
-
-            <!--Input-->
-            <DockPanel DockPanel.Dock="Left" Width="300">
-
-                <GroupBox DockPanel.Dock="Top" Header="Model Folder" Margin="2,5,0,1">
-                    <DockPanel Margin="3">
-                        <UniformGrid DockPanel.Dock="Bottom" Columns="1">
-                            <Button Content="Load Model" Command="{Binding LoadModelCommand}" />
-                        </UniformGrid>
-                        <DockPanel>
-                            <Button DockPanel.Dock="Right" Content="Open" Command="{Binding OpenModelCommand}"/>
-                            <TextBox Text="{Binding ModelPath}"/>
-                        </DockPanel>
-                    </DockPanel>
-                </GroupBox>
-
-                <DockPanel>
-                    <GroupBox DockPanel.Dock="Top" Header="Search Options" Margin="2,5,0,1">
-                        <controls:SearchOptionsControl SearchOptions="{Binding Configuration.SearchOptions}"  />
-                    </GroupBox>
-                </DockPanel>
-
-            </DockPanel>
-
-            <!--Content-->
-            <TabControl>
-
-                <!--StatelessView Tab-->
-                <TabItem Header="Stateless">
-                    <Grid IsEnabled="{Binding IsModelLoaded}">
-                        <views:StatelessView
-                            Model="{Binding Model}"
-                            Tokenizer="{Binding Tokenizer}"
-                            ModelOptions="{Binding Configuration.ModelOptions}"
-                            SearchOptions="{Binding Configuration.SearchOptions}" />
-                    </Grid>
-                </TabItem>
-
-                <!--StatefulView Tab-->
-                <TabItem Header="Stateful">
-                    <Grid IsEnabled="{Binding IsModelLoaded}">
-                        <views:StatefulView 
-                            Model="{Binding Model}"
-                            Tokenizer="{Binding Tokenizer}"
-                            ModelOptions="{Binding Configuration.ModelOptions}"
-                            SearchOptions="{Binding Configuration.SearchOptions}" />
-                    </Grid>
-                </TabItem>
-
-                <!--Tokenizer Tab-->
-                <TabItem Header="Tokenizer">
-                    <Grid IsEnabled="{Binding IsModelLoaded}">
-                        <views:TokenizerView Tokenizer="{Binding Tokenizer}" />
-                    </Grid>
-                </TabItem>
-
-            </TabControl>
-
-        </DockPanel>
-    </Grid>
-</Window>
diff --git a/examples/csharp/Genny/Genny/MainWindow.xaml.cs b/examples/csharp/Genny/Genny/MainWindow.xaml.cs
deleted file mode 100644
index 10522632a5..0000000000
--- a/examples/csharp/Genny/Genny/MainWindow.xaml.cs
+++ /dev/null
@@ -1,132 +0,0 @@
-﻿using Genny.Utils;
-using Genny.ViewModel;
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System;
-using System.ComponentModel;
-using System.IO;
-using System.Runtime.CompilerServices;
-using System.Text.Json;
-using System.Threading.Tasks;
-using System.Windows;
-
-namespace Genny
-{
-    /// <summary>
-    /// Interaction logic for MainWindow.xaml
-    /// </summary>
-    public partial class MainWindow : Window, INotifyPropertyChanged
-    {
-        private Model _model;
-        private Tokenizer _tokenizer;
-        private ConfigurationModel _configuration;
-        private string _modelPath = "D:\\Repositories\\phi2_onnx";
-        private bool _isModelLoaded;
-
-        public MainWindow()
-        {
-            OpenModelCommand = new RelayCommand(OpenModelAsync);
-            LoadModelCommand = new RelayCommand(LoadModelAsync, CanExecuteLoadModel);
-            InitializeComponent();
-        }
-
-        public RelayCommand OpenModelCommand { get; }
-        public RelayCommand LoadModelCommand { get; }
-
-        public Model Model
-        {
-            get { return _model; }
-            set { _model = value; NotifyPropertyChanged(); }
-        }
-
-        public Tokenizer Tokenizer
-        {
-            get { return _tokenizer; }
-            set { _tokenizer = value; NotifyPropertyChanged(); }
-        }
-
-        public ConfigurationModel Configuration
-        {
-            get { return _configuration; }
-            set { _configuration = value; NotifyPropertyChanged(); }
-        }
-
-
-        public bool IsModelLoaded
-        {
-            get { return _isModelLoaded; }
-            set { _isModelLoaded = value; NotifyPropertyChanged(); }
-        }
-
-        public string ModelPath
-        {
-            get { return _modelPath; }
-            set { _modelPath = value; NotifyPropertyChanged(); }
-        }
-
-
-        private Task OpenModelAsync()
-        {
-            var folderBrowserDialog = new System.Windows.Forms.FolderBrowserDialog
-            {
-                Description = "Model Folder Path",
-                UseDescriptionForTitle = true,
-            };
-            var dialogResult = folderBrowserDialog.ShowDialog();
-            if (dialogResult == System.Windows.Forms.DialogResult.OK)
-                ModelPath = folderBrowserDialog.SelectedPath;
-
-            return Task.CompletedTask;
-        }
-
-
-        private async Task LoadModelAsync()
-        {
-            await UnloadModelAsync();
-            try
-            {
-                Configuration = await LoadConfigAsync(ModelPath);
-                await Task.Run(() =>
-                {
-                    Model = new Model(ModelPath);
-                    Tokenizer = new Tokenizer(_model);
-                });
-                IsModelLoaded = true;
-            }
-            catch (Exception ex)
-            {
-                MessageBox.Show(ex.Message, "Model Load Error", MessageBoxButton.OK, MessageBoxImage.Error);
-            }
-        }
-
-
-        private bool CanExecuteLoadModel()
-        {
-            return !string.IsNullOrWhiteSpace(ModelPath);
-        }
-
-
-        private Task UnloadModelAsync()
-        {
-            _model?.Dispose();
-            _tokenizer?.Dispose();
-            IsModelLoaded = false;
-            return Task.CompletedTask;
-        }
-
-
-        private static async Task<ConfigurationModel> LoadConfigAsync(string modelPath)
-        {
-            var configPath = Path.Combine(modelPath, "genai_config.json");
-            var configJson = await File.ReadAllTextAsync(configPath);
-            return JsonSerializer.Deserialize<ConfigurationModel>(configJson);
-        }
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs b/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs
deleted file mode 100644
index afc99ee711..0000000000
--- a/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs
+++ /dev/null
@@ -1,47 +0,0 @@
-﻿using System.Windows;
-using System.Windows.Controls;
-
-namespace Genny.Utils
-{
-    /// <summary>
-    /// Behaviour to auto scroll to the bottom went the content changes, e.g appending text
-    /// </summary>
-    public static class AutoScrollBehavior
-    {
-        public static readonly DependencyProperty AutoScrollProperty =
-            DependencyProperty.RegisterAttached("AutoScroll", typeof(bool), typeof(AutoScrollBehavior), new PropertyMetadata(false, AutoScrollPropertyChanged));
-
-        public static void AutoScrollPropertyChanged(DependencyObject obj, DependencyPropertyChangedEventArgs args)
-        {
-            var scrollViewer = obj as ScrollViewer;
-            if (scrollViewer != null && (bool)args.NewValue)
-            {
-                scrollViewer.ScrollChanged += ScrollViewer_ScrollChanged;
-                scrollViewer.ScrollToEnd();
-            }
-            else
-            {
-                scrollViewer.ScrollChanged -= ScrollViewer_ScrollChanged;
-            }
-        }
-
-        private static void ScrollViewer_ScrollChanged(object sender, ScrollChangedEventArgs e)
-        {
-            if (e.ExtentHeightChange != 0)
-            {
-                var scrollViewer = sender as ScrollViewer;
-                scrollViewer?.ScrollToBottom();
-            }
-        }
-
-        public static bool GetAutoScroll(DependencyObject obj)
-        {
-            return (bool)obj.GetValue(AutoScrollProperty);
-        }
-
-        public static void SetAutoScroll(DependencyObject obj, bool value)
-        {
-            obj.SetValue(AutoScrollProperty, value);
-        }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Utils/RelayCommand.cs b/examples/csharp/Genny/Genny/Utils/RelayCommand.cs
deleted file mode 100644
index 131f27b011..0000000000
--- a/examples/csharp/Genny/Genny/Utils/RelayCommand.cs
+++ /dev/null
@@ -1,111 +0,0 @@
-﻿using System;
-using System.Threading;
-using System.Threading.Tasks;
-using System.Windows.Input;
-
-namespace Genny.Utils
-{
-    /// <summary>
-    /// Basic Relay command implemtation
-    /// </summary>
-    /// <seealso cref="System.Windows.Input.ICommand" />
-    public class RelayCommand : ICommand
-    {
-        private readonly Func<Task> _execute;
-        private readonly Func<bool> _canExecute;
-        private long _isExecuting;
-
-        public RelayCommand(Func<Task> execute, Func<bool> canExecute = null)
-        {
-            _execute = execute;
-            _canExecute = canExecute ?? (() => true);
-        }
-
-        public event EventHandler CanExecuteChanged
-        {
-            add { CommandManager.RequerySuggested += value; }
-            remove { CommandManager.RequerySuggested -= value; }
-        }
-
-        public void RaiseCanExecuteChanged()
-        {
-            CommandManager.InvalidateRequerySuggested();
-        }
-
-        public bool CanExecute(object parameter)
-        {
-            if (Interlocked.Read(ref _isExecuting) != 0)
-                return false;
-
-            return _canExecute();
-        }
-
-        public async void Execute(object parameter)
-        {
-            Interlocked.Exchange(ref _isExecuting, 1);
-            RaiseCanExecuteChanged();
-
-            try
-            {
-                await _execute();
-            }
-            finally
-            {
-                Interlocked.Exchange(ref _isExecuting, 0);
-                RaiseCanExecuteChanged();
-            }
-        }
-    }
-
-    /// <summary>
-    /// Basic Relay command with type argument implemtation
-    /// </summary>
-    /// <seealso cref="System.Windows.Input.ICommand" />
-    public class RelayCommand<T> : ICommand
-    {
-        private readonly Func<T, Task> _execute;
-        private readonly Func<T, bool> _canExecute;
-        private long _isExecuting;
-
-        public RelayCommand(Func<T, Task> execute, Func<T, bool> canExecute = null)
-        {
-            _execute = execute;
-            _canExecute = canExecute ?? (o => true);
-        }
-
-        public event EventHandler CanExecuteChanged
-        {
-            add { CommandManager.RequerySuggested += value; }
-            remove { CommandManager.RequerySuggested -= value; }
-        }
-
-        public void RaiseCanExecuteChanged()
-        {
-            CommandManager.InvalidateRequerySuggested();
-        }
-
-        public bool CanExecute(object parameter)
-        {
-            if (Interlocked.Read(ref _isExecuting) != 0)
-                return false;
-
-            return _canExecute(parameter is T r ? r : default);
-        }
-
-        public async void Execute(object parameter)
-        {
-            Interlocked.Exchange(ref _isExecuting, 1);
-            RaiseCanExecuteChanged();
-
-            try
-            {
-                await _execute((T)parameter);
-            }
-            finally
-            {
-                Interlocked.Exchange(ref _isExecuting, 0);
-                RaiseCanExecuteChanged();
-            }
-        }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs b/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs
deleted file mode 100644
index 1d922c2fda..0000000000
--- a/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs
+++ /dev/null
@@ -1,66 +0,0 @@
-﻿using System;
-using System.Windows;
-using System.Windows.Controls;
-using System.Windows.Input;
-
-namespace Genny.Utils
-{
-    /// <summary>
-    /// Behaviour to use Shift + Enfer to add a new line to a TextBox allowing IsDefault Commands to be fired on Enter
-    /// </summary>
-    public class ShiftEnterBehavior
-    {
-        public static readonly DependencyProperty EnableProperty =
-            DependencyProperty.RegisterAttached("Enable", typeof(bool), typeof(ShiftEnterBehavior), new PropertyMetadata(false, OnEnableChanged));
-
-        public static bool GetEnable(DependencyObject obj)
-        {
-            return (bool)obj.GetValue(EnableProperty);
-        }
-
-        public static void SetEnable(DependencyObject obj, bool value)
-        {
-            obj.SetValue(EnableProperty, value);
-        }
-
-        private static void OnEnableChanged(DependencyObject obj, DependencyPropertyChangedEventArgs e)
-        {
-            if (obj is TextBox textBox)
-            {
-                bool attach = (bool)e.NewValue;
-
-                if (attach)
-                {
-                    DataObject.AddPastingHandler(textBox, TextBox_OnPaste);
-                    textBox.PreviewKeyDown += TextBox_PreviewKeyDown;
-                }
-                else
-                {
-                    DataObject.RemovePastingHandler(textBox, TextBox_OnPaste);
-                    textBox.PreviewKeyDown -= TextBox_PreviewKeyDown;
-                }
-            }
-        }
-
-        private static void TextBox_PreviewKeyDown(object sender, KeyEventArgs e)
-        {
-            // If Shift + Enter is pressed append a new line
-            if (e.Key == Key.Enter && Keyboard.Modifiers == ModifierKeys.Shift && sender is TextBox textBox)
-            {
-                e.Handled = true;
-                textBox.AppendText(Environment.NewLine);
-                textBox.CaretIndex = textBox.Text.Length;
-            }
-        }
-
-        private static void TextBox_OnPaste(object sender, DataObjectPastingEventArgs e)
-        {
-            // Because AcceptsReturn is false we need to intercept paste to allow new lines
-            if (sender is TextBox textBox && e.DataObject.GetDataPresent(DataFormats.UnicodeText))
-            {
-                e.CancelCommand();
-                textBox.AppendText(e.DataObject.GetData(DataFormats.UnicodeText) as string);
-            }
-        }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs b/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs
deleted file mode 100644
index 5e78ff95b7..0000000000
--- a/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs
+++ /dev/null
@@ -1,13 +0,0 @@
-﻿using System.Text.Json.Serialization;
-
-namespace Genny.ViewModel
-{
-    public class ConfigurationModel
-    {
-        [JsonPropertyName("model")]
-        public ModelOptionsModel ModelOptions { get; set; }
-
-        [JsonPropertyName("search")]
-        public SearchOptionsModel SearchOptions { get; set; }
-    }
-}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs b/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs
deleted file mode 100644
index bb7fc341d8..0000000000
--- a/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs
+++ /dev/null
@@ -1,14 +0,0 @@
-﻿using System.Text.Json.Serialization;
-
-namespace Genny.ViewModel
-{
-    public class ModelOptionsModel
-    {
-        [JsonPropertyName("type")]
-        public string Type { get; set; }
-
-        [JsonPropertyName("context_length")]
-        public int ContextLength { get; set; }
-    }
-
-}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs b/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs
deleted file mode 100644
index b51bd66db6..0000000000
--- a/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿using System;
-using System.ComponentModel;
-using System.Runtime.CompilerServices;
-
-namespace Genny.ViewModel
-{
-    public class ResultModel : INotifyPropertyChanged
-    {
-        private string _content;
-        private bool _isUserInput;
-
-        public string Content
-        {
-            get { return _content; }
-            set { _content = value; NotifyPropertyChanged(); }
-        }
-
-        public bool IsUserInput
-        {
-            get { return _isUserInput; }
-            set { _isUserInput = value; NotifyPropertyChanged(); }
-        }
-
-        public DateTime Timestamp { get; } = DateTime.Now;
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs b/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs
deleted file mode 100644
index 2fe6b3ab72..0000000000
--- a/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs
+++ /dev/null
@@ -1,132 +0,0 @@
-﻿using System.ComponentModel;
-using System.Runtime.CompilerServices;
-using System.Text.Json.Serialization;
-
-namespace Genny.ViewModel
-{
-    public class SearchOptionsModel : INotifyPropertyChanged
-    {
-        private int _topK = 50;
-        private float _topP = 0.9f;
-        private float _temperature = 1;
-        private float _repetitionPenalty = 1;
-        private bool _pastPresentShareBuffer = false;
-        private int _numReturnSequences = 1;
-        private int _numBeams = 1;
-        private int _noRepeatNgramSize = 0;
-        private int _minLength = 0;
-        private int _maxLength = 200;
-        private float _lengthPenalty = 1;
-        private bool _earlyStopping = true;
-        private bool _doSample = false;
-        private float _diversityPenalty = 0;
-
-        [JsonPropertyName("top_k")]
-        public int TopK
-        {
-            get { return _topK; }
-            set { _topK = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("top_p")]
-        public float TopP
-        {
-            get { return _topP; }
-            set { _topP = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("temperature")]
-        public float Temperature
-        {
-            get { return _temperature; }
-            set { _temperature = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("repetition_penalty")]
-        public float RepetitionPenalty
-        {
-            get { return _repetitionPenalty; }
-            set { _repetitionPenalty = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("past_present_share_buffer")]
-        public bool PastPresentShareBuffer
-        {
-            get { return _pastPresentShareBuffer; }
-            set { _pastPresentShareBuffer = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("num_return_sequences")]
-        public int NumReturnSequences
-        {
-            get { return _numReturnSequences; }
-            set { _numReturnSequences = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("num_beams")]
-        public int NumBeams
-        {
-            get { return _numBeams; }
-            set { _numBeams = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("no_repeat_ngram_size")]
-        public int NoRepeatNgramSize
-        {
-            get { return _noRepeatNgramSize; }
-            set { _noRepeatNgramSize = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("min_length")]
-        public int MinLength
-        {
-            get { return _minLength; }
-            set { _minLength = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("max_length")]
-        public int MaxLength
-        {
-            get { return _maxLength; }
-            set { _maxLength = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("length_penalty")]
-        public float LengthPenalty
-        {
-            get { return _lengthPenalty; }
-            set { _lengthPenalty = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("diversity_penalty")]
-        public float DiversityPenalty
-        {
-            get { return _diversityPenalty; }
-            set { _diversityPenalty = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("early_stopping")]
-        public bool EarlyStopping
-        {
-            get { return _earlyStopping; }
-            set { _earlyStopping = value; NotifyPropertyChanged(); }
-        }
-
-        [JsonPropertyName("do_sample")]
-        public bool DoSample
-        {
-            get { return _doSample; }
-            set { _doSample = value; NotifyPropertyChanged(); }
-        }
-
-     
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs b/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs
deleted file mode 100644
index a87d9a50e1..0000000000
--- a/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs
+++ /dev/null
@@ -1,4 +0,0 @@
-﻿namespace Genny.ViewModel
-{
-    public record TokenModel(int Id, string Content);
-}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/Views/StatefulView.xaml b/examples/csharp/Genny/Genny/Views/StatefulView.xaml
deleted file mode 100644
index 71473e6d89..0000000000
--- a/examples/csharp/Genny/Genny/Views/StatefulView.xaml
+++ /dev/null
@@ -1,80 +0,0 @@
-﻿<UserControl x:Class="Genny.Views.StatefulView"
-             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
-             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
-             xmlns:local="clr-namespace:Genny.Views"
-             xmlns:scm="clr-namespace:System.ComponentModel;assembly=WindowsBase"
-             xmlns:utils="clr-namespace:Genny.Utils"
-             Name="UI"
-             mc:Ignorable="d" 
-             d:DesignHeight="450" d:DesignWidth="800">
-    <DockPanel DataContext="{Binding ElementName=UI}">
-
-    <!--Input Controls-->
-        <DockPanel DockPanel.Dock="Bottom" Height="100" Margin="2">
-
-            <!--Buttons-->
-            <DockPanel DockPanel.Dock="Right">
-                <DockPanel>
-                    <UniformGrid Columns="2" DockPanel.Dock="Bottom" Height="30" Width="100">
-                        <Button Content="Clear" Command="{Binding ClearCommand}"/>
-                        <Button Content="Cancel" Command="{Binding CancelCommand}"/>
-                    </UniformGrid>
-                    <Button Content="Send" Command="{Binding GenerateCommand}" IsDefault="True"/>
-                </DockPanel>
-            </DockPanel>
-
-            <!--Prompt-->
-            <TextBox Text="{Binding Prompt, UpdateSourceTrigger=PropertyChanged}" utils:ShiftEnterBehavior.Enable="True"/>
-
-        </DockPanel>
-
-
-        <!--Result List-->
-        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
-            <ScrollViewer CanContentScroll="False" utils:AutoScrollBehavior.AutoScroll="True" HorizontalScrollBarVisibility="Disabled" >
-                <ItemsControl ItemsSource="{Binding}" ScrollViewer.HorizontalScrollBarVisibility="Disabled" ScrollViewer.VerticalScrollBarVisibility="Disabled">
-                    <ItemsControl.DataContext>
-                        <CollectionViewSource Source="{Binding ResultHistory, ElementName=UI}">
-                            <CollectionViewSource.SortDescriptions>
-                                <scm:SortDescription PropertyName="Timestamp" Direction="Ascending" />
-                            </CollectionViewSource.SortDescriptions>
-                        </CollectionViewSource>
-                    </ItemsControl.DataContext>
-                    <ItemsControl.ItemTemplate>
-                        <DataTemplate>
-                            <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
-                                <DockPanel>
-                                    <DockPanel DockPanel.Dock="Left" Margin="10">
-                                        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2" VerticalAlignment="Top">
-                                            <Image Width="60" Height="60" RenderOptions.BitmapScalingMode="Fant">
-                                                <Image.Style>
-                                                    <Style TargetType="{x:Type Image}">
-                                                        <Setter Property="Source" Value="{StaticResource ImageAvatarRobot}" />
-                                                        <Style.Triggers>
-                                                            <DataTrigger Binding="{Binding IsUserInput}" Value="True">
-                                                                <Setter Property="Source" Value="{StaticResource ImageAvatarUser}" />
-                                                            </DataTrigger>
-                                                        </Style.Triggers>
-                                                    </Style>
-                                                </Image.Style>
-                                            </Image>
-                                        </Border>
-                                    </DockPanel>
-                                    <DockPanel DockPanel.Dock="Bottom">
-                                        <TextBlock Text="{Binding Timestamp}" HorizontalAlignment="Right" Margin="0,0,4,2" />
-                                    </DockPanel>
-                                    <DockPanel Margin="5">
-                                        <TextBox Text="{Binding Content}" TextWrapping="Wrap" Style="{x:Null}" BorderThickness="0" IsReadOnly="True" />
-                                    </DockPanel>
-                                </DockPanel>
-                            </Border>
-                        </DataTemplate>
-                    </ItemsControl.ItemTemplate>
-                </ItemsControl>
-            </ScrollViewer>
-        </Border>
-
-    </DockPanel>
-</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs b/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs
deleted file mode 100644
index d399d30055..0000000000
--- a/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs
+++ /dev/null
@@ -1,196 +0,0 @@
-﻿using Genny.Utils;
-using Genny.ViewModel;
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System;
-using System.Collections.Generic;
-using System.Collections.ObjectModel;
-using System.ComponentModel;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Threading;
-using System.Threading.Tasks;
-using System.Windows;
-using System.Windows.Controls;
-
-namespace Genny.Views
-{
-    /// <summary>
-    /// Interaction logic for StatefulView.xaml
-    /// </summary>
-    public partial class StatefulView : UserControl, INotifyPropertyChanged
-    {
-        private string _prompt;
-        private readonly List<int> _pastTokens;
-        private CancellationTokenSource _cancellationTokenSource;
-
-        public StatefulView()
-        {
-            _pastTokens = new List<int>();
-            ClearCommand = new RelayCommand(ClearAsync);
-            CancelCommand = new RelayCommand(CancelAsync);
-            GenerateCommand = new RelayCommand(GenerateAsync, CanExecuteGenerate);
-            ResultHistory = new ObservableCollection<ResultModel>();
-            InitializeComponent();
-        }
-
-        public static readonly DependencyProperty ModelProperty =
-          DependencyProperty.Register(nameof(Model), typeof(Model), typeof(StatefulView));
-
-        public static readonly DependencyProperty TokenizerProperty =
-            DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(StatefulView));
-
-        public static readonly DependencyProperty ModelOptionsProperty =
-            DependencyProperty.Register(nameof(ModelOptions), typeof(ModelOptionsModel), typeof(StatefulView));
-
-        public static readonly DependencyProperty SearchOptionsProperty =
-            DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(StatefulView));
-
-        public RelayCommand ClearCommand { get; }
-        public RelayCommand CancelCommand { get; }
-        public RelayCommand GenerateCommand { get; }
-        public ResultModel CurrentResult { get; set; }
-        public ObservableCollection<ResultModel> ResultHistory { get; }
-
-        public Model Model
-        {
-            get { return (Model)GetValue(ModelProperty); }
-            set { SetValue(ModelProperty, value); }
-        }
-
-        public Tokenizer Tokenizer
-        {
-            get { return (Tokenizer)GetValue(TokenizerProperty); }
-            set { SetValue(TokenizerProperty, value); }
-        }
-
-        public ModelOptionsModel ModelOptions
-        {
-            get { return (ModelOptionsModel)GetValue(ModelOptionsProperty); }
-            set { SetValue(ModelOptionsProperty, value); }
-        }
-
-        public SearchOptionsModel SearchOptions
-        {
-            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
-            set { SetValue(SearchOptionsProperty, value); }
-        }
-
-        public string Prompt
-        {
-            get { return _prompt; }
-            set { _prompt = value; NotifyPropertyChanged(); }
-        }
-
-
-        private async Task GenerateAsync()
-        {
-            try
-            {
-                var userInput = new ResultModel
-                {
-                    Content = Prompt,
-                    IsUserInput = true
-                };
-
-                Prompt = null;
-                CurrentResult = null;
-                ResultHistory.Add(userInput);
-                _cancellationTokenSource = new CancellationTokenSource();
-                await foreach (var sentencePiece in RunInferenceAsync(userInput.Content, _cancellationTokenSource.Token))
-                {
-                    if (CurrentResult == null)
-                    {
-                        if (string.IsNullOrWhiteSpace(sentencePiece.Content)) // Ingore preceding '\n'
-                            continue;
-
-                        ResultHistory.Add(CurrentResult = new ResultModel());
-                    }
-                    CurrentResult.Content += sentencePiece.Content;
-                }
-            }
-            catch (OperationCanceledException)
-            {
-                CurrentResult.Content += "\n\n[Operation Canceled]";
-            }
-            catch (Exception ex)
-            {
-                MessageBox.Show(ex.Message, "Inference Error", MessageBoxButton.OK, MessageBoxImage.Error);
-            }
-        }
-
-
-        private bool CanExecuteGenerate()
-        {
-            return !string.IsNullOrWhiteSpace(Prompt);
-        }
-
-
-        private Task CancelAsync()
-        {
-            _cancellationTokenSource?.Cancel();
-            return Task.CompletedTask;
-        }
-
-
-        private Task ClearAsync()
-        {
-            _pastTokens.Clear();
-            ResultHistory.Clear();
-            return Task.CompletedTask;
-        }
-
-
-        private async IAsyncEnumerable<TokenModel> RunInferenceAsync(string prompt, [EnumeratorCancellation] CancellationToken cancellationToken)
-        {
-            var sequences = await Tokenizer.EncodeAsync(prompt, cancellationToken);
-
-            // Add Tokens to history
-            AddPastTokens(sequences);
-
-            using var generatorParams = new GeneratorParams(Model);
-            generatorParams.ApplySearchOptions(SearchOptions);
-
-            // max_length is per message, so increment max_length for next call
-            var newMaxLength = Math.Min(_pastTokens.Count + SearchOptions.MaxLength, ModelOptions.ContextLength);
-            generatorParams.SetSearchOption("max_length", newMaxLength); 
-
-            generatorParams.SetInputIDs(CollectionsMarshal.AsSpan(_pastTokens), (ulong)_pastTokens.Count, 1);
-
-            using var tokenizerStream = Tokenizer.CreateStream();
-            using var generator = new Generator(Model, generatorParams);
-            while (!generator.IsDone())
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-
-                yield return await Task.Run(() =>
-                {
-                    generator.ComputeLogits();
-                    generator.GenerateNextToken();
-
-                    var tokenId = generator.GetSequence(0)[^1];
-                    return new TokenModel(tokenId, tokenizerStream.Decode(tokenId));
-                }, cancellationToken);
-            }
-        }
-
-
-        private void AddPastTokens(Sequences sequences)
-        {
-            _pastTokens.AddRange(sequences[0].ToArray());
-
-            // Only keep (context_length - max_length) worth of history
-            while (_pastTokens.Count > ModelOptions.ContextLength - SearchOptions.MaxLength)
-            {
-                _pastTokens.RemoveAt(0);
-            }
-        }
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Views/StatelessView.xaml b/examples/csharp/Genny/Genny/Views/StatelessView.xaml
deleted file mode 100644
index b36b103bf5..0000000000
--- a/examples/csharp/Genny/Genny/Views/StatelessView.xaml
+++ /dev/null
@@ -1,80 +0,0 @@
-﻿<UserControl x:Class="Genny.Views.StatelessView"
-             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
-             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
-             xmlns:local="clr-namespace:Genny.Views"
-             xmlns:scm="clr-namespace:System.ComponentModel;assembly=WindowsBase"
-             xmlns:utils="clr-namespace:Genny.Utils"
-             Name="UI"
-             mc:Ignorable="d" 
-             d:DesignHeight="450" d:DesignWidth="800">
-    <DockPanel DataContext="{Binding ElementName=UI}">
-
-    <!--Input Controls-->
-        <DockPanel DockPanel.Dock="Bottom" Height="100" Margin="2">
-
-            <!--Buttons-->
-            <DockPanel DockPanel.Dock="Right">
-                <DockPanel>
-                    <UniformGrid Columns="2" DockPanel.Dock="Bottom" Height="30" Width="100">
-                        <Button Content="Clear" Command="{Binding ClearCommand}"/>
-                        <Button Content="Cancel" Command="{Binding CancelCommand}"/>
-                    </UniformGrid>
-                    <Button Content="Send" Command="{Binding GenerateCommand}" IsDefault="True"/>
-                </DockPanel>
-            </DockPanel>
-
-            <!--Prompt-->
-            <TextBox Text="{Binding Prompt, UpdateSourceTrigger=PropertyChanged}" utils:ShiftEnterBehavior.Enable="True"/>
-
-        </DockPanel>
-
-
-        <!--Result List-->
-        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
-            <ScrollViewer CanContentScroll="False" utils:AutoScrollBehavior.AutoScroll="True" HorizontalScrollBarVisibility="Disabled" >
-                <ItemsControl ItemsSource="{Binding}" ScrollViewer.HorizontalScrollBarVisibility="Disabled" ScrollViewer.VerticalScrollBarVisibility="Disabled">
-                    <ItemsControl.DataContext>
-                        <CollectionViewSource Source="{Binding ResultHistory, ElementName=UI}">
-                            <CollectionViewSource.SortDescriptions>
-                                <scm:SortDescription PropertyName="Timestamp" Direction="Ascending" />
-                            </CollectionViewSource.SortDescriptions>
-                        </CollectionViewSource>
-                    </ItemsControl.DataContext>
-                    <ItemsControl.ItemTemplate>
-                        <DataTemplate>
-                            <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
-                                <DockPanel>
-                                    <DockPanel DockPanel.Dock="Left" Margin="10">
-                                        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2" VerticalAlignment="Top">
-                                            <Image Width="60" Height="60" RenderOptions.BitmapScalingMode="Fant">
-                                                <Image.Style>
-                                                    <Style TargetType="{x:Type Image}">
-                                                        <Setter Property="Source" Value="{StaticResource ImageAvatarRobot}" />
-                                                        <Style.Triggers>
-                                                            <DataTrigger Binding="{Binding IsUserInput}" Value="True">
-                                                                <Setter Property="Source" Value="{StaticResource ImageAvatarUser}" />
-                                                            </DataTrigger>
-                                                        </Style.Triggers>
-                                                    </Style>
-                                                </Image.Style>
-                                            </Image>
-                                        </Border>
-                                    </DockPanel>
-                                    <DockPanel DockPanel.Dock="Bottom">
-                                        <TextBlock Text="{Binding Timestamp}" HorizontalAlignment="Right" Margin="0,0,4,2" />
-                                    </DockPanel>
-                                    <DockPanel Margin="5">
-                                        <TextBox Text="{Binding Content}" TextWrapping="Wrap" Style="{x:Null}" BorderThickness="0" IsReadOnly="True" />
-                                    </DockPanel>
-                                </DockPanel>
-                            </Border>
-                        </DataTemplate>
-                    </ItemsControl.ItemTemplate>
-                </ItemsControl>
-            </ScrollViewer>
-        </Border>
-
-    </DockPanel>
-</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs b/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs
deleted file mode 100644
index c0227fbf10..0000000000
--- a/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs
+++ /dev/null
@@ -1,171 +0,0 @@
-﻿using Genny.Utils;
-using Genny.ViewModel;
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System;
-using System.Collections.Generic;
-using System.Collections.ObjectModel;
-using System.ComponentModel;
-using System.Runtime.CompilerServices;
-using System.Threading;
-using System.Threading.Tasks;
-using System.Windows;
-using System.Windows.Controls;
-
-namespace Genny.Views
-{
-    /// <summary>
-    /// Interaction logic for StatelessView.xaml
-    /// </summary>
-    public partial class StatelessView : UserControl, INotifyPropertyChanged
-    {
-        private string _prompt;
-        private CancellationTokenSource _cancellationTokenSource;
-
-        public StatelessView()
-        {
-            ClearCommand = new RelayCommand(ClearAsync);
-            CancelCommand = new RelayCommand(CancelAsync);
-            GenerateCommand = new RelayCommand(GenerateAsync, CanExecuteGenerate);
-            ResultHistory = new ObservableCollection<ResultModel>();
-            InitializeComponent();
-        }
-
-        public static readonly DependencyProperty ModelProperty =
-          DependencyProperty.Register(nameof(Model), typeof(Model), typeof(StatelessView));
-
-        public static readonly DependencyProperty TokenizerProperty =
-            DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(StatelessView));
-
-        public static readonly DependencyProperty ModelOptionsProperty =
-            DependencyProperty.Register(nameof(ModelOptions), typeof(ModelOptionsModel), typeof(StatelessView));
-
-        public static readonly DependencyProperty SearchOptionsProperty =
-            DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(StatelessView));
-
-        public RelayCommand ClearCommand { get; }
-        public RelayCommand CancelCommand { get; }
-        public RelayCommand GenerateCommand { get; }
-        public ResultModel CurrentResult { get; set; }
-        public ObservableCollection<ResultModel> ResultHistory { get; }
-
-        public Model Model
-        {
-            get { return (Model)GetValue(ModelProperty); }
-            set { SetValue(ModelProperty, value); }
-        }
-
-        public Tokenizer Tokenizer
-        {
-            get { return (Tokenizer)GetValue(TokenizerProperty); }
-            set { SetValue(TokenizerProperty, value); }
-        }
-
-        public ModelOptionsModel ModelOptions
-        {
-            get { return (ModelOptionsModel)GetValue(ModelOptionsProperty); }
-            set { SetValue(ModelOptionsProperty, value); }
-        }
-
-        public SearchOptionsModel SearchOptions
-        {
-            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
-            set { SetValue(SearchOptionsProperty, value); }
-        }
-
-        public string Prompt
-        {
-            get { return _prompt; }
-            set { _prompt = value; NotifyPropertyChanged(); }
-        }
-
-
-        private async Task GenerateAsync()
-        {
-            try
-            {
-                var userInput = new ResultModel
-                {
-                    Content = Prompt,
-                    IsUserInput = true
-                };
-
-                Prompt = null;
-                CurrentResult = null;
-                ResultHistory.Add(userInput);
-                _cancellationTokenSource = new CancellationTokenSource();
-                await foreach (var sentencePiece in RunInferenceAsync(userInput.Content, _cancellationTokenSource.Token))
-                {
-                    if (CurrentResult == null)
-                    {
-                        if (string.IsNullOrWhiteSpace(sentencePiece.Content)) // Ingore preceding '\n'
-                            continue;
-
-                        ResultHistory.Add(CurrentResult = new ResultModel());
-                    }
-                    CurrentResult.Content += sentencePiece.Content;
-                }
-            }
-            catch (OperationCanceledException)
-            {
-                CurrentResult.Content += "\n\n[Operation Canceled]";
-            }
-            catch (Exception ex)
-            {
-                MessageBox.Show(ex.Message, "Inference Error", MessageBoxButton.OK, MessageBoxImage.Error);
-            }
-        }
-
-
-        private bool CanExecuteGenerate()
-        {
-            return !string.IsNullOrWhiteSpace(Prompt);
-        }
-
-
-        private Task CancelAsync()
-        {
-            _cancellationTokenSource?.Cancel();
-            return Task.CompletedTask;
-        }
-
-
-        private Task ClearAsync()
-        {
-            ResultHistory.Clear();
-            return Task.CompletedTask;
-        }
-
-        private async IAsyncEnumerable<TokenModel> RunInferenceAsync(string prompt, [EnumeratorCancellation] CancellationToken cancellationToken)
-        {
-            var sequences = await Tokenizer.EncodeAsync($"<|user|>{prompt}<|end|><|assistant|>", cancellationToken);
-
-            using var generatorParams = new GeneratorParams(Model);
-            generatorParams.ApplySearchOptions(SearchOptions);
-            generatorParams.SetInputSequences(sequences);
-
-            using var tokenizerStream = Tokenizer.CreateStream();
-            using var generator = new Generator(Model, generatorParams);
-            while (!generator.IsDone())
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-
-                yield return await Task.Run(() =>
-                {
-                    generator.ComputeLogits();
-                    generator.GenerateNextToken();
-
-                    var tokenId = generator.GetSequence(0)[^1];
-                    return new TokenModel(tokenId, tokenizerStream.Decode(tokenId));
-                }, cancellationToken);
-            }
-        }
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
diff --git a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml
deleted file mode 100644
index b69e64dee6..0000000000
--- a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml
+++ /dev/null
@@ -1,33 +0,0 @@
-﻿<UserControl x:Class="Genny.Views.TokenizerView"
-             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
-             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
-             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
-             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
-             xmlns:local="clr-namespace:Genny.Views"
-             Name="UI"
-             mc:Ignorable="d" 
-             d:DesignHeight="450" d:DesignWidth="800">
-    <UniformGrid DataContext="{Binding ElementName=UI}" Rows="2">
-
-        <!--Encode-->
-        <DockPanel Grid.Row="0" Margin="10">
-            <TextBlock DockPanel.Dock="Top" Text="Encode" />
-            <Button DockPanel.Dock="Bottom" HorizontalAlignment="Right" Content="Encode" Command="{Binding EncodeCommand}" CommandParameter="{Binding Text, ElementName=EncodeInputTextbox, UpdateSourceTrigger=PropertyChanged}" Margin="0,4,0,0" Padding="3"/>
-            <UniformGrid Rows="2">
-                <TextBox x:Name="EncodeInputTextbox" TextWrapping="Wrap"/>
-                <TextBox Text="{Binding EncodeResult}" IsReadOnly="True" TextWrapping="Wrap" BorderThickness="1,0,1,1"/>
-            </UniformGrid>
-        </DockPanel>
-
-        <!--Decode-->
-        <DockPanel Grid.Row="1" Margin="10">
-            <TextBlock DockPanel.Dock="Top" Text="Decode" />
-            <Button DockPanel.Dock="Bottom" HorizontalAlignment="Right" Content="Decode" Command="{Binding DecodeCommand}" CommandParameter="{Binding Text, ElementName=DecodeInputTextbox, UpdateSourceTrigger=PropertyChanged}" Margin="0,4,0,0" Padding="3"/>
-            <UniformGrid Rows="2">
-                <TextBox x:Name="DecodeInputTextbox" TextWrapping="Wrap"/>
-                <TextBox Text="{Binding DecodeResult}" IsReadOnly="True" TextWrapping="Wrap" BorderThickness="1,0,1,1"/>
-            </UniformGrid>
-        </DockPanel>
-
-    </UniformGrid>
-</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs
deleted file mode 100644
index a6b488fa6d..0000000000
--- a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs
+++ /dev/null
@@ -1,93 +0,0 @@
-﻿using Genny.Utils;
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System;
-using System.ComponentModel;
-using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Threading.Tasks;
-using System.Windows;
-using System.Windows.Controls;
-
-namespace Genny.Views
-{
-    /// <summary>
-    /// Interaction logic for TokenizerView.xaml
-    /// </summary>
-    public partial class TokenizerView : UserControl, INotifyPropertyChanged
-    {
-        private string _encodeResult;
-        private string _decodeResult;
-
-        public TokenizerView()
-        {
-            EncodeCommand = new RelayCommand<string>(EncodeAsync);
-            DecodeCommand = new RelayCommand<string>(DecodeAsync);
-            InitializeComponent();
-        }
-
-        public static readonly DependencyProperty TokenizerProperty =
-           DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(TokenizerView));
-
-        public RelayCommand<string> EncodeCommand { get; }
-        public RelayCommand<string> DecodeCommand { get; }
-
-        public Tokenizer Tokenizer
-        {
-            get { return (Tokenizer)GetValue(TokenizerProperty); }
-            set { SetValue(TokenizerProperty, value); }
-        }
-
-        public string EncodeResult
-        {
-            get { return _encodeResult; }
-            set { _encodeResult = value; NotifyPropertyChanged(); }
-        }
-
-        public string DecodeResult
-        {
-            get { return _decodeResult; }
-            set { _decodeResult = value; NotifyPropertyChanged(); }
-        }
-
-
-        private async Task EncodeAsync(string input)
-        {
-            EncodeResult = null;
-            try
-            {
-                var sequences = await Tokenizer.EncodeAsync(input);
-                EncodeResult = string.Join(", ", sequences[0].ToArray());
-            }
-            catch (Exception ex)
-            {
-                MessageBox.Show(ex.Message, "Tokenizer Encode Error", MessageBoxButton.OK, MessageBoxImage.Error);
-            }
-        }
-
-
-        private async Task DecodeAsync(string input)
-        {
-            DecodeResult = null;
-            try
-            {
-                var intArray = input
-                     .Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)
-                     .Select(int.Parse)
-                     .ToArray();
-                DecodeResult = await Tokenizer.DecodeAsync(intArray);
-            }
-            catch (Exception ex)
-            {
-                MessageBox.Show(ex.Message, "Tokenizer Decode Error", MessageBoxButton.OK, MessageBoxImage.Error);
-            }
-        }
-
-        #region INotifyPropertyChanged
-        public event PropertyChangedEventHandler PropertyChanged;
-        public void NotifyPropertyChanged([CallerMemberName] string property = "")
-        {
-            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
-        }
-        #endregion
-    }
-}
diff --git a/examples/csharp/Genny/README.md b/examples/csharp/Genny/README.md
deleted file mode 100644
index 3ea42c2425..0000000000
--- a/examples/csharp/Genny/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-## Genny
-A example UI for debugging and testing models with OnnxRuntime-GenAI
-
-|   |  |
-| :--- | :--- |
-<img src="Assets/Screenshot1.PNG" /> | <img src="Assets/Screenshot2.PNG" />
-
-______________________
-
-## Run Genny
-* Open `Genny.sln` in VisualStudio and run `Debug` or `Release` to launch the application
-* Enter or Select your model folder path
-* Click Load Model (this may take a few minutes)
-
-
-## CPU or GPU
-* `Debug` or `Release` to launch the application with CPU support
-* `Debug_Cuda` or `Release_Cuda`  to launch the application with CUDA GPU support
-* `Debug_DirectML` or `Release_DirectML`  to launch the application with DirectML GPU support
-
-
-## Models
-You can generate the model using the ONNX Runtime Generative AI model builder, or bring your own model.
-
-To generate the model with model builder:
-
-1. Install the python package
-
-   Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
-
-2. Install the model builder script dependencies
-
-   ```bash
-   pip install numpy
-   pip install transformers
-   pip install torch
-   pip install onnx
-   pip install onnxruntime
-   ```
-   
-3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../../src/python/py/models/README.md)
-
-   ```bash
-   python -m onnxruntime_genai.models.builder -m models/phi-2 -e cpu -p int4 -o models/phi2-int4
-   ```
-
-The model builder also generates the configuration needed by the API to run generation. You can modify the config according to your scenario.
-
-If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
-
-The Phi-3 ONNX models are hosted [here](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) in a collection on Hugging Face.
-
-   ```bash
-   huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/* --local-dir .
-   ```
\ No newline at end of file
diff --git a/examples/csharp/HelloPhi/Program.cs b/examples/csharp/HelloPhi/Program.cs
deleted file mode 100644
index e2f64dfc8f..0000000000
--- a/examples/csharp/HelloPhi/Program.cs
+++ /dev/null
@@ -1,204 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-using Microsoft.ML.OnnxRuntimeGenAI;
-
-void PrintUsage()
-{
-    Console.WriteLine("Usage:");
-    Console.WriteLine("  -m model_path");
-    Console.WriteLine("\t\t\t\tPath to the model");
-    Console.WriteLine("  -e execution_provider");
-    Console.WriteLine("\t\t\t\tExecution provider to run the model");
-    Console.WriteLine("  --non-interactive (optional)");
-    Console.WriteLine("\t\t\t\tInteractive mode");
-}
-
-using OgaHandle ogaHandle = new OgaHandle();
-
-if (args.Length < 1)
-{
-    PrintUsage();
-    Environment.Exit(-1);
-}
-
-bool interactive = true;
-string modelPath = string.Empty;
-string executionProvider = string.Empty;
-
-uint i = 0;
-while (i < args.Length)
-{
-    var arg = args[i];
-    if (arg == "--non-interactive")
-    {
-        interactive = false;
-    }
-    else if (arg == "-m")
-    {
-        if (i + 1 < args.Length)
-        {
-            modelPath = Path.Combine(args[i+1]);
-        }
-    }
-    else if (arg == "-e")
-    {
-        if (i + 1 < args.Length)
-        {
-            executionProvider = Path.Combine(args[i+1]);
-        }
-    }
-    i++;
-}
-
-if (string.IsNullOrEmpty(modelPath))
-{
-    throw new Exception("Model path must be specified");
-}
-if (string.IsNullOrEmpty(executionProvider))
-{
-    throw new Exception("Execution provider must be specified");
-}
-
-Console.WriteLine("-------------");
-Console.WriteLine("Hello, Phi!");
-Console.WriteLine("-------------");
-
-Console.WriteLine("Model path: " + modelPath);
-Console.WriteLine("Execution provider: " + executionProvider);
-Console.WriteLine("Interactive: " + interactive);
-
-using Config config = new Config(modelPath);
-config.ClearProviders();
-if (executionProvider != "cpu") {
-    config.AppendProvider(executionProvider);
-    if (executionProvider == "cuda") {
-        config.SetProviderOption(executionProvider, "enable_cuda_graph", "0");
-    }
-}
-using Model model = new Model(config);
-using Tokenizer tokenizer = new Tokenizer(model);
-
-var option = 2;
-if (interactive)
-{
-    Console.WriteLine("Please enter option number:");
-    Console.WriteLine("1. Complete Q&A");
-    Console.WriteLine("2. Streaming Q&A");
-    Console.WriteLine("3. Streaming Chat (not supported for DirectML and QNN currently)");
-    int.TryParse(Console.ReadLine(), out option);
-}
-
-int minLength = 50;
-int maxLength = 500;
-
-static string GetPrompt(bool interactive)
-{
-    string prompt = "def is_prime(num):"; // Example prompt
-    if (interactive)
-    {
-        Console.WriteLine("Prompt: (Use quit() to exit)");
-        prompt = Console.ReadLine();
-    }
-    return prompt;
-}
-
-if (option == 1 || option == 2)
-{
-    do
-    {
-        string prompt = GetPrompt(interactive);
-        if (string.IsNullOrEmpty(prompt))
-        {
-            continue;
-        }
-        if (string.Compare(prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
-        {
-            break;
-        }
-        string messages = $@"[{{""role"":""system"",""content"":""You are a helpful AI assistant.""}},{{""role"":""user"",""content"":""{prompt}""}}]";
-        var sequences = tokenizer.Encode(tokenizer.ApplyChatTemplate("", messages, "", true));
-
-        if (option == 1) // Complete Output
-        {
-            using GeneratorParams generatorParams = new GeneratorParams(model);
-            generatorParams.SetSearchOption("min_length", minLength);
-            generatorParams.SetSearchOption("max_length", maxLength);
-            using var generator = new Generator(model, generatorParams);
-            generator.AppendTokenSequences(sequences);
-            var watch = System.Diagnostics.Stopwatch.StartNew();
-            while (!generator.IsDone())
-            {
-                generator.GenerateNextToken();
-            }
-
-            var outputSequence = generator.GetSequence(0);
-            var outputString = tokenizer.Decode(outputSequence);
-            watch.Stop();
-            var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-            Console.WriteLine("Output:");
-            Console.WriteLine(outputString);
-            var totalTokens = outputSequence.Length;
-            Console.WriteLine($"Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
-        }
-
-        else if (option == 2) //Streaming Output
-        {
-            using GeneratorParams generatorParams = new GeneratorParams(model);
-            generatorParams.SetSearchOption("min_length", minLength);
-            generatorParams.SetSearchOption("max_length", maxLength);
-            using var tokenizerStream = tokenizer.CreateStream();
-            using var generator = new Generator(model, generatorParams);
-            generator.AppendTokenSequences(sequences);
-            var watch = System.Diagnostics.Stopwatch.StartNew();
-            while (!generator.IsDone())
-            {
-                generator.GenerateNextToken();
-                Console.Write(tokenizerStream.Decode(generator.GetNextTokens()[0]));
-            }
-            Console.WriteLine();
-            watch.Stop();
-            var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-            var outputSequence = generator.GetSequence(0);
-            var totalTokens = outputSequence.Length;
-            Console.WriteLine($"Streaming Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
-        }
-    } while (interactive);
-}
-
-if (option == 3) // Streaming Chat
-{
-    using GeneratorParams generatorParams = new GeneratorParams(model);
-    generatorParams.SetSearchOption("min_length", minLength);
-    generatorParams.SetSearchOption("max_length", maxLength);
-    using var tokenizerStream = tokenizer.CreateStream();
-    using var generator = new Generator(model, generatorParams);
-    var prevTotalTokens = 0;
-    do{
-        string prompt = GetPrompt(interactive);
-        if (string.IsNullOrEmpty(prompt))
-        {
-            continue;
-        }
-        if (string.Compare(prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
-        {
-            break;
-        }
-        string messages = $@"[{{""role"":""system"",""content"":""You are a helpful AI assistant.""}},{{""role"":""user"",""content"":""{prompt}""}}]";
-        var sequences = tokenizer.Encode(tokenizer.ApplyChatTemplate("", messages, "", true));
-        var watch = System.Diagnostics.Stopwatch.StartNew();
-        generator.AppendTokenSequences(sequences);
-        while (!generator.IsDone())
-        {
-            generator.GenerateNextToken();
-            Console.Write(tokenizerStream.Decode(generator.GetNextTokens()[0]));
-        }
-        Console.WriteLine();
-        watch.Stop();
-        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-        var outputSequence = generator.GetSequence(0);
-        var totalNewTokens = outputSequence.Length - prevTotalTokens;
-        prevTotalTokens = totalNewTokens;
-        Console.WriteLine($"Streaming Tokens: {totalNewTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}");
-    } while (interactive);
-}
diff --git a/examples/csharp/HelloPhi/README.md b/examples/csharp/HelloPhi/README.md
deleted file mode 100644
index 7d54c0bf01..0000000000
--- a/examples/csharp/HelloPhi/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# ONNX Runtime GenAI C# example 
-
-## Obtain a model
-
-You can download a published model from Hugging Face. For example, this is Phi-3.5 mini optimized for CPU and mobile. You can find other models here: 
-
-```script
-huggingface-cli download microsoft/Phi-3.5-mini-instruct-onnx --include cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4/* --local-dir models
-move models\cpu_and_mobile\cpu-int4-rtn-block-32-acc-level-4 models\phi-3
-```
-
-Alternatively you can build a model yourself using the model builder. See [here](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md) for more details.
-
-
-## Run the model
-
-Open [HelloPhi.sln](HelloPhi.sln) and run the console application.
-
-Notes:
-
-1. The `executionProvider` must be one of the following: `cpu`, `cuda`, or `dml`.
-
-2. This application does not add a template to the prompt that you enter. If your model needs a template (e.g. `<|user|>\n{input} <|end|>\n<|assistant|>` for Phi-3.5) then please add this to your prompt.
diff --git a/examples/csharp/HelloPhi3V/HelloPhi3V.sln b/examples/csharp/HelloPhi3V/HelloPhi3V.sln
deleted file mode 100644
index 170c36c1ef..0000000000
--- a/examples/csharp/HelloPhi3V/HelloPhi3V.sln
+++ /dev/null
@@ -1,37 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.9.34902.65
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HelloPhi3V", "HelloPhi3V.csproj", "{75C05439-20D3-44C3-883A-15E150E9F93E}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug_Cuda|Any CPU = Debug_Cuda|Any CPU
-		Debug_DirectML|Any CPU = Debug_DirectML|Any CPU
-		Debug|Any CPU = Debug|Any CPU
-		Release_Cuda|Any CPU = Release_Cuda|Any CPU
-		Release_DirectML|Any CPU = Release_DirectML|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug_Cuda|Any CPU.ActiveCfg = Debug_Cuda|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug_Cuda|Any CPU.Build.0 = Debug_Cuda|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug_DirectML|Any CPU.ActiveCfg = Debug_DirectML|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug_DirectML|Any CPU.Build.0 = Debug_DirectML|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release_Cuda|Any CPU.ActiveCfg = Release_Cuda|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release_Cuda|Any CPU.Build.0 = Release_Cuda|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release_DirectML|Any CPU.ActiveCfg = Debug|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release_DirectML|Any CPU.Build.0 = Debug|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{75C05439-20D3-44C3-883A-15E150E9F93E}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {58510186-ED6B-46C0-8D3D-DB5300239D3A}
-	EndGlobalSection
-EndGlobal
diff --git a/examples/csharp/HelloPhi3V/Program.cs b/examples/csharp/HelloPhi3V/Program.cs
deleted file mode 100644
index 1d32cac199..0000000000
--- a/examples/csharp/HelloPhi3V/Program.cs
+++ /dev/null
@@ -1,185 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System.Linq;
-using System.Runtime.CompilerServices;
-
-static string GetDirectoryInTreeThatContains(string currentDirectory, string targetDirectoryName)
-{
-    bool found = false;
-    foreach (string d in Directory.GetDirectories(currentDirectory, searchPattern: targetDirectoryName))
-    {
-        found = true;
-        return Path.Combine(currentDirectory, targetDirectoryName);
-    }
-    if (!found)
-    {
-        DirectoryInfo dirInfo = new DirectoryInfo(currentDirectory);
-        if (dirInfo.Parent != null)
-        {
-            return GetDirectoryInTreeThatContains(Path.GetFullPath(Path.Combine(currentDirectory, "..")), targetDirectoryName);
-        }
-        else
-        {
-            return null;
-        }
-    }
-    return null;
-}
-
-void PrintUsage()
-{
-    Console.WriteLine("Usage:");
-    Console.WriteLine("  -m model_path");
-    Console.WriteLine("\t\t\t\tPath to the model");
-    Console.WriteLine("  -e execution_provider");
-    Console.WriteLine("\t\t\t\tExecution provider for the model");
-    Console.WriteLine("  --image_paths");
-    Console.WriteLine("\t\t\t\tPath to the images");
-    Console.WriteLine("  --non-interactive (optional), mainly for CI usage");
-    Console.WriteLine("\t\t\t\tInteractive mode");
-}
-
-using OgaHandle ogaHandle = new OgaHandle();
-
-if (args.Length < 1)
-{
-    PrintUsage();
-    Environment.Exit(-1);
-}
-
-bool interactive = true;
-string modelPath = string.Empty;
-string executionProvider = string.Empty;
-List<string> imagePaths = new List<string>();
-
-uint i_arg = 0;
-while (i_arg < args.Length)
-{
-    var arg = args[i_arg];
-    if (arg == "--non-interactive")
-    {
-        interactive = false;
-    }
-    else if (arg == "-m")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            modelPath = Path.Combine(args[i_arg+1]);
-        }
-    }
-    else if (arg == "-e")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            executionProvider = Path.Combine(args[i_arg+1]);
-        }
-    }
-    else if (arg == "--image_paths")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            imagePaths = args[i_arg + 1].Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-        }
-    }
-    i_arg++;
-}
-
-if (string.IsNullOrEmpty(modelPath))
-{
-    throw new Exception("Model path must be specified");
-}
-if (string.IsNullOrEmpty(executionProvider))
-{
-    throw new Exception("Execution provider must be specified");
-}
-
-Console.WriteLine("--------------------");
-Console.WriteLine("Hello, Phi-3-Vision!");
-Console.WriteLine("--------------------");
-
-Console.WriteLine("Model path: " + modelPath);
-Console.WriteLine("Execution provider: " + executionProvider);
-Console.WriteLine("Interactive: " + interactive);
-
-using Config config = new Config(modelPath);
-config.ClearProviders();
-if (executionProvider != "cpu") {
-    config.AppendProvider(executionProvider);
-    if (executionProvider == "cuda") {
-        config.SetProviderOption(executionProvider, "enable_cuda_graph", "0");
-    }
-}
-using Model model = new Model(config);
-using MultiModalProcessor processor = new MultiModalProcessor(model);
-using Tokenizer tokenizer = new Tokenizer(model);
-using var stream = processor.CreateStream();
-
-do
-{
-    if (interactive)
-    {
-        Console.WriteLine("Image Path (comma separated; leave empty if no image):");
-        imagePaths = Console.ReadLine().Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-    }
-
-    if (imagePaths.Count == 0)
-    {
-        Console.WriteLine("No image provided. Using default image.");
-        imagePaths.Add(Path.Combine(
-            GetDirectoryInTreeThatContains(Directory.GetCurrentDirectory(), "test"), "test_models", "images", "australia.jpg"));
-    }
-    for (int i = 0; i < imagePaths.Count; i++)
-    {
-        string imagePath = Path.GetFullPath(imagePaths[i].Trim());
-        if (!File.Exists(imagePath))
-        {
-            throw new Exception("Image file not found: " + imagePath);
-        }
-        Console.WriteLine("Using image: " + imagePath);
-    }
-
-    Images images = imagePaths.Count > 0 ? Images.Load(imagePaths.ToArray()) : null;
-
-    string text = "What is shown in this image?";
-    if (interactive) {
-        Console.WriteLine("Prompt:");
-        text = Console.ReadLine();
-    }
-
-    string content = "";
-    if (images != null)
-    {
-        content = string.Join("\\n", imagePaths.Select((_, idx) => $"<|image_{idx + 1}|>")) + "\\n";
-    }
-    content += text;
-
-    string messages = $"[{{\"role\":\"user\",\"content\":\"{content}\"}}]";
-    string prompt = tokenizer.ApplyChatTemplate("", messages, "", true);
-
-    Console.WriteLine("Processing image and prompt...");
-    using var inputTensors = processor.ProcessImages(prompt, images);
-
-    Console.WriteLine("Generating response...");
-    using GeneratorParams generatorParams = new GeneratorParams(model);
-    generatorParams.SetSearchOption("max_length", 7680);
-
-    using var generator = new Generator(model, generatorParams);
-    generator.SetInputs(inputTensors);
-    var watch = System.Diagnostics.Stopwatch.StartNew();
-    while (!generator.IsDone())
-    {
-        generator.GenerateNextToken();
-        Console.Write(stream.Decode(generator.GetNextTokens()[0]));
-    }
-    watch.Stop();
-    var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-    Console.WriteLine();
-    Console.WriteLine($"Total Time: {runTimeInSeconds:0.00}");
-
-    if (images != null)
-    {
-        images.Dispose();
-    }
-} while (interactive);
\ No newline at end of file
diff --git a/examples/csharp/HelloPhi4MM/HelloPhi4MM.csproj b/examples/csharp/HelloPhi4MM/HelloPhi4MM.csproj
deleted file mode 100644
index 49ca4fb4db..0000000000
--- a/examples/csharp/HelloPhi4MM/HelloPhi4MM.csproj
+++ /dev/null
@@ -1,17 +0,0 @@
-﻿<Project Sdk="Microsoft.NET.Sdk">
-
-  <PropertyGroup>
-    <OutputType>Exe</OutputType>
-    <TargetFramework>net6.0</TargetFramework>
-    <ImplicitUsings>enable</ImplicitUsings>
-    <Nullable>enable</Nullable>
-    <Configurations>Debug;Release;Debug_Cuda;Release_Cuda;Debug_DirectML;Release_DirectML</Configurations>
-  </PropertyGroup>
-
-  <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
-  </ItemGroup>
-
-</Project>
diff --git a/examples/csharp/HelloPhi4MM/Program.cs b/examples/csharp/HelloPhi4MM/Program.cs
deleted file mode 100644
index ce0ddf359d..0000000000
--- a/examples/csharp/HelloPhi4MM/Program.cs
+++ /dev/null
@@ -1,236 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-using Microsoft.ML.OnnxRuntimeGenAI;
-using System.Linq;
-using System.Runtime.CompilerServices;
-
-static string GetDirectoryInTreeThatContains(string currentDirectory, string targetDirectoryName)
-{
-    bool found = false;
-    foreach (string d in Directory.GetDirectories(currentDirectory, searchPattern: targetDirectoryName))
-    {
-        found = true;
-        return Path.Combine(currentDirectory, targetDirectoryName);
-    }
-    if (!found)
-    {
-        DirectoryInfo dirInfo = new DirectoryInfo(currentDirectory);
-        if (dirInfo.Parent != null)
-        {
-            return GetDirectoryInTreeThatContains(Path.GetFullPath(Path.Combine(currentDirectory, "..")), targetDirectoryName);
-        }
-        else
-        {
-            return null;
-        }
-    }
-    return null;
-}
-
-void PrintUsage()
-{
-    Console.WriteLine("Usage:");
-    Console.WriteLine("  -m model_path");
-    Console.WriteLine("\t\t\t\tPath to the model");
-    Console.WriteLine("  -e execution_provider");
-    Console.WriteLine("\t\t\t\tExecution provider for the model");
-    Console.WriteLine("  --image_paths");
-    Console.WriteLine("\t\t\t\tPath to the images");
-    Console.WriteLine("  --audio_paths");
-    Console.WriteLine("\t\t\t\tPath to the audios");
-    Console.WriteLine("  --non-interactive (optional), mainly for CI usage");
-    Console.WriteLine("\t\t\t\tInteractive mode");
-}
-
-using OgaHandle ogaHandle = new OgaHandle();
-
-if (args.Length < 1)
-{
-    PrintUsage();
-    Environment.Exit(-1);
-}
-
-bool interactive = true;
-string modelPath = string.Empty;
-string executionProvider = string.Empty;
-List<string> imagePaths = new List<string>();
-List<string> audioPaths = new List<string>();
-
-uint i_arg = 0;
-while (i_arg < args.Length)
-{
-    var arg = args[i_arg];
-    if (arg == "--non-interactive")
-    {
-        interactive = false;
-    }
-    else if (arg == "-m")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            modelPath = Path.Combine(args[i_arg+1]);
-        }
-    }
-    else if (arg == "-e")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            executionProvider = Path.Combine(args[i_arg+1]);
-        }
-    }
-    else if (arg == "--image_paths")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            imagePaths = args[i_arg + 1].Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-        }
-    }
-    else if (arg == "--audio_paths")
-    {
-        if (i_arg + 1 < args.Length)
-        {
-            audioPaths = args[i_arg + 1].Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-        }
-    }
-    i_arg++;
-}
-
-if (string.IsNullOrEmpty(modelPath))
-{
-    throw new Exception("Model path must be specified");
-}
-if (string.IsNullOrEmpty(executionProvider))
-{
-    throw new Exception("Execution provider must be specified");
-}
-
-Console.WriteLine("--------------------");
-Console.WriteLine("Hello, Phi-4-Multimodal!");
-Console.WriteLine("--------------------");
-
-Console.WriteLine("Model path: " + modelPath);
-Console.WriteLine("Execution provider: " + executionProvider);
-Console.WriteLine("Interactive: " + interactive);
-
-using Config config = new Config(modelPath);
-config.ClearProviders();
-if (executionProvider != "cpu") {
-    config.AppendProvider(executionProvider);
-    if (executionProvider == "cuda") {
-        config.SetProviderOption(executionProvider, "enable_cuda_graph", "0");
-    }
-}
-using Model model = new Model(config);
-using Tokenizer tokenizer = new Tokenizer(model);
-using MultiModalProcessor processor = new MultiModalProcessor(model);
-using var stream = processor.CreateStream();
-
-do
-{
-    // Get images
-    if (interactive)
-    {
-        Console.WriteLine("Image Path (comma separated; leave empty if no image):");
-        imagePaths = Console.ReadLine().Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-    }
-    if (imagePaths.Count == 0)
-    {
-        Console.WriteLine("No image provided. Using default image.");
-        imagePaths.Add(Path.Combine(
-            GetDirectoryInTreeThatContains(Directory.GetCurrentDirectory(), "test"), "test_models", "images", "australia.jpg"));
-    }
-    for (int i = 0; i < imagePaths.Count; i++)
-    {
-        string imagePath = Path.GetFullPath(imagePaths[i].Trim());
-        if (!File.Exists(imagePath))
-        {
-            throw new Exception("Image file not found: " + imagePath);
-        }
-        Console.WriteLine("Using image: " + imagePath);
-    }
-    Images images = imagePaths.Count > 0 ? Images.Load(imagePaths.ToArray()) : null;
-
-    // Get audios
-    if (interactive)
-    {
-        Console.WriteLine("Audio Path (comma separated; leave empty if no audio):");
-        audioPaths = Console.ReadLine().Split(',').ToList<string>().Select(i => i.ToString().Trim()).ToList();
-    }
-    if (audioPaths.Count == 0)
-    {
-        Console.WriteLine("No audio provided. Using default audio.");
-        audioPaths.Add(Path.Combine(
-            GetDirectoryInTreeThatContains(Directory.GetCurrentDirectory(), "test"), "test_models", "audios", "1272-141231-0002.mp3"));
-    }
-    for (int i = 0; i < audioPaths.Count; i++)
-    {
-        string audioPath = Path.GetFullPath(audioPaths[i].Trim());
-        if (!File.Exists(audioPath))
-        {
-            throw new Exception("Audio file not found: " + audioPath);
-        }
-        Console.WriteLine("Using audio: " + audioPath);
-    }
-    Audios audios = audioPaths.Count > 0 ? Audios.Load(audioPaths.ToArray()) : null;
-
-    // Get prompt
-    string text = "Does the audio summarize what is in the picture? If not, what is different?";
-    if (interactive) {
-        Console.WriteLine("Prompt:");
-        text = Console.ReadLine();
-    }
-
-    // Combine prompt, images, and audios and construct multimodal content
-    string content = "";
-    if (images != null)
-    {
-        for (int i = 0; i < imagePaths.Count; i++)
-        {
-            content += $"<|image_{i + 1}|>\n";
-        }
-    }
-    if (audios != null)
-    {
-        for (int i = 0; i < audioPaths.Count; i++)
-        {
-            content += $"<|audio_{i + 1}|>\n";
-        }
-    }
-    content += text;
-
-    // Format message string
-    string messages = $"[{{\"role\":\"user\",\"content\":\"{content}\"}}]";
-
-    // Apply chat template to get prompt
-    string prompt = tokenizer.ApplyChatTemplate("", messages, "", true);
-
-    Console.WriteLine("Processing inputs...");
-    using var inputTensors = processor.ProcessImagesAndAudios(prompt, images, audios);
-
-    Console.WriteLine("Generating response...");
-    using GeneratorParams generatorParams = new GeneratorParams(model);
-    generatorParams.SetSearchOption("max_length", 7680);
-
-    using var generator = new Generator(model, generatorParams);
-    generator.SetInputs(inputTensors);
-    var watch = System.Diagnostics.Stopwatch.StartNew();
-    while (!generator.IsDone())
-    {
-        generator.GenerateNextToken();
-        Console.Write(stream.Decode(generator.GetNextTokens()[0]));
-    }
-    watch.Stop();
-    var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-    Console.WriteLine();
-    Console.WriteLine($"Total Time: {runTimeInSeconds:0.00}");
-
-    if (images != null)
-    {
-        images.Dispose();
-    }
-    if (audios != null)
-    {
-        audios.Dispose();
-    }
-} while (interactive);
\ No newline at end of file
diff --git a/examples/csharp/HelloPhi/HelloPhi.csproj b/examples/csharp/ModelChat/ModelChat.csproj
similarity index 65%
rename from examples/csharp/HelloPhi/HelloPhi.csproj
rename to examples/csharp/ModelChat/ModelChat.csproj
index 71e1818fed..ebde151928 100644
--- a/examples/csharp/HelloPhi/HelloPhi.csproj
+++ b/examples/csharp/ModelChat/ModelChat.csproj
@@ -1,18 +1,21 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net6.0</TargetFramework>
-    <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
-    <Configurations>Debug;Release;Debug_Cuda;Release_Cuda;Debug_DirectML;Release_DirectML</Configurations>
-    <Platforms>AnyCPU;x64</Platforms>
+    <ImplicitUsings>enable</ImplicitUsings>
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="System.CommandLine" Version="2.0.1" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="../Common/Common.cs" Link="Common/Common.cs" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/examples/csharp/HelloPhi/HelloPhi.sln b/examples/csharp/ModelChat/ModelChat.sln
similarity index 96%
rename from examples/csharp/HelloPhi/HelloPhi.sln
rename to examples/csharp/ModelChat/ModelChat.sln
index 2caee7ca47..edfc4caba1 100644
--- a/examples/csharp/HelloPhi/HelloPhi.sln
+++ b/examples/csharp/ModelChat/ModelChat.sln
@@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.9.34902.65
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HelloPhi", "HelloPhi.csproj", "{89932021-18FC-490C-8675-73F2AD1DEB2A}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ModelChat", "ModelChat.csproj", "{89932021-18FC-490C-8675-73F2AD1DEB2A}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/examples/csharp/ModelChat/Program.cs b/examples/csharp/ModelChat/Program.cs
new file mode 100644
index 0000000000..47a9e6aaec
--- /dev/null
+++ b/examples/csharp/ModelChat/Program.cs
@@ -0,0 +1,642 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using CommonUtils;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System.CommandLine;
+using System.Text.Json;
+
+/// <summary>
+/// Example of model-generate
+/// </summary>
+/// <param name="model">Model to use</param>
+/// <param name="tokenizer">Tokenizer to use</param>
+/// <param name="generatorParamsArgs">Generator params arguments to use</param>
+/// <param name="modelPath">Path to folder containing model</param>
+/// <param name="systemPrompt">System prompt to use with model</param>
+/// <param name="userPrompt">User prompt to use with model</param>
+/// <param name="interactive">Ask user or use pre-defined value</param>
+/// <param name="verbose">Use verbose logging</param>
+/// <returns>
+/// None
+/// </returns>
+void ModelGenerate(
+    Model model,
+    Tokenizer tokenizer,
+    GeneratorParamsArgs generatorParamsArgs,
+    string modelPath,
+    string systemPrompt,
+    string userPrompt,
+    bool interactive,
+    bool verbose
+)
+{
+    // Complete Q&A
+    do
+    {
+        // Get user prompt
+        string user_prompt = Common.GetUserPrompt(userPrompt, interactive);
+        if (string.Compare(user_prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
+        }
+
+        // Get input tokens
+        string messages = $@"[{{""role"":""system"",""content"":""{systemPrompt}""}},{{""role"":""user"",""content"":""{user_prompt}""}}]";
+        string prompt = Common.ApplyChatTemplate(modelPath, tokenizer, messages, add_generation_prompt: true);
+        var sequences = tokenizer.Encode(prompt);
+        if (verbose) Console.WriteLine($"Prompt encoded: {prompt}");
+
+        // Set search options for generator params
+        using GeneratorParams generatorParams = new GeneratorParams(model);
+        Common.SetSearchOptions(generatorParams, generatorParamsArgs, verbose);
+
+        // Create generator and append input tokens
+        using Generator generator = new Generator(model, generatorParams);
+        if (verbose) Console.WriteLine("Generator created");
+
+        generator.AppendTokenSequences(sequences);
+        if (verbose) Console.WriteLine("Input tokens added");
+
+        // Run generation loop
+        if (verbose) Console.WriteLine("Running generation loop...\n");
+        var watch = System.Diagnostics.Stopwatch.StartNew();
+        while (true)
+        {
+            generator.GenerateNextToken();
+            if (generator.IsDone())
+            {
+                break;
+            }
+        }
+        watch.Stop();
+        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+
+        // Get output tokens and decode to string
+        var outputSequence = generator.GetSequence(0);
+        var outputString = tokenizer.Decode(outputSequence);
+
+        // Display output and timings
+        Console.WriteLine("Output:");
+        Console.WriteLine(outputString);
+        var totalTokens = outputSequence.Length;
+        Console.WriteLine($"Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
+        Console.WriteLine();
+
+    } while (interactive);
+}
+
+/// <summary>
+/// Example of model-qa
+/// </summary>
+/// <param name="model">Model to use</param>
+/// <param name="tokenizer">Tokenizer to use</param>
+/// <param name="tokenizerStream">Tokenizer stream to use</param>
+/// <param name="generatorParamsArgs">Generator params arguments to use</param>
+/// <param name="guidanceArgs">Guidance arguments to use</param>
+/// <param name="modelPath">Path to folder containing model</param>
+/// <param name="systemPrompt">System prompt to use with model</param>
+/// <param name="userPrompt">User prompt to use with model</param>
+/// <param name="interactive">Ask user or use pre-defined value</param>
+/// <param name="verbose">Use verbose logging</param>
+/// <returns>
+/// None
+/// </returns>
+void ModelQA(
+    Model model,
+    Tokenizer tokenizer,
+    TokenizerStream tokenizerStream,
+    GeneratorParamsArgs generatorParamsArgs,
+    GuidanceArgs guidanceArgs,
+    string modelPath,
+    string systemPrompt,
+    string userPrompt,
+    bool interactive,
+    bool verbose
+)
+{
+    // Creating running list of messages
+    var system_message = new Dictionary<string, string>
+    {
+        { "role", "system" },
+        { "content", systemPrompt }
+    };
+    var input_list = new List<Dictionary<string, string>>() { system_message };
+
+    // Get and set guidance info if requested
+    string guidance_type = "";
+    string guidance_data = "";
+    string tools = "";
+    if (!string.IsNullOrEmpty(guidanceArgs.response_format))
+    {
+        Console.WriteLine("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json");
+        (guidance_type, guidance_data, tools) = Common.GetGuidance(
+            response_format: guidanceArgs.response_format,
+            filepath: guidanceArgs.tools_file,
+            text_output: guidanceArgs.text_output,
+            tool_output: guidanceArgs.tool_output,
+            tool_call_start: guidanceArgs.tool_call_start,
+            tool_call_end: guidanceArgs.tool_call_end
+        );
+        input_list[0]["tools"] = tools;
+    }
+
+    // Streaming Q&A
+    do
+    {
+        // Get user prompt
+        string user_prompt = Common.GetUserPrompt(userPrompt, interactive);
+        if (string.Compare(user_prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
+        }
+
+        // Add user message to list of messages
+        var user_message = new Dictionary<string, string>
+        {
+            { "role", "user" },
+            { "content", user_prompt }
+        };
+        input_list.Add(user_message);
+
+        // Set search options for generator params
+        using GeneratorParams generatorParams = new GeneratorParams(model);
+        Common.SetSearchOptions(generatorParams, generatorParamsArgs, verbose);
+
+        // Initialize guidance if requested
+        if (!string.IsNullOrEmpty(guidance_type) && !string.IsNullOrEmpty(guidance_data))
+        {
+            generatorParams.SetGuidance(guidance_type, guidance_data);
+            if (verbose)
+            {
+                Console.WriteLine();
+                Console.WriteLine($"Guidance type is: {guidance_type}");
+                Console.WriteLine($"Guidance data is: \n{guidance_data}");
+                Console.WriteLine();
+            }
+        }
+
+        // Create generator
+        using Generator generator = new Generator(model, generatorParams);
+        if (verbose) Console.WriteLine("Generator created");
+
+        // Apply chat template
+        string prompt = "";
+        try
+        {
+            string messages = JsonSerializer.Serialize(input_list);
+            prompt = Common.ApplyChatTemplate(modelPath, tokenizer, messages, add_generation_prompt: true, tools);
+        }
+        catch
+        {
+            prompt = user_prompt;
+        }
+        if (verbose) Console.WriteLine($"Prompt: {prompt}");
+
+        // Encode combined system + user prompt and append tokens to model
+        var sequences = tokenizer.Encode(prompt);
+        generator.AppendTokenSequences(sequences);
+
+        // Run generation loop
+        if (verbose) Console.WriteLine("Running generation loop...\n");
+        Console.Write("Output: ");
+        var watch = System.Diagnostics.Stopwatch.StartNew();
+        while (true)
+        {
+            generator.GenerateNextToken();
+            if (generator.IsDone())
+            {
+                break;
+            }
+            // Decode and print the next token
+            Console.Write(tokenizerStream.Decode(generator.GetNextTokens()[0]));
+        }
+        watch.Stop();
+        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+
+        // Remove user message from list of messages
+        input_list.RemoveAt(input_list.Count - 1);
+
+        // Display output and timings
+        var outputSequence = generator.GetSequence(0);
+        var totalTokens = outputSequence.Length;
+        Console.WriteLine();
+        Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
+        Console.WriteLine();
+
+    } while (interactive);
+}
+
+/// <summary>
+/// Example of model-chat
+/// </summary>
+/// <param name="model">Model to use</param>
+/// <param name="tokenizer">Tokenizer to use</param>
+/// <param name="tokenizerStream">Tokenizer stream to use</param>
+/// <param name="generatorParamsArgs">Generator params arguments to use</param>
+/// <param name="guidanceArgs">Guidance arguments to use</param>
+/// <param name="modelPath">Path to folder containing model</param>
+/// <param name="systemPrompt">System prompt to use with model</param>
+/// <param name="userPrompt">User prompt to use with model</param>
+/// <param name="interactive">Ask user or use pre-defined value</param>
+/// <param name="rewind">Rewind to system prompt after each user prompt</param>
+/// <param name="verbose">Use verbose logging</param>
+/// <returns>
+/// None
+/// </returns>
+void ModelChat(
+    Model model,
+    Tokenizer tokenizer,
+    TokenizerStream tokenizerStream,
+    GeneratorParamsArgs generatorParamsArgs,
+    GuidanceArgs guidanceArgs,
+    string modelPath,
+    string systemPrompt,
+    string userPrompt,
+    bool interactive,
+    bool rewind,
+    bool verbose
+)
+{
+    // Set search options for generator params
+    using GeneratorParams generatorParams = new GeneratorParams(model);
+    Common.SetSearchOptions(generatorParams, generatorParamsArgs, verbose);
+
+    // Create system message
+    var system_message = new Dictionary<string, string>
+    {
+        { "role", "system" },
+        { "content", systemPrompt }
+    };
+
+    // Get and set guidance info if requested
+    string tools = "";
+    if (!string.IsNullOrEmpty(guidanceArgs.response_format))
+    {
+        Console.WriteLine("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json");
+        string guidance_type = "";
+        string guidance_data = "";
+        (guidance_type, guidance_data, tools) = Common.GetGuidance(
+            response_format: guidanceArgs.response_format,
+            filepath: guidanceArgs.tools_file,
+            text_output: guidanceArgs.text_output,
+            tool_output: guidanceArgs.tool_output,
+            tool_call_start: guidanceArgs.tool_call_start,
+            tool_call_end: guidanceArgs.tool_call_end
+        );
+        system_message["tools"] = tools;
+
+        generatorParams.SetGuidance(guidance_type, guidance_data);
+        if (verbose)
+        {
+            Console.WriteLine();
+            Console.WriteLine($"Guidance type is: {guidance_type}");
+            Console.WriteLine($"Guidance data is: \n{guidance_data}");
+            Console.WriteLine();
+        }
+    }
+
+    // Create generator
+    using Generator generator = new Generator(model, generatorParams);
+    if (verbose) Console.WriteLine("Generator created");
+
+    // Apply chat template
+    string prompt = "";
+    try
+    {
+        string messages = JsonSerializer.Serialize(new List<Dictionary<string, string>> { system_message });
+        prompt = Common.ApplyChatTemplate(modelPath, tokenizer, messages, add_generation_prompt: false, tools);
+    }
+    catch
+    {
+        prompt = systemPrompt;
+    }
+    if (verbose) Console.WriteLine($"System prompt: {prompt}\n");
+
+    // Encode system prompt and append tokens to model
+    var sequences = tokenizer.Encode(prompt);
+    var system_prompt_length = sequences[0].Length;
+    generator.AppendTokenSequences(sequences);
+
+    // Streaming Chat
+    var prevTotalTokens = 0;
+    do
+    {
+        // Get user prompt
+        string user_prompt = Common.GetUserPrompt(userPrompt, interactive);
+        if (string.Compare(user_prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
+        }
+
+        // Create user message
+        var user_message = new Dictionary<string, string>
+        {
+            { "role", "user" },
+            { "content", user_prompt }
+        };
+
+        // Apply chat template
+        prompt = "";
+        try
+        {
+            string messages = JsonSerializer.Serialize(new List<Dictionary<string, string>> { user_message });
+            prompt = Common.ApplyChatTemplate(modelPath, tokenizer, messages, add_generation_prompt: true);
+        }
+        catch
+        {
+            prompt = systemPrompt;
+        }
+        if (verbose) Console.WriteLine($"User prompt: {prompt}");
+
+        // Encode user prompt and append tokens to model
+        sequences = tokenizer.Encode(prompt);
+        generator.AppendTokenSequences(sequences);
+
+        // Run generation loop
+        if (verbose) Console.WriteLine("Running generation loop...\n");
+        Console.Write("Output: ");
+        var watch = System.Diagnostics.Stopwatch.StartNew();
+        while (true)
+        {
+            generator.GenerateNextToken();
+            if (generator.IsDone())
+            {
+                break;
+            }
+            Console.Write(tokenizerStream.Decode(generator.GetNextTokens()[0]));
+        }
+        watch.Stop();
+        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+
+        // Display output and timings
+        var outputSequence = generator.GetSequence(0);
+        var totalNewTokens = outputSequence.Length - prevTotalTokens;
+        prevTotalTokens = totalNewTokens;
+        Console.WriteLine();
+        Console.WriteLine($"Streaming Tokens: {totalNewTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}");
+        Console.WriteLine();
+
+        if (rewind)
+        {
+            generator.RewindTo((ulong)system_prompt_length);
+        }
+
+    } while (interactive);
+}
+
+/// <summary>
+/// Get command-line arguments
+/// </summary>
+/// <returns>
+/// RootCommand object with all possible command-line arguments
+/// </returns>
+RootCommand GetArgs()
+{
+    var parser = new RootCommand("ModelChat Arguments");
+
+    var model_path = new Option<string>(
+        name: "model_path",
+        aliases: ["-m", "--model_path"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        Description = "Path to the model",
+        Required = true
+    };
+    model_path.Validators.Add(result =>
+    {
+        var value = result.GetValue(model_path);
+        if (string.IsNullOrEmpty(value))
+        {
+            result.AddError("Model path must be specified");
+        }
+        else if (!Path.Exists(value))
+        {
+            result.AddError("Path must be to a model folder on disk");
+        }
+    });
+    
+    var execution_provider = new Option<string>(
+        name: "execution_provider",
+        aliases: ["-e", "--execution_provider"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "follow_config",
+        Description = "Execution provider to run the model"
+    };
+    execution_provider.Validators.Add(result => {
+        var value = result.GetValue(execution_provider);
+        if (string.IsNullOrEmpty(value))
+        {
+            result.AddError("Execution provider must be specified. Use 'follow_config' to not specify one.");
+        }
+    });
+
+    var verbose = new Option<bool>(
+        name: "verbose",
+        aliases: ["-v", "--verbose"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Print verbose output. Defaults to false"
+    };
+
+    var debug = new Option<bool>(
+        name: "debug",
+        aliases: ["-d", "--debug"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Dump input and output tensors with debug mode. Defaults to false"
+    };
+
+    var non_interactive = new Option<bool>(
+        name: "non_interactive",
+        aliases: ["--non_interactive"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Run in interactive mode"
+    };
+
+    var ep_path = new Option<string>(
+        name: "ep_path",
+        aliases: ["--ep_path"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "",
+        Description = "Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)"
+    };
+
+    var system_prompt = new Option<string>(
+        name: "system_prompt",
+        aliases: ["-sp", "--system_prompt"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "You are a helpful AI assistant.",
+        Description = "System prompt to use for the model."
+    };
+
+    var user_prompt = new Option<string>(
+        name: "user_prompt",
+        aliases: ["-up", "--user_prompt"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "What color is the sky?",
+        Description = "User prompt to use for the model."
+    };
+
+    var rewind = new Option<bool>(
+        name: "rewind",
+        aliases: ["-rw", "--rewind"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Rewind to the system prompt after each generation. Defaults to false"
+    };
+
+    parser.Add(model_path);
+    parser.Add(execution_provider);
+    parser.Add(ep_path);
+    parser.Add(system_prompt);
+    parser.Add(user_prompt);
+    parser.Add(verbose);
+    parser.Add(debug);
+    parser.Add(non_interactive);
+    parser.Add(rewind);
+
+    Common.GetGeneratorParamsArgs(parser);
+    Common.GetGuidanceArgs(parser);
+
+    return parser;
+}
+
+/// <summary>
+/// Main method for inference
+/// </summary>
+/// <param name="args">Command-line arguments</param>
+/// <returns>
+/// None
+/// </returns>
+void main(string[] args) {
+    // Obtain and parse command-line arguments
+    RootCommand parser = GetArgs();
+    ParseResult parseResult = parser.Parse(args);
+    parseResult.Invoke();
+
+    // Validate command-line arguments
+    if (args.Length < 1 || parseResult.Errors.Count > 0 || parseResult.Tokens.Any(t => t.Value is "-h" or "--help" or "-?"))
+    {
+        Console.WriteLine("Run this with -h/--help/-? to see which arguments you need to set.");
+        foreach (var error in parseResult.Errors)
+        {
+            Console.WriteLine("Error: " + error.Message);
+        }
+        // Exit early
+        return;
+    }
+
+    // Get main argument values
+    string modelPath = parseResult.GetValue<string>("model_path")!;
+    string executionProvider = parseResult.GetValue<string>("execution_provider")!;
+    string epPath = parseResult.GetValue<string>("ep_path")!;
+    string systemPrompt = parseResult.GetValue<string>("system_prompt")!;
+    string userPrompt = parseResult.GetValue<string>("user_prompt")!;
+    bool verbose = parseResult.GetValue<bool>("verbose");
+    bool debug = parseResult.GetValue<bool>("debug");
+    bool interactive = !parseResult.GetValue<bool>("non_interactive");
+    bool rewind = parseResult.GetValue<bool>("rewind");
+
+    var (generatorParamsArgs, guidanceArgs) = Common.SetGroupedArgs(parseResult);
+
+    // Print main argument values
+    Console.WriteLine("-----------------");
+    Console.WriteLine("Hello, ModelChat!");
+    Console.WriteLine("-----------------");
+
+    Console.WriteLine("Model path: " + modelPath);
+    Console.WriteLine("Execution provider: " + executionProvider);
+    if (!string.IsNullOrEmpty(epPath))
+    {
+        Console.WriteLine("Execution provider path: " + epPath);
+    }
+    Console.WriteLine("System prompt: " + systemPrompt);
+    if (!interactive)
+    {
+        Console.WriteLine("User prompt: " + userPrompt);
+    }
+    Console.WriteLine("Verbose: " + verbose);
+    Console.WriteLine("Debug: " + debug);
+    Console.WriteLine("Interactive: " + interactive);
+    Console.WriteLine("Rewind: " + rewind);
+    Console.WriteLine("-----------------");
+    Console.WriteLine();
+
+    // Enable debugging if requested
+    if (debug) Common.SetLogger();
+    /**
+     * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
+     * the C# binding to Utils.RegisterEPLibrary is in a stable package release.
+     */
+    // RegisterEP(executionProvider, epPath);
+
+    // Create model
+    if (verbose) Console.WriteLine("Loading model...");
+    using Config config = Common.GetConfig(path: modelPath, ep: executionProvider, null, generatorParamsArgs);
+    using Model model = new Model(config);
+    if (verbose) Console.WriteLine("Model loaded");
+
+    // Create tokenizer
+    using Tokenizer tokenizer = new Tokenizer(model);
+    using TokenizerStream tokenizerStream = tokenizer.CreateStream();
+    if (verbose) Console.WriteLine("Tokenizer created");
+
+    // Get scenario to run from user
+    var option = 2;
+    if (interactive)
+    {
+        do
+        {
+            Console.WriteLine("Please enter option number:");
+            Console.WriteLine("1. Complete Q&A");
+            Console.WriteLine("2. Streaming Q&A");
+            Console.WriteLine("3. Streaming Chat");
+            Console.Write("> ");
+            int.TryParse(Console.ReadLine(), out option);
+
+            if (option < 1 || option > 3)
+            {
+                Console.WriteLine("Invalid option. Please try again.");
+            }
+        } while (option < 1 || option > 3);
+    }
+
+    // Get prompt and run chosen scenario
+    if (option == 1)
+    {
+        if (verbose) Console.WriteLine("Entering option 1\n");
+        ModelGenerate(model, tokenizer, generatorParamsArgs, modelPath, systemPrompt, userPrompt, interactive, verbose);
+    }
+    else if (option == 2)
+    {
+        if (verbose) Console.WriteLine("Entering option 2\n");
+        ModelQA(model, tokenizer, tokenizerStream, generatorParamsArgs, guidanceArgs, modelPath, systemPrompt, userPrompt, interactive, verbose);
+    }
+    else
+    {
+        if (verbose) Console.WriteLine("Entering option 3\n");
+        ModelChat(model, tokenizer, tokenizerStream, generatorParamsArgs, guidanceArgs, modelPath, systemPrompt, userPrompt, interactive, rewind, verbose);
+    }
+}
+
+using OgaHandle ogaHandle = new OgaHandle();
+main(args);
diff --git a/examples/csharp/HelloPhi3V/HelloPhi3V.csproj b/examples/csharp/ModelMM/ModelMM.csproj
similarity index 57%
rename from examples/csharp/HelloPhi3V/HelloPhi3V.csproj
rename to examples/csharp/ModelMM/ModelMM.csproj
index 49ca4fb4db..272ed0d776 100644
--- a/examples/csharp/HelloPhi3V/HelloPhi3V.csproj
+++ b/examples/csharp/ModelMM/ModelMM.csproj
@@ -1,17 +1,21 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net6.0</TargetFramework>
-    <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
-    <Configurations>Debug;Release;Debug_Cuda;Release_Cuda;Debug_DirectML;Release_DirectML</Configurations>
+    <ImplicitUsings>enable</ImplicitUsings>
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="System.CommandLine" Version="2.0.1" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="../Common/Common.cs" Link="Common/Common.cs" />
   </ItemGroup>
 
 </Project>
diff --git a/examples/csharp/HelloPhi4MM/HelloPhi4MM.sln b/examples/csharp/ModelMM/ModelMM.sln
similarity index 93%
rename from examples/csharp/HelloPhi4MM/HelloPhi4MM.sln
rename to examples/csharp/ModelMM/ModelMM.sln
index fd3fd547fb..6ae784b067 100644
--- a/examples/csharp/HelloPhi4MM/HelloPhi4MM.sln
+++ b/examples/csharp/ModelMM/ModelMM.sln
@@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.9.34902.65
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HelloPhi4MM", "HelloPhi4MM.csproj", "{A1EF39E6-2808-493D-8BED-BEAB3E5FC932}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ModelMM", "ModelMM.csproj", "{A1EF39E6-2808-493D-8BED-BEAB3E5FC932}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/examples/csharp/ModelMM/Program.cs b/examples/csharp/ModelMM/Program.cs
new file mode 100644
index 0000000000..e4bf1baf39
--- /dev/null
+++ b/examples/csharp/ModelMM/Program.cs
@@ -0,0 +1,425 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using CommonUtils;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System.CommandLine;
+using System.Text.Json;
+
+/// <summary>
+/// Example of model-mm
+/// </summary>
+/// <param name="model">Model to use</param>
+/// <param name="tokenizer">Tokenizer to use</param>
+/// <param name="tokenizerStream">Tokenizer stream to use</param>
+/// <param name="processor">Processor to use</param>
+/// <param name="generatorParamsArgs">Generator params arguments to use</param>
+/// <param name="guidanceArgs">Guidance arguments to use</param>
+/// <param name="imagePaths">File paths to images</param>
+/// <param name="audioPaths">File paths to audios</param>
+/// <param name="modelPath">Path to folder containing model</param>
+/// <param name="systemPrompt">System prompt to use with model</param>
+/// <param name="userPrompt">User prompt to use with model</param>
+/// <param name="interactive">Ask user or use pre-defined value</param>
+/// <param name="verbose">Use verbose logging</param>
+/// <returns>
+/// None
+/// </returns>
+void ModelMM(
+    Model model,
+    Tokenizer tokenizer,
+    TokenizerStream tokenizerStream,
+    MultiModalProcessor processor,
+    GeneratorParamsArgs generatorParamsArgs,
+    GuidanceArgs guidanceArgs,
+    List<string> imagePaths,
+    List<string> audioPaths,
+    string modelPath,
+    string systemPrompt,
+    string userPrompt,
+    bool interactive,
+    bool verbose
+)
+{
+    // Creating running list of messages
+    var system_message = new Dictionary<string, string>
+    {
+        { "role", "system" },
+        { "content", systemPrompt }
+    };
+    var input_list = new List<Dictionary<string, string>>() { system_message };
+
+    // Get and set guidance info if requested
+    string guidance_type = "";
+    string guidance_data = "";
+    string tools = "";
+    if (!string.IsNullOrEmpty(guidanceArgs.response_format))
+    {
+        Console.WriteLine("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json");
+        (guidance_type, guidance_data, tools) = Common.GetGuidance(
+            response_format: guidanceArgs.response_format,
+            filepath: guidanceArgs.tools_file,
+            text_output: guidanceArgs.text_output,
+            tool_output: guidanceArgs.tool_output,
+            tool_call_start: guidanceArgs.tool_call_start,
+            tool_call_end: guidanceArgs.tool_call_end
+        );
+        input_list[0]["tools"] = tools;
+    }
+
+    // Streaming Q&A
+    do
+    {
+        // Get images
+        Images? images;
+        int num_images;
+        (images, num_images) = Common.GetUserImages(imagePaths, interactive);
+
+        // Get audios
+        Audios? audios;
+        int num_audios;
+        (audios, num_audios) = Common.GetUserAudios(audioPaths, interactive);
+
+        // Get user prompt
+        string text = Common.GetUserPrompt(userPrompt, interactive);
+        if (string.Compare(text, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
+        }
+
+        // Construct user content based on inputs
+        /**
+         * TODO: Uncomment the below snippet to use model.GetModelType() once
+         * the C# binding to Model.GetModelType() is in a stable package release.
+         */
+        //var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text);
+        var user_content = Common.GetUserContent("phi4mm", num_images, num_audios, text);
+
+        // Add user message to list of messages
+        var user_message = new Dictionary<string, string>
+        {
+            { "role", "user" },
+            { "content", user_content }
+        };
+        input_list.Add(user_message);
+
+        // Set search options for generator params
+        using GeneratorParams generatorParams = new GeneratorParams(model);
+        Common.SetSearchOptions(generatorParams, generatorParamsArgs, verbose);
+
+        // Initialize guidance if requested
+        if (!string.IsNullOrEmpty(guidance_type) && !string.IsNullOrEmpty(guidance_data))
+        {
+            generatorParams.SetGuidance(guidance_type, guidance_data);
+            if (verbose)
+            {
+                Console.WriteLine();
+                Console.WriteLine($"Guidance type is: {guidance_type}");
+                Console.WriteLine($"Guidance data is: \n{guidance_data}");
+                Console.WriteLine();
+            }
+        }
+
+        // Create generator
+        using Generator generator = new Generator(model, generatorParams);
+        if (verbose) Console.WriteLine("Generator created");
+
+        // Apply chat template
+        string prompt = "";
+        try
+        {
+            string messages = JsonSerializer.Serialize(input_list);
+            prompt = Common.ApplyChatTemplate(modelPath, tokenizer, messages, add_generation_prompt: true, tools);
+        }
+        catch
+        {
+            prompt = text;
+        }
+        if (verbose) Console.WriteLine($"Prompt: {prompt}");
+
+        // Encode combined system + user prompt and append inputs to model
+        using var inputTensors = processor.ProcessImagesAndAudios(prompt, images, audios);
+        generator.SetInputs(inputTensors);
+
+        // Run generation loop
+        if (verbose) Console.WriteLine("Running generation loop...\n");
+        Console.Write("Output: ");
+        var watch = System.Diagnostics.Stopwatch.StartNew();
+        while (true)
+        {
+            generator.GenerateNextToken();
+            if (generator.IsDone())
+            {
+                break;
+            }
+            // Decode and print the next token
+            Console.Write(tokenizerStream.Decode(generator.GetNextTokens()[0]));
+        }
+        watch.Stop();
+        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+
+        // Remove user message from list of messages
+        input_list.RemoveAt(input_list.Count - 1);
+
+        // Display output and timings
+        var outputSequence = generator.GetSequence(0);
+        var totalTokens = outputSequence.Length;
+        Console.WriteLine();
+        Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
+        Console.WriteLine();
+
+    } while (interactive);
+}
+
+/// <summary>
+/// Get command-line arguments
+/// </summary>
+/// <returns>
+/// RootCommand object with all possible command-line arguments
+/// </returns>
+RootCommand GetArgs()
+{
+    var parser = new RootCommand("ModelChat Arguments");
+
+    var model_path = new Option<string>(
+        name: "model_path",
+        aliases: ["-m", "--model_path"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        Description = "Path to the model",
+        Required = true
+    };
+    model_path.Validators.Add(result =>
+    {
+        var value = result.GetValue(model_path);
+        if (string.IsNullOrEmpty(value))
+        {
+            result.AddError("Model path must be specified");
+        }
+        else if (!Path.Exists(value))
+        {
+            result.AddError("Path must be to a model folder on disk");
+        }
+    });
+    
+    var execution_provider = new Option<string>(
+        name: "execution_provider",
+        aliases: ["-e", "--execution_provider"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "follow_config",
+        Description = "Execution provider to run the model"
+    };
+    execution_provider.Validators.Add(result => {
+        var value = result.GetValue(execution_provider);
+        if (string.IsNullOrEmpty(value))
+        {
+            result.AddError("Execution provider must be specified. Use 'follow_config' to not specify one.");
+        }
+    });
+
+    var verbose = new Option<bool>(
+        name: "verbose",
+        aliases: ["-v", "--verbose"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Print verbose output. Defaults to false"
+    };
+
+    var debug = new Option<bool>(
+        name: "debug",
+        aliases: ["-d", "--debug"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Dump input and output tensors with debug mode. Defaults to false"
+    };
+
+    var non_interactive = new Option<bool>(
+        name: "non_interactive",
+        aliases: ["--non_interactive"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Run in interactive mode"
+    };
+
+    var ep_path = new Option<string>(
+        name: "ep_path",
+        aliases: ["--ep_path"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "",
+        Description = "Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)"
+    };
+
+    var system_prompt = new Option<string>(
+        name: "system_prompt",
+        aliases: ["-sp", "--system_prompt"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "You are a helpful AI assistant.",
+        Description = "System prompt to use for the model."
+    };
+
+    var user_prompt = new Option<string>(
+        name: "user_prompt",
+        aliases: ["-up", "--user_prompt"]
+    )
+    {
+        Arity = ArgumentArity.ExactlyOne,
+        DefaultValueFactory = (_) => "What color is the sky?",
+        Description = "User prompt to use for the model."
+    };
+
+    var rewind = new Option<bool>(
+        name: "rewind",
+        aliases: ["-rw", "--rewind"]
+    )
+    {
+        Arity = ArgumentArity.Zero,
+        DefaultValueFactory = (_) => false,
+        Description = "Rewind to the system prompt after each generation. Defaults to false"
+    };
+
+    var image_paths = new Option<List<string>>(
+        name: "image_paths",
+        aliases: ["--image_paths"]
+    )
+    {
+        Arity = ArgumentArity.ZeroOrMore,
+        AllowMultipleArgumentsPerToken = true,
+        DefaultValueFactory = (_) => [],
+        Description = "File paths to the images"
+    };
+
+    var audio_paths = new Option<List<string>>(
+        name: "audio_paths",
+        aliases: ["--audio_paths"]
+    )
+    {
+        Arity = ArgumentArity.ZeroOrMore,
+        AllowMultipleArgumentsPerToken = true,
+        DefaultValueFactory = (_) => [],
+        Description = "File paths to the audios"
+    };
+
+    parser.Add(model_path);
+    parser.Add(execution_provider);
+    parser.Add(ep_path);
+    parser.Add(system_prompt);
+    parser.Add(user_prompt);
+    parser.Add(verbose);
+    parser.Add(debug);
+    parser.Add(non_interactive);
+    parser.Add(rewind);
+    parser.Add(image_paths);
+    parser.Add(audio_paths);
+
+    Common.GetGeneratorParamsArgs(parser);
+    Common.GetGuidanceArgs(parser);
+
+    return parser;
+}
+
+/// <summary>
+/// Main method for inference
+/// </summary>
+/// <param name="args">Command-line arguments</param>
+/// <returns>
+/// None
+/// </returns>
+void main(string[] args) {
+    // Obtain and parse command-line arguments
+    RootCommand parser = GetArgs();
+    ParseResult parseResult = parser.Parse(args);
+    parseResult.Invoke();
+
+    // Validate command-line arguments
+    if (args.Length < 1 || parseResult.Errors.Count > 0 || parseResult.Tokens.Any(t => t.Value is "-h" or "--help" or "-?"))
+    {
+        Console.WriteLine("Run this with -h/--help/-? to see which arguments you need to set.");
+        foreach (var error in parseResult.Errors)
+        {
+            Console.WriteLine("Error: " + error.Message);
+        }
+        // Exit early
+        return;
+    }
+
+    // Get main argument values
+    string modelPath = parseResult.GetValue<string>("model_path")!;
+    string executionProvider = parseResult.GetValue<string>("execution_provider")!;
+    string epPath = parseResult.GetValue<string>("ep_path")!;
+    string systemPrompt = parseResult.GetValue<string>("system_prompt")!;
+    string userPrompt = parseResult.GetValue<string>("user_prompt")!;
+    bool verbose = parseResult.GetValue<bool>("verbose");
+    bool debug = parseResult.GetValue<bool>("debug");
+    bool interactive = !parseResult.GetValue<bool>("non_interactive");
+    bool rewind = parseResult.GetValue<bool>("rewind");
+    List<string> imagePaths = parseResult.GetValue<List<string>>("image_paths") ?? [];
+    List<string> audioPaths = parseResult.GetValue<List<string>>("audio_paths") ?? [];
+
+    var (generatorParamsArgs, guidanceArgs) = Common.SetGroupedArgs(parseResult);
+
+    // Print main argument values
+    Console.WriteLine("-----------------");
+    Console.WriteLine("Hello, ModelMM!");
+    Console.WriteLine("-----------------");
+
+    Console.WriteLine("Model path: " + modelPath);
+    Console.WriteLine("Execution provider: " + executionProvider);
+    if (!string.IsNullOrEmpty(epPath))
+    {
+        Console.WriteLine("Execution provider path: " + epPath);
+    }
+    Console.WriteLine("System prompt: " + systemPrompt);
+    if (!interactive)
+    {
+        Console.WriteLine("User prompt: " + userPrompt);
+    }
+    Console.WriteLine("Verbose: " + verbose);
+    Console.WriteLine("Debug: " + debug);
+    Console.WriteLine("Interactive: " + interactive);
+    Console.WriteLine("Rewind: " + rewind);
+    Console.WriteLine("-----------------");
+    Console.WriteLine();
+
+    // Enable debugging if requested
+    if (debug) Common.SetLogger();
+    /**
+     * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
+     * the C# binding to Utils.RegisterEPLibrary is in a stable package release.
+     */
+    // RegisterEP(executionProvider, epPath);
+
+    // Create model
+    if (verbose) Console.WriteLine("Loading model...");
+    using Config config = Common.GetConfig(path: modelPath, ep: executionProvider, null, generatorParamsArgs);
+    using Model model = new Model(config);
+    if (verbose) Console.WriteLine("Model loaded");
+
+    // Create tokenizer
+    using Tokenizer tokenizer = new Tokenizer(model);
+    using TokenizerStream tokenizerStream = tokenizer.CreateStream();
+    if (verbose) Console.WriteLine("Tokenizer created");
+
+    // Create processor
+    using MultiModalProcessor processor = new MultiModalProcessor(model);
+    if (verbose) Console.WriteLine("Processor created");
+
+    // Get prompt and run scenario
+    if (verbose) Console.WriteLine("Entering model-mm\n");
+    ModelMM(model, tokenizer, tokenizerStream, processor, generatorParamsArgs, guidanceArgs, imagePaths, audioPaths, modelPath, systemPrompt, userPrompt, interactive, verbose);
+}
+
+using OgaHandle ogaHandle = new OgaHandle();
+main(args);
diff --git a/examples/csharp/README.md b/examples/csharp/README.md
new file mode 100644
index 0000000000..0e32539546
--- /dev/null
+++ b/examples/csharp/README.md
@@ -0,0 +1,87 @@
+# ONNX Runtime GenAI C# Examples
+
+> 📝 **Note:** The examples from the main branch of this repository are compatible with the binaries built from the same commit. Therefore, if using the example from `main`, ONNX Runtime GenAI needs to be built from source. If this is your scenario, just build the library and the examples will be auto built along with the library. If this is not your scenario, please use prebuilt binaries from the release you're interested in and use the examples from the same version tag and follow the steps below.
+
+## Install ONNX Runtime GenAI
+
+Install the C# package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html).
+
+## Download a Model
+
+There are many places to obtain a model. Please read through [our download options](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/DownloadModels.md).
+
+## Build an Example
+
+```bash
+# Prerequisite: navigate to the C# examples folder
+cd examples/csharp/
+```
+
+ModelChat:
+
+```bash
+# Change `Release` to your desired target. This is just an example.
+dotnet build ModelChat -c Release
+```
+
+ModelMM:
+
+```bash
+# Change `Release` to your desired target. This is just an example.
+dotnet build ModelMM -c Release
+```
+
+## Run an Example
+
+1. On Windows:
+
+```powershell
+# Prerequisite: navigate to the compiled binaries. This is an example. Your navigation may change depending on your target.
+cd ./ModelChat/bin/Debug/net8.0/
+
+# The `model-chat` script allows for multi-turn conversations.
+.\ModelChat.exe -m {path to model folder} -e {execution provider}
+```
+
+```powershell
+# Prerequisite: navigate to the compiled binaries. This is an example. Your navigation may change depending on your target.
+cd ./ModelMM/bin/Debug/net8.0/
+
+# The `model-mm` script works for multi-modal models and streams the output text token by token.
+.\ModelMM.exe -m {path to model folder} -e {execution provider}
+```
+
+2. On Linux and macOS:
+
+```bash
+# Prerequisite: navigate to the compiled binaries. This is an example. Your navigation may change depending on your target.
+cd ./ModelChat/bin/Debug/net8.0/
+
+# The `model-chat` script allows for multi-turn conversations.
+./ModelChat -m {path to model folder} -e {execution provider}
+```
+
+```bash
+# Prerequisite: navigate to the compiled binaries. This is an example. Your navigation may change depending on your target.
+cd ./ModelMM/bin/Debug/net8.0/
+
+# The `model-mm` script works for multi-modal models and streams the output text token by token.
+./ModelMM -m {path to model folder} -e {execution provider}
+```
+
+## Tool Calling
+
+Please read through [our constrained decoding](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/ConstrainedDecoding.md) options to learn more.
+
+Here are some examples of how you can run the C# examples with function/tool calling.
+
+```
+# Using JSON Schema with only tool call output
+.\ModelChat.exe -m {path to model folder} -e {execution provider} --response_format json_schema --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+
+# Using Lark Grammar with only tool call output
+.\ModelMM.exe -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+
+# Using Lark Grammar with text or tool call output
+.\ModelChat.exe -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --text_output --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+```
diff --git a/examples/python/README.md b/examples/python/README.md
index 286b1fe275..6802c498c6 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -1,62 +1,49 @@
 # ONNX Runtime GenAI Python Examples
 
-## Install ONNX Runtime GenAI
-
-Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html).
-
-## Get the model
-
-You can generate the model using the model builder with this library, download the model from huggingface ([example](https://github.com/microsoft/onnxruntime-genai?tab=readme-ov-file#sample-code-for-phi-3-in-python)), or bring your own model.
-
-If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
+> 📝 **Note:** The examples from the main branch of this repository are compatible with the binaries built from the same commit. Therefore, if using the example from `main`, ONNX Runtime GenAI needs to be built from source. If this is your scenario, just build the library and the examples will be auto built along with the library. If this is not your scenario, please use prebuilt binaries from the release you're interested in and use the examples from the same version tag and follow the steps below.
 
-To generate the model with model builder:
-
-1. Install the model builder's dependencies
-
-   ```bash
-   pip install numpy transformers torch onnx onnxruntime
-   ```
-
-2. Choose a model. Examples of supported ones are listed on the repo's main [README](../../README.md).
+## Install ONNX Runtime GenAI
 
-3. Run the model builder to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
+Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html).
 
-   ```bash
-   cd examples/python
-   python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu
-   ```
+## Download a Model
 
-## Run the example model script
+There are many places to obtain a model. Please read through [our download options](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/DownloadModels.md).
 
-See accompanying qa-e2e-example.sh and generate-e2e-example.sh scripts for end-to-end examples of workflow.
+## Run an Example
 
-The `model-generate` script generates the output sequence all on one function call.
+```bash
+# The `model-chat` script allows for multi-turn conversations.
+python model-chat.py -m {path to model folder} -e {execution provider}
+```
 
-The `model-qa` script streams the output text token by token.
+```bash
+# The `model-generate` script generates the entire output sequence in one function call
+python model-generate.py -m {path to model folder} -e {execution provider}
+```
 
-To run the python examples...
 ```bash
-python model-generate.py -m {path to model folder} -e {execution provider} -pr {input prompt}
+# The `model-qa` script streams the output text token by token.
 python model-qa.py -m {path to model folder} -e {execution provider}
 ```
 
-## Use Constrained Decoding for the model output
+```bash
+# The `model-mm` script works for multi-modal models and streams the output text token by token.
+python model-mm.py -m {path to model folder} -e {execution provider}
+```
 
-Constrained Decoding is useful when using function/tool calling as it helps in ensuring the output is in the correct format.
+## Tool Calling
 
-We have integrated [LLGuidance](https://github.com/guidance-ai/llguidance) for constrained decoding. There are three types of constrained decoding enabled right now:
-1. Lark Grammar (Recommended): This option allows you to have an option for a regular output as well as function/tool output in JSON format.
-2. JSON Schema: Output will be JSON schema and it will be one of the function/tools provided.
-3. Regex: If a particular regular expression is desired.
+Please read through [our constrained decoding](https://github.com/microsoft/onnxruntime-genai/blob/main/docs/ConstrainedDecoding.md) options to learn more.
 
-To ensure that the function/tool call works correctly with constrained decoding, you need to modify your tokenizer.json file. For each model that has its own tool calling token, the tool calling token's `special` attribute needs to be set to true. For example, Phi-4 mini uses the <|tool_call|> token so you should set the `special` attribute for <|tool_call|> as `true` inside `tokenizer.json`.
+Here are some examples of how you can run the Python examples with function/tool calling.
 
-To run the Python examples with function/tool calling:
-```
-# Using Lark Grammar with 1 function/tool call
-python model-qa.py -m {path to model folder} -e {execution provider} --guidance_type "lark_grammar"  --guidance_info '[{"name": "get_weather", "description": "Get weather of a city.", "parameters": {"city": {"description": "The city for which weather information is requested", "type": "string", "default": "Dallas"}}}]'
+```bash
+# Using JSON Schema with only tool call output
+python model-qa.py -m {path to model folder} -e {execution provider} --response_format json_schema --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
 
-# With 2 function/tool calls in chat mode
-python model-chat.py -m {path to model folder} -e {execution provider} --guidance_type "lark_grammar"  --guidance_info '[{"name": "get_weather", "description": "Get weather of a city.", "parameters": {"city": {"description": "The city for which weather information is requested", "type": "string", "default": "Dallas"}}},{"name": "get_population", "description": "Get population of a city.", "parameters": {"city": {"description": "The city for which population information is requested", "type": "string", "default": "Dallas"}}}]'
-```
+# Using Lark Grammar with only tool call output
+python model-mm.py -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
+
+# Using Lark Grammar with text or tool call output
+python model-chat.py -m {path to model folder} -e {execution provider} --response_format lark_grammar --tools_file {path to json file} --text_output --tool_output --tool_call_start "{starting tool call token}" --tool_call_end "{ending tool call token}"
\ No newline at end of file
diff --git a/examples/python/awq-quantized-model.py b/examples/python/awq-quantized-model.py
index 465935e264..41a95c63af 100644
--- a/examples/python/awq-quantized-model.py
+++ b/examples/python/awq-quantized-model.py
@@ -78,7 +78,7 @@ def run_model(args):
     model = og.Model(config)
     print("Model loaded")
     tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
+    stream = tokenizer.create_stream()
 
     # Override any default search options in `genai_config.json`
     search_options = {
@@ -111,7 +111,7 @@ def run_model(args):
             while not generator.is_done():
                 generator.generate_next_token()
                 new_token = generator.get_next_tokens()[0]
-                print(tokenizer_stream.decode(new_token), end="", flush=True)
+                print(stream.decode(new_token), end="", flush=True)
         except KeyboardInterrupt:
             print("  --control+c pressed, aborting generation--")
         print()
diff --git a/examples/python/common.py b/examples/python/common.py
new file mode 100644
index 0000000000..887f30609c
--- /dev/null
+++ b/examples/python/common.py
@@ -0,0 +1,548 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import os
+import onnxruntime_genai as og
+
+from dataclasses import dataclass, asdict
+from typing import Any
+
+def set_logger(inputs: bool = True, outputs: bool = True) -> None:
+    """
+    Set log options inside ORT GenAI
+
+    Args:
+        inputs (bool): Dump inputs to the model in the console
+        outputs (bool): Dump outputs to the model in the console
+    Returns:
+        None
+    """
+    og.set_log_options(enabled=True, model_input_values=inputs, model_output_values=outputs)
+
+def register_ep(ep: str, ep_path: str, use_winml: bool) -> None:
+    """
+    Register execution provider if path is provided or via Windows ML
+
+    Args:
+        ep (str): Name of execution provider
+        ep_path (str): Path to execution provider to register
+        use_winml (bool): Use Windows ML to register execution providers
+    Returns:
+        None
+    """
+    if not ep_path:
+        return  # No library path specified, skip registration
+
+    print(f"Registering execution provider: {ep}")
+
+    if use_winml:
+        # Requies winml.py file
+        # Modified from here: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing
+        try:
+            import winml
+            print(winml.register_execution_providers(ort=False, ort_genai=True))
+        except ImportError:
+            print("WinML not available, using default execution providers")
+        except Exception as e:
+            print(f"Failed to register WinML execution providers: {e}")
+    elif ep == "cuda":
+        og.register_execution_provider_library("CUDAExecutionProvider", ep_path)
+    elif ep == "NvTensorRtRtx":
+        og.register_execution_provider_library("NvTensorRTRTXExecutionProvider", ep_path)
+    else:
+        print(f"Warning: EP registration not supported for {ep}")
+        print("Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries. Use Windows ML via '--use_winml' to register EPs.")
+        return
+
+    print(f"Registered {ep} successfully!")
+
+def get_config(path: str, ep: str, ep_options: dict[str, str] = {}, search_options: dict[str, int] = {}) -> og.Config:
+    """
+    Get og.Config object and set EP-specific and search-specific options inside it
+
+    Args:
+        path (str): Path to model folder containing GenAI config
+        ep (str): Name of execution provider to set
+        ep_options (dict[str, str]): Map of EP-specific option names and their values
+        search_options (dict[str, int]): Map of search-specific option names and their values
+    Returns:
+        og.Config: ORT GenAI config object with all options set
+    """
+    # Create config with EP
+    # - If follow_config, then use the default EP stored inside the GenAI config.
+    # - Otherwise, override the stored EP by clearing all providers and appending the desired one.
+    config = og.Config(path)
+    if ep != "follow_config":
+        config.clear_providers()
+        if ep != "cpu":
+            print(f"Setting model to {ep}")
+            config.append_provider(ep)
+
+        # Set any EP-specific options
+        for k, v in ep_options.items():
+            if k == "enable_cuda_graph" and ep in {"cuda", "NvTensorRtRtx"} and search_options.get("num_beams", 1) > 1:
+                # Disable CUDA graph if using beam search (num_beams > 1),
+                # num_beams > 1 requires past_present_share_buffer to be false so enable_cuda_graph must be false
+                config.set_provider_option(ep, "enable_cuda_graph", "0")
+            else:
+                config.set_provider_option(ep, k, v)
+
+    if "chunk_size" in search_options and search_options["chunk_size"] == 0:
+        # Remove chunk_size of 0
+        del search_options["chunk_size"]
+
+    # Set any search-specific options that need to be known before constructing an og.Model object
+    # Otherwise they can be set with params.set_search_options(**search_options)
+    config.overlay(json.dumps({"search": search_options}))
+    return config
+
+def get_search_options(args: argparse.Namespace):
+    """
+    Get search options for a generator's params during decoding
+
+    Args:
+        args (argparse.Namespace): arguments provided by user
+    Returns:
+        dict[str, Any]: dictionary of key-value pairs to set
+    """
+    search_options = {}
+    names = [
+        "batch_size",
+        "do_sample",
+        "max_length",
+        "min_length",
+        "num_beams",
+        "num_return_sequences",
+        "repetition_penalty",
+        "temperature",
+        "top_k",
+        "top_p",
+    ]
+    for name in names:
+        if name in args:
+            search_options[name] = getattr(args, name)
+
+    # In case the user doesn't provide the batch size, set it to 1
+    search_options["batch_size"] = search_options.get("batch_size", 1)
+    return search_options
+
+def apply_chat_template(model_path: str, tokenizer: og.Tokenizer, messages: str, add_generation_prompt: bool, tools: str = "") -> str:
+    """
+    Apply the chat template with various fallback options
+
+    Args:
+        model_path (str): path to folder containing model
+        tokenizer (og.Tokenizer): tokenizer object to use
+        add_generation_prompt (bool): add tokens to indicate the start of the AI's response
+        tools (str): string-encoded list of tools
+    Returns:
+        str: prompt to encode
+    """
+    template_str = ""
+    jinja_path = os.path.join(model_path, "chat_template.jinja")
+    if os.path.exists(jinja_path):
+        with open(jinja_path, encoding="utf-8") as f:
+            template_str = f.read()
+
+    prompt = tokenizer.apply_chat_template(
+        messages=messages, tools=tools, add_generation_prompt=add_generation_prompt, template_str=template_str
+    )
+    return prompt
+
+def get_user_prompt(prompt: str, non_interactive: bool) -> str:
+    """
+    Get prompt for 'user' role in chat template
+
+    Args:
+        prompt (str): provided prompt
+        non_interactive (bool): non-interactive mode (uses either provided prompt or default)
+    Returns:
+        str: prompt to encode
+    """
+    text = None
+
+    while True:
+        if not non_interactive:
+            # If interactive mode is on
+            text = input("Prompt (Use quit() to exit): ")
+        else:
+            # Use provided prompt (whether default or user-provided)
+            text = prompt
+
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+        else:
+            break
+
+    return text
+
+def get_user_media_paths(media_paths: list[str], non_interactive: bool, media_type: str) -> list[str]:
+    """
+    Get paths to media for user
+
+    Args:
+        media_paths (list[str]): user-provided media paths
+        non_interactive (bool): non-interactive mode (uses either user-provided media paths or default)
+        media_type (str): the media type being obtained
+    Returns:
+        list[str]: all media filepaths to read and encode
+    """
+    # Check media type
+    media_type = media_type.lower()
+    assert media_type in {"audio", "image"}, "Media type must be 'image' or 'audio'"
+
+    paths = []
+    if media_paths:
+        # If user-provided media paths
+        paths = media_paths
+    elif not non_interactive:
+        # If interactive mode is on
+        paths = [
+            path.strip()
+            for path in input(f"{media_type.capitalize()} Path (comma separated; leave empty if no {media_type}): ").split(",")
+        ]
+
+    paths = [path for path in paths if path]
+    for path in paths:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"{media_type.capitalize()} file not found: {path}")
+        print(f"Using {media_type}: {path}")
+
+    return paths
+
+def get_user_images(image_paths: list[str], non_interactive: bool) -> tuple[og.Images, int]:
+    """
+    Get images for user
+
+    Args:
+        image_paths (list[str]): user-provided image paths
+        non_interactive (bool): non-interactive mode (uses either user-provided image paths or default)
+    Returns:
+        (og.Images, int): (all images, number of images) as a tuple
+    """
+    media_type = "image"
+    paths = get_user_media_paths(image_paths, non_interactive, media_type)
+    if not paths:
+        print(f"No {media_type} provided")
+        return None, 0
+
+    images = og.Images.open(*paths)
+    return images, len(paths)
+
+def get_user_audios(audio_paths: list[str], non_interactive: bool) -> tuple[og.Audios, int]:
+    """
+    Get audios for user
+
+    Args:
+        audio_paths (list[str]): user-provided audio paths
+        non_interactive (bool): non-interactive mode (uses either user-provided audio paths or default)
+    Returns:
+        (og.Audios, int): (all audios, number of audios) as a tuple
+    """
+    media_type = "audio"
+    paths = get_user_media_paths(audio_paths, non_interactive, media_type)
+    if not paths:
+        print(f"No {media_type} provided")
+        return None, 0
+
+    audios = og.Audios.open(*paths)
+    return audios, len(paths)
+
+def get_user_content(model_type: str, num_images: int, num_audios: int, prompt: str) -> str | list[dict[str, str]]:
+    """
+    Get content for 'user' role in chat template
+
+    Args:
+        model_type (str): model type inside ORT GenAI
+        num_images (int): number of images
+        num_audios (int): number of audios
+        prompt (str): user prompt
+    Returns:
+        str | list[dict[str, str]]: Combined content for 'user' role
+    """
+    content = None
+    # Combine all image tags, audio tags, and text into one user content
+    if model_type == "phi3v":
+        # Phi-3 vision, Phi-3.5 vision
+        image_tags = "".join([f"<|image_{i + 1}|>\n" for i in range(num_images)])
+        content = image_tags + prompt
+    elif model_type == "phi4mm":
+        # Phi-4 multimodal
+        image_tags = "".join([f"<|image_{i + 1}|>\n" for i in range(num_images)])
+        audio_tags = "".join([f"<|audio_{i + 1}|>\n" for i in range(num_audios)])
+        content = image_tags + audio_tags + prompt
+    elif model_type in {"qwen2_5_vl", "fara"}:
+        # Qwen-2.5 VL, Fara
+        image_tags = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in range(num_images)])
+        content = image_tags + prompt
+    else:
+        # Gemma-3 style: structured content
+        image_tags = [{"type": "image"} for _ in range(num_images)]
+        content = image_tags + [{"type": "text", "text": prompt}]
+    return content
+
+@dataclass
+class ToolSchema:
+    """
+    A class for defining a tool in a JSON schema compatible way
+    """
+    description: str
+    type: str
+    properties: dict[str, Any]
+    required: list[str]
+    additionalProperties: bool
+
+@dataclass
+class JsonSchema:
+    """
+    A class for defining a JSON schema for guidance
+    """
+    x_guidance: dict[str, Any]
+    type: str
+    items: dict[str, list[ToolSchema]]
+    minItems: int
+
+@dataclass
+class FunctionDefinition:
+    """
+    A class for defining a function in an OpenAI-compatible way
+    """
+    name: str
+    description: str
+    parameters: dict[str, Any]
+
+@dataclass
+class Tool:
+    """
+    A class for defining a tool in an OpenAI-compatible way
+    """
+    type: str
+    function: FunctionDefinition
+
+def tools_to_schemas(tools: list[Tool]) -> list[ToolSchema]:
+    """
+    Convert a list of tools to a list of tool schemas
+
+    Args:
+        tools (list[Tool]): list of OpenAI-compatible tools
+    Returns:
+        list[ToolSchema]: list of JSON schema compatible tools
+    """
+    tool_schemas = []
+    for tool in tools:
+        properties = {"name": {"const": tool.function.name}}
+        tool_parameters_exist = tool.function.parameters != {}
+
+        if tool_parameters_exist:
+            parameters = {
+                "type": tool.function.parameters.get("type", "object"),
+                "properties": tool.function.parameters.get("properties", {}),
+                "required": tool.function.parameters.get("required", []),
+            }
+            properties["parameters"] = parameters
+
+        tool_schema = ToolSchema(
+            description=tool.function.description,
+            type="object",
+            properties=properties,
+            required=["name", "parameters"] if tool_parameters_exist else ["name"],
+            additionalProperties=False,
+        )
+        tool_schemas.append(tool_schema)
+    return tool_schemas
+
+def get_json_schema(tools: list[Tool], tool_output: bool) -> str:
+    """
+    Create a JSON schema from a list of tools
+
+    Args:
+        tools (list[Tool]): list of OpenAI-compatible tools
+        tool_output: output can have a tool call
+    Returns:
+        str: JSON schema as a JSON-compatible string
+    """
+    schemas = tools_to_schemas(tools)
+    x_guidance = {"whitespace_flexible": False, "key_separator": ": ", "item_separator": ", "}
+    json_schema = JsonSchema(x_guidance=x_guidance, type="array", items={"anyOf": schemas}, minItems=int(tool_output))
+    d = {k.replace("x_guidance", "x-guidance"): v for k, v in asdict(json_schema).items()}
+    return json.dumps(d)
+
+def get_lark_grammar(
+    tools: list[Tool],
+    text_output: bool,
+    tool_output: bool,
+    tool_call_start: str,
+    tool_call_end: str,
+) -> str:
+    """
+    Create a LARK grammar from a list of tools
+
+    Args:
+        tools (list[Tool]): list of OpenAI-compatible tools
+        text_output (bool): output can have text
+        tool_output (bool): output can have a tool call
+        tool_call_start (str): string representation of tool call starting token (e.g. <tool_call>)
+        tool_call_end (str): string representation of tool call ending token (e.g. </tool_call>)
+    Returns:
+        str: LARK grammar as a string
+    """
+    known_tool_call_ids = tool_call_start != "" and tool_call_end != ""
+
+    rows = []
+    if text_output and not tool_output:
+        start_row = "start: TEXT"
+    elif not text_output and tool_output:
+        start_row = f"start: {'toolcall' if known_tool_call_ids else 'functioncall'}"
+    elif text_output and tool_output:
+        start_row = f"start: TEXT | {'toolcall' if known_tool_call_ids else 'functioncall'}"
+    else:
+        raise Exception("At least one of 'text_output' and 'tool_output' must be true")
+    rows.append(start_row)
+
+    if text_output:
+        text_row = "TEXT: /[^{<](.|\\n)*/"
+        rows.append(text_row)
+
+    if tool_output:
+        schema = get_json_schema(tools=tools, tool_output=tool_output)
+        if known_tool_call_ids:
+            tool_row = f"toolcall: {tool_call_start} functioncall {tool_call_end}"
+            rows.append(tool_row)
+
+        func_row = f"functioncall: %json {schema}"
+        rows.append(func_row)
+
+    return "\n".join(rows)
+
+def to_tool(tool_defs: list[dict[str, Any]]) -> list[Tool]:
+    """
+    Convert a JSON-deserialized object of tools to a list of Tool objects
+
+    Args:
+        tool_defs (list[dict[str, Any]]): JSON-deserialized object containing OpenAI-compatible tool definitions
+    Returns:
+        list[Tool]: list of Tool objects
+    """
+    tools = []
+    for tool_def in tool_defs:
+        func = FunctionDefinition(
+            name=tool_def["function"]["name"],
+            description=tool_def["function"]["description"],
+            parameters=tool_def["function"]["parameters"],
+        )
+        tool = Tool(type="function", function=func)
+        tools.append(tool)
+    return tools
+
+def get_guidance(
+    response_format: str = "",
+    filepath: str = "",
+    tools_str: str = "",
+    tools: list[dict[str, Any] | Tool] = [],
+    text_output: bool = True,
+    tool_output: bool = False,
+    tool_call_start: str = "",
+    tool_call_end: str = "",
+) -> tuple[str, str, str]:
+    """
+    Create a grammar to use with LLGuidance
+
+    Args:
+        response_format (str): type of format requested
+        filepath (str): path to file containing OpenAI-compatible tool definitions
+        tools_str (str): JSON-serialized string containing OpenAI-compatible tool definitions
+        tools (list[dict[str, Any] | Tool]): list of OpenAI-compatible tools defined in memory
+        text_output (bool): output can have text
+        tool_output (bool): output can have a tool call
+        tool_call_start (str): string representation of tool call starting token (e.g. <tool_call>)
+        tool_call_end (str): string representation of tool call ending token (e.g. </tool_call>)
+    Returns:
+        (str, str, str): (grammar type, grammar data, tools) as a tuple of strings
+    """
+    guidance_type, guidance_data = "", ""
+
+    # Get list of tools from a range of sources (filepath, JSON-serialized string, in-memory)
+    if tool_output:
+        if os.path.exists(filepath):
+            # If tools are provided as a file
+            with open(filepath, 'r') as f:
+                tool_defs = json.load(f)
+                tools = to_tool(tool_defs)
+        elif tools_str != "":
+            # If tools are provided as a JSON-serialized string
+            try:
+                tool_defs = json.loads(tools_str)
+                tools = to_tool(tool_defs)
+            except json.JSONDecodeError:
+                raise ValueError("Invalid JSON format for tools list. Format must be a JSON-serialized string.")
+        elif len(tools) > 0:
+            if type(tools[0]) != Tool:
+                tools = to_tool(tools)
+        else:
+            raise ValueError("Please provide the list of tools through a file, JSON-serialized string, or a list of tools")
+
+        assert len(tools) > 0, "Could not obtain a list of tools in memory"
+
+    # Create guidance based on user-provided response format
+    if response_format in {"text", "lark_grammar"}:
+        if response_format == "text":
+            assert text_output and not tool_output, "A response format of 'text' requires text_output = True and tool_output = False"
+
+        guidance_type = "lark_grammar"
+        guidance_data = get_lark_grammar(
+            tools=tools,
+            text_output=text_output,
+            tool_output=tool_output,
+            tool_call_start=tool_call_start,
+            tool_call_end=tool_call_end,
+        )
+    elif response_format in {"json_schema", "json_object"}:
+        assert tool_output and not text_output, "A response format of 'json_schema' or 'json_object' requires text_output = False and tool_output = True"
+
+        guidance_type = "json_schema"
+        guidance_data = get_json_schema(tools=tools, tool_output=tool_output)
+    else:
+        raise ValueError("Invalid response format provided")
+
+    return guidance_type, guidance_data, json.dumps([asdict(tool) for tool in tools])
+
+def get_generator_params_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add an argument group for the generator params
+
+    Args:
+        parser (argparse.ArgumentParser): original parser object with existing arguments
+    Returns:
+        None
+    """
+    generator_params = parser.add_argument_group("Generator Params")
+    generator_params.add_argument('-c', '--chunk_size', type=int, default=0, help="Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)")
+    generator_params.add_argument('-s', '--do_sample', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    generator_params.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    generator_params.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    generator_params.add_argument('-b', '--num_beams', type=int, default=1, help='Number of beams to create')
+    generator_params.add_argument('-rs', '--num_return_sequences', type=int, default=1, help='Number of return sequences to produce')
+    generator_params.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    generator_params.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    generator_params.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    generator_params.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+
+def get_guidance_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add an argument group for guidance options
+
+    Args:
+        parser (argparse.ArgumentParser): original parser object with existing arguments
+    Returns:
+        None
+    """
+    guidance = parser.add_argument_group("Guidance Arguments")
+    guidance.add_argument('-rf', '--response_format', type=str, default="", choices=["", "text", "json_object", "json_schema", "lark_grammar"], help='Provide response format for the model')
+    guidance.add_argument('-tf', '--tools_file', type=str, default="", help='Path to file containing list of OpenAI-compatible tool definitions. Ex: test/test_models/tool-definitions/weather.json')
+    guidance.add_argument('-text', '--text_output', action='store_true', default=False, help='Produce a text response in the output')
+    guidance.add_argument('-tool', '--tool_output', action='store_true', default=False, help='Produce a tool call in the output')
+    guidance.add_argument('-tcs', '--tool_call_start', type=str, default="", help='String representation of tool call start (ex: <|tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.')
+    guidance.add_argument('-tce', '--tool_call_end', type=str, default="", help='String representation of tool call end (ex: <|/tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.')
diff --git a/examples/python/gemma-3-vision-tutorial.md b/examples/python/gemma-3-vision-tutorial.md
index 5b235f9b47..5e04b89b56 100644
--- a/examples/python/gemma-3-vision-tutorial.md
+++ b/examples/python/gemma-3-vision-tutorial.md
@@ -144,20 +144,20 @@ Currently, both JSON files needed to run with ONNX Runtime GenAI are created by
 
 ## 4. Run Gemma-3 vision ONNX models
 
-[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-vision.py) is an example of how you can run your Gemma-3 vision model with ONNX Runtime GenAI.
+[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-mm.py) is an example of how you can run your Gemma-3 vision model with ONNX Runtime GenAI.
 
 ### CPU
 ```bash
-$ python model-vision.py -m ./gemma3-vision-it/cpu -e cpu
+$ python model-mm.py -m ./gemma3-vision-it/cpu -e cpu
 ```
 
 ### CUDA
 ```bash
-$ python model-vision.py -m ./gemma3-vision-it/cuda -e cuda
+$ python model-mm.py -m ./gemma3-vision-it/cuda -e cuda
 ```
 
 ### DirectML
 
 ```bash
-$ python model-vision.py -m ./gemma3-vision-it/dml -e dml
+$ python model-mm.py -m ./gemma3-vision-it/dml -e dml
 ```
diff --git a/examples/python/generate-e2e-example.sh b/examples/python/generate-e2e-example.sh
deleted file mode 100755
index e4ab9b3d52..0000000000
--- a/examples/python/generate-e2e-example.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Description: Example of generate end-to-end usage, including model building and running
-pip install numpy transformers torch onnx onnxruntime
-python3 -m onnxruntime_genai.models.builder -m microsoft/phi-2 -o genai_models/phi2-int4-cpu -p int4 -e cpu -c hf_cache
-python3 model-generate.py -m genai_models/phi2-int4-cpu -e cpu -pr "my favorite movie is" "write a function that always returns True" "I am very happy" -p 0.0 -k 1 -v 
diff --git a/examples/python/guidance-example.py b/examples/python/guidance-example.py
index 3d9a068783..9b144daa04 100644
--- a/examples/python/guidance-example.py
+++ b/examples/python/guidance-example.py
@@ -22,7 +22,7 @@ def main(args):
     model = og.Model(config)
 
     tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
+    stream = tokenizer.create_stream()
 
     search_options = {
         name: getattr(args, name)
@@ -57,13 +57,13 @@ def main(args):
 
         # NOTE: since get_next_tokens returns only the last token, we'll need to use get_sequence instead
         # new_tokens = generator.get_next_tokens()[0]
-        # print(tokenizer_stream.decode(new_tokens), end='', flush=True)
+        # print(stream.decode(new_tokens), end='', flush=True)
 
         seq = generator.get_sequence(0)
         new_tokens = seq[prev_len:]
         seq_str = ""
         for token in new_tokens:
-            seq_str += tokenizer_stream.decode(token)
+            seq_str += stream.decode(token)
         print(seq_str, end="", flush=True)
         prev_len = len(seq)
         full_seq_str += seq_str
diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
index e77ad54380..9d27d70dc6 100644
--- a/examples/python/model-chat.py
+++ b/examples/python/model-chat.py
@@ -3,202 +3,83 @@
 
 import argparse
 import json
-import os
 import time
 
 import onnxruntime_genai as og
-
-
-def get_tools_list(input_tools):
-    # input_tools format: '[{"name": "fn1", "description": "fn details", "parameters": {"p1": {"description": "details", "type": "string"}}},
-    # {"fn2": 2},{"fn3": 3}]'
-    tools_list = []
-    try:
-        tools_list = json.loads(input_tools)
-    except json.JSONDecodeError:
-        raise ValueError('Invalid JSON format for tools list, expected format: \'[{"name": "fn1"},{"name": "fn2"}]\'')
-    if len(tools_list) == 0:
-        raise ValueError("Tools list cannot be empty")
-    return tools_list
-
-
-def create_prompt_tool_input(tools_list):
-    tool_input = str(tools_list[0])
-    for tool in tools_list[1:]:
-        tool_input += "," + str(tool)
-    return tool_input
-
-
-def get_json_grammar(input_tools):
-    tools_list = get_tools_list(input_tools)
-    prompt_tool_input = create_prompt_tool_input(tools_list)
-    if len(tools_list) == 1:
-        return prompt_tool_input, json.dumps(tools_list[0])
-    else:
-        output = '{ "anyOf": [' + json.dumps(tools_list[0])
-        for tool in tools_list[1:]:
-            output += "," + json.dumps(tool)
-        output += "] }"
-        return prompt_tool_input, output
-
-
-def get_lark_grammar(input_tools):
-    tools_list = get_tools_list(input_tools)
-    prompt_tool_input = create_prompt_tool_input(tools_list)
-    if len(tools_list) == 1:
-        # output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(tools_list[0]))
-        output = "start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps(
-            convert_tool_to_grammar_input(tools_list[0])
-        )
-        return prompt_tool_input, output
-    else:
-        return (
-            prompt_tool_input,
-            'start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {"anyOf": ['
-            + ",".join([json.dumps(tool) for tool in tools_list])
-            + "]}",
-        )
-
-
-def convert_tool_to_grammar_input(tool):
-    param_props = {}
-    required_params = []
-    for param_name, param_info in tool.get("parameters", {}).items():
-        param_props[param_name] = {
-            "type": param_info.get("type", "string"),
-            "description": param_info.get("description", ""),
-        }
-        required_params.append(param_name)
-    output_schema = {
-        "description": tool.get("description", ""),
-        "type": "object",
-        "required": ["name", "parameters"],
-        "additionalProperties": False,
-        "properties": {
-            "name": {"const": tool["name"]},
-            "parameters": {
-                "type": "object",
-                "properties": param_props,
-                "required": required_params,
-                "additionalProperties": False,
-            },
-        },
-    }
-    if len(param_props) == 0:
-        output_schema["required"] = ["name"]
-    return output_schema
+from common import apply_chat_template, get_config, get_generator_params_args, get_guidance, get_guidance_args, get_search_options, register_ep, set_logger
 
 
 def main(args):
+    if args.debug:
+        set_logger()
+    register_ep(args.execution_provider, args.ep_path, args.use_winml)
+
     if args.verbose:
         print("Loading model...")
-    if args.timings:
-        started_timestamp = 0
-        first_token_timestamp = 0
-
-    # Register execution provider library if specified (for plug-in providers)
-    if args.ep_library_path:
-        if args.verbose:
-            print(f"Registering execution provider library: {args.ep_library_path}")
-
-        # Determine the provider registration name based on execution provider
-        provider_registration_name = None
-        if args.execution_provider == "cuda":
-            provider_registration_name = "CUDAExecutionProvider"
-        elif args.execution_provider == "NvTensorRtRtx":
-            provider_registration_name = "NvTensorRTRTXExecutionProvider"
-        else:
-            raise ValueError(
-                f"Provider library registration not supported for '{args.execution_provider}'. Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries."
-            )
-
-        og.register_execution_provider_library(provider_registration_name, args.ep_library_path)
-        if args.verbose:
-            print(f"Successfully registered {provider_registration_name} from {args.ep_library_path}")
 
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            if args.verbose:
-                print(f"Setting model to {args.execution_provider}")
-            config.append_provider(args.execution_provider)
+    # Create model
+    config = get_config(args.model_path, args.execution_provider)
     model = og.Model(config)
-
     if args.verbose:
         print("Model loaded")
 
+    # Create tokenizer
     tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
+    stream = tokenizer.create_stream()
     if args.verbose:
         print("Tokenizer created")
-    if args.verbose:
-        print()
-
-    search_options = {
-        name: getattr(args, name)
-        for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"]
-        if name in args
-    }
-    search_options["batch_size"] = 1
-
-    if args.verbose:
-        print(search_options)
-
-    system_prompt = args.system_prompt
-    guidance_type = ""
-    prompt_tool_input = ""
-    guidance_input = ""
-    if args.guidance_type != "none":
-        guidance_type = args.guidance_type
-        if not args.guidance_info:
-            raise ValueError("Guidance information is required if guidance type is provided")
-        if guidance_type == "json_schema" or guidance_type == "lark_grammar":
-            tools_list = args.guidance_info
-            if guidance_type == "json_schema":
-                prompt_tool_input, guidance_input = get_json_grammar(tools_list)
-            elif guidance_type == "lark_grammar":
-                prompt_tool_input, guidance_input = get_lark_grammar(tools_list)
-        elif guidance_type == "regex":
-            guidance_input = args.guidance_info
-        else:
-            raise ValueError("Guidance Type can only be [json_schema, regex, or lark_grammar]")
 
+    # Get and set search options for generator params
     params = og.GeneratorParams(model)
+    search_options = get_search_options(args)
     params.set_search_options(**search_options)
-    if guidance_type:
-        params.set_guidance(guidance_type, guidance_input)
+    if args.verbose:
+        print(f"GeneratorParams created: {search_options}")
+
+    # Create system message
+    message = [{"role": "system", "content": args.system_prompt}]
+
+    # Get and set guidance info if requested
+    if args.response_format != "":
+        print("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json")
+        guidance_type, guidance_data, tools = get_guidance(
+            response_format=args.response_format,
+            filepath=args.tools_file,
+            text_output=args.text_output,
+            tool_output=args.tool_output,
+            tool_call_start=args.tool_call_start,
+            tool_call_end=args.tool_call_end,
+        )
+        message[0]["tools"] = tools
+
+        params.set_guidance(guidance_type, guidance_data)
         if args.verbose:
-            print("Guidance type is set to:", guidance_type)
-            print("Guidance input is:", guidance_input)
+            print()
+            print(f"Guidance type is: {guidance_type}")
+            print(f"Guidance data is: \n{guidance_data}")
+            print()
 
+    # Create generator
     generator = og.Generator(model, params)
     if args.verbose:
         print("Generator created")
-    if guidance_type == "json_schema" or guidance_type == "lark_grammar":
-        messages = f"""[{{"role": "system", "content": "{system_prompt}", "tools": "{prompt_tool_input}"}}]"""
-    else:
-        messages = f"""[{{"role": "system", "content": "{system_prompt}"}}]"""
 
-    # Apply Chat Template
-    template_str = ""
-    tokenizer_input_system_prompt = None
-    jinja_path = os.path.join(args.model_path, "chat_template.jinja")
-    if os.path.exists(jinja_path):
-        with open(jinja_path, encoding="utf-8") as f:
-            template_str = f.read()
-            tokenizer_input_system_prompt = tokenizer.apply_chat_template(
-                messages=messages, add_generation_prompt=False, template_str=template_str
-            )
-    else:
-        tokenizer_input_system_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=False)
+    # Apply chat template
+    try:
+        system_prompt = apply_chat_template(model_path=args.model_path, tokenizer=tokenizer, messages=json.dumps(message), tools=tools, add_generation_prompt=False)
+    except:
+        system_prompt = args.system_prompt
+    if args.verbose:
+        print(f"System prompt: {system_prompt}")
+
+    # Encode system prompt and append tokens to model
+    system_tokens = tokenizer.encode(system_prompt)
+    system_prompt_length = len(system_tokens)
+    generator.append_tokens(system_tokens)
 
-    input_tokens = tokenizer.encode(tokenizer_input_system_prompt)
-    # Ignoring the last end of text token as it is messes up the generation when grammar is enabled
-    if guidance_type:
-        input_tokens = input_tokens[:-1]
-    system_prompt_length = len(input_tokens)
-    generator.append_tokens(input_tokens)
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
 
     # Keep asking for input prompts in a loop
     while True:
@@ -213,21 +94,23 @@ def main(args):
         if args.timings:
             started_timestamp = time.time()
 
-        messages = f"""[{{"role": "user", "content": "{text}"}}]"""
+        # Create user message
+        message = [{"role": "user", "content": text}]
+        
+        # Apply chat template
+        try:
+            user_prompt = apply_chat_template(model_path=args.model_path, tokenizer=tokenizer, messages=json.dumps(message), add_generation_prompt=True)
+        except:
+            user_prompt = text
+        if args.verbose:
+            print(f"User prompt: {user_prompt}")
 
-        # Apply Chat Template
-        user_prompt = ""
-        if os.path.exists(jinja_path):
-            user_prompt = tokenizer.apply_chat_template(
-                messages=messages, add_generation_prompt=True, template_str=template_str
-            )
-        else:
-            user_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True)
-        input_tokens = tokenizer.encode(user_prompt)
-        generator.append_tokens(input_tokens)
+        # Encode user prompt and append tokens to model
+        user_tokens = tokenizer.encode(user_prompt)
+        generator.append_tokens(user_tokens)
 
         if args.verbose:
-            print("Running generation loop ...")
+            print("Running generation loop...")
         if args.timings:
             first = True
             new_tokens = []
@@ -235,6 +118,7 @@ def main(args):
         print()
         print("Output: ", end="", flush=True)
 
+        # Run generation loop
         try:
             while not generator.is_done():
                 generator.generate_next_token()
@@ -244,9 +128,8 @@ def main(args):
                         first = False
 
                 new_token = generator.get_next_tokens()[0]
-                print(tokenizer_stream.decode(new_token), end="", flush=True)
-                if args.timings:
-                    new_tokens.append(new_token)
+                print(stream.decode(new_token), end='', flush=True)
+                if args.timings: new_tokens.append(new_token)
         except KeyboardInterrupt:
             print("  --control+c pressed, aborting generation--")
         print()
@@ -259,95 +142,25 @@ def main(args):
                 f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
             )
 
-        # Rewind the generator to the system prompt, this will erase all the memory of the model.
+        # Rewind the generator to the system prompt. This will erase all the chat history with the model.
         if args.rewind:
             generator.rewind_to(system_prompt_length)
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai"
-    )
-    parser.add_argument(
-        "-m",
-        "--model_path",
-        type=str,
-        required=True,
-        help="Onnx model folder path (must contain genai_config.json and model.onnx)",
-    )
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "-epl",
-        "--ep_library_path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to the execution provider library DLL/SO for plug-in providers. "
-        "Use this to load CUDA or NvTensorRT as plug-in providers instead of built-in. "
-        "Example: -epl 'C:\\path\\to\\onnxruntime_providers_cuda.dll' or -epl '/usr/lib/libonnxruntime_providers_cuda.so'",
-    )
-    parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt")
-    parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt")
-    parser.add_argument(
-        "-ds",
-        "--do_sample",
-        action="store_true",
-        help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false",
-    )
-    parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with")
-    parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from")
-    parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with")
-    parser.add_argument("-re", "--repetition_penalty", type=float, help="Repetition penalty to sample with")
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Print verbose output and timing information. Defaults to false",
-    )
-    parser.add_argument(
-        "-g",
-        "--timings",
-        action="store_true",
-        default=False,
-        help="Print timing information for each generation step. Defaults to false",
-    )
-    parser.add_argument(
-        "-gtype",
-        "--guidance_type",
-        type=str,
-        default="none",
-        choices=["none", "json_schema", "regex", "lark_grammar"],
-        help="Provide guidance type for the model, options are json_schema, regex, or lark_grammar.",
-    )
-    parser.add_argument(
-        "-ginfo",
-        "--guidance_info",
-        type=str,
-        default="",
-        help="Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided",
-    )
-    parser.add_argument(
-        "-s",
-        "--system_prompt",
-        type=str,
-        default="You are a helpful AI assistant.",
-        help="System prompt to use for the prompt.",
-    )
-    parser.add_argument(
-        "-r",
-        "--rewind",
-        action="store_true",
-        default=False,
-        help="Rewind to the system prompt after each generation. Defaults to false",
-    )
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI chat example for ORT GenAI")
+    parser.add_argument('-m', '--model_path', type=str, required=True, help='ONNX model folder path (must contain genai_config.json and model.onnx)')
+    parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Dump input and output tensors with debug mode. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    parser.add_argument('-sp', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the model.')
+    parser.add_argument('-rw', '--rewind', action='store_true', default=False, help='Rewind to the system prompt after each generation. Defaults to false')
+    parser.add_argument("--ep_path", type=str, required=False, default='', help='Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)')
+    parser.add_argument("--use_winml", action=argparse.BooleanOptionalAction, required=False, default=False, help='Use WinML to register execution providers') 
+
+    get_generator_params_args(parser)
+    get_guidance_args(parser)
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py
index 86be56922b..39d813b064 100644
--- a/examples/python/model-generate.py
+++ b/examples/python/model-generate.py
@@ -1,11 +1,19 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import argparse
 import json
 import time
 
 import onnxruntime_genai as og
+from common import get_config, get_generator_params_args, get_search_options, register_ep, set_logger
 
 
 def main(args):
+    if args.debug:
+        set_logger()
+    register_ep(args.execution_provider, args.ep_path, args.use_winml)
+
     if args.verbose:
         print("Loading model...")
 
@@ -22,39 +30,13 @@ def main(args):
             text = input("Input: ")
             prompts = [text]
 
-    batch_size = len(prompts)
-
-    config = og.Config(args.model_path)
-
-    # Configure search options
-    search_config = {"batch_size": batch_size, "num_beams": args.num_beams}
-
-    # Configure execution provider if specified
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            if args.verbose:
-                print(f"Setting model to {args.execution_provider}...")
-            config.append_provider(args.execution_provider)
-
-    # Disable CUDA graph if using beam search (num_beams > 1),
-    # num_beams > 1 requires past_present_share_buffer to be false so enable_cuda_graph must be false
-    if args.num_beams > 1:
-        config.set_provider_option(args.execution_provider, "enable_cuda_graph", "0")
-        if args.verbose:
-            print("Set enable_cuda_graph to '0' via set_provider_option()")
-
-    # Add chunk_size only for NvTensorRtRtx execution provider
-    if args.execution_provider == "NvTensorRtRtx" and args.chunk_size > 0:
-        search_config["chunk_size"] = args.chunk_size
-
-    # Apply search configuration overlay
-    config.overlay(json.dumps({"search": search_config}))
+    search_config = {"batch_size": len(prompts), "chunk_size": args.chunk_size, "num_beams": args.num_beams}
+    config = get_config(args.model_path, args.execution_provider, ep_options={}, search_options=search_config)
 
     model = og.Model(config)
-
     if args.verbose:
         print("Model loaded")
+
     tokenizer = og.Tokenizer(model)
     if args.verbose:
         print("Tokenizer created")
@@ -72,21 +54,10 @@ def main(args):
         print(f"Prompt(s) encoded: {prompts}")
 
     params = og.GeneratorParams(model)
-
-    search_options = {
-        name: getattr(args, name)
-        for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"]
-        if name in args
-    }
-
-    if args.verbose:
-        print(f"Args: {args}")
-    if args.verbose:
-        print(f"Search options: {search_options}")
-
+    search_options = get_search_options(args)
     params.set_search_options(**search_options)
     if args.verbose:
-        print("GeneratorParams created")
+        print(f"GeneratorParams created: {search_options}")
 
     generator = og.Generator(model, params)
     if args.verbose:
@@ -97,7 +68,7 @@ def main(args):
         print("Input tokens added")
 
     if args.verbose:
-        print("Generating tokens ...\n")
+        print("Running generation loop...\n")
     start_time = time.time()
     while not generator.is_done():
         generator.generate_next_token()
@@ -116,77 +87,18 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai"
-    )
-    parser.add_argument(
-        "-m",
-        "--model_path",
-        type=str,
-        required=True,
-        help="Onnx model folder path (must contain genai_config.json and model.onnx)",
-    )
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "-pr",
-        "--prompts",
-        nargs="*",
-        required=False,
-        help="Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts",
-    )
-    parser.add_argument(
-        "-i", "--min_length", type=int, default=25, help="Min number of tokens to generate including the prompt"
-    )
-    parser.add_argument(
-        "-l", "--max_length", type=int, default=50, help="Max number of tokens to generate including the prompt"
-    )
-    parser.add_argument(
-        "-ds",
-        "--do_sample",
-        action="store_true",
-        help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false",
-    )
-    parser.add_argument("--top_p", type=float, help="Top p probability to sample with")
-    parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from")
-    parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with")
-    parser.add_argument("-r", "--repetition_penalty", type=float, help="Repetition penalty to sample with")
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Print verbose output and timing information. Defaults to false",
-    )
-    parser.add_argument("-b", "--batch_size_for_cuda_graph", type=int, default=1, help="Max batch size for CUDA graph")
-    parser.add_argument(
-        "-c",
-        "--chat_template",
-        type=str,
-        default="",
-        help="Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.",
-    )
-    parser.add_argument(
-        "--chunk_size",
-        type=int,
-        default=0,
-        help="Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)",
-    )
-    parser.add_argument("-n", "--num_beams", type=int, default=3, help="Number of beams for beam search (default: 3)")
-    parser.add_argument(
-        "--non-interactive",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        default=False,
-        help="Non-interactive mode, mainly for CI usage",
-    )
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for ORT GenAI")
+    parser.add_argument("-m", "--model_path", type=str, required=True, help="ONNX model folder path (must contain genai_config.json and model.onnx)")
+    parser.add_argument("-e", "--execution_provider", type=str, required=False, default="follow_config", choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Print verbose output and timing information. Defaults to false")
+    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Dump input and output tensors with debug mode. Defaults to false')
+    parser.add_argument("-pr", "--prompts", nargs="*", required=False, help="Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts")
+    parser.add_argument("-ct", "--chat_template", type=str, default="", help="Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.")
+    parser.add_argument("--non_interactive", action=argparse.BooleanOptionalAction, required=False, default=False, help="Non-interactive mode, mainly for CI usage")
+    parser.add_argument("--ep_path", type=str, required=False, default='', help='Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)')
+    parser.add_argument("--use_winml", action=argparse.BooleanOptionalAction, required=False, default=False, help='Use WinML to register execution providers')
+
+    get_generator_params_args(parser)
 
     args = parser.parse_args()
     main(args)
diff --git a/examples/python/model-mm.py b/examples/python/model-mm.py
new file mode 100644
index 0000000000..eb0b0db496
--- /dev/null
+++ b/examples/python/model-mm.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import time
+
+import onnxruntime_genai as og
+from common import (
+    apply_chat_template,
+    get_config,
+    get_generator_params_args,
+    get_guidance,
+    get_guidance_args,
+    get_user_prompt,
+    get_search_options,
+    get_user_audios,
+    get_user_content,
+    get_user_images,
+    register_ep,
+    set_logger,
+)
+
+
+def main(args):
+    if args.debug:
+        set_logger()
+    register_ep(args.execution_provider, args.ep_path, args.use_winml)
+
+    if args.verbose:
+        print("Loading model...")
+
+    # Create model
+    config = get_config(args.model_path, args.execution_provider)
+    model = og.Model(config)
+    if args.verbose:
+        print("Model loaded")
+
+    # Create tokenizer
+    tokenizer = og.Tokenizer(model)
+    stream = tokenizer.create_stream()
+    if args.verbose:
+        print("Tokenizer created")
+
+    # Create processor
+    processor = model.create_multimodal_processor()
+    if args.verbose:
+        print("Processor created")
+
+    # Get search options for generator params
+    search_options = get_search_options(args)
+
+    # Create running list of messages
+    input_list = [
+        {"role": "system", "content": args.system_prompt},
+    ]
+
+    # Get guidance info if requested
+    guidance_type, guidance_data, tools = "", "", ""
+    if args.response_format != "":
+        print("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json")
+        guidance_type, guidance_data, tools = get_guidance(
+            response_format=args.response_format,
+            filepath=args.tools_file,
+            text_output=args.text_output,
+            tool_output=args.tool_output,
+            tool_call_start=args.tool_call_start,
+            tool_call_end=args.tool_call_end,
+        )
+        input_list[0]["tools"] = tools
+
+    # Keep track of timings if requested
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
+
+    # Keep asking for input prompts in a loop
+    while True:
+        # Get images
+        images, num_images = get_user_images(args.image_paths, args.non_interactive)
+
+        # Get audios
+        audios, num_audios = get_user_audios(args.audio_paths, args.non_interactive)
+
+        # Get user prompt
+        text = get_user_prompt(args.user_prompt, args.non_interactive)
+        if text == "quit()":
+            break
+
+        # Construct user content based on inputs
+        user_content = get_user_content(model.type, num_images, num_audios, text)
+
+        # Add user message to list of messages
+        input_list.append({"role": "user", "content": user_content})
+        messages = json.dumps(input_list)
+    
+        if args.timings:
+            started_timestamp = time.time()
+
+        # Initialize generator params
+        params = og.GeneratorParams(model)
+        params.set_search_options(**search_options)
+        if args.verbose:
+            print(f"GeneratorParams created: {search_options}")
+
+        # Initialize guidance info
+        if args.response_format != "":
+            params.set_guidance(guidance_type, guidance_data)
+            if args.verbose:
+                print()
+                print(f"Guidance type is: {guidance_type}")
+                print(f"Guidance data is: \n{guidance_data}")
+                print()
+
+        # Create generator
+        generator = og.Generator(model, params)
+        if args.verbose:
+            print("Generator created")
+
+        # Apply chat template
+        try:
+            prompt = apply_chat_template(model_path=args.model_path, tokenizer=tokenizer, messages=messages, tools=tools, add_generation_prompt=True)
+        except:
+            prompt = text
+        if args.verbose:
+            print(f"Prompt: {prompt}")
+
+        # Encode combined system + user prompt and append inputs to model
+        inputs = processor(prompt, images=images, audios=audios)
+        generator.set_inputs(inputs)
+
+        if args.verbose:
+            print("Running generation loop...")
+        if args.timings:
+            first = True
+            new_tokens = []
+
+        print()
+        print("Output: ", end="", flush=True)
+
+        # Run generation loop
+        try:
+            while not generator.is_done():
+                generator.generate_next_token()
+                if args.timings:
+                    if first:
+                        first_token_timestamp = time.time()
+                        first = False
+
+                new_token = generator.get_next_tokens()[0]
+                print(stream.decode(new_token), end="", flush=True)
+                if args.timings:
+                    new_tokens.append(new_token)
+        except KeyboardInterrupt:
+            print("  --control+c pressed, aborting generation--")
+        print()
+        print()
+
+        # Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
+        del generator
+
+        # Remove user message from list of messages
+        input_list.pop()
+
+        if args.timings:
+            prompt_time = first_token_timestamp - started_timestamp
+            run_time = time.time() - first_token_timestamp
+            print(
+                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
+            )
+
+        # If non-interactive is requested, it will just run the model for the user prompt and exit
+        if args.non_interactive:
+            break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI question/answer example for ORT GenAI")
+    parser.add_argument('-m', '--model_path', type=str, required=True, help='ONNX model folder path (must contain genai_config.json and model.onnx)')
+    parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Dump input and output tensors with debug mode. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    parser.add_argument('-sp', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the model.')
+    parser.add_argument('-up', '--user_prompt', type=str, default='What color is the sky?', help='User prompt to use for the model.')
+    parser.add_argument("--image_paths", nargs="*", type=list, required=False, default=[], help="Paths to the images, mainly for CI usage")
+    parser.add_argument("--audio_paths", nargs="*", type=list, required=False, default=[], help="Paths to the audios, mainly for CI usage")
+    parser.add_argument("--non_interactive", action=argparse.BooleanOptionalAction, required=False, default=False, help="Non-interactive mode, mainly for CI usage")
+    parser.add_argument("--ep_path", type=str, required=False, default='', help='Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)')
+    parser.add_argument("--use_winml", action=argparse.BooleanOptionalAction, required=False, default=False, help='Use WinML to register execution providers')
+
+    get_generator_params_args(parser)
+    get_guidance_args(parser)
+
+    args = parser.parse_args()
+    args.max_length = args.max_length if hasattr(args, "max_length") else 7680
+    main(args)
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
index f4d93517a6..7004edd661 100644
--- a/examples/python/model-qa.py
+++ b/examples/python/model-qa.py
@@ -6,219 +6,114 @@
 import time
 
 import onnxruntime_genai as og
-
-
-def get_tools_list(input_tools):
-    # input_tools format: '[{"name": "fn1", "description": "fn details", "parameters": {"p1": {"description": "details", "type": "string"}}},
-    # {"fn2": 2},{"fn3": 3}]'
-    tools_list = []
-    try:
-        tools_list = json.loads(input_tools)
-    except json.JSONDecodeError:
-        raise ValueError('Invalid JSON format for tools list, expected format: \'[{"name": "fn1"},{"name": "fn2"}]\'')
-    if len(tools_list) == 0:
-        raise ValueError("Tools list cannot be empty")
-    return tools_list
-
-
-def create_prompt_tool_input(tools_list):
-    tool_input = str(tools_list[0])
-    for tool in tools_list[1:]:
-        tool_input += "," + str(tool)
-    return tool_input
-
-
-def get_json_grammar(input_tools):
-    tools_list = get_tools_list(input_tools)
-    prompt_tool_input = create_prompt_tool_input(tools_list)
-    if len(tools_list) == 1:
-        return prompt_tool_input, json.dumps(tools_list[0])
-    else:
-        output = '{ "anyOf": [' + json.dumps(tools_list[0])
-        for tool in tools_list[1:]:
-            output += "," + json.dumps(tool)
-        output += "] }"
-        return prompt_tool_input, output
-
-
-def get_lark_grammar(input_tools):
-    tools_list = get_tools_list(input_tools)
-    prompt_tool_input = create_prompt_tool_input(tools_list)
-    if len(tools_list) == 1:
-        # output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(tools_list[0]))
-        output = "start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps(
-            convert_tool_to_grammar_input(tools_list[0])
-        )
-        return prompt_tool_input, output
-    else:
-        return (
-            prompt_tool_input,
-            'start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {"anyOf": ['
-            + ",".join([json.dumps(tool) for tool in tools_list])
-            + "]}",
-        )
-
-
-def convert_tool_to_grammar_input(tool):
-    param_props = {}
-    required_params = []
-    for param_name, param_info in tool.get("parameters", {}).items():
-        param_props[param_name] = {
-            "type": param_info.get("type", "string"),
-            "description": param_info.get("description", ""),
-        }
-        required_params.append(param_name)
-    output_schema = {
-        "description": tool.get("description", ""),
-        "type": "object",
-        "required": ["name", "parameters"],
-        "additionalProperties": False,
-        "properties": {
-            "name": {"const": tool["name"]},
-            "parameters": {
-                "type": "object",
-                "properties": param_props,
-                "required": required_params,
-                "additionalProperties": False,
-            },
-        },
-    }
-    if len(param_props) == 0:
-        output_schema["required"] = ["name"]
-    return output_schema
+from common import (
+    apply_chat_template,
+    get_config,
+    get_generator_params_args,
+    get_guidance,
+    get_guidance_args,
+    get_user_prompt,
+    get_search_options,
+    register_ep,
+    set_logger,
+)
 
 
 def main(args):
+    if args.debug:
+        set_logger()
+    register_ep(args.execution_provider, args.ep_path, args.use_winml)
+
     if args.verbose:
         print("Loading model...")
-    if args.timings:
-        started_timestamp = 0
-        first_token_timestamp = 0
-
-    # Register execution provider library if specified (for plug-in providers)
-    if args.ep_library_path:
-        if args.verbose:
-            print(f"Registering execution provider library: {args.ep_library_path}")
-
-        # Determine the provider registration name based on execution provider
-        provider_registration_name = None
-        if args.execution_provider == "cuda":
-            provider_registration_name = "CUDAExecutionProvider"
-        elif args.execution_provider == "NvTensorRtRtx":
-            provider_registration_name = "NvTensorRTRTXExecutionProvider"
-        else:
-            raise ValueError(
-                f"Provider library registration not supported for '{args.execution_provider}'. Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries."
-            )
-
-        og.register_execution_provider_library(provider_registration_name, args.ep_library_path)
-        if args.verbose:
-            print(f"Successfully registered {provider_registration_name} from {args.ep_library_path}")
 
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            if args.verbose:
-                print(f"Setting model to {args.execution_provider}")
-            config.append_provider(args.execution_provider)
+    # Create model
+    config = get_config(args.model_path, args.execution_provider)
     model = og.Model(config)
-
     if args.verbose:
         print("Model loaded")
 
+    # Create tokenizer
     tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
+    stream = tokenizer.create_stream()
     if args.verbose:
         print("Tokenizer created")
-    if args.verbose:
-        print()
-
-    search_options = {
-        name: getattr(args, name)
-        for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"]
-        if name in args
-    }
-    search_options["batch_size"] = 1
 
-    if args.verbose:
-        print(search_options)
+    # Get search options for generator params
+    search_options = get_search_options(args)
+
+    # Create running list of messages
+    input_list = [
+        {"role": "system", "content": args.system_prompt},
+    ]
+
+    # Get guidance info if requested
+    guidance_type, guidance_data, tools = "", "", ""
+    if args.response_format != "":
+        print("Make sure your tool call start id and tool call end id are marked as special in tokenizer.json")
+        guidance_type, guidance_data, tools = get_guidance(
+            response_format=args.response_format,
+            filepath=args.tools_file,
+            text_output=args.text_output,
+            tool_output=args.tool_output,
+            tool_call_start=args.tool_call_start,
+            tool_call_end=args.tool_call_end,
+        )
+        input_list[0]["tools"] = tools
 
-    system_prompt = args.system_prompt
-    guidance_type = ""
-    prompt_tool_input = ""
-    guidance_input = ""
-    if args.guidance_type != "none":
-        guidance_type = args.guidance_type
-        if not args.guidance_info:
-            raise ValueError("Guidance information is required if guidance type is provided")
-        if guidance_type == "json_schema" or guidance_type == "lark_grammar":
-            tools_list = args.guidance_info
-            if guidance_type == "json_schema":
-                prompt_tool_input, guidance_input = get_json_grammar(tools_list)
-            elif guidance_type == "lark_grammar":
-                prompt_tool_input, guidance_input = get_lark_grammar(tools_list)
-        elif guidance_type == "regex":
-            guidance_input = args.guidance_info
-        else:
-            raise ValueError("Guidance Type can only be [json_schema, regex, or lark_grammar]")
+    # Keep track of timings if requested
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
 
     # Keep asking for input prompts in a loop
     while True:
-        if args.input_prompt:
-            text = args.input_prompt
-        else:
-            text = input("Prompt (Use quit() to exit): ")
-        if not text:
-            print("Error, input cannot be empty")
-            continue
-
+        # Get user prompt
+        text = get_user_prompt(args.user_prompt, args.non_interactive)
         if text == "quit()":
             break
 
+        # Add user message to list of messages
+        input_list.append({"role": "user", "content": text})
+        messages = json.dumps(input_list)
+
         if args.timings:
             started_timestamp = time.time()
 
+        # Initialize generator params
         params = og.GeneratorParams(model)
         params.set_search_options(**search_options)
+        if args.verbose:
+            print(f"GeneratorParams created: {search_options}")
 
-        if guidance_type:
-            params.set_guidance(guidance_type, guidance_input)
+        # Initialize guidance info
+        if args.response_format != "":
+            params.set_guidance(guidance_type, guidance_data)
             if args.verbose:
-                print("Guidance type is set to:", guidance_type)
-                print("Guidance input is:", guidance_input)
+                print()
+                print(f"Guidance type is: {guidance_type}")
+                print(f"Guidance data is: \n{guidance_data}")
+                print()
 
+        # Create generator
         generator = og.Generator(model, params)
         if args.verbose:
             print("Generator created")
 
-        # Create messages with proper JSON encoding
-        # Gemma2 models don't support system role, so we prepend system prompt to user message
-        if model.type == "gemma2":
-            combined_message = f"{system_prompt}\n\n{text}" if system_prompt else text
-            messages_list = [{"role": "user", "content": combined_message}]
-        elif guidance_type == "json_schema" or guidance_type == "lark_grammar":
-            messages_list = [
-                {"role": "system", "content": system_prompt, "tools": prompt_tool_input},
-                {"role": "user", "content": text},
-            ]
-        else:
-            messages_list = [{"role": "system", "content": system_prompt}, {"role": "user", "content": text}]
-
-        # Convert to JSON string for tokenizer
-        messages = json.dumps(messages_list)
-
-        # Apply Chat Template
-        if model.type == "marian-ssru":
+        # Apply chat template
+        try:
+            prompt = apply_chat_template(model_path=args.model_path, tokenizer=tokenizer, messages=messages, tools=tools, add_generation_prompt=True)
+        except:
             prompt = text
-        else:
-            prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True)
+        if args.verbose:
+            print(f"Prompt: {prompt}")
 
+        # Encode combined system + user prompt and append tokens to model
         input_tokens = tokenizer.encode(prompt)
         generator.append_tokens(input_tokens)
 
         if args.verbose:
-            print("Running generation loop ...")
+            print("Running generation loop...")
         if args.timings:
             first = True
             new_tokens = []
@@ -226,6 +121,7 @@ def main(args):
         print()
         print("Output: ", end="", flush=True)
 
+        # Run generation loop
         try:
             while not generator.is_done():
                 generator.generate_next_token()
@@ -235,7 +131,7 @@ def main(args):
                         first = False
 
                 new_token = generator.get_next_tokens()[0]
-                print(tokenizer_stream.decode(new_token), end="", flush=True)
+                print(stream.decode(new_token), end="", flush=True)
                 if args.timings:
                     new_tokens.append(new_token)
         except KeyboardInterrupt:
@@ -243,105 +139,39 @@ def main(args):
         print()
         print()
 
-        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
-
+        # Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
         del generator
 
+        # Remove user message from list of messages
+        input_list.pop()
+
         if args.timings:
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp
             print(
                 f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
             )
-        # If Input prompt is provided it will just run the model for the input prompt and exit
-        if args.input_prompt:
+
+        # If non-interactive is requested, it will just run the model for the user prompt and exit
+        if args.non_interactive:
             break
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai"
-    )
-    parser.add_argument(
-        "-m",
-        "--model_path",
-        type=str,
-        required=True,
-        help="Onnx model folder path (must contain genai_config.json and model.onnx)",
-    )
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "-epl",
-        "--ep_library_path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to the execution provider library DLL for plug-in providers. "
-        "Use this to load CUDA or NvTensorRT as plug-in providers instead of built-in. "
-        "Example: -epl 'C:\\path\\to\\onnxruntime_providers_cuda.dll'",
-    )
-    parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt")
-    parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt")
-    parser.add_argument(
-        "-ds",
-        "--do_sample",
-        action="store_true",
-        help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false",
-    )
-    parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with")
-    parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from")
-    parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with")
-    parser.add_argument("-re", "--repetition_penalty", type=float, help="Repetition penalty to sample with")
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Print verbose output and timing information. Defaults to false",
-    )
-    parser.add_argument(
-        "-g",
-        "--timings",
-        action="store_true",
-        default=False,
-        help="Print timing information for each generation step. Defaults to false",
-    )
-    parser.add_argument(
-        "-gtype",
-        "--guidance_type",
-        type=str,
-        default="none",
-        choices=["none", "json_schema", "regex", "lark_grammar"],
-        help="Provide guidance type for the model, options are json_schema, regex, or lark_grammar.",
-    )
-    parser.add_argument(
-        "-ginfo",
-        "--guidance_info",
-        type=str,
-        default="",
-        help="Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided",
-    )
-    parser.add_argument(
-        "-s",
-        "--system_prompt",
-        type=str,
-        default="You are a helpful AI assistant.",
-        help="System prompt to use for the prompt.",
-    )
-    parser.add_argument(
-        "-inp",
-        "--input_prompt",
-        type=str,
-        default="",
-        help="Input Prompt, if provided it will just run the prompt and exit",
-    )
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI question/answer example for ORT GenAI")
+    parser.add_argument('-m', '--model_path', type=str, required=True, help='ONNX model folder path (must contain genai_config.json and model.onnx)')
+    parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Dump input and output tensors with debug mode. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    parser.add_argument('-sp', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the model.')
+    parser.add_argument('-up', '--user_prompt', type=str, default='What color is the sky?', help='User prompt to use for the model.')
+    parser.add_argument("--non_interactive", action=argparse.BooleanOptionalAction, required=False, default=False, help="Non-interactive mode, mainly for CI usage")
+    parser.add_argument("--ep_path", type=str, required=False, default='', help='Path to execution provider DLL/SO for plug-in providers (ex: onnxruntime_providers_cuda.dll or onnxruntime_providers_tensorrt.dll)')
+    parser.add_argument("--use_winml", action=argparse.BooleanOptionalAction, required=False, default=False, help='Use WinML to register execution providers')
+
+    get_generator_params_args(parser)
+    get_guidance_args(parser)
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
deleted file mode 100644
index 46ac6d79b0..0000000000
--- a/examples/python/model-vision.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License
-
-import argparse
-import glob
-import json
-import os
-import readline
-import time
-from pathlib import Path
-
-import onnxruntime_genai as og
-
-# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
-
-# Tool-calling system prompt for Qwen/Fara models
-FARA_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
-
-The functions at your disposal are:
-<tools>
-{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\\n* `key`: Press keyboard keys, like \\"Enter\\", \\"Alt\\", \\"Shift\\", \\"Tab\\", \\"Control\\", \\"Backspace\\", \\"Delete\\", \\"Escape\\", etc. Keys are pressed down in the order given, then released in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `visit_url`: Visit a specified URL.\\n* `web_search`: Perform a web search with a specified query.\\n* `history_back`: Go back to the previous page in the browser history.\\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}}
-</tools>
-
-To make a function call, you should output a json object inside <tool_call></tool_call> XML tags. The json object must contain the function name and its arguments, like this:
-<tool_call>
-{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}
-</tool_call>
-"""
-
-
-def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
-    curr_path = Path(current_dir).absolute()
-    target_dir = glob.glob(target_dir_name, root_dir=curr_path)
-    if target_dir:
-        return Path(curr_path / target_dir[0]).absolute()
-    else:
-        if curr_path.parent == curr_path:
-            # Root dir
-            return None
-        return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name)
-
-
-def _complete(text, state):
-    return [*glob.glob(text + "*"), None][state]
-
-
-def run(args: argparse.Namespace):
-    if args.use_winml:
-        try:
-            import winml
-
-            print(winml.register_execution_providers(ort=False, ort_genai=True))
-        except ImportError:
-            print("WinML not available, using default execution providers")
-        except Exception as e:
-            print(f"Failed to register WinML execution providers: {e}")
-
-    print("Loading model...")
-
-    # Register execution provider library if specified (for plug-in providers)
-    if args.ep_library_path:
-        print(f"Registering execution provider library: {args.ep_library_path}")
-
-        # Determine the provider registration name based on execution provider
-        provider_registration_name = None
-        if args.execution_provider == "cuda":
-            provider_registration_name = "CUDAExecutionProvider"
-        elif args.execution_provider == "NvTensorRtRtx":
-            provider_registration_name = "NvTensorRTRTXExecutionProvider"
-        else:
-            raise ValueError(
-                f"Provider library registration not supported for '{args.execution_provider}'. Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries."
-            )
-
-        og.register_execution_provider_library(provider_registration_name, args.ep_library_path)
-        print(f"Successfully registered {provider_registration_name} from {args.ep_library_path}")
-
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            print(f"Setting model to {args.execution_provider}...")
-            config.append_provider(args.execution_provider)
-    model = og.Model(config)
-    print("Model loaded")
-
-    tokenizer = og.Tokenizer(model)
-    processor = model.create_multimodal_processor()
-    stream = processor.create_stream()
-
-    interactive = not args.non_interactive
-
-    while True:
-        if interactive:
-            try:
-                readline.set_completer_delims(" \t\n;")
-                readline.parse_and_bind("tab: complete")
-                readline.set_completer(_complete)
-            except ImportError:
-                # Not available on some platforms. Ignore it.
-                pass
-            image_paths = [
-                image_path.strip()
-                for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",")
-            ]
-        else:
-            if args.image_paths:
-                image_paths = args.image_paths
-            else:
-                image_paths = [
-                    str(
-                        _find_dir_contains_sub_dir(Path(__file__).parent, "test")
-                        / "test_models"
-                        / "images"
-                        / "australia.jpg"
-                    )
-                ]
-
-        image_paths = [image_path for image_path in image_paths if image_path]
-
-        images = None
-        if len(image_paths) == 0:
-            print("No image provided")
-        else:
-            for _, image_path in enumerate(image_paths):
-                if not os.path.exists(image_path):
-                    raise FileNotFoundError(f"Image file not found: {image_path}")
-                print(f"Using image: {image_path}")
-
-            images = og.Images.open(*image_paths)
-
-        if interactive:
-            text = input("Prompt: ")
-        else:
-            if args.prompt:
-                text = args.prompt
-            else:
-                text = "What is shown in this image?"
-
-        # Construct the "messages" argument passed to apply_chat_template
-        messages = []
-        if model.type == "phi3v":
-            # Combine all image tags and text into one user message
-            content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
-            messages.append({"role": "user", "content": content})
-        elif model.type in ["qwen2_5_vl", "fara"]:
-            messages.append({"role": "system", "content": FARA_SYSTEM_PROMPT})
-            content = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
-            messages.append({"role": "user", "content": content})
-        else:
-            # Gemma3-style multimodal: structured content
-            content_list = [{"type": "image"} for _ in image_paths]
-            content_list.append({"type": "text", "text": text})
-            messages.append({"role": "user", "content": content_list})
-
-        # Apply the chat template using the tokenizer
-        message_json = json.dumps(messages)
-        prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
-
-        print("Processing images and prompt...")
-        inputs = processor(prompt, images=images)
-
-        print("Generating response...")
-        params = og.GeneratorParams(model)
-        max_length = args.max_length if args.max_length else 7680
-        params.set_search_options(max_length=max_length)
-
-        generator = og.Generator(model, params)
-        generator.set_inputs(inputs)
-        start_time = time.time()
-
-        while not generator.is_done():
-            generator.generate_next_token()
-            new_token = generator.get_next_tokens()[0]
-            print(stream.decode(new_token), end="", flush=True)
-
-        print()
-        total_run_time = time.time() - start_time
-        print(f"Total Time : {total_run_time:.2f}")
-
-        for _ in range(3):
-            print()
-
-        # Delete the generator to free the captured graph before creating another one
-        del generator
-
-        if not interactive:
-            break
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the folder containing the model")
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "-epl",
-        "--ep_library_path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to the execution provider library DLL/SO for plug-in providers. "
-        "Use this to load CUDA or NvTensorRT as plug-in providers instead of built-in. "
-        "Example: -epl 'C:\\path\\to\\onnxruntime_providers_cuda.dll' or -epl '/usr/lib/libonnxruntime_providers_cuda.so'",
-    )
-    parser.add_argument(
-        "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage"
-    )
-    parser.add_argument(
-        "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
-    )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        required=False,
-        default=None,
-        help="Maximum generation length. Defaults to model's context_length from config.",
-    )
-    parser.add_argument(
-        "--non-interactive",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        help="Non-interactive mode, mainly for CI usage",
-    )
-    parser.add_argument(
-        "--use-winml",
-        action="store_true",
-        required=False,
-        help="Register WinML execution providers before loading the model",
-    )
-    args = parser.parse_args()
-    run(args)
diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 32b09375df..1bbadcc081 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -66,12 +66,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
+   Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for main is given below).
 
    ```bash
-   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
-   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
-   python phi3-qa.py -m directml\directml-int4-awq-block-128 -e dml
+   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
+   python model-qa.py -m directml\directml-int4-awq-block-128 -e dml
    ```
 
    Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
@@ -106,12 +105,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
+   Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for main is given below).
 
    ```bash
-   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
-   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
-   python phi3-qa.py -m cuda/cuda-int4-rtn-block-32 -e cuda
+   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
+   python model-qa.py -m cuda/cuda-int4-rtn-block-32 -e cuda
    ```
 
       Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
@@ -140,12 +138,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
+   Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for main is given below).
 
    ```bash
-   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
-   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
-   python phi3-qa.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu
+   curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
+   python model-qa.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu
    ```
 
    Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
diff --git a/examples/python/phi-3-vision.md b/examples/python/phi-3-vision.md
index 51a06bb263..164e61c48a 100644
--- a/examples/python/phi-3-vision.md
+++ b/examples/python/phi-3-vision.md
@@ -107,15 +107,15 @@ Currently, both JSON files needed to run with ONNX Runtime GenAI are created by
 
 ## 4. Run Phi-3 vision ONNX models
 
-[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3v.py) is an example of how you can run your Phi-3 vision model with ONNX Runtime GenAI.
+[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-mm.py) is an example of how you can run your Phi-3 vision model with ONNX Runtime GenAI.
 
 ### CUDA
 ```bash
-$ python .\phi3v.py -m .\phi3-vision-128k-instruct\cuda -e cuda
+$ python model-mm.py -m ./phi3-vision-128k-instruct/cuda -e cuda
 ```
 
 ### DirectML
 
 ```bash
-$ python .\phi3v.py -m .\phi3-vision-128k-instruct\dml -e dml
+$ python model-mm.py -m ./phi3-vision-128k-instruct/dml -e dml
 ```
diff --git a/examples/python/phi-3.5-vision.md b/examples/python/phi-3.5-vision.md
index 093440db2b..9526a79421 100644
--- a/examples/python/phi-3.5-vision.md
+++ b/examples/python/phi-3.5-vision.md
@@ -104,15 +104,15 @@ Currently, both JSON files needed to run with ONNX Runtime GenAI are created by
 
 ## 4. Run Phi-3.5 vision ONNX models
 
-[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3v.py) is an example of how you can run your Phi-3.5 vision model with ONNX Runtime GenAI.
+[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-mm.py) is an example of how you can run your Phi-3.5 vision model with ONNX Runtime GenAI.
 
 ### CUDA
 ```bash
-$ python .\phi3v.py -m .\phi3.5-vision-instruct\cuda -e cuda
+$ python model-mm.py -m ./phi3.5-vision-instruct/cuda -e cuda
 ```
 
 ### DirectML
 
 ```bash
-$ python .\phi3v.py -m .\phi3.5-vision-instruct\dml -e dml
+$ python model-mm.py -m ./phi3.5-vision-instruct/dml -e dml
 ```
diff --git a/examples/python/phi-4-multi-modal.md b/examples/python/phi-4-multi-modal.md
index 184d54d742..e6d5442a61 100644
--- a/examples/python/phi-4-multi-modal.md
+++ b/examples/python/phi-4-multi-modal.md
@@ -179,20 +179,20 @@ Currently, the JSON files needed to run with ONNX Runtime GenAI are created by h
 
 ## 4. Run Phi-4 Multimodal ONNX models
 
-[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi4-mm.py) is an example of how you can run your Phi-4 multimodal model with ONNX Runtime GenAI.
+[Here](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-mm.py) is an example of how you can run your Phi-4 multimodal model with ONNX Runtime GenAI.
 
 ### CPU
 ```bash
-$ python3 phi4-mm.py -m ./phi4-mm/cpu -e cpu
+$ python3 model-mm.py -m ./model-mm/cpu -e cpu
 ```
 
 ### CUDA
 ```bash
-$ python3 phi4-mm.py -m ./phi4-mm/cuda -e cuda
+$ python3 model-mm.py -m ./model-mm/cuda -e cuda
 ```
 
 ### DirectML
 
 ```bash
-$ python phi4-mm.py -m ./phi4-mm/dml -e dml
+$ python model-mm.py -m ./model-mm/dml -e dml
 ```
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
deleted file mode 100644
index 645954fee3..0000000000
--- a/examples/python/phi3-qa.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import argparse
-import json
-import time
-
-import onnxruntime_genai as og
-
-
-def main(args):
-    if args.verbose:
-        print("Loading model...")
-    if args.timings:
-        started_timestamp = 0
-        first_token_timestamp = 0
-
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            if args.verbose:
-                print(f"Setting model to {args.execution_provider}")
-            config.append_provider(args.execution_provider)
-    model = og.Model(config)
-
-    if args.verbose:
-        print("Model loaded")
-
-    tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
-    if args.verbose:
-        print("Tokenizer created")
-    if args.verbose:
-        print()
-    search_options = {
-        name: getattr(args, name)
-        for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"]
-        if name in args
-    }
-
-    # Set the max length to something sensible by default, unless it is specified by the user,
-    # since otherwise it will be set to the entire context length
-    if "max_length" not in search_options:
-        search_options["max_length"] = 2048
-
-    # Keep asking for input prompts in a loop
-    while True:
-        text = input("Input: ")
-        if not text:
-            print("Error, input cannot be empty")
-            continue
-
-        if args.timings:
-            started_timestamp = time.time()
-
-        # If there is a chat template, use it
-        input_message = [{"role": "user", "content": text}]
-        input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True)
-
-        input_tokens = tokenizer.encode(input_prompt)
-
-        params = og.GeneratorParams(model)
-        params.set_search_options(**search_options)
-        generator = og.Generator(model, params)
-
-        generator.append_tokens(input_tokens)
-        if args.verbose:
-            print("Generator created")
-
-        if args.verbose:
-            print("Running generation loop ...")
-        if args.timings:
-            first = True
-            new_tokens = []
-
-        print()
-        print("Output: ", end="", flush=True)
-
-        try:
-            while not generator.is_done():
-                generator.generate_next_token()
-                if args.timings:
-                    if first:
-                        first_token_timestamp = time.time()
-                        first = False
-
-                new_token = generator.get_next_tokens()[0]
-                print(tokenizer_stream.decode(new_token), end="", flush=True)
-                if args.timings:
-                    new_tokens.append(new_token)
-        except KeyboardInterrupt:
-            print("  --control+c pressed, aborting generation--")
-        print()
-        print()
-
-        if args.timings:
-            prompt_time = first_token_timestamp - started_timestamp
-            run_time = time.time() - first_token_timestamp
-            print(
-                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai"
-    )
-    parser.add_argument(
-        "-m",
-        "--model_path",
-        type=str,
-        required=True,
-        help="Onnx model folder path (must contain genai_config.json and model.onnx)",
-    )
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt")
-    parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt")
-    parser.add_argument(
-        "-ds",
-        "--do_sample",
-        action="store_true",
-        default=False,
-        help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false",
-    )
-    parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with")
-    parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from")
-    parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with")
-    parser.add_argument("-r", "--repetition_penalty", type=float, help="Repetition penalty to sample with")
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Print verbose output and timing information. Defaults to false",
-    )
-    parser.add_argument(
-        "-g",
-        "--timings",
-        action="store_true",
-        default=False,
-        help="Print timing information for each generation step. Defaults to false",
-    )
-    args = parser.parse_args()
-    main(args)
diff --git a/examples/python/phi4-mm.py b/examples/python/phi4-mm.py
deleted file mode 100644
index e3ef84f282..0000000000
--- a/examples/python/phi4-mm.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License
-
-import argparse
-import glob
-import os
-import time
-from pathlib import Path
-
-import onnxruntime_genai as og
-
-# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
-
-
-def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
-    curr_path = Path(current_dir).absolute()
-    target_dir = glob.glob(target_dir_name, root_dir=curr_path)
-    if target_dir:
-        return Path(curr_path / target_dir[0]).absolute()
-    else:
-        if curr_path.parent == curr_path:
-            # Root dir
-            return None
-        return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name)
-
-
-def _complete(text, state):
-    return (glob.glob(text + "*") + [None])[state]
-
-
-def get_paths(modality, user_provided_paths, default_paths, interactive):
-    paths = None
-
-    if interactive:
-        try:
-            import readline
-
-            readline.set_completer_delims(" \t\n;")
-            readline.parse_and_bind("tab: complete")
-            readline.set_completer(_complete)
-        except ImportError:
-            # Not available on some platforms. Ignore it.
-            pass
-        paths = [
-            path.strip()
-            for path in input(f"{modality.capitalize()} Path (comma separated; leave empty if no {modality}): ").split(
-                ","
-            )
-        ]
-    else:
-        paths = user_provided_paths if user_provided_paths else default_paths
-
-    paths = [path for path in paths if path]
-    return paths
-
-
-def run(args: argparse.Namespace):
-    print("Loading model...")
-
-    # Register execution provider library if specified (for plug-in providers)
-    if args.ep_library_path:
-        print(f"Registering execution provider library: {args.ep_library_path}")
-
-        # Determine the provider registration name based on execution provider
-        provider_registration_name = None
-        if args.execution_provider == "cuda":
-            provider_registration_name = "CUDAExecutionProvider"
-        elif args.execution_provider == "NvTensorRtRtx":
-            provider_registration_name = "NvTensorRTRTXExecutionProvider"
-        else:
-            raise ValueError(
-                f"Provider library registration not supported for '{args.execution_provider}'. Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries."
-            )
-
-        og.register_execution_provider_library(provider_registration_name, args.ep_library_path)
-        print(f"Successfully registered {provider_registration_name} from {args.ep_library_path}")
-
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            print(f"Setting model to {args.execution_provider}...")
-            config.append_provider(args.execution_provider)
-    model = og.Model(config)
-    print("Model loaded")
-
-    processor = model.create_multimodal_processor()
-    tokenizer_stream = processor.create_stream()
-
-    interactive = not args.non_interactive
-
-    while True:
-        image_paths = get_paths(
-            modality="image",
-            user_provided_paths=args.image_paths,
-            default_paths=[
-                str(
-                    _find_dir_contains_sub_dir(Path(__file__).parent, "test")
-                    / "test_models"
-                    / "images"
-                    / "australia.jpg"
-                )
-            ],
-            interactive=interactive,
-        )
-        audio_paths = get_paths(
-            modality="audio",
-            user_provided_paths=args.audio_paths,
-            default_paths=[
-                str(
-                    _find_dir_contains_sub_dir(Path(__file__).parent, "test")
-                    / "test_models"
-                    / "audios"
-                    / "1272-141231-0002.mp3"
-                )
-            ],
-            interactive=interactive,
-        )
-
-        images = None
-        audios = None
-        prompt = "<|user|>\n"
-
-        # Get images
-        if len(image_paths) == 0:
-            print("No image provided")
-        else:
-            for i, image_path in enumerate(image_paths):
-                if not os.path.exists(image_path):
-                    raise FileNotFoundError(f"Image file not found: {image_path}")
-                print(f"Using image: {image_path}")
-                prompt += f"<|image_{i + 1}|>\n"
-            images = og.Images.open(*image_paths)
-
-        # Get audios
-        if len(audio_paths) == 0:
-            print("No audio provided")
-        else:
-            for i, audio_path in enumerate(audio_paths):
-                if not os.path.exists(audio_path):
-                    raise FileNotFoundError(f"Audio file not found: {audio_path}")
-                print(f"Using audio: {audio_path}")
-                prompt += f"<|audio_{i + 1}|>\n"
-            audios = og.Audios.open(*audio_paths)
-
-        if interactive:
-            text = input("Prompt: ")
-        else:
-            if args.prompt:
-                text = args.prompt
-            else:
-                text = "Does the audio summarize what is shown in the image? If not, what is different?"
-        prompt += f"{text}<|end|>\n<|assistant|>\n"
-
-        print("Processing inputs...")
-        inputs = processor(prompt, images=images, audios=audios)
-        print("Processor complete.")
-
-        print("Generating response...")
-        params = og.GeneratorParams(model)
-        params.set_search_options(max_length=7680)
-
-        generator = og.Generator(model, params)
-        generator.set_inputs(inputs)
-        start_time = time.time()
-
-        while not generator.is_done():
-            generator.generate_next_token()
-            new_token = generator.get_next_tokens()[0]
-            print(tokenizer_stream.decode(new_token), end="", flush=True)
-
-        print()
-        total_run_time = time.time() - start_time
-        print(f"Total Time : {total_run_time:.2f}")
-
-        for _ in range(3):
-            print()
-
-        # Delete the generator to free the captured graph before creating another one
-        del generator
-
-        if not interactive:
-            break
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the folder containing the model")
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "-epl",
-        "--ep_library_path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to the execution provider library DLL/SO for plug-in providers. "
-        "Use this to load CUDA or NvTensorRT as plug-in providers instead of built-in. "
-        "Example: -epl 'C:\\path\\to\\onnxruntime_providers_cuda.dll' or -epl '/usr/lib/libonnxruntime_providers_cuda.so'",
-    )
-    parser.add_argument(
-        "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage"
-    )
-    parser.add_argument(
-        "--audio_paths", nargs="*", type=str, required=False, help="Path to the audios, mainly for CI usage"
-    )
-    parser.add_argument(
-        "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
-    )
-    parser.add_argument(
-        "--non-interactive",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        help="Non-interactive mode, mainly for CI usage",
-    )
-    args = parser.parse_args()
-    run(args)
diff --git a/examples/python/qa-e2e-example.sh b/examples/python/qa-e2e-example.sh
deleted file mode 100755
index 034e9ff414..0000000000
--- a/examples/python/qa-e2e-example.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-# Description: Example of chatbot end-to-end usage, including model building and running.
-python3 -m onnxruntime_genai.models.builder -m microsoft/phi-2 -o genai_models/phi2-int4-cpu -p int4 -e cpu -c hf_cache
-python3 model-qa.py -m genai_models/phi2-int4-cpu -e cpu -p 0.0 -k 1
diff --git a/examples/python/winml.py b/examples/python/winml.py
new file mode 100644
index 0000000000..e2451d3932
--- /dev/null
+++ b/examples/python/winml.py
@@ -0,0 +1,108 @@
+import sys
+from pathlib import Path
+import traceback
+
+_winml_instance = None
+
+class WinML:
+    def __new__(cls, *args, **kwargs):
+        global _winml_instance
+        if _winml_instance is None:
+            _winml_instance = super(WinML, cls).__new__(cls, *args, **kwargs)
+            _winml_instance._initialized = False
+        return _winml_instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+        self._initialized = True
+
+        self._fix_winrt_runtime()
+        from winui3.microsoft.windows.applicationmodel.dynamicdependency.bootstrap import (
+            InitializeOptions,
+            initialize
+        )
+        import winui3.microsoft.windows.ai.machinelearning as winml
+        self._win_app_sdk_handle = initialize(options=InitializeOptions.ON_NO_MATCH_SHOW_UI)
+        self._win_app_sdk_handle.__enter__()
+        catalog = winml.ExecutionProviderCatalog.get_default()
+        self._providers = catalog.find_all_providers()
+        self._ep_paths : dict[str, str] = {}
+        for provider in self._providers:
+            provider.ensure_ready_async().get()
+            if provider.library_path == '':
+                continue
+            self._ep_paths[provider.name] = provider.library_path
+        self._registered_eps : dict[str, list[str]] = {"onnxruntime": [], "onnxruntime_genai": []}
+
+    def __del__(self):
+        self._providers = None
+        self._win_app_sdk_handle.__exit__(None, None, None)
+
+    def _fix_winrt_runtime(self):
+        """
+        This function removes the msvcp140.dll from the winrt-runtime package.
+        So it does not cause issues with other libraries.
+        """
+        from importlib import metadata
+        site_packages_path = Path(str(metadata.distribution('winrt-runtime').locate_file('')))
+        dll_path = site_packages_path / 'winrt' / 'msvcp140.dll'
+        if dll_path.exists():
+            dll_path.unlink()
+
+    def register_execution_providers(self, ort=True, ort_genai=False) -> dict[str, list[str]]:
+        modules = []
+        if ort:
+            import onnxruntime
+            modules.append(onnxruntime)
+        if ort_genai:
+            import onnxruntime_genai
+            modules.append(onnxruntime_genai)
+        for name, path in self._ep_paths.items():
+            for module in modules:
+                if name not in self._registered_eps[module.__name__]:
+                    try:
+                        module.register_execution_provider_library(name, path)
+                        self._registered_eps[module.__name__].append(name)
+                    except Exception as e:
+                        print(f"Failed to register execution provider {name}: {e}", file=sys.stderr)
+                        traceback.print_exc()
+        return self._registered_eps
+
+
+def register_execution_providers(ort=True, ort_genai=False) -> dict[str, list[str]]:
+    """Register WinML execution providers for ONNX Runtime and ONNX Runtime GenAI.
+
+    Args:
+        ort (bool): Whether to register for ONNX Runtime.
+        ort_genai (bool): Whether to register for ONNX Runtime GenAI.
+
+    Returns:
+        dict[str, list[str]]: Dictionary of registered execution provider names by module.
+    """
+    return WinML().register_execution_providers(ort=ort, ort_genai=ort_genai)
+
+
+def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
+    """Ensures correct EP device selection for WinML. NEVER modify this function.
+    ep_name is one of:
+        - "CPUExecutionProvider"
+        - "DmlExecutionProvider"
+        - "WebGpuExecutionProvider"
+        - "QNNExecutionProvider"
+        - "OpenVINOExecutionProvider"
+        - "VitisAIExecutionProvider"
+        - "NvTensorRTRTXExecutionProvider"
+
+    device_type is one of:
+        - ort.OrtHardwareDeviceType.CPU
+        - ort.OrtHardwareDeviceType.GPU
+        - ort.OrtHardwareDeviceType.NPU
+    """
+    import onnxruntime as ort
+    ep_devices = ort.get_ep_devices()
+    for ep_device in ep_devices:
+        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
+            print(f"Adding {ep_name} for {device_type}")
+            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)
+            break
diff --git a/examples/slm_engine/README.md b/examples/slm_engine/README.md
deleted file mode 100644
index 67f606baa2..0000000000
--- a/examples/slm_engine/README.md
+++ /dev/null
@@ -1,535 +0,0 @@
-# SLM Engine for running GenAI on the Edge Compute
-
-SLM Engine is a C++ based application that uses ONNX Runtime (ORT) genrate() API library and runs GenAI models on the edge computing resources. The SLM engine is developed to deploy generative AI models (sometimes fine-tuned for API function calling specific to certain Edge use cases). Any small language based generative AI model quantized to ONNX can run on SLM engine.
-
-The following diagram illustrates a high level architecture of the SLM Engine and its relationship with the ONNX Runtime libraries.
-
-<div align="center">
-    <img src="architecture.drawio.png">
-    <p>SLM Engine Architecture</p>
-</div>
-
-SLM Engine is designed to be built from sources for different types of target hardware to maximize the portability. Building from source allows maximum customization to ensure most efficient execution of workloads for specific target hardware.
-
-The current version is tested on Windows 11, MacOS, Linux and Android running on the CPU of various platforms. The SLM Engine also runs on various accelerators (such as GPU and NPU) via the execution provider mechanism of the ONNX runtime.
-
-## Installation
-
-Since this is targeted for various devices running on the Edge we provide a simple to use build setup that the developers can use to build for any system of their choosing.
-
-### Prerequisites
-
-The SLM Engine first builds the `onnxruntime-genai` library from source. For some targets you also need to build `onnxruntime` from source. Therefore, any prerequisites that apply to build from source of these two libraries also applicable to SLM Engine. There are no additional requirements for building SLM engine.
-
-In order to build the software you will need C++ toolchain such as clang/llvm and cmake. Since the build scripts use Python3, any Python 3 would work. However, to enable Qualcomm QNN support, you need to use a Linux host and Python 3.8. Note that Python module `requests` is required. Use `pip install requests` when setting up the Python for building this software.
-
-<details>
- <summary><b>Windows Long File Path</b></summary>
-
-For Windows, often the maximum path length is 260 which results in breaking the onnxruntime dependency build. Therefore, the long file path needs to be enabled in the group policy editor using the following steps:
-
-- Open the 'Run' command (Win+R) and type gpedit.msc, then press Enter.
-- Navigate to: Computer Configuration > Administrative Templates > System > Filesystem.
-- Double-click Enable Win32 long paths and set it to Enabled, then click Apply.
-
-Also, enable long filenames by opening a terminal window as **Administrator** and then running the following command:
-
-```
-c:\> git config --system core.longpaths true
-
-```
-
-</details>
-
-#### Build
-
-Building is as easy as following these steps:
-
-#### 1. Build the Dependencies
-
-```bash
-$ cd build_scripts
-$ python build_deps.py
-...
-```
-
-All the dependency artifacts are stored in `slm_deps/artifacts/<PLATFORM>`. For example, if you are building on MacOS then the built artifacts will be stored in `slm_deps/artifacts/MacOS-armh64/`. Similarly, if you are cross compiling for Android, then the artifacts will be stored in `slm_deps/artifacts/Android-aarch64/`.
-
-<details>
-
- <summary><b>Dependency Build Details</b></summary>
- <p>
-
-Depending on the build options specified, the `onnxruntime` is either built from pre-built binaries or source. The pre-built option is not available for Android or MacOS Sequoia. Next, `onnxruntime-genai` library is built from source. Finally, the script will clone and build few other `header only` dependencies.
-
-At the end of the build, all the built artifacts such as the header files and libraries are stored inside the deps/artifacts directory under a subdirectory that's named after the target platform.
-
-#### build_deps.py
-
-Following are the command line options applicable for the dependency build:
-
-```bash
-usage: build_deps.py [-h] [--android_sdk_path ANDROID_SDK_PATH] [--android_ndk_path ANDROID_NDK_PATH]
-                     [--api_level API_LEVEL] [--qnn_sdk_path QNN_SDK_PATH] [--build_type BUILD_TYPE] [--build_ort_from_source] [--ort_version_to_use ORT_VERSION_TO_USE]
-
-Build script for dependency libraries
-
-options:
-  -h, --help            show this help message and exit
-  --android_sdk_path ANDROID_SDK_PATH
-                        Path to ANDROID SDK
-  --android_ndk_path ANDROID_NDK_PATH
-                        Path to ANDROID NDK
-  --api_level API_LEVEL
-                        Android API Level
-  --qnn_sdk_path QNN_SDK_PATH
-                        Path to Qualcomm QNN SDK (AI Engine Direct)
-  --build_type BUILD_TYPE
-                        {Release|RelWithDebInfo|Debug}
-  --build_ort_from_source
-                        If set, ONNX Runtime is built from source
-
-  --ort_version_to_use ORT_VERSION_TO_USE
-                        ONNX Runtime version to use. Must be a git tag or branch
-
-```
-
-#### Notes
-
-1. Use the option `--ort_version_to_use` and provide a git commit hash or a branch name or a tag, if you want to use a specific version of the ONNX Runtime library. However, if you previously used a different ONNX Runtime version, then delete the `deps/src/onnxruntime` directory before running the `build_deps.py` again with this option provided.
-
-1. For Android builds, the `--build_ort_from_source` option must be set as for Android build, only build from source is supported.
-
-#### Android Build With QNN Support
-
-The following example illustrates how to cross compile the dependencies for Android CPU from Linux host.
-
-```bash
-$ export ANDROID_SDK_ROOT=<Android SDK Directory>
-$ export NDK_ROOT=$ANDROID_SDK_ROOT/ndk/<Version Number>
-$ export QNN_SDK_ROOT=<qualcomm/qairt/VERSION>
-$ python build_deps.py \
-    --android_sdk_path $ANDROID_SDK_ROOT \
-    --android_ndk_path $NDK_ROOT \
-    --build_ort_from_source \
-    --qnn_sdk_path $QNN_SDK_ROOT
-...
-
-```
-
-If you are building just for Android CPU, then omit the `qnn_sdk_path`.
-
-After the dependencies are built, it's time to build the SLM Engine as described in the next section.
-
-</details>
-
-#### 2. Build SLM Engine
-
-Next step is to build the program itself. For that use the script `build.py` with appropriate command line options as needed for Android build.
-
-```bash
-$ python build.py
-```
-
-<details>
-
- <summary><b>Build Details</b></summary>
- <p>
-
-For Android build, the following commandline options are important:
-
-```bash
-usage: build.py [-h] [--android_ndk_path ANDROID_NDK_PATH] [--build_type BUILD_TYPE]
-
-Build script for this repo
-
-options:
-  -h, --help            show this help message and exit
-  --android_ndk_path ANDROID_NDK_PATH
-                        Path to ANDROID NDK
-  --build_type BUILD_TYPE
-                        {Release|RelWithDebInfo|Debug}
-
-```
-
-For Android builds - use the following example:
-
-```bash
-$ python build.py --android_ndk_path $NDK_ROOT
-...
-```
-
-Notice that no need to specify any QNN flags as QNN device is handled by the ONNX Runtime via the [Execution Provider](https://onnxruntime.ai/docs/execution-providers/) mechanism.
-
-For building on a Linux host we also provide a Dockerfile and a shell script to build using docker. Use the following command:
-
-```bash
-$ ./build_linux.sh
-...
-```
-
-</details>
-
-### Target Specific onnxruntime Library Build Information
-
-Following table provides target specific build information for the `onnxruntime` library.
-
-| Build Host       | Target Platform  | Using prebuilt <br>onnxruntime | Output <br>Directory | Relevant <br>Build Options                                           |
-| ---------------- | ---------------- | ------------------------------ | -------------------- | -------------------------------------------------------------------- |
-| MacOS/arm64      | Android-aarch64  | ❌                             | Android-aarch64      | --android_sdk_path <br>--android_ndk_path<br>--build_ort_from_source |
-| MacOS/arm64      | MacOS-arm64      | ❌                             | Darwin-arm64         | --build_ort_from_source                                              |
-| Ubuntu 24.04/x86 | Android-aarch64  | ❌                             | Android-aarch64      | --android_sdk_path <br>--android_ndk_path<br>--build_ort_from_source |
-| Ubuntu 24.04/x86 | Ubuntu 24.04/x86 | ✅                             | Linux-x86_64         | --build_ort_from_source (optional)                                   |
-| Ubuntu 24.04/ARM | Ubuntu 24.04/ARM | ✅                             | Linux-aarch64        | --build_ort_from_source (optional)                                   |
-| Windows 11/AMD64 | Windows 11 AMD64 | ✅                             | Windows-AMD64        | --build_ort_from_source (optional)                                   |
-| Windows 11/ARM64 | Windows 11 ARM64 | ✅                             | Windows-ARM64        | --build_ort_from_source (optional)                                   |
-
-## Using SLM Engine
-
-To use the SLM Engine in your AI application, use one of the following methods:
-
-- Use the C++ library from your C++ application by including the `slm_engine.h` and creating an instance of the `SLMEngine` class.
-
-OR
-
-- Run the command line program `slm-server` which starts a web server and exposes an API endpoint that accepts OpenAI like chat completion API. Then make REST API calls pointing at this endpoint from your application.
-
-### SLM Server
-
-The command line options of the slm-server are the following:
-
-```shell
-$ ./slm-server  --help
-SLM Runner Version: 1.0.0
-ORT GenAI Version: 0.7.0-dev
-ORT Version: 1.20.1
-Usage: slm_server --model_path VAR [--port_number VAR] [--verbose]
-
-Optional arguments:
-  -m, --model_path     Path to the model file [required]
-  -p, --port_number    HTTP Port Number to use (default 8080)
-  -v, --verbose        If provided, more debugging information printed on standard output
-```
-
-### Example Launch Command
-
-```shell
-$ ./slm-server -m <path to the ONNX model> -v
-
-```
-
-Once the server is running, you can use a HTTP client to send user queries to the server and generate responses. Following is an example cURL command that talks to the SLM Engine vis the REST API:
-
-### Example cURL
-
-```bash
-curl -X POST http://localhost:8000/completions -H "Content-Type: application/json" --data '{"messages": [{"role": "system", "content": "You are a helpful AI Assistant. Please answer the questions very accurately. Use emojis and markdown as appropriate"},{"role": "user", "content": "What are the top 5 places to visit in San Diego? Be brief."}], "max_tokens": 1200, "temperature": 0.7}'
-```
-
-The SLM server supports the following REST APIs (click to expand):
-
-<details>
- <summary><code>GET</code> <code><b>/</b></code> <code>Returns SLM server status</code></summary>
-
-##### Parameters
-
-> None
-
-##### Responses
-
-> | http code | content-type       | response      |
-> | --------- | ------------------ | ------------- |
-> | `200`     | `application/json` | `JSON Object` |
-
-##### Example cURL
-
-```javascript
->  curl -X GET http://localhost:8000
-```
-
-##### JSON Schema for the Response for GET /
-
-```json
-{
-    "response":
-    {
-        "status": "success",
-        "engine_state":
-        {
-            "engine_version": <Version String>,
-            "model": <Model name>
-        }
-    }
-}
-```
-
-</details>
-
-<details>
- <summary><code>POST</code> <code><b>/complete</b></code> <code>Given the prompt, generates response from SLM</code></summary>
-
-##### Parameters
-
-> | name | type     | data type                                   | description |
-> | ---- | -------- | ------------------------------------------- | ----------- |
-> | None | required | object (JSON formatted using OpenAI schema) | N/A         |
-
-##### Responses
-
-> | http code | content-type       | response      |
-> | --------- | ------------------ | ------------- |
-> | `200`     | `application/json` | `JSON Object` |
-
-##### JSON Schema for the Response for /completions (success)
-
-```json
-{
-    "response":
-    {
-        "status": "success",
-        "choices": [
-            "index": 0,
-            "message": {
-                "role": "assistant",
-                "content": "<LLM Generated content>"
-            },
-        ]
-        "kpi": {
-            "generated_toks": <Value>,
-            "prompt_toks": <Value>,
-            "tok_rate": <value>,
-            "total_time": <Value>,
-            "ttft": <Value>
-        },
-        "question": <User's Question>,
-        "llm_input": "<Actual String input to LLM"
-    }
-}
-```
-
-##### JSON Schema for the Response for /completions (error)
-
-```json
-{
-  "response": {
-    "status": "error",
-    "message": "Error Message"
-  }
-}
-```
-
-</details>
-
-### Function Calling Support
-
-The SLM Engine enabling the model to intelligently select and invoke predefined functions based on user requests. This feature allows developers to extend the model's capabilities by providing custom tools and functions that the AI can use to perform specific tasks.
-
-#### Key Features:
-- **Tool Definition**: Define custom functions with parameters and descriptions
-- **Intelligent Function Selection**: The model automatically determines which function to call based on user input
-- **Structured Output**: Returns function calls in a standardized JSON format
-
-#### Example Function Calling Request
-
-The following example demonstrates how to use function calling with the SLM Engine for booking flights and hotels:
-
-```bash
-curl -X POST http://localhost:8000/completions -H "Content-Type: application/json" --data '{
-    "messages": [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant with these tools."
-        },
-        {
-            "role": "user",
-            "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris"
-        }
-    ],
-    "tools": [
-        {
-            "name": "booking_flight_tickets", 
-            "description": "booking flights", 
-            "parameters": {
-                "origin_airport_code": {
-                    "description": "The name of Departure airport code", 
-                    "type": "string"
-                }, 
-                "destination_airport_code": {
-                    "description": "The name of Destination airport code", 
-                    "type": "string"
-                }, 
-                "departure_date": {
-                    "description": "The date of outbound flight", 
-                    "type": "string"
-                }, 
-                "return_date": {
-                    "description": "The date of return flight", 
-                    "type": "string"
-                }
-            }
-        }, 
-        {
-            "name": "booking_hotels", 
-            "description": "booking hotel", 
-            "parameters": {
-                "destination": {
-                    "description": "The name of the city", 
-                    "type": "string"
-                }, 
-                "check_in_date": {
-                    "description": "The date of check in", 
-                    "type": "string"
-                }, 
-                "checkout_date": {
-                    "description": "The date of check out", 
-                    "type": "string"
-                }
-            }
-        }
-    ],
-    "temperature": 0.00001,
-    "max_tokens": 4096,
-    "top_p": 1.0,
-    "do_sample": false
-}'
-```
-
-The model will analyze the user's request and generate appropriate function calls with the correct parameters, enabling seamless integration with external APIs and services.
-
-***Note*** - This time we just support Phi and Llama,Qwen3 model
-
-### C++ Application using the SLMEngine
-
-The SLMEngine is designed to be used from another C++ application running on the Edge. Integrating the SLMEngine into another C++ project using cmake is illustrated below.
-
-First build the SLM Engine from source using the build instructions provided in this document. The build output are stored in the target specific `install/include` and `install/bin` directories.
-
-#### CMakeLists.txt
-
-```cmake
-cmake_minimum_required(VERSION 3.28)
-project(HelloSLM)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-add_library(slm_engine SHARED IMPORTED)
-set_target_properties(
-    slm_engine
-    PROPERTIES
-    IMPORTED_IMPLIB
-    <location-of-slm-artifacts>/bin/libslmengine.so
-)
-add_library(ort SHARED IMPORTED)
-set_target_properties(
-    ort
-    PROPERTIES
-    IMPORTED_IMPLIB
-    <location-of-slm-artifacts>/bin/libonnxruntime.so
-
-add_library(ort_genai SHARED IMPORTED)
-set_target_properties(
-    ort_genai
-    PROPERTIES
-    IMPORTED_IMPLIB
-    <location-of-slm-artifacts>/bin/libonnxruntime-genai.so
-
-include_directories(<location-of-slm-artifacts>/include)
-
-add_executable(hello_slm hello_slm.cpp)
-target_link_libraries(inference_server slm_engine ort ort_genai)
-
-```
-
-#### hello_slm.cpp
-
-```c++
-
-#include <string>
-#include <iostream>
-#include "slm_engine.h"
-
-int main(int argc, char **argv) {
-
-    auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-        "path to ONNX Model Directory", "phi", true);
-
-    if (!slm_engine) {
-        std::cout << "Cannot create engine!\n";
-        return;
-    }
-
-    microsoft::slm_engine::SLMEngine::GenerationOptions generator_options;
-    generator_options.MaxGeneratedTokens = 2400;
-    std::string response_str;
-    microsoft::slm_engine::SLMEngine::RuntimePerf kpi;
-
-    // Call the SLM engine
-    slm_engine->generate("What is 2 + 2?", generator_options, response_str, kpi);
-
-    std::cout << "Generated Response: " << response_str << std::endl;
-}
-
-```
-
-See the [slm_engine.h](src/cpp/slm_engine.h) for more details of the C++ API.
-
-See the following reference CLI applications to learn more about how to use an HTTP server [slm_server.cpp](src/cpp/slm_server.cpp) or a CLI program for batch generation processing [slm_runner.cpp](src/cpp/slm_runner.cpp) using this library.
-
-## Running SLM Engine
-
-After the build is complete, the binaries are available in the build_scripts/builds/<TARGET_NAME>/install/bin directory. To test the build, download the ONNX model first and then run following command to execute SLM engine with a sample input file.
-
-### Download ONNX Model
-
-1. Navigate to the Hugging Face and download an SLM such as [Microsoft Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
-1. Next run the `slm-runner` CLI program pointing at the CPU version of the quantized model.
-
-Following example show how to test this on a Windows 11:
-
-```Powershell
->  cd .\build_scripts\builds\Windows-ARM64\install\bin\
->  .\slm-runner.exe -mf phi -t ..\..\..\..\..\test\batch-input.jsonl -m ..\..\..\..\..\..\..\..\..\models\Phi-3-mini-4k-instruct-onnx\cpu_and_mobile\cpu-int4-rtn-block-32-acc-level-4\ -o output.jsonl -v
-
-```
-
-The SLM Runner used above is a CLI program for testing the SLM engine in a batch mode. The following section provides more details.
-
-#### SLM Runner
-
-The `slm-runner` CLI application works in a batch mode and thus useful for benchmarking and testing. In addition to the ONNX model, you will also need to prepare a JSONL file that contains the system and user prompts formatted like OpenAI API. Following is an example of a line of JSON fragment that contains the `system` and `user` messages:
-
-```shell
-$ ./slm-runner --help
-SLM Runner Version: 1.0.0
-ORT GenAI Version: 0.7.0-dev
-ORT Version: 1.20.1
-Unknown argument: --help
-Usage: slm_runner --model_path VAR --test_data_file VAR --output_file VAR [--verbose]
-
-Optional arguments:
-  -m, --model_path             Path to the model file [required]
-  -t, --test_data_file         Path to the test data file (JSONL) [required]
-  -o, --output_file            Path to the output file (JSONL) [required]
-  -w, --wait_between_requests  Wait time between requests in milliseconds
-  -v, --verbose                If provided, more debugging information printed on standard output
-```
-
-```JSON
-{"messages":
-    [
-        {
-            "role": "system",
-            "content": "You are an in car virtual assistant that maps user's inputs to the corresponding function call in the vehicle. You must respond with only a JSON object matching the following schema: {\"function_name\": <name of the function>, \"arguments\": <arguments of the function>}"
-        },
-        {
-            "role": "user", "content": "Can you please set the radio to 90.3"
-        }
-    ],
-    "max_tokens": 300,
-    "temperature": 0.0,
-    "stop": ["\n"]
-}
-
-```
diff --git a/examples/slm_engine/architecture.drawio.png b/examples/slm_engine/architecture.drawio.png
deleted file mode 100644
index 148738631f..0000000000
Binary files a/examples/slm_engine/architecture.drawio.png and /dev/null differ
diff --git a/examples/slm_engine/build_scripts/.gitignore b/examples/slm_engine/build_scripts/.gitignore
deleted file mode 100644
index ba1880171c..0000000000
--- a/examples/slm_engine/build_scripts/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-builds/
-__pycache__
-slm_deps/
diff --git a/examples/slm_engine/build_scripts/Dockerfile b/examples/slm_engine/build_scripts/Dockerfile
deleted file mode 100644
index f9bd248029..0000000000
--- a/examples/slm_engine/build_scripts/Dockerfile
+++ /dev/null
@@ -1,40 +0,0 @@
-# Use a base image that supports multiple architectures
-FROM ubuntu:24.04
-
-# Ensure that all interactions for apt are non-interactive
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Install necessary packages for building gRPC and C++ applications
-RUN apt-get update && apt-get install -y \
-    git \
-    git-lfs \
-    cmake \
-    build-essential \
-    ninja-build \
-    automake \
-    autoconf \
-    libtool \
-    pkg-config \
-    unzip \
-    wget \
-    vim \
-    nano \
-    curl \
-    openjdk-11-jdk \
-    python3.12 \
-    python3.12-venv \
-    python3.12-dev \
-    python3-pip \
-    python3-requests \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Android SDK
-ENV ANDROID_HOME=/opt/android-sdk
-
-RUN mkdir -p $ANDROID_HOME && \
-    wget -O cmd.zip https://dl.google.com/android/repository/commandlinetools-linux-9477386_latest.zip && \
-    unzip cmd.zip -d $ANDROID_HOME && rm cmd.zip && \
-    yes | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=$ANDROID_HOME --licenses && \
-    $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=$ANDROID_HOME 'platforms;android-27' && \
-    $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=$ANDROID_HOME 'build-tools;30.0.2' && \
-    $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=$ANDROID_HOME 'ndk;27.2.12479018' 
diff --git a/examples/slm_engine/build_scripts/build.py b/examples/slm_engine/build_scripts/build.py
deleted file mode 100755
index 844b1ce5ea..0000000000
--- a/examples/slm_engine/build_scripts/build.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import pathlib
-import platform
-import subprocess
-import sys
-
-from build_deps import get_machine_type
-
-BLUE = "\033[34m"
-RED = "\033[31m"
-CLEAR = "\033[0m"
-
-
-def cmake_options_android(ndk_dir):
-    if not os.path.exists(ndk_dir):
-        raise Exception(f"{RED}NDK Directory doesn't exist: {ndk_dir}{CLEAR}")
-    else:
-        cmake_option = [
-            f"-DCMAKE_TOOLCHAIN_FILE={ndk_dir}/build/cmake/android.toolchain.cmake",
-            "-DANDROID_PLATFORM=android-33",
-            "-DANDROID_ABI=arm64-v8a",
-        ]
-        return cmake_option
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Build script for this repo")
-
-    # Adding arguments
-    parser.add_argument("--android_ndk_path", type=str, help="Path to ANDROID NDK")
-    parser.add_argument(
-        "--build_type",
-        type=str,
-        default="Release",
-        help="{Release|RelWithDebInfo|Debug}",
-    )
-    parser.add_argument(
-        "--cmake_generator",
-        type=str,
-        help="{Unix Makefiles|Ninja|Visual Studio 17 2022|Xcode}",
-    )
-
-    # Parsing arguments
-    args = parser.parse_args()
-
-    # Determine the toplevel directory
-    path = pathlib.Path(__file__).parent.resolve()
-    TOPLEVEL_DIR = path.parent.absolute()
-
-    # We need to get the name of the toplevel/src directory
-    TOPLEVEL_DIR = f"{TOPLEVEL_DIR}/src"
-
-    # Set up the cmake generator
-    cmake_generator = args.cmake_generator
-    if cmake_generator is None:
-        if sys.platform.startswith("win"):
-            cmake_generator = "Visual Studio 17 2022"
-        elif sys.platform.startswith("linux"):
-            cmake_generator = "Ninja"
-        else:
-            cmake_generator = "Unix Makefiles"
-
-    print(f"Using CMake generator: {cmake_generator}")
-
-    artifacts_dir = os.path.abspath("slm_deps/artifacts/")
-    cmake_options = [
-        "cmake",
-        "-G",
-        cmake_generator,
-        TOPLEVEL_DIR,
-        f"-DARTIFACTS_DIR={artifacts_dir}",
-        f"-DCMAKE_BUILD_TYPE={args.build_type}",
-    ]
-
-    # We keep the build directory prefix as same as that's returned by the
-    # platform.system() call which maps 1:1 with the Linux uname -s command.
-    # When cross-compiling for Android, we use Android as the prefix.
-
-    dir_prefix = platform.system()
-    if args.android_ndk_path:
-        cmake_options.extend(cmake_options_android(args.android_ndk_path))
-        dir_prefix = "Android"
-        args.android = True
-    else:
-        args.android = False
-
-    build_dir = f"builds/{dir_prefix}-{get_machine_type(args)}"
-
-    # Launch build
-    print("BUILD Dir:", build_dir)
-    os.makedirs(build_dir, exist_ok=True)
-
-    print(f"{BLUE}CMAKE Options: {cmake_options}{CLEAR}")
-
-    print("Building ...")
-    os.chdir(build_dir)
-    result = subprocess.call(cmake_options)
-    if result != 0:
-        raise Exception(f"{RED}CMake error!{CLEAR}")
-
-    result = subprocess.call(
-        [
-            "cmake",
-            "--build",
-            ".",
-            "--parallel",
-            "--config",
-            args.build_type,
-        ]
-    )
-    if result != 0:
-        raise Exception(f"{RED}Build error!{CLEAR}")
-
-    # Now run the installation
-    print("Installing...")
-    result = subprocess.call(["cmake", "--install", "."])
-    if result != 0:
-        raise Exception(f"{RED}Installation error!{CLEAR}")
-
-    print(f"{BLUE}Build complete{CLEAR}")
-    print(f"{BLUE}Artifacts are available here: {artifacts_dir}{CLEAR}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/slm_engine/build_scripts/build_android.sh b/examples/slm_engine/build_scripts/build_android.sh
deleted file mode 100755
index 17567a84c7..0000000000
--- a/examples/slm_engine/build_scripts/build_android.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/sh
-
-# This script builds the slm_engine for Android using docker.
-# It uses the Dockerfile in the current directory to build a docker image
-# that contains all the necessary dependencies for building the slm_engine.
-# The script then runs the docker image to build the slm_engine.
-# The script assumes that the Dockerfile is in the same directory as this script.
-# The script also assumes that the android-sdk and android-ndk are installed
-# in the /opt/android-sdk directory.
-# 
-
-# Check the architecture
-if [ "$(uname -m)" != "x86_64" ]; then
-    echo "This script is intended to run on x86_64 architecture only."
-    exit 1
-fi
-
-set -e
-set -x
-set -u
-
-# Build the docker image 
-docker build -t slm-engine-builder -f Dockerfile .
-
-# Define base build_deps command
-BUILD_DEPS_CMD="python3 build_deps.py \
-    --build_ort_from_source \
-    --android_sdk_path /opt/android-sdk/ \
-    --android_ndk_path /opt/android-sdk/ndk/27.2.12479018/"
-
-# Docker volume mount options
-VOLUME_MOUNTS="-v `pwd`/../../../:`pwd`/../../../"
-
-# Check if USE_ORT_VERSION is defined
-if [ ! -z "${USE_ORT_VERSION:-}" ]; then
-    BUILD_DEPS_CMD="$BUILD_DEPS_CMD --ort_version_to_use $USE_ORT_VERSION"
-    echo "Using ONNX Runtime version: $USE_ORT_VERSION"
-fi
-
-# Check if QNN_SDK_HOME is defined
-if [ ! -z "${QNN_SDK_HOME:-}" ]; then
-    # Create Docker mount point for QNN SDK
-    QNN_SDK_DOCKER_PATH="/opt/qnn_sdk"
-    
-    # Add mount for QNN SDK
-    VOLUME_MOUNTS="$VOLUME_MOUNTS -v $QNN_SDK_HOME:$QNN_SDK_DOCKER_PATH"
-    
-    # Use the Docker path in build command
-    BUILD_DEPS_CMD="$BUILD_DEPS_CMD --qnn_sdk_path $QNN_SDK_DOCKER_PATH"
-    
-    echo "QNN SDK path detected, building with QNN support"
-    echo "Mounting $QNN_SDK_HOME to $QNN_SDK_DOCKER_PATH in container"
-fi
-
-# Run the docker to build dependencies
-docker run --rm $VOLUME_MOUNTS \
-    -u $(id -u):$(id -g) -w `pwd` \
-    slm-engine-builder $BUILD_DEPS_CMD
-
-# Next build the slm_engine
-docker run --rm -v \
-    `pwd`/../../../:`pwd`/../../../  \
-    -u $(id -u):$(id -g) -w `pwd` \
-    slm-engine-builder python3 build.py \
-    --android_ndk_path /opt/android-sdk/ndk/27.2.12479018/ \
diff --git a/examples/slm_engine/build_scripts/build_deps.py b/examples/slm_engine/build_scripts/build_deps.py
deleted file mode 100755
index 2c6286dbb4..0000000000
--- a/examples/slm_engine/build_scripts/build_deps.py
+++ /dev/null
@@ -1,614 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import glob
-import os
-import platform
-import shutil
-import subprocess
-import sys
-import time
-
-MAGENTA = "\033[35m"
-RED = "\033[31m"
-CLEAR = "\033[0m"
-
-
-def cmake_options_android(ndk_dir):
-    if not os.path.exists(ndk_dir):
-        raise Exception(f"NDK Directory doesn't exist: {ndk_dir}")
-    cmake_option = [
-        f"-DCMAKE_TOOLCHAIN_FILE={ndk_dir}/build/cmake/android.toolchain.cmake",
-        "-DANDROID_PLATFORM=android-33",
-        "-DANDROID_ABI=arm64-v8a",
-    ]
-    return cmake_option
-
-
-def get_platform_dirname(args):
-    # Get the name of the OS
-
-    platform_name = platform.system()
-    if platform_name == "Darwin":
-        platform_name = "MacOS"
-
-    if args.android:
-        platform_name = "Android"
-
-    return platform_name
-
-
-def get_target_specific_oga_build_dirname():
-    # Get the name of the OS
-    platform_name = platform.system()
-    if platform_name == "Darwin":
-        platform_name = "macOS"
-
-    return platform_name
-
-
-def get_machine_type(args):
-    machine_type = platform.machine()
-    if args.android:
-        machine_type = "aarch64"
-
-    return machine_type
-
-
-def copy_files_without_hidden(src, dest):
-    """
-    Recursively copies files from the source directory to the destination directory,
-    excluding hidden files and directories.
-
-    Args:
-      src: Path to the source directory.
-      dest: Path to the destination directory.
-    """
-    try:
-        os.makedirs(dest, exist_ok=True)  # Create destination directory if it doesn't exist
-
-        for root, dirs, files in os.walk(src):
-            for file in files:
-                if not file.startswith("."):  # Exclude hidden files
-                    src_file = os.path.join(root, file)
-                    dest_file = os.path.join(dest, os.path.relpath(src_file, src))
-                    os.makedirs(os.path.dirname(dest_file), exist_ok=True)  # Create necessary directories
-                    shutil.copy2(src_file, dest_file)
-
-    except OSError as e:
-        print(f"Error: {e}")
-        raise e
-
-
-def copy_files_keeping_symlinks(src_files, dest):
-    if not type(src_files) == list:
-        raise Exception("src_files must be a list")
-
-    for file in src_files:
-        # Preserve symlinks
-        if os.path.islink(file):
-            # Get the name of the link without the rest of the path
-            linkname = f"{dest}/{os.path.basename(file)}"
-            linkto = os.readlink(file)
-
-            if not os.path.exists(linkname):
-                os.symlink(linkto, linkname)
-        elif os.path.isdir(file):
-            shutil.copytree(file, f"{dest}/{os.path.basename(file)}", dirs_exist_ok=True)
-        else:
-            shutil.copy2(file, dest)
-
-
-def download_ort(ort_version, artifacts_dir):
-    """
-    Copy the artifacts from the prebuilt directory to the artifacts directory
-    """
-    # Determine the top level directory
-    build_scripts_dir = f"{os.path.dirname(os.path.realpath(__file__))}"
-    os.chdir(build_scripts_dir)
-
-    # Get the platform and machine type
-    platform_name = platform.system()
-    if platform_name == "Darwin":
-        platform_name = "osx"
-    elif platform_name == "Linux":
-        platform_name = "linux"
-    elif platform_name == "Windows":
-        platform_name = "win"
-
-    machine_type = platform.machine().lower()
-    if machine_type == "x86_64":
-        machine_type = "x64"
-    elif machine_type == "amd64":
-        machine_type = "x64"
-
-    # Prepare the URL
-    URL_PREFIX = "https://github.com/microsoft/onnxruntime/releases/download"
-    FILE_PREFIX = f"onnxruntime-{platform_name}-{machine_type}-{ort_version}"
-
-    FILE_NAME = f"{FILE_PREFIX}.tgz"
-    if platform_name == "win":
-        FILE_NAME = f"{FILE_PREFIX}.zip"
-
-    download_url = f"{URL_PREFIX}/v{ort_version}/{FILE_NAME}"
-    print(f"Downloading from: {download_url}")
-
-    # Download the file
-    if not os.path.exists(artifacts_dir):
-        os.makedirs(artifacts_dir, exist_ok=True)
-    os.chdir(artifacts_dir)
-
-    if not os.path.exists(FILE_NAME):
-        result = subprocess.call(["curl", "-L", download_url, "-o", FILE_NAME])
-        if result != 0:
-            raise Exception(f"Failed to download {download_url}")
-        # Extract the file
-        shutil.unpack_archive(FILE_NAME, ".")
-        os.remove(FILE_NAME)
-
-    # Now copy the files to the artifacts directory
-
-    # Copy the include files
-    src_dir = os.path.abspath(FILE_PREFIX)
-    os.makedirs(f"{artifacts_dir}/include", exist_ok=True)
-    copy_files_keeping_symlinks(
-        glob.glob(f"{src_dir}/include/*"),
-        f"{artifacts_dir}/include/",
-    )
-    # Copy the lib files
-    os.makedirs(f"{artifacts_dir}/lib", exist_ok=True)
-    copy_files_keeping_symlinks(
-        glob.glob(f"{src_dir}/lib/*"),
-        f"{artifacts_dir}/lib/",
-    )
-
-
-def build_ort(args, build_dir, artifacts_dir):
-    """
-    Build the ONNX Runtime library and ORT-GenAI library
-    """
-    start_time = time.time()
-
-    os.chdir(build_dir)
-
-    # Make the src directory if needed
-    os.makedirs("src", exist_ok=True)
-    os.chdir("src")
-
-    if not os.path.exists("onnxruntime"):
-        # Clone the ORT Repo
-        print("Cloning ONNX Runtime")
-        if subprocess.call(["git", "clone", "https://github.com/microsoft/onnxruntime.git"]) != 0:
-            raise Exception("Failed to clone ONNX Runtime")
-
-    # Now get the dependencies
-    os.chdir("onnxruntime")
-
-    # Checkout the correct version
-    version = args.ort_version_to_use
-    print(f"Checking out ONNX Runtime version: {version}")
-    if subprocess.call(["git", "fetch", "--tags", "origin"]) != 0:
-        raise Exception("Failed to fetch tags for ONNX Runtime")
-    if subprocess.call(["git", "checkout", version]) != 0:
-        raise Exception("Failed to checkout ONNX Runtime version")
-
-    # Return to the original directory
-    os.chdir("../..")
-
-    # Prepare the command arguments
-    cmd_args = [
-        "--build_shared_lib",
-        "--skip_tests",
-        "--parallel",
-        "--config",
-        args.build_type,
-        # "--use_guidance",
-    ]
-    if args.android:
-        cmd_args.extend(
-            [
-                "--android",
-                "--android_sdk_path",
-                args.android_sdk_path,
-                "--android_ndk_path",
-                args.android_ndk_path,
-                "--android_abi",
-                "arm64-v8a",
-                "--android_api",
-                args.api_level,
-            ]
-        )
-        if args.qnn_sdk_path:
-            cmd_args.extend(["--use_qnn", "static_lib", "--qnn_home", args.qnn_sdk_path])
-
-    cmd_args.extend(["--cmake_extra_defines", "onnxruntime_BUILD_UNIT_TESTS=OFF"])
-
-    # now build the ORT library
-    print(f"{MAGENTA}Building ONNX Runtime{CLEAR}")
-    os.chdir("src/onnxruntime")
-
-    build_script = "build.bat" if platform.system() == "Windows" else "./build.sh"
-    print(f"{MAGENTA}Running {build_script} with args: {cmd_args}{CLEAR}")
-    result = subprocess.call([build_script] + cmd_args)
-    if result != 0:
-        raise Exception("Failed to build ONNX Runtime")
-
-    # Now add the symbolic links
-    # First save the current directory
-    current_dir = os.getcwd()
-
-    # Get the absolute path tot he build directory
-    build_dir_name = f"build/{get_platform_dirname(args)}/{args.build_type}"
-    build_dir_name = os.path.abspath(build_dir_name)
-    ort_home = os.path.abspath(f"{build_dir_name}/install")
-    print(f"{MAGENTA}ORT Home: {ort_home}{CLEAR}")
-
-    os.chdir(build_dir_name)
-
-    # Run install
-    print(f"{MAGENTA}Running install{CLEAR}")
-    result = subprocess.call(
-        [
-            "cmake",
-            "--install",
-            ".",
-            "--prefix",
-            ort_home,
-        ]
-    )
-
-    if result != 0:
-        raise Exception("Failed to install ONNX Runtime")
-
-    # Now create the symbolic links only if Android Build
-    os.chdir(ort_home)
-    if args.android:
-        # Create the symbolic links only in doesn't exist
-        if not os.path.exists("headers"):
-            os.symlink("include/onnxruntime", "headers")
-
-        # Make the jni directory
-        os.makedirs("jni", exist_ok=True)
-        os.chdir("jni")
-        if not os.path.exists("arm64-v8a"):
-            os.symlink("../lib", "arm64-v8a")
-        os.chdir("..")
-    else:
-        # If we are on Windows - then we need to copy the .dll files to the
-        # lib directory as well
-        if platform.system() == "Windows":
-            copy_files_keeping_symlinks(
-                glob.glob("bin/*.dll"),
-                "lib",
-            )
-
-    # Copy the include/onnxruntime/* to include directory
-    copy_files_keeping_symlinks(
-        glob.glob("include/onnxruntime/*"),
-        "include",
-    )
-
-    print(f"{MAGENTA}Copying ORT artifacts to 3P Artifacts: \n{artifacts_dir}{CLEAR}")
-    os.makedirs(artifacts_dir, exist_ok=True)
-    copy_files_keeping_symlinks(glob.glob(f"{ort_home}/*"), artifacts_dir)
-
-    # Back to the original directory
-    os.chdir(current_dir)
-    time_build_end = time.time()
-    print(f"ORT Build Time: {time_build_end - start_time:.2f} seconds")
-
-    return ort_home
-
-
-def build_ort_genai(args, artifacts_dir, ort_home):
-    time_build_start = time.time()
-
-    # Navigate to the directory where this Python file is located
-    os.chdir(os.path.dirname(os.path.realpath(__file__)))
-
-    # Save the current directory
-    current_dir = os.getcwd()
-
-    # Go to the toplevel directory. To determine the top level directory, we need to
-    # find the directory of this python file and then go from there
-    top_level_dir = "../../../"
-    os.chdir(top_level_dir)
-
-    if subprocess.call(["git", "submodule", "update", "--init", "--recursive"]) != 0:
-        raise Exception("Failed to update submodules")
-
-    # Now build the ORT-GenAI library
-    print(f"{MAGENTA}Building ONNX Runtime-GenAI with Guidance Support for Function Calling{CLEAR}")
-    # Prepare the command arguments
-    cmd_args = [
-        "--skip_wheel",
-        "--skip_tests",
-        "--parallel",
-        "--config",
-        args.build_type,
-        "--cmake_extra_defines",
-        "ENABLE_PYTHON=OFF",
-        # "USE_GUIDANCE=ON",
-        # "--use_guidance",  # Enable guidance support for constrained JSON generation
-        # Note: If Python linking issues occur, comment out --use_guidance above
-        # Function calling will work in both guidance and fallback modes
-    ]
-    if ort_home is None:
-        raise Exception(f"{RED}ORT Home is None. Please build ORT from source first{CLEAR}")
-
-    print(f"{MAGENTA}ORT Home: {ort_home}{CLEAR}")
-    cmd_args.extend(["--ort_home", ort_home])
-
-    if args.android:
-        cmd_args.extend(
-            [
-                "--android",
-                "--android_home",
-                args.android_sdk_path,
-                "--android_ndk_path",
-                args.android_ndk_path,
-                "--android_abi",
-                "arm64-v8a",
-                "--android_api",
-                args.api_level,
-            ]
-        )
-
-    print(f"{MAGENTA}Running build.py with args: {cmd_args}{CLEAR}")
-    python_executable = sys.executable
-    result = subprocess.call([python_executable, "build.py"] + cmd_args)
-    if result != 0:
-        # If guidance build fails, try fallback mode
-        print(f"{RED}Guidance build failed. Attempting fallback mode without guidance...{CLEAR}")
-        # Remove --use_guidance from cmd_args
-        if "--use_guidance" in cmd_args:
-            cmd_args.remove("--use_guidance")
-
-        print(f"{MAGENTA}Running build.py with fallback args: {cmd_args}{CLEAR}")
-        result = subprocess.call([python_executable, "build.py"] + cmd_args)
-        if result != 0:
-            raise Exception(f"{RED}Failed to build ORT-GenAI in both guidance and fallback modes{CLEAR}")
-        else:
-            print(f"{MAGENTA}Successfully built ORT-GenAI in fallback mode{CLEAR}")
-    else:
-        print(f"{MAGENTA}Successfully built ORT-GenAI with guidance support{CLEAR}")
-
-    # Now install the ORT-GenAI library
-    build_dir_name = f"build/{get_platform_dirname(args)}/{args.build_type}"
-    build_dir_name = os.path.abspath(build_dir_name)
-
-    os.chdir(build_dir_name)
-
-    # Run install
-    print("Running install")
-    result = subprocess.call(
-        [
-            "cmake",
-            "--install",
-            ".",
-            "--prefix",
-            f"{build_dir_name}/install",
-        ]
-    )
-
-    if result != 0:
-        print(f"Current Directory: {os.getcwd()}")
-        raise Exception(f"{RED}Failed to install ONNX Runtime{CLEAR}")
-
-    # The "current_dir" is the "build_scripts" directory.
-    os.chdir(current_dir)
-
-    os.makedirs(f"{artifacts_dir}/include", exist_ok=True)
-    os.makedirs(f"{artifacts_dir}/lib", exist_ok=True)
-
-    copy_files_keeping_symlinks(glob.glob(f"{build_dir_name}/install/lib/*"), f"{artifacts_dir}/lib")
-    copy_files_keeping_symlinks(glob.glob(f"{build_dir_name}/install/bin/*"), f"{artifacts_dir}/lib")
-
-    copy_files_keeping_symlinks(
-        glob.glob(f"{build_dir_name}/install/include/*"),
-        f"{artifacts_dir}/include",
-    )
-    print(f"{MAGENTA}Artifacts are available in: \n{artifacts_dir}{CLEAR}")
-
-    print(f"{MAGENTA}ONNX Runtime Built{CLEAR}")
-    time_build_end = time.time()
-    print(f"GenAI Build Time: {time_build_end - time_build_start:.2f} seconds")
-
-
-def build_header_only(args, build_dir, artifacts_dir):
-    """
-    Build the header-only libraries
-    """
-    # List of header-only libraries
-    header_only_libs = [
-        {
-            "name": "json",
-            "url": "https://github.com/nlohmann/json.git",
-            "version": "v3.11.3",
-            "common_dest": False,
-            "directory": "include",
-        },
-        {
-            "name": "argparse",
-            "url": "https://github.com/p-ranav/argparse.git",
-            "version": "v3.2",
-            "common_dest": False,
-            "directory": "include",
-        },
-        {
-            "name": "cpp-httplib",
-            "url": "https://github.com/yhirose/cpp-httplib.git",
-            "version": "v0.18.5",
-            "common_dest": True,
-            "files": ["httplib.h"],
-        },
-    ]
-
-    # Copy the headers to the artifacts directory
-    dest_root_dir = os.path.abspath(f"{artifacts_dir}/include")
-
-    os.chdir(build_dir)
-
-    print(f"Current Directory: {os.getcwd()}")
-    os.makedirs("src", exist_ok=True)
-
-    for lib in header_only_libs:
-        print(f"Building {lib['name']}")
-        # Clone the repo
-        if not os.path.exists(f"src/{lib['name']}"):
-            # Clone the ORT Repo
-            print(f"{MAGENTA}Cloning {lib['name']}{CLEAR}")
-            os.chdir("src")
-            result = subprocess.call(["git", "clone", lib["url"]])
-            if result != 0:
-                print(f"Failed to clone {lib['name']}")
-                return
-            os.chdir("..")
-
-        # Go to src
-        os.chdir("src")
-
-        # Checkout the specific version
-        os.chdir(lib["name"])
-        result = subprocess.call(["git", "fetch", "--tags", "origin"])
-        if result != 0:
-            print(f"{RED}Failed to get tags for {lib['name']}{CLEAR}")
-            return
-
-        result = subprocess.call(["git", "checkout", lib["version"]])
-        if result != 0:
-            print(f"{RED}Failed to checkout version: {lib['version']} {lib['name']}{CLEAR}")
-            return
-
-        if not os.path.exists(dest_root_dir):
-            os.makedirs(dest_root_dir, exist_ok=True)
-
-        # If the files key is defined, then copy the files
-        if "files" in lib:
-            for file in lib["files"]:
-                shutil.copy2(file, dest_root_dir)
-        elif "directory" in lib:
-            os.chdir("..")
-            copy_files_without_hidden(f"{lib['name']}/{lib['directory']}", dest_root_dir)
-        else:
-            # Copy the entire directory
-            os.chdir("..")
-            copy_files_without_hidden(lib["name"], dest_root_dir)
-
-        # Return to the original directory
-        os.chdir("..")
-    print(f"{MAGENTA}Header Only Libraries Built{CLEAR}")
-    print(f"{MAGENTA}Artifacts are available in: \n{dest_root_dir}{CLEAR}")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Build script for dependency libraries")
-
-    # Adding arguments
-    parser.add_argument("--android_sdk_path", type=str, help="Path to ANDROID SDK")
-    parser.add_argument("--android_ndk_path", type=str, help="Path to ANDROID NDK")
-    parser.add_argument("--api_level", type=str, help="Android API Level", default="27")  # e.g., 29
-    parser.add_argument(
-        "--qnn_sdk_path",
-        type=str,
-        help="Path to Qualcomm QNN SDK (AI Engine Direct)",
-    )
-    parser.add_argument(
-        "--build_type",
-        type=str,
-        default="Release",
-        help="{Release|RelWithDebInfo|Debug}",
-    )
-
-    parser.add_argument(
-        "--build_ort_from_source",
-        action="store_true",
-        help="If set, ONNX Runtime is built from source",
-    )
-
-    parser.add_argument(
-        "--ort_version_to_use",
-        type=str,
-        help="ONNX Runtime version to use when building from source. Must be a git tag or branch",
-    )
-
-    parser.add_argument(
-        "--ort_home_dir",
-        type=str,
-        help="Location of the prebuilt ORT artifacts.",
-    )
-
-    # Parsing arguments
-    args = parser.parse_args()
-
-    if args.android_sdk_path or args.android_ndk_path:
-        args.android = True
-        # If the user didn't specify build_ort_from_source assert
-        if not args.build_ort_from_source:
-            raise Exception("For Android build ONNX Runtime use: --build_ort_from_source")
-    else:
-        args.android = False
-
-    # Change directory to where this Python file is located to avoid any issues
-    # related to running this script from another directory
-    os.chdir(os.path.dirname(os.path.realpath(__file__)))
-
-    # The following directory contains the following directories:
-    # - artifacts
-    #   -- Contains the artifacts of onnxruntime and onnxruntime-genai
-    #      This applies for both - downloaded or built from source
-    # - slm_deps
-    #   -- src
-    #      Contains the source code of the dependencies. The repos downloaded here
-    dep_src_dir = os.path.abspath("../../../build/slm_deps")
-    os.makedirs(dep_src_dir, exist_ok=True)
-
-    artifacts_dir = os.path.abspath(f"slm_deps/artifacts/{get_platform_dirname(args)}-{get_machine_type(args)}")
-
-    os.makedirs(artifacts_dir, exist_ok=True)
-
-    common_artifacts_dir = os.path.abspath("slm_deps/artifacts/common")
-    os.makedirs(common_artifacts_dir, exist_ok=True)
-
-    time_build_start = time.time()
-    # Initialize the ort_home to None. Default behavior is to download the
-    # ONNX Runtime library and use that. If the user however chooses to build
-    # the ONNX Runtime library from source (e.g., for Android or other embedded targets)
-    # then we will use the location of the ort_home as set by the build_ort()
-    ort_home = None
-    if args.build_ort_from_source:
-        if args.ort_version_to_use is None:
-            # If not Windows then use 1.23.0
-            if platform.system() != "Windows":
-                args.ort_version_to_use = "v1.23.0"
-            else:
-                args.ort_version_to_use = "main"
-        ort_home = build_ort(args, dep_src_dir, artifacts_dir)
-    else:
-        if args.ort_home_dir:
-            ort_home = os.path.abspath(args.ort_home_dir)
-        else:
-            # The ORT binaries are available as they were downloaded during the GenAI build
-            # This is the supported version for most platforms
-            if args.ort_version_to_use is None:
-                ORT_VERSION = "1.23.0"
-            else:
-                ORT_VERSION = args.ort_version_to_use
-            # Copy the ORT artifacts to the artifacts directory.
-            download_ort(ORT_VERSION, artifacts_dir)
-            ort_home = artifacts_dir
-
-    # Now build the ORT-GenAI library
-    build_ort_genai(args, artifacts_dir, ort_home)
-
-    # Now build the header-only libraries
-    build_header_only(args, dep_src_dir, common_artifacts_dir)
-
-    # Return to the original directory
-    os.chdir("..")
-    time_build_end = time.time()
-
-    print(f"Total Build Time: {time_build_end - time_build_start:.2f} seconds")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/slm_engine/build_scripts/build_linux.sh b/examples/slm_engine/build_scripts/build_linux.sh
deleted file mode 100755
index 1b4feea1d2..0000000000
--- a/examples/slm_engine/build_scripts/build_linux.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-# This script builds the slm_engine for Linux using docker.
-# It uses the Dockerfile in the current directory to build a docker image   
-# that contains all the necessary dependencies for building the slm_engine.
-# The script then runs the docker image to build the slm_engine.
-# The script assumes that the Dockerfile is in the same directory as this script.
-# The script also assumes that the docker is installed and running on the host machine.
-
-set -e
-set -x
-set -u
-
-# Build the docker image 
-docker build -t slm-engine-builder -f Dockerfile .
-
-# Run the docker to build dependencies
-docker run --rm -v \
-    `pwd`/../../../:`pwd`/../../../  \
-    -u $(id -u):$(id -g) -w `pwd`  \
-    slm-engine-builder python3 build_deps.py
-
-# Next build the slm_engine
-docker run --rm -v \
-    `pwd`/../../../:`pwd`/../../../  \
-    -u $(id -u):$(id -g) -w `pwd` \
-    slm-engine-builder python3 build.py
\ No newline at end of file
diff --git a/examples/slm_engine/build_scripts/docker_shell.sh b/examples/slm_engine/build_scripts/docker_shell.sh
deleted file mode 100755
index a64f4298c9..0000000000
--- a/examples/slm_engine/build_scripts/docker_shell.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/sh
-
-# This script builds the slm_engine for Android using docker.
-# It uses the Dockerfile in the current directory to build a docker image
-# that contains all the necessary dependencies for building the slm_engine.
-# The script then runs the docker image to build the slm_engine.
-# The script assumes that the Dockerfile is in the same directory as this script.
-# The script also assumes that the android-sdk and android-ndk are installed
-# in the /opt/android-sdk directory.
-# 
-
-# Check the architecture
-if [ "$(uname -m)" != "x86_64" ]; then
-    echo "This script is intended to run on x86_64 architecture only."
-    exit 1
-fi
-
-set -e
-set -x
-set -u
-
-# Build the docker image 
-docker build -t slm-engine-builder -f Dockerfile .
-
-# Define base build_deps command
-BUILD_DEPS_CMD="python3 build_deps.py \
-    --build_ort_from_source \
-    --android_sdk_path /opt/android-sdk/ \
-    --android_ndk_path /opt/android-sdk/ndk/27.2.12479018/"
-
-# Docker volume mount options
-VOLUME_MOUNTS="-v `pwd`/../../../:`pwd`/../../../"
-
-# Check if USE_ORT_VERSION is defined
-if [ ! -z "${USE_ORT_VERSION:-}" ]; then
-    BUILD_DEPS_CMD="$BUILD_DEPS_CMD --ort_version_to_use $USE_ORT_VERSION"
-    echo "Using ONNX Runtime version: $USE_ORT_VERSION"
-fi
-
-# Check if QNN_SDK_HOME is defined
-if [ ! -z "${QNN_SDK_HOME:-}" ]; then
-    # Create Docker mount point for QNN SDK
-    QNN_SDK_DOCKER_PATH="/opt/qnn_sdk"
-    
-    # Add mount for QNN SDK
-    VOLUME_MOUNTS="$VOLUME_MOUNTS -v $QNN_SDK_HOME:$QNN_SDK_DOCKER_PATH"
-    
-    # Use the Docker path in build command
-    BUILD_DEPS_CMD="$BUILD_DEPS_CMD --qnn_sdk_path $QNN_SDK_DOCKER_PATH"
-    
-    echo "QNN SDK path detected, building with QNN support"
-    echo "Mounting $QNN_SDK_HOME to $QNN_SDK_DOCKER_PATH in container"
-fi
-
-# Run the docker to build dependencies
-docker run --rm $VOLUME_MOUNTS \
-    -u $(id -u):$(id -g) -w `pwd` \
-    -w $HOME \
-    -it slm-engine-builder bash
diff --git a/examples/slm_engine/src/CMakeLists.txt b/examples/slm_engine/src/CMakeLists.txt
deleted file mode 100644
index 4c04f2d29d..0000000000
--- a/examples/slm_engine/src/CMakeLists.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-cmake_minimum_required(VERSION 3.28)
-project(SLMEngine)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(TOPLEVEL_DIR ${CMAKE_CURRENT_LIST_DIR})
-set(SYS_TARGET_ID ${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR})
-if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") 
-    set(SYS_TARGET_ID MacOS-${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-set(ARTIFACTS ${ARTIFACTS_DIR})
-
-get_filename_component(ARTIFACTS ${ARTIFACTS} ABSOLUTE)
-message(STATUS "ARTIFACTS: ${ARTIFACTS}")
-
-# Read the VERSION number 
-file (
-    STRINGS 
-    ${CMAKE_CURRENT_LIST_DIR}/VERSION.txt 
-    VERSION_NUMBER
-    LIMIT_COUNT 1)
-message(STATUS "SLM Engine VERSION: ${VERSION_NUMBER}")
-add_compile_definitions(SW_VERSION_NUMBER="${VERSION_NUMBER}")
-add_compile_definitions(BUILDING_SLM_ENGINE)
-
-# Read the version number of the Gen AI library
-set(REPO_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../..)
-if (EXISTS ${REPO_ROOT}/VERSION_INFO)
-    file(READ "${REPO_ROOT}/VERSION_INFO" ver)
-    set(ORT_GENAI_VERSION ${ver})
-else()
-    message(WARNING "VERSION_INFO file not found. Setting to UNKNOWN")
-    set(ORT_GENAI_VERSION "UNKNOWN")  
-endif()
-message(STATUS "ORT-GENAI-VERSION: ${ORT_GENAI_VERSION}")
-add_compile_definitions(ORT_GENAI_VERSION="${ORT_GENAI_VERSION}")
-
-if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") 
-    set(CMAKE_INSTALL_RPATH "@loader_path")
-    set(LIB_EXT "dylib")
-elseif( (${CMAKE_SYSTEM_NAME} STREQUAL "Android") OR (${CMAKE_SYSTEM_NAME} STREQUAL "Linux"))
-    set(LIB_EXT "so")
-    if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-        set(CMAKE_INSTALL_RPATH "\$ORIGIN")
-    endif()
-elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    set(LIB_EXT "lib")
-else()
-    message(FATAL_ERROR "Unsupported OS: ${CMAKE_SYSTEM_NAME}")
-endif()
-
-# Setup Google Test
-include(FetchContent)
-FetchContent_Declare(
-  googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG v1.15.2
-)
-# For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-set(BUILD_GMOCK OFF CACHE BOOL "" FORCE)
-set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
-
-if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    set(LIBNAME_PREFIX lib)
-endif()
-
-# Import the onnxruntime
-add_library(ort SHARED IMPORTED)
-set_target_properties(
-    ort 
-    PROPERTIES 
-    IMPORTED_IMPLIB 
-    ${ARTIFACTS}/${SYS_TARGET_ID}/lib/${LIBNAME_PREFIX}onnxruntime${LIB_VERSION_ORT}.${LIB_EXT})
-
-file(GLOB ORT_LIB_FILES 
-    "${ARTIFACTS}/${SYS_TARGET_ID}/lib/${LIBNAME_PREFIX}onnxruntime*")
-
-file(GLOB ORT_LIB_DIRS "${ARTIFACTS}/${SYS_TARGET_ID}/lib"
-    "${ARTIFACTS}/${SYS_TARGET_ID}/lib/${LIBNAME_PREFIX}onnxruntime*/")
-
-list(APPEND IMPORTED_LIB_LIST ${ORT_LIB_FILES})
-list(APPEND IMPORTED_LIB_LIST ${ORT_LIB_DIRS})
-
-# Import ONNXRT-GenAI stuff
-add_library(ort-genai SHARED IMPORTED)
-set_target_properties(
-    ort-genai 
-    PROPERTIES 
-    IMPORTED_IMPLIB 
-    ${ARTIFACTS}/${SYS_TARGET_ID}/lib/${LIBNAME_PREFIX}onnxruntime-genai.${LIB_EXT})
-
-list(APPEND IMPORTED_LIB_LIST 
-${ARTIFACTS}/${SYS_TARGET_ID}/lib/${LIBNAME_PREFIX}onnxruntime-genai.${LIB_EXT})
-
-set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install/")
-
-# In a loop copy all the files and directories in the list
-# We cannot use "install(FILES" because the list contains both files and directories"
-foreach (lib ${IMPORTED_LIB_LIST})
-    get_filename_component(lib_name ${lib} NAME)
-    message(STATUS "Copying ${lib_name} to install directory")
-    file(COPY ${lib} DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
-endforeach()
-
-# Include directories
-message(STATUS "Adding include directories: ${ARTIFACTS}/${SYS_TARGET_ID}/include")
-include_directories(${ARTIFACTS}/${SYS_TARGET_ID}/include)
-include_directories(${ARTIFACTS}/common/include)
-message(STATUS "Adding include directories: ${CMAKE_CURRENT_LIST_DIR}/include")
-
-add_subdirectory(cpp)
diff --git a/examples/slm_engine/src/VERSION.txt b/examples/slm_engine/src/VERSION.txt
deleted file mode 100644
index 4a36342fca..0000000000
--- a/examples/slm_engine/src/VERSION.txt
+++ /dev/null
@@ -1 +0,0 @@
-3.0.0
diff --git a/examples/slm_engine/src/cpp/CMakeLists.txt b/examples/slm_engine/src/cpp/CMakeLists.txt
deleted file mode 100644
index b16fb4b637..0000000000
--- a/examples/slm_engine/src/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-
-add_library(
-    slmengine
-    SHARED
-    slm_engine.cpp
-    input_decoder.cpp
-)
-
-target_link_libraries(slmengine ort-genai ort)
-
-add_executable(input_decoder-test input_decoder.cpp input_decoder_test.cpp)
-target_link_libraries(input_decoder-test GTest::gtest)
-
-add_executable(
-    unit-test 
-    slm_engine_test.cpp
-    gtest_main.cpp
-
-)
-target_link_libraries(
-    unit-test  
-    slmengine 
-    GTest::gtest
-)
-
-add_executable(slm-runner slm_runner.cpp)
-target_link_libraries(slm-runner slmengine ort-genai ort)
-
-add_executable(slm-server slm_server.cpp)
-target_link_libraries(slm-server slmengine ort-genai ort)
-    
-install(DIRECTORY ${ARTIFACTS}/${SYS_TARGET_ID}/include
-    DESTINATION .
-    FILES_MATCHING PATTERN "*.h"
-)
-
-install(FILES slm_engine.h DESTINATION include)
-install(FILES input_decoder.h DESTINATION include)
-
-install(TARGETS slmengine DESTINATION bin)
-install(TARGETS slm-runner DESTINATION bin)
-install(TARGETS slm-server DESTINATION bin)
-install(TARGETS unit-test DESTINATION bin)
diff --git a/examples/slm_engine/src/cpp/gtest_main.cpp b/examples/slm_engine/src/cpp/gtest_main.cpp
deleted file mode 100644
index a188303a16..0000000000
--- a/examples/slm_engine/src/cpp/gtest_main.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-
-#include "gtest/gtest.h"
-#include <argparse/argparse.hpp>
-
-#include "httplib.h"
-#include "ort_genai.h"
-
-using namespace std;
-
-extern const char* MODEL_FILE_PATH;
-extern const char* ADAPTER_ROOT_DIR;
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  argparse::ArgumentParser program("slm_engine_test", "1.0",
-                                   argparse ::default_arguments::none);
-  string model_path;
-  program.add_argument("-m", "--model_path")
-      .help("Path to the model file")
-      .store_into(model_path);
-
-  string adapter_root_path;
-  program.add_argument("-m", "--adapter_root_path")
-      .help("Path to the LoRA adapter root directory")
-      .store_into(adapter_root_path);
-
-  try {
-    program.parse_args(argc, argv);
-  } catch (const std::exception& err) {
-    std::cerr << err.what() << std::endl;
-    std::cerr << program;
-    std::exit(-1);
-  }
-
-  if (!model_path.empty()) {
-    cout << "Setting Model path: " << model_path << endl;
-    MODEL_FILE_PATH = model_path.c_str();
-  }
-
-  if (!adapter_root_path.empty()) {
-    cout << "Setting Adapter path: " << adapter_root_path << endl;
-    ADAPTER_ROOT_DIR = adapter_root_path.c_str();
-  }
-
-  auto status = RUN_ALL_TESTS();
-
-  OgaShutdown();
-  return status;
-}
diff --git a/examples/slm_engine/src/cpp/input_decoder.cpp b/examples/slm_engine/src/cpp/input_decoder.cpp
deleted file mode 100644
index b9b8f186dc..0000000000
--- a/examples/slm_engine/src/cpp/input_decoder.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include "input_decoder.h"
-
-#include <iostream>
-#include <nlohmann/json.hpp>
-using json = nlohmann::json;
-
-#define RED "\033[31;1m"
-#define GREEN "\033[32m"
-#define CLEAR "\033[0m"
-
-using namespace std;
-
-namespace microsoft {
-namespace slm_engine {
-
-// clang-format off
-// OpenAI API example
-// {
-//     "model": "name-of-the-adapter (optional)", 
-//     "messages": [
-// 		{
-// 			"role": "system",
-// 			"content": "You are an in car virtual assistant that maps user's inputs to the corresponding function call in the vehicle. You must respond with only a JSON object matching the following schema: {\"function_name\": <name of the function>, \"arguments\": <arguments of the function>}"
-// 		},
-// 		{
-// 			"role": "user",
-// 			"content": "Would you mind changing the radio station to BBC Radio 1, please?"
-// 		}
-// 	],
-// 	"temperature": 0,
-// 	"stop": [
-// 		"\n\n",
-// 		"\n\n\n"
-// 	],
-// 	"max_tokens": 250
-// }
-
-// clang-format on
-
-class OpenAIInputDecoder : public InputDecoder {
- public:
-  bool decode(const string& message, InputParams& decoded_params) override {
-    try {
-      auto json_msg = json::parse(message);
-      // Look for "messages"
-      if (!json_msg.contains("messages")) {
-        cout << RED << "Required node 'messages' not found!" << CLEAR
-             << endl;
-        return false;
-      } else {
-        auto messages = json_msg.at("messages");
-        if (messages.size() == 0) {
-          cout << RED << "Empty \"messages\" node\n"
-               << CLEAR;
-          return false;
-        }
-        if (!extract_messages(messages, decoded_params)) {
-          cout << RED << "Error extracting messages\n"
-               << CLEAR;
-          return false;
-        }
-      }
-      if (json_msg.contains("model")) {
-        decoded_params.LoRAAdapterName = json_msg["model"].get<string>();
-      }
-      if (json_msg.contains("temperature")) {
-        decoded_params.Temperature =
-            json_msg["temperature"].get<float_t>();
-      }
-      if (json_msg.contains("max_tokens")) {
-        decoded_params.MaxGeneratedTokens =
-            json_msg["max_tokens"].get<uint32_t>();
-      }
-      if (json_msg.contains("top_k")) {
-        decoded_params.TopK = json_msg["top_k"].get<uint32_t>();
-      }
-      if (json_msg.contains("top_p")) {
-        decoded_params.TopP = json_msg["top_p"].get<float_t>();
-      }
-
-      if (json_msg.contains("stop")) {
-        auto stop_tokens = json_msg.at("stop");
-        if (stop_tokens.size() > 0 && stop_tokens.size() < 5) {
-          for (auto& next_token : stop_tokens) {
-            decoded_params.StopTokens.push_back(next_token);
-          }
-        } else {
-          cout << RED
-               << "Wrong size of stop tokens: " << stop_tokens.size()
-               << CLEAR << endl;
-        }
-      }
-
-      // Handle tools parameter for function calling
-      if (json_msg.contains("tools")) {
-        decoded_params.ToolsJson = json_msg["tools"].dump();
-        decoded_params.HasTools = true;
-      }
-    } catch (json::parse_error& err) {
-      cout << RED << "Error in JSON At: " << err.what() << CLEAR << endl;
-      return false;
-    }
-    return true;
-  }
-
- private:
-  bool extract_messages(const nlohmann::json& messages,
-                        InputParams& decoded_params) {
-    bool user_msg_found = false;
-    bool system_msg_found = false;
-
-    for (auto& next_msg : messages) {
-      if (next_msg.contains("role")) {
-        auto role = next_msg.at("role");
-        if (!next_msg.contains("content")) {
-          cout << RED << "Error: No content for role: " << role
-               << CLEAR << endl;
-          return false;
-        }
-        if (role == "system") {  // system message
-          if (system_msg_found) {
-            cout << RED << "Error: System message already exists"
-                 << CLEAR << endl;
-            return false;
-          }
-          decoded_params.Messages.push_back(
-              {InputParams::Role::SYSTEM, next_msg["content"]});
-          system_msg_found = true;
-        } else if (role == "user" || role == "assistant") {
-          // Check to see if the next messages what we expect
-          // Meaning - the sequence is:
-          // system, user, assistant, user, assistant, ...
-          // The validity of the system message is checked above. So
-          // we just need to check the user and assistant messages
-          if (role == "user") {
-            if (user_msg_found) {
-              cout << RED << "Error: User message already exists"
-                   << CLEAR << endl;
-              return false;
-            } else {
-              decoded_params.Messages.push_back(
-                  {InputParams::Role::USER, next_msg["content"]});
-              user_msg_found = true;
-              decoded_params.UserPrompt = next_msg["content"];
-            }
-          } else {
-            // assistant message found
-            decoded_params.Messages.push_back(
-                {InputParams::Role::ASSISTANT,
-                 next_msg["content"]});
-            user_msg_found = false;  // reset the flag
-          }
-        } else {  // unknown role
-          cout << RED << "Unknown role: " << role << CLEAR << endl;
-          return false;
-        }
-      } else {  // role not found
-        cout << RED << "Role not found in message" << CLEAR << endl;
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-unique_ptr<InputDecoder> InputDecoder::CreateDecoder(const string& name) {
-  // check to see if this decoder exists
-  if (name == "openai") {
-    return make_unique<OpenAIInputDecoder>();
-  }
-
-  // Instantiate and return
-  cout << RED << "Decoder not available: " << name << CLEAR << endl;
-  return nullptr;
-}
-
-}  // namespace slm_engine
-}  // namespace microsoft
diff --git a/examples/slm_engine/src/cpp/input_decoder.h b/examples/slm_engine/src/cpp/input_decoder.h
deleted file mode 100644
index 0213728740..0000000000
--- a/examples/slm_engine/src/cpp/input_decoder.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace microsoft {
-namespace slm_engine {
-/// @brief An abstract class defining the interface to various types of
-///        input decoder such as OpenAI and so on.
-class InputDecoder {
- public:
-  /// @brief Creates an instance of a specific decoder that conforms to a
-  /// specific API (such as OpenAI)
-  /// @param name Name of the API provider
-  /// @return An instance of the decoder. If the given decoder is not
-  /// supported, a nullptr is returned
-  static std::unique_ptr<InputDecoder> CreateDecoder(const std::string& name);
-
-  /// @brief Data structure representing input parameters
-  struct InputParams {
-    enum class Role { SYSTEM,
-                      USER,
-                      ASSISTANT,
-                      TOOL };
-
-    // Utility function to convert string to Role
-    static Role ToRole(const std::string& role) {
-      if (role == "system") {
-        return Role::SYSTEM;
-      } else if (role == "user") {
-        return Role::USER;
-      } else if (role == "tool") {
-        return Role::TOOL;
-      } else {
-        return Role::ASSISTANT;
-      }
-    }
-
-    // The first message is the system prompt and subsequent messages are
-    // sets of user followed by assistant messages
-    std::vector<std::pair<Role, std::string>> Messages;
-    // The user prompt is the last message in the sequence
-    std::string UserPrompt;
-    // The LoRAAdapterName is sent by the client as "model" in the
-    // OpenAI API. In our implementation, this is the name of the adapter that will be used
-    std::string LoRAAdapterName;
-    uint32_t MaxGeneratedTokens;
-    std::vector<std::string> StopTokens;
-    float Temperature;
-    float TopP;
-    uint32_t TopK;
-
-    // Function calling support
-    std::string ToolsJson;  // Raw tools JSON string from input
-    bool HasTools;
-
-    explicit InputParams() {
-      MaxGeneratedTokens = 512;
-      Temperature = 0.00000000000001f;
-      TopK = 50;
-      TopP = 1.0f;
-      HasTools = false;
-    }
-
-    std::string get_messages() {
-      std::ostringstream output;
-      for (const auto& msg : Messages) {
-        switch (msg.first) {
-          case Role::SYSTEM:
-            output << "{\"role\": \"system\", ";
-            break;
-          case Role::USER:
-            output << "{\"role\": \"user\", ";
-            break;
-          case Role::TOOL:
-            output << "{\"role\": \"tool\", ";
-            break;
-          case Role::ASSISTANT:
-            output << "{\"role\": \"assistant\", ";
-            break;
-        }
-        output << "\"" << msg.second << "\"}\n";
-      }
-      return output.str();
-    }
-
-    std::string to_string() {
-      // std::string operator<<(const InputParams& that) {
-      std::ostringstream output;
-      for (const auto& msg : Messages) {
-        output << "Role: ";
-        switch (msg.first) {
-          case Role::SYSTEM:
-            output << "SYSTEM";
-            break;
-          case Role::USER:
-            output << "USER";
-            break;
-          case Role::TOOL:
-            output << "TOOL";
-            break;
-          case Role::ASSISTANT:
-            output << "ASSISTANT";
-            break;
-        }
-        output << " Content: " << msg.second << std::endl;
-      }
-      return output.str();
-    }
-  };
-
-  /// @brief Default destructor for needed to clean up derived classes
-  virtual ~InputDecoder() = default;
-
-  /// @brief Given the message, extracts various fields from the message
-  /// @param message A message encoded in JSON that represents a specific API
-  /// that this decoder understands and can decode
-  /// @param decoded_params The decoded parameters from the message above
-  /// @return True when all the mandatory parameters are specified, False
-  /// otherwise
-  virtual bool decode(const std::string& message,
-                      InputParams& decoded_params) = 0;
-};
-}  // namespace slm_engine
-}  // namespace microsoft
\ No newline at end of file
diff --git a/examples/slm_engine/src/cpp/input_decoder_test.cpp b/examples/slm_engine/src/cpp/input_decoder_test.cpp
deleted file mode 100644
index 57fdc935a8..0000000000
--- a/examples/slm_engine/src/cpp/input_decoder_test.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "input_decoder.h"
-
-#include <gtest/gtest.h>
-
-#include <argparse/argparse.hpp>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#define MAGENTA_BOLD "\033[35;1m"
-#define MAGENTA "\033[35m"
-#define RED_BOLD "\033[31;1m"
-#define RED "\033[31m"
-#define BLUE_BOLD "\033[34;1m"
-#define BLUE "\033[34m"
-#define GREEN_BOLD "\033[32;1m"
-#define GREEN "\033[32m"
-#define CLEAR "\033[0m"
-
-using namespace std;
-
-/// @brief Reading from the input JSONL file, get the LLM response and write to
-/// the output
-/// @param model_path Path to the ONNX Quantized GenAI model
-/// @param test_data_file JSONL file containing the question set to ask SLM
-/// @param output_file Path to the JSONL file to save the SLM response and stats
-/// @return 0 if successful, -1 otherwise
-int run_test(const string& test_data_file) {
-  // Make sure that the files exist
-
-  // Make sure that the files exist
-  if (!filesystem::exists(test_data_file)) {
-    cout << "Error! Test Data file doesn't exist: " << test_data_file
-         << "\n";
-    return -1;
-  }
-
-  auto open_ai_decoder =
-      microsoft::slm_engine::InputDecoder::CreateDecoder("openai");
-  string line;
-  ifstream test_data(test_data_file);
-  while (getline(test_data, line)) {
-    if (line.empty()) {
-      continue;
-    }
-    // call the decoder
-    microsoft::slm_engine::InputDecoder::InputParams input_params;
-    auto status = open_ai_decoder->decode(line, input_params);
-    if (status) {
-      cout << BLUE << input_params.get_messages() << CLEAR << endl;
-    } else {
-      cout << MAGENTA_BOLD << "Error in decoding\n"
-           << CLEAR;
-    }
-  }
-  return 0;
-}
-
-/// @brief Program entry point
-int main(int argc, char** argv) {
-  argparse::ArgumentParser program("slm_runner", "1.0",
-                                   argparse ::default_arguments::none);
-  string test_data_file;
-  program.add_argument("-t", "--test_data_file")
-      .required()
-      .help("Path to the test data file (JSONL)")
-      .store_into(test_data_file);
-
-  try {
-    program.parse_args(argc, argv);
-  } catch (const std::exception& err) {
-    std::cerr << err.what() << std::endl;
-    std::cerr << program;
-    std::exit(-1);
-  }
-
-  return run_test(test_data_file);
-}
\ No newline at end of file
diff --git a/examples/slm_engine/src/cpp/slm_engine.cpp b/examples/slm_engine/src/cpp/slm_engine.cpp
deleted file mode 100644
index ea7c028866..0000000000
--- a/examples/slm_engine/src/cpp/slm_engine.cpp
+++ /dev/null
@@ -1,1209 +0,0 @@
-#include "slm_engine.h"
-
-#include <chrono>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <stdio.h>
-#include <string.h>
-
-#if !defined(_WIN32)
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <unistd.h>
-#else
-#include <windows.h>
-#include <psapi.h>
-#endif
-
-#include <nlohmann/json.hpp>
-
-#include "onnxruntime_cxx_api.h"
-
-using namespace std;
-using json = nlohmann::json;
-
-#define MAGENTA "\033[35;1m"
-#define RED "\033[31;1m"
-#define BLUE "\033[34;1m"
-#define GREEN "\033[32;1m"
-#define CLEAR "\033[0m"
-
-// Function calling instructions to be added to system prompts
-const std::string function_calling_instructions = R"(
-
-In addition to plain text responses, you can chose to call one or more of the provided functions.
-
-Use the following rule to decide when to call a function:
-  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
-  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
-
-If you decide to call functions:
-  * prefix function calls with <|tool|> marker and end with <|/tool|> marker
-  * all function calls should be generated in a single JSON list formatted as [{"name": [function name], "arguments": [function arguments as JSON]}, ...]
-  * follow the provided JSON schema. Do not hallucinate arguments or values. Do not blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
-  * make sure you pick the right functions that match the user intent
-
-Available functions as JSON spec:
-)";
-
-namespace microsoft {
-namespace slm_engine {
-
-SLMEngine::SupportedModelType SLMEngine::StringToModelType(const std::string& model_type) {
-  if (strncasecmp(model_type.c_str(), "phi", 3) == 0) {
-    return SLMEngine::SupportedModelType::PHI;
-  } else if (strncasecmp(model_type.c_str(), "llama", 5) == 0) {
-    return SLMEngine::SupportedModelType::Llama;
-  } else if (strncasecmp(model_type.c_str(), "qwen", 4) == 0) {
-    return SLMEngine::SupportedModelType::Qwen;
-  } else if (strncasecmp(model_type.c_str(), "custom", 6) == 0) {
-    return SLMEngine::SupportedModelType::CUSTOM;
-  }
-  return SLMEngine::SupportedModelType::UNKNOWN;
-}
-
-std::string SLMEngine::ModelTypeToString(SLMEngine::SupportedModelType model_type) {
-  switch (model_type) {
-    case SLMEngine::SupportedModelType::PHI:
-      return "phi";
-    case SLMEngine::SupportedModelType::Llama:
-      return "llama";
-    case SLMEngine::SupportedModelType::Qwen:
-      return "qwen";
-    case SLMEngine::SupportedModelType::CUSTOM:
-      return "custom";
-    case SLMEngine::SupportedModelType::UNKNOWN:
-    default:
-      return "unknown";
-  }
-}
-
-std::unique_ptr<SLMEngine> SLMEngine::Create(
-    const char* model_path, bool verbose) {
-  auto new_obj = std::unique_ptr<SLMEngine>(new SLMEngine(verbose));
-  if (!new_obj->load_model(model_path)) {
-    cout << RED << "Error creating the SLM Engine" << CLEAR << endl;
-    return nullptr;
-  }
-  return std::move(new_obj);
-}
-
-std::unique_ptr<SLMEngine> SLMEngine::Create(
-    const char* model_path,
-    const std::vector<LoRAAdapter> adapters,
-    bool verbose, Status& status_msg) {
-  // Load the model
-  auto new_obj = std::unique_ptr<SLMEngine>(new SLMEngine(verbose));
-  if (!new_obj->load_model(model_path)) {
-    cout << RED << "Error creating the SLM Engine" << CLEAR << endl;
-    status_msg.code = false;
-    status_msg.message = "Failed to load model: " + std::string(model_path);
-    return nullptr;
-  }
-
-  new_obj->m_adapters = OgaAdapters::Create(*new_obj->m_onnx_model.get());
-  if (!new_obj->m_adapters) {
-    status_msg.code = false;
-    status_msg.message = "Failed to create adapters";
-    return nullptr;
-  }
-
-  // Create the adapters
-  for (const auto& adapter : adapters) {
-    if (adapter.name.empty() || adapter.adapter_path.empty()) {
-      status_msg.code = false;
-      status_msg.message = "Adapter name or path is empty";
-      return nullptr;
-    }
-
-    // Check if the adapter path exists
-    std::ifstream file_check(adapter.adapter_path);
-    if (!file_check.good()) {
-      status_msg.code = false;
-      status_msg.message = "Adapter path does not exist: " + adapter.adapter_path;
-      return nullptr;
-    }
-    file_check.close();
-
-    // Load the adapter
-    new_obj->m_adapters->LoadAdapter(adapter.adapter_path.c_str(),
-                                     adapter.name.c_str());
-  }
-
-  new_obj->m_adapters_list = adapters;
-
-  return std::move(new_obj);
-}
-
-std::vector<SLMEngine::LoRAAdapter> SLMEngine::get_adapter_list() {
-  std::vector<SLMEngine::LoRAAdapter> adapter_list;
-  for (const auto& adapter : m_adapters_list) {
-    adapter_list.emplace_back(adapter.name, adapter.adapter_path);
-  }
-  return adapter_list;
-}
-
-void SLMEngine::GetVersion(std::string& slm_version, std::string& ortga_version,
-                           std::string& ort_version) {
-// SW_VERSION_NUMBER is defined in the CMakeLists.txt file
-#ifdef SW_VERSION_NUMBER
-  slm_version = std::string(SW_VERSION_NUMBER);
-#else
-  slm_version = "unknown";
-#endif
-
-#ifdef ORT_GENAI_VERSION
-  ortga_version = std::string(ORT_GENAI_VERSION);
-#else
-  ortga_version = "unknown";
-#endif
-
-  ort_version = Ort::GetVersionString();
-}
-
-std::string SLMEngine::GetModelFamily(const std::string& model_path) {
-  // Open the config.json file
-  std::ifstream config_file(model_path + "/config.json");
-  if (!config_file.is_open()) {
-    std::cout << RED << "Error opening config.json file" << CLEAR << std::endl;
-    return "";
-  }
-  // Parse the JSON file
-  json config_json;
-  config_file >> config_json;
-  config_file.close();
-
-  // Check if the "model_type" field exists
-  if (config_json.find("model_type") == config_json.end()) {
-    std::cout << RED << "Error: model_type field not found in config.json" << CLEAR << std::endl;
-    return "";
-  }
-  // Get the value of the "model_type" field
-  std::string model_type = config_json["model_type"];
-
-  return model_type;
-}
-
-std::string SLMEngine::format_prompt(
-    const std::string& system_prompt,
-    const std::string& user_prompt) {
-  std::stringstream ss_output;
-  ss_output << m_prompt_format.prompt_format.at(InputDecoder::InputParams::Role::SYSTEM).prefix
-            << system_prompt
-            << m_prompt_format.prompt_format.at(InputDecoder::InputParams::Role::SYSTEM).suffix;
-  ss_output << m_prompt_format.prompt_format.at(InputDecoder::InputParams::Role::USER).prefix
-            << user_prompt
-            << m_prompt_format.prompt_format.at(InputDecoder::InputParams::Role::USER).suffix;
-  ss_output << m_prompt_format.prompt_format.at(InputDecoder::InputParams::Role::ASSISTANT).prefix;
-
-  return ss_output.str();
-}
-
-SLMEngine::~SLMEngine() {
-  m_onnx_model.reset();
-  m_tokenizer.reset();
-  m_tokenizer_stream.reset();
-  m_input_decoder.reset();
-}
-
-std::unique_ptr<OgaGenerator> SLMEngine::create_generator(
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    uint32_t& time_to_prefill) {
-  auto generator_params = OgaGeneratorParams::Create(*m_onnx_model);
-  if (!generator_params) {
-    return nullptr;
-  }
-
-  generator_params->SetSearchOption("max_length", generation_options.MaxGeneratedTokens);
-  generator_params->SetSearchOption("temperature", generation_options.Temperature);
-  generator_params->SetSearchOption("top_p", generation_options.TopP);
-  generator_params->SetSearchOption("top_k", generation_options.TopK);
-
-  auto mem_before = GetMemoryUsage();
-
-  // Create the generator
-  auto generator = OgaGenerator::Create(*m_onnx_model, *generator_params);
-  if (!generator) {
-    return nullptr;
-  }
-
-  auto sequences = OgaSequences::Create();
-
-  auto start = std::chrono::steady_clock::now();
-  m_tokenizer->Encode(formatted_prompt.c_str(), *sequences);
-  auto time_to_encode =
-      std::chrono::duration_cast<std::chrono::milliseconds>(
-          std::chrono::steady_clock::now() - start)
-          .count();
-
-  start = std::chrono::steady_clock::now();
-
-  generator->AppendTokenSequences(*sequences);
-  time_to_prefill =
-      std::chrono::duration_cast<std::chrono::milliseconds>(
-          std::chrono::steady_clock::now() - start)
-          .count();
-
-  auto mem_after = GetMemoryUsage();
-
-  if (m_verbose) {
-    cout << BLUE << "Time to encode: " << time_to_encode
-         << " ms Initial Tokens: " << generator->GetSequenceCount(0)
-         << " Time to append: " << time_to_prefill << " ms" << CLEAR << endl;
-
-    cout << BLUE << "Memory used: " << mem_after - mem_before << " bytes" << CLEAR << endl;
-  }
-
-  return std::move(generator);
-}
-
-SLMEngine::Status SLMEngine::generate(
-    const std::string& adapter_name,
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    std::string& response_str,
-    RuntimePerf& kpi) {
-  // Verify that the adapter is a valid one
-  if (!m_adapters) {
-    return Status{false, "Adapter not found: " + adapter_name};
-  }
-  auto api_start = std::chrono::steady_clock::now();
-
-  uint32_t time_to_prefill;
-  auto generator = create_generator(
-      formatted_prompt, generation_options, time_to_prefill);
-
-  if (!generator) {
-    return Status{false, "Failed to create generator"};
-  }
-
-  // Set the adapter
-  generator->SetActiveAdapter(*(m_adapters.get()), adapter_name.c_str());
-
-  // Add the time_to_prefill to the KPI
-  kpi.TimeToFirstToken = time_to_prefill;
-
-  // Delegate to generate
-  auto status = generate(generator.get(), nullptr, response_str, kpi);
-  kpi.TotalTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::steady_clock::now() - api_start)
-                      .count();
-  return status;
-}
-
-SLMEngine::Status SLMEngine::generate(
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    std::string& response_str,
-    RuntimePerf& kpi) {
-  auto api_start = std::chrono::steady_clock::now();
-
-  uint32_t time_to_prefill;
-  auto generator = create_generator(
-      formatted_prompt, generation_options, time_to_prefill);
-
-  if (!generator) {
-    cout << RED << "Error creating the generator" << CLEAR << endl;
-    return Status{false, "Error creating the generator"};
-  }
-
-  kpi.TimeToFirstToken = time_to_prefill;
-  generate(generator.get(), nullptr, response_str, kpi);
-  kpi.TotalTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::steady_clock::now() - api_start)
-                      .count();
-  return Status{true, "Generation successful"};
-}
-
-SLMEngine ::Status SLMEngine::generate(
-    OgaGenerator* generator,
-    std::function<bool(const std::string&, OgaTensor* logits)> generation_callback,
-    std::string& response_str,
-    RuntimePerf& kpi) {
-  std::lock_guard<std::mutex> lock(m_mutex);
-
-  auto start = std::chrono::steady_clock::now();
-  bool is_first_token = true;
-  auto time_count = 0;
-
-  auto initial_prompt_token_count = generator->GetSequenceCount(0);
-
-  int count = 0;
-  uint32_t total_generation_time = 0;
-  std::ostringstream response;
-  while (!generator->IsDone()) {
-    auto gen_start = std::chrono::steady_clock::now();
-    generator->GenerateNextToken();
-    auto gen_end = std::chrono::steady_clock::now();
-    auto elapsed =
-        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - gen_start)
-            .count();
-    total_generation_time += elapsed;
-    count++;
-    // cout << BLUE << "Generation time: " << elapsed << " us" << CLEAR
-    //      << endl;
-
-    const auto num_tokens = generator->GetSequenceCount(0);
-    const auto new_token = generator->GetSequenceData(0)[num_tokens - 1];
-    auto end = std::chrono::steady_clock::now();
-    if (is_first_token) {
-      is_first_token = false;
-      auto elapsed =
-          std::chrono::duration_cast<std::chrono::milliseconds>(end -
-                                                                start)
-              .count();
-      kpi.PromptTokenCount = initial_prompt_token_count;
-      kpi.TimeToFirstToken += elapsed;
-    } else {
-      time_count += std::chrono::duration_cast<std::chrono::milliseconds>(
-                        end - start)
-                        .count();
-    }
-
-    auto next_string_piece = m_tokenizer_stream->Decode(new_token);
-    // TODO: Use the actual token for the end or line below
-    if (strncmp(next_string_piece, "</s>", 10) != 0) {
-      // We received end of the text - so will exclude
-      response << next_string_piece;
-    } else {
-      cout << RED << "Got </s>!!!" << CLEAR << endl;
-    }
-
-    // Print next output string if the generation continues
-    if (m_verbose) {
-      cout << next_string_piece;
-      flush(cout);
-    }
-
-    // Call the generation callback if provided
-    if (generation_callback) {
-      auto logits = generator->GetLogits();
-      if (logits) {
-        if (!generation_callback(next_string_piece, logits.get())) {
-          cout << RED << "Sopping generation due to callback request." << endl;
-          break;
-        }
-      }
-    }
-
-    // Reset the start time for the next token
-    start = std::chrono::steady_clock::now();
-  }
-
-  // Find out the generation time
-  uint32_t avg_generation_time =
-      static_cast<float>(total_generation_time) / static_cast<float>(count);
-  kpi.GenerationTimePerToken = avg_generation_time;
-
-  if (m_verbose) {
-    cout << CLEAR << endl;
-  }
-
-  response_str = response.str();
-  m_llm_output_dbg_stream << response_str << endl;
-
-  kpi.GeneratedTokenCount =
-      generator->GetSequenceCount(0) - initial_prompt_token_count;
-  kpi.TokenRate = kpi.GeneratedTokenCount / (time_count / 1000.0f);
-
-  // // Get the current memory
-  kpi.CurrentMemoryUsed = GetMemoryUsage();
-  return Status({true, "Generation successful"});
-}
-
-std::string SLMEngine::complete(const char* user_prompt) {
-  InputDecoder::InputParams input_parameters;
-  // Decode the user prompt
-  if (!m_input_decoder->decode(user_prompt, input_parameters)) {
-    cout << RED << "❌ Error decoding input message: " << user_prompt << CLEAR << endl;
-    json output_json;
-    output_json["status"] = "error";
-    output_json["message"] = "Error decoding input message: " + string(user_prompt);
-    return output_json.dump();
-  }
-
-  // cout<< BLUE << input_parameters  << endl;
-
-  // Check if tools are provided for function calling
-  bool use_function_calling = input_parameters.HasTools && !input_parameters.ToolsJson.empty();
-
-  if (m_verbose) {
-    cout << "Input Parameters processed successfully" << endl;
-  }
-
-  cout << "Input Parameters has tools: " << input_parameters.HasTools << endl;
-
-  // Format prompt with tools if function calling is enabled
-  std::string formatted_prompt;
-  if (use_function_calling) {
-    formatted_prompt = format_input_with_tools(input_parameters);
-  } else {
-    formatted_prompt = format_input(input_parameters);
-  }
-
-  m_llm_input_dbg_stream << formatted_prompt << endl;
-
-  if (m_verbose) {
-    cout << BLUE << "User: " << input_parameters.UserPrompt << endl;
-    if (use_function_calling) {
-      cout << BLUE << "🔧 Function calling mode enabled with tools" << endl;
-    }
-    cout << GREEN;
-  }
-
-  RuntimePerf kpi;
-  std::string response;
-  FunctionCallResult function_result;
-
-  GenerationOptions generator_options;
-  generator_options.MaxGeneratedTokens = input_parameters.MaxGeneratedTokens;
-  generator_options.Temperature = input_parameters.Temperature;
-  generator_options.TopP = input_parameters.TopP;
-
-  SLMEngine::Status status;
-  auto api_start = std::chrono::steady_clock::now();
-
-  if (use_function_calling) {
-    cout << BLUE << "🔧 Function calling mode enabled" << CLEAR << endl;
-    // Setup function calling options
-    FunctionCallOptions function_options;
-    function_options.tools = parse_tools_from_json(input_parameters.ToolsJson);
-
-    if (m_verbose) {
-      cout << BLUE << "   Tools available: " << function_options.tools.size() << CLEAR << endl;
-      for (const auto& tool : function_options.tools) {
-        cout << BLUE << "   - " << tool.name << ": " << tool.description << CLEAR << endl;
-      }
-    }
-
-    if (m_verbose) {
-      cout << RED << "Function result status: " << "FUNCTION_CALL"
-           << ", calls: " << function_options.tools.size() << CLEAR << endl;
-    }
-
-    if (input_parameters.LoRAAdapterName.empty()) {
-      status = generate_with_functions(formatted_prompt, generator_options,
-                                       function_options, response, function_result, kpi);
-    } else {
-      status = generate_with_functions(input_parameters.LoRAAdapterName, formatted_prompt,
-                                       generator_options, function_options, response, function_result, kpi);
-    }
-  } else {
-    // Regular generation without function calling
-    if (input_parameters.LoRAAdapterName.empty()) {
-      status = generate(formatted_prompt, generator_options, response, kpi);
-    } else {
-      status = generate(input_parameters.LoRAAdapterName, formatted_prompt,
-                        generator_options, response, kpi);
-    }
-  }
-
-  kpi.TotalTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::steady_clock::now() - api_start)
-                      .count();
-
-  m_llm_output_dbg_stream << response << endl;
-
-  // Remove stop tokens from response
-  for (const auto& stop_token : input_parameters.StopTokens) {
-    auto stop_token_pos = response.find(stop_token);
-    if (stop_token_pos != std::string::npos) {
-      response = response.substr(0, stop_token_pos);
-      break;
-    }
-  }
-
-  json output_json;
-  if (!status.code) {
-    output_json["status"] = "error";
-    output_json["message"] = status.message;
-    return output_json.dump();
-  }
-
-  output_json["status"] = "success";
-  output_json["question"] = input_parameters.UserPrompt;
-  output_json["llm_input"] = formatted_prompt;
-
-  // Handle function calling response
-  if (use_function_calling && function_result.is_function_call) {
-    // Function call(s) detected - format answer as JSON array string like in your examples
-    json function_calls_json = json::array();
-    for (const auto& call : function_result.function_calls) {
-      json call_json;
-      call_json["name"] = call.function_name;
-
-      // Parse the parameters_json string into a JSON object for arguments
-      try {
-        call_json["arguments"] = json::parse(call.parameters_json);
-      } catch (const json::exception& e) {
-        // If parsing fails, store as raw string
-        call_json["arguments"] = call.parameters_json;
-      }
-
-      function_calls_json.push_back(call_json);
-    }
-
-    json response_data = {
-        {"answer", function_calls_json.dump()}  // Store as JSON string like in your examples
-    };
-
-    cout << GREEN << "Function call detected" << CLEAR << endl;
-
-    // Always add the structured function_calls array (unified format)
-    json function_calls_array = json::array();
-    for (const auto& call : function_result.function_calls) {
-      json structured_call;
-      structured_call["name"] = call.function_name;
-      structured_call["arguments"] = call.parameters_json;  // Keep as string format as requested
-
-      function_calls_array.push_back(structured_call);
-    }
-    response_data["function_calls"] = function_calls_array;
-
-    output_json["response"] = response_data;
-
-    if (m_verbose) {
-      if (function_result.function_calls.size() == 1) {
-        cout << GREEN << "📞 Function call response: " << function_result.function_calls[0].function_name << CLEAR << endl;
-      } else {
-        cout << GREEN << "📞 Multiple function calls response (" << function_result.function_calls.size() << " calls):" << CLEAR << endl;
-        for (size_t i = 0; i < function_result.function_calls.size(); ++i) {
-          cout << GREEN << "   " << (i + 1) << ". " << function_result.function_calls[i].function_name << CLEAR << endl;
-        }
-      }
-    }
-  } else {
-    // Regular text response
-    json response_data = {
-        {"answer", use_function_calling ? function_result.text_response : response}};
-    output_json["response"] = response_data;
-
-    if (m_verbose && use_function_calling) {
-      cout << GREEN << "💬  Response  with tools available" << CLEAR << endl;
-    }
-  }
-
-  json kpi_json;
-  kpi_json["prompt_toks"] = kpi.PromptTokenCount;
-  kpi_json["ttft"] = kpi.TimeToFirstToken;
-  kpi_json["generated_toks"] = kpi.GeneratedTokenCount;
-  kpi_json["tok_rate"] = kpi.TokenRate;
-  kpi_json["total_time"] = kpi.TotalTime;
-  kpi_json["memory_usage"] = kpi.CurrentMemoryUsed;
-
-  output_json["kpi"] = kpi_json;
-
-  // Return the output_json directly (not wrapped in "response")
-  return output_json.dump();
-}
-
-// Use a Dictionary to store various types of prompt formatting
-// LLama3.2 and Phi3 have different prompt formats
-// Llama3.2 format described here:
-// https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md
-//
-
-// Following is a dictionary that stores the prompt format for different models
-const auto PromptFormatTable = R"(
-[
-   {
-      "llm_type": "phi",
-      "prompt_format": {
-         "system": { "prefix": "<|system|>", "suffix": "<|end|>" },
-         "user": { "prefix": "<|user|>", "suffix": "<|end|>" },
-         "assistant": { "prefix": "<|assistant|>", "suffix": "<|end|>" },
-         "tool": { "prefix": "<|tool|>", "suffix": "<|/tool|>" }
-      }
-   },
-   {     
-      "llm_type": "llama",
-      "prompt_format": {
-         "system": { "prefix": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n", "suffix": "<|eot_id|>" },
-         "user": { "prefix": "<|start_header_id|>user<|end_header_id|>\n\n", "suffix": "<|eot_id|>" },
-         "assistant": { "prefix": "<|start_header_id|>assistant<|end_header_id|>\n\n", "suffix": "<|eot_id|>" },
-         "tool": { "prefix": "<|tool|>", "suffix": "<|/tool|>" }
-      }
-   },
-   {     
-      "llm_type": "qwen",
-      "prompt_format": {
-         "system": { "prefix": "<|im_start|>system\n", "suffix": "<|im_end|>" },
-         "user": { "prefix": "<|im_start|>user\n/no_think ", "suffix": "<|im_end|>" },
-         "assistant": { "prefix": "<|im_start|>assistant\n", "suffix": "<|im_end|>" },
-         "tool": { "prefix": "<|tool|>", "suffix": "<|/tool|>" }
-      }
-   },
-   {
-      "llm_type": "custom",
-      "prompt_format": {
-         "system": { "prefix": "", "suffix": "" },
-         "user": { "prefix": "", "suffix": "" },
-         "assistant": { "prefix": "", "suffix": "" }
-      }
-   }
-]
-)";
-
-// Define a function to parse the JSON dictionary to the c++ data structure
-bool SLMEngine::parse_prompt_format_dict(
-    SupportedModelType model_type, const std::string& json_dict,
-    PromptFormatDictionary& prompt_format_dict) {
-  auto j = json::parse(json_dict);
-  for (const auto& llm_type : j) {
-    if (llm_type["llm_type"] != ModelTypeToString(model_type)) {
-      continue;
-    }
-
-    prompt_format_dict.llm_type = llm_type["llm_type"];
-
-    for (const auto& role : llm_type["prompt_format"].items()) {
-      PromptFormat pf;
-      pf.prefix = role.value()["prefix"];
-      pf.suffix = role.value()["suffix"];
-      prompt_format_dict
-          .prompt_format[InputDecoder::InputParams::ToRole(role.key())] =
-          pf;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool SLMEngine::load_model(const char* model_path) {
-  if (m_verbose) {
-    cout << RED << "Memory Usage Before Model Load: " << GetMemoryUsage() << " MB"
-         << CLEAR << endl;
-  }
-
-  m_model_path = model_path;
-  m_model_type = GetModelFamily(m_model_path);
-
-  // Convert the model name to the SupportedModelType
-  auto model_type = StringToModelType(m_model_type);
-  if (model_type == SupportedModelType::UNKNOWN) {
-    cout << RED << "Error! Cannot detect the model type for model: " << model_path << CLEAR
-         << endl;
-    return false;
-  }
-
-  m_onnx_model = OgaModel::Create(model_path);
-  m_tokenizer = OgaTokenizer::Create(*m_onnx_model);
-  m_tokenizer_stream = OgaTokenizerStream::Create(*m_tokenizer);
-  // m_generator_params = OgaGeneratorParams::Create(*m_onnx_model);
-  // m_sequences = OgaSequences::Create();
-
-  m_input_decoder = InputDecoder::CreateDecoder("openai");
-  if (m_input_decoder == nullptr) {
-    cout << "Error!" << endl;
-    return false;
-  }
-
-  if (!parse_prompt_format_dict(model_type, PromptFormatTable,
-                                m_prompt_format)) {
-    cout << "Error parsing the prompt format dictionary" << endl;
-    return false;
-  }
-
-  if (m_verbose) {
-    string slm_ver, oga_ver, ort_ver;
-    GetVersion(slm_ver, oga_ver, ort_ver);
-
-    cout << "Loaded Model: " << model_path << endl;
-    cout << "Model Type: " << ModelTypeToString(model_type) << endl;
-    cout << "Prompt Format: " << m_prompt_format.llm_type << endl;
-    cout << "SLM Engine Initialized" << endl;
-    cout << "SLM VERSION: " << slm_ver << endl;
-    cout << "ORT GenAI VERSION: " << oga_ver << endl;
-    cout << "ORT VERSION: " << ort_ver << endl;
-  }
-  m_llm_input_dbg_stream.open("slm-input-records.jsonl");
-  m_llm_output_dbg_stream.open("slm-output-records.jsonl");
-
-  return true;
-}
-
-// Now define a function to format the input
-std::string SLMEngine::format_input(
-    const InputDecoder::InputParams& input_params) {
-  ostringstream ss_output;
-  bool no_assistant_messages = true;
-  for (const auto& msg : input_params.Messages) {
-    switch (msg.first) {
-      case InputDecoder::InputParams::Role::SYSTEM:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second
-                  << m_prompt_format.prompt_format.at(msg.first).suffix;
-        break;
-      case InputDecoder::InputParams::Role::USER:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second
-                  << m_prompt_format.prompt_format.at(msg.first).suffix;
-        // Each time we get a user message we reset the flag
-        no_assistant_messages = true;
-        break;
-      case InputDecoder::InputParams::Role::TOOL:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second
-                  << m_prompt_format.prompt_format.at(msg.first).suffix;
-        break;
-      case InputDecoder::InputParams::Role::ASSISTANT:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second;
-        // if there are more messages then add the assistant suffix
-        if (msg != input_params.Messages.back()) {
-          ss_output
-              << m_prompt_format.prompt_format.at(msg.first).suffix;
-        }
-        no_assistant_messages = false;
-        break;
-    }
-  }
-
-  if (no_assistant_messages) {
-    ss_output << m_prompt_format.prompt_format
-                     .at(InputDecoder::InputParams::Role::ASSISTANT)
-                     .prefix;
-  }
-
-  return ss_output.str();
-}
-
-// Format input with tools for function calling (Phi format)
-std::string SLMEngine::format_input_with_tools(
-    const InputDecoder::InputParams& input_params) {
-  ostringstream ss_output;
-  bool no_assistant_messages = true;
-
-  for (const auto& msg : input_params.Messages) {
-    switch (msg.first) {
-      case InputDecoder::InputParams::Role::SYSTEM:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second;
-
-        // Add function calling instructions and tools information to system message if available
-        if (input_params.HasTools && !input_params.ToolsJson.empty()) {
-          // Add function calling instructions
-          ss_output << function_calling_instructions;
-
-          // Add the actual tools JSON spec
-          ss_output << input_params.ToolsJson;
-        }
-
-        ss_output << m_prompt_format.prompt_format.at(msg.first).suffix;
-        break;
-
-      case InputDecoder::InputParams::Role::USER:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second
-                  << m_prompt_format.prompt_format.at(msg.first).suffix;
-        no_assistant_messages = true;
-        break;
-
-      case InputDecoder::InputParams::Role::TOOL:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second
-                  << m_prompt_format.prompt_format.at(msg.first).suffix;
-        break;
-
-      case InputDecoder::InputParams::Role::ASSISTANT:
-        ss_output << m_prompt_format.prompt_format.at(msg.first).prefix
-                  << msg.second;
-        if (msg != input_params.Messages.back()) {
-          ss_output << m_prompt_format.prompt_format.at(msg.first).suffix;
-        }
-        no_assistant_messages = false;
-        break;
-    }
-  }
-
-  if (no_assistant_messages) {
-    ss_output << m_prompt_format.prompt_format
-                     .at(InputDecoder::InputParams::Role::ASSISTANT)
-                     .prefix;
-  }
-
-  return ss_output.str();
-}
-
-uint32_t SLMEngine::GetMemoryUsage() {
-#if defined(_WIN32)
-  PROCESS_MEMORY_COUNTERS_EX pmc;
-  if (GetProcessMemoryInfo(
-          GetCurrentProcess(),
-          (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc))) {
-    return pmc.WorkingSetSize / (1024 * 1024);
-  }
-  return 0;
-#else
-#if defined(__ANDROID__)
-  // Read the /proc/self/status file to get the memory usage
-  std::ifstream status_file("/proc/self/status");
-  std::string line;
-  while (std::getline(status_file, line)) {
-    if (line.find("VmRSS") != std::string::npos) {
-      // remove the non-numeric characters
-      line.erase(std::remove_if(
-                     line.begin(), line.end(),
-                     [](unsigned char c) { return !std::isdigit(c); }),
-                 line.end());
-
-      // Convert to MB
-      auto memory = std::stoul(line) / 1024;
-      return memory;
-    }
-  }
-  return 0;
-#else
-  struct rusage usage;
-  getrusage(RUSAGE_SELF, &usage);
-  auto current_memory = usage.ru_maxrss;
-
-#if defined(__linux__)
-  current_memory = current_memory / 1024;
-#elif defined(__aarch64__) && defined(__APPLE__)
-  current_memory = current_memory / (1024 * 1024);
-#endif
-  return current_memory;
-#endif
-#endif
-}
-
-bool SLMEngine::create_lark_grammar(const std::vector<FunctionTool>& tools,
-                                    std::string& prompt_tool_input,
-                                    std::string& grammar_input) {
-  if (tools.empty()) {
-    return false;
-  }
-
-  prompt_tool_input = create_prompt_tool_input(tools);
-
-  if (tools.size() == 1) {
-    // Single tool case
-    std::string tool_schema = convert_tool_to_grammar_input(tools[0]);
-    grammar_input =
-        "start: TEXT | fun_call\n"
-        "TEXT: /[^{](.|\\n)*/\n"
-        " fun_call: <|tool_call|> %json " +
-        tool_schema;
-  } else {
-    // Multiple tools case
-    std::string anyof_schema = "{\"anyOf\": [";
-    for (size_t i = 0; i < tools.size(); ++i) {
-      if (i > 0) anyof_schema += ",";
-      anyof_schema += convert_tool_to_grammar_input(tools[i]);
-    }
-    anyof_schema += "]}";
-
-    grammar_input =
-        "start: TEXT | fun_call\n"
-        "TEXT: /[^{](.|\\n)*/\n"
-        " fun_call: <|tool_call|> %json " +
-        anyof_schema;
-  }
-
-  return true;
-}
-
-std::string SLMEngine::convert_tool_to_grammar_input(const FunctionTool& tool) {
-  json param_props = json::object();
-  json required_params = json::array();
-
-  for (const auto& [param_name, param_info] : tool.parameters) {
-    param_props[param_name] = {
-        {"type", param_info.type},
-        {"description", param_info.description}};
-    required_params.push_back(param_name);
-  }
-
-  json output_schema = {
-      {"description", tool.description},
-      {"type", "object"},
-      {"required", {"name", "parameters"}},
-      {"additionalProperties", false},
-      {"properties", {{"name", {{"const", tool.name}}}, {"parameters", {{"type", "object"}, {"properties", param_props}, {"required", required_params}, {"additionalProperties", false}}}}}};
-
-  if (param_props.empty()) {
-    output_schema["required"] = json::array({"name"});
-  }
-
-  return output_schema.dump();
-}
-
-std::string SLMEngine::create_prompt_tool_input(const std::vector<FunctionTool>& tools) {
-  json tools_json = json::array();
-
-  for (const auto& tool : tools) {
-    json tool_json = {
-        {"name", tool.name},
-        {"description", tool.description},
-        {"parameters", json::object()}};
-
-    for (const auto& [param_name, param_info] : tool.parameters) {
-      tool_json["parameters"][param_name] = {
-          {"description", param_info.description},
-          {"type", param_info.type}};
-      if (!param_info.default_value.empty()) {
-        tool_json["parameters"][param_name]["default"] = param_info.default_value;
-      }
-    }
-
-    tools_json.push_back(tool_json);
-  }
-
-  return tools_json.dump();
-}
-
-bool SLMEngine::parse_function_call(const std::string& generated_text,
-                                    FunctionCallResult& function_result) {
-  // Look for multiple function call patterns: <|tool_call|>{...}
-  std::regex function_call_regex(R"(<\|tool_call\|>\s*(\{.*?\}))");
-  std::sregex_iterator iter(generated_text.begin(), generated_text.end(), function_call_regex);
-  std::sregex_iterator end;
-
-  std::vector<FunctionCall> detected_calls;
-  size_t first_call_pos = std::string::npos;
-
-  for (auto it = iter; it != end; ++it) {
-    std::smatch match = *it;
-    try {
-      std::string json_str = match[1].str();
-      json function_call = json::parse(json_str);
-
-      if (function_call.contains("name") && function_call.contains("parameters")) {
-        std::string name = function_call["name"];
-        std::string params = function_call["parameters"].dump();
-        detected_calls.emplace_back(name, params);
-
-        // Record position of first function call for text extraction
-        if (first_call_pos == std::string::npos) {
-          first_call_pos = match.position();
-        }
-
-        if (m_verbose) {
-          cout << GREEN << "📞 Detected function call: " << name << CLEAR << endl;
-        }
-      }
-    } catch (const json::exception& e) {
-      if (m_verbose) {
-        cout << RED << "Error parsing function call JSON: " << e.what() << CLEAR << endl;
-      }
-    }
-  }
-
-  if (!detected_calls.empty()) {
-    function_result.is_function_call = true;
-    function_result.function_calls = std::move(detected_calls);
-
-    // Extract text before first function call as text response
-    if (first_call_pos > 0) {
-      function_result.text_response = generated_text.substr(0, first_call_pos);
-      // Trim whitespace
-      function_result.text_response.erase(
-          function_result.text_response.find_last_not_of(" \n\r\t") + 1);
-    }
-
-    if (m_verbose) {
-      cout << GREEN << "🔧 Total function calls detected: " << detected_calls.size() << CLEAR << endl;
-    }
-
-    return true;
-  }
-
-  // No function call found, treat as regular text
-  function_result.is_function_call = false;
-  function_result.text_response = generated_text;
-  return false;
-}
-
-std::unique_ptr<OgaGenerator> SLMEngine::create_function_generator(
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    const FunctionCallOptions& function_options,
-    uint32_t& time_to_prefill) {
-  auto generator_params = OgaGeneratorParams::Create(*m_onnx_model);
-  if (!generator_params) {
-    return nullptr;
-  }
-
-  generator_params->SetSearchOption("max_length", generation_options.MaxGeneratedTokens);
-  generator_params->SetSearchOption("temperature", generation_options.Temperature);
-  generator_params->SetSearchOption("top_k", generation_options.TopK);
-  generator_params->SetSearchOption("top_p", generation_options.TopP);
-
-  auto mem_before = GetMemoryUsage();
-
-  // Create the generator
-  auto generator = OgaGenerator::Create(*m_onnx_model, *generator_params);
-  if (!generator) {
-    return nullptr;
-  }
-
-  auto sequences = OgaSequences::Create();
-
-  auto start = std::chrono::steady_clock::now();
-  m_tokenizer->Encode(formatted_prompt.c_str(), *sequences);
-  auto time_to_encode =
-      std::chrono::duration_cast<std::chrono::milliseconds>(
-          std::chrono::steady_clock::now() - start)
-          .count();
-
-  start = std::chrono::steady_clock::now();
-
-  generator->AppendTokenSequences(*sequences);
-  time_to_prefill =
-      std::chrono::duration_cast<std::chrono::milliseconds>(
-          std::chrono::steady_clock::now() - start)
-          .count();
-
-  auto mem_after = GetMemoryUsage();
-
-  if (m_verbose) {
-    cout << BLUE << "Time to encode: " << time_to_encode
-         << " ms Initial Tokens: " << generator->GetSequenceCount(0)
-         << " Time to append: " << time_to_prefill << " ms" << CLEAR << endl;
-
-    cout << BLUE << "Memory used: " << mem_after - mem_before << " bytes" << CLEAR << endl;
-  }
-
-  return std::move(generator);
-}
-
-SLMEngine::Status SLMEngine::generate_with_functions(
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    const FunctionCallOptions& function_options,
-    std::string& response_str,
-    FunctionCallResult& function_result,
-    RuntimePerf& kpi) {
-  auto api_start = std::chrono::steady_clock::now();
-
-  uint32_t time_to_prefill;
-  auto generator = create_function_generator(
-      formatted_prompt, generation_options, function_options, time_to_prefill);
-
-  if (!generator) {
-    cout << RED << "Error creating the function generator" << CLEAR << endl;
-    return Status{false, "Error creating the function generator"};
-  }
-
-  kpi.TimeToFirstToken = time_to_prefill;
-
-  // Generate response
-  auto status = generate(generator.get(), nullptr, response_str, kpi);
-
-  if (status.code) {
-    // Parse function call from response
-    parse_function_call(response_str, function_result);
-
-    if (m_verbose && function_result.is_function_call) {
-      cout << MAGENTA << "Function Call Detected: " << function_result.function_name()
-           << " with parameters: " << function_result.parameters_json() << CLEAR << endl;
-    }
-  }
-
-  kpi.TotalTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::steady_clock::now() - api_start)
-                      .count();
-  return status;
-}
-
-SLMEngine::Status SLMEngine::generate_with_functions(
-    const std::string& adapter_name,
-    const std::string& formatted_prompt,
-    const GenerationOptions& generation_options,
-    const FunctionCallOptions& function_options,
-    std::string& response_str,
-    FunctionCallResult& function_result,
-    RuntimePerf& kpi) {
-  // Verify that the adapter is a valid one
-  if (!m_adapters) {
-    return Status{false, "Adapter not found: " + adapter_name};
-  }
-
-  auto api_start = std::chrono::steady_clock::now();
-
-  uint32_t time_to_prefill;
-  auto generator = create_function_generator(
-      formatted_prompt, generation_options, function_options, time_to_prefill);
-
-  if (!generator) {
-    return Status{false, "Failed to create function generator"};
-  }
-
-  // Set the adapter
-  generator->SetActiveAdapter(*(m_adapters.get()), adapter_name.c_str());
-
-  // Add the time_to_prefill to the KPI
-  kpi.TimeToFirstToken = time_to_prefill;
-
-  // Generate response
-  auto status = generate(generator.get(), nullptr, response_str, kpi);
-
-  if (status.code) {
-    // Parse function call from response
-    parse_function_call(response_str, function_result);
-
-    if (m_verbose && function_result.is_function_call) {
-      cout << MAGENTA << "Function Call Detected: " << function_result.function_name()
-           << " with parameters: " << function_result.parameters_json() << CLEAR << endl;
-    }
-  }
-
-  kpi.TotalTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::steady_clock::now() - api_start)
-                      .count();
-  return status;
-}
-
-std::vector<SLMEngine::FunctionTool> SLMEngine::parse_tools_from_json(const std::string& tools_json) {
-  std::vector<FunctionTool> tools;
-
-  try {
-    json tools_array = json::parse(tools_json);
-
-    for (const auto& tool_json : tools_array) {
-      if (tool_json.contains("name") && tool_json.contains("description")) {
-        FunctionTool tool(tool_json["name"], tool_json["description"]);
-
-        if (tool_json.contains("parameters")) {
-          const auto& params = tool_json["parameters"];
-          for (auto it = params.begin(); it != params.end(); ++it) {
-            std::string param_name = it.key();
-            const auto& param_info = it.value();
-
-            FunctionParameter param;
-            if (param_info.contains("description")) {
-              param.description = param_info["description"];
-            }
-            if (param_info.contains("type")) {
-              param.type = param_info["type"];
-            }
-            if (param_info.contains("default")) {
-              param.default_value = param_info["default"];
-            }
-
-            tool.parameters[param_name] = param;
-          }
-        }
-
-        tools.push_back(tool);
-      }
-    }
-  } catch (const json::exception& e) {
-    if (m_verbose) {
-      cout << RED << "Error parsing tools JSON: " << e.what() << CLEAR << endl;
-    }
-  }
-
-  return tools;
-}
-
-}  // namespace slm_engine
-}  // namespace microsoft
diff --git a/examples/slm_engine/src/cpp/slm_engine.h b/examples/slm_engine/src/cpp/slm_engine.h
deleted file mode 100644
index e38658fa06..0000000000
--- a/examples/slm_engine/src/cpp/slm_engine.h
+++ /dev/null
@@ -1,512 +0,0 @@
-#pragma once
-
-#include <fstream>
-#include <functional>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <functional>
-
-#if defined(_WIN32) || defined(_WIN64)
-#include <string.h>
-#define strcasecmp _stricmp
-#define strncasecmp _strnicmp
-#else
-#include <strings.h>
-#endif
-
-#include "input_decoder.h"
-#include "ort_genai.h"
-
-#ifdef _WIN32
-#ifdef BUILDING_SLM_ENGINE
-#define SLM_ENGINE_EXPORT __declspec(dllexport)
-#else
-#define SLM_ENGINE_EXPORT __declspec(dllimport)
-#endif
-#else
-// To make symbols visible on macOS/iOS
-#ifdef __APPLE__
-#define SLM_ENGINE_EXPORT __attribute__((visibility("default")))
-#else
-#define SLM_ENGINE_EXPORT
-#endif
-#endif
-
-namespace microsoft {
-namespace slm_engine {
-
-/// @brief SLM Engine class to interact with the GenAI Model
-///
-/// The SLM Engine class is responsible for loading the GenAI Model and
-/// interacting with it to generate responses to user prompts. The class
-/// provides a complete() function that takes a user prompt and returns the
-/// generated response.
-///
-/// The class provides a Create() function to create a new instance
-/// of the SLM Engine and initialize it.
-///
-/// The class also provides a struct to hold the runtime performance metrics
-/// of the SLM Engine.
-///
-/// Example Usage:
-/// @code
-/// // Create a new instance of the SLM Engine
-/// auto slm_engine = SLMEngine::Create("path/to/model", true);
-/// if (!slm_engine) {
-///     std::cout << "Error creating the SLM Engine" << std::endl;
-///     return -1;
-/// }
-///
-/// // Generate a response to a user prompt
-/// std::string prompt =
-///     "{\"role\": \"user\", \"content\": \"Hello, how are you?\"}";
-/// std::string response = slm_engine->complete(prompt.c_str());
-/// std::cout << "Response: " << response["response"]["answer"] << std::endl;
-/// @endcode
-///
-
-class SLM_ENGINE_EXPORT SLMEngine {
- public:
-  /// @brief Status of the operation
-  /// @param code True if the operation was successful, false otherwise
-  /// @param message Message providing additional information about the status
-  /// @note The message is empty if the operation was successful
-  struct SLM_ENGINE_EXPORT Status {
-    bool code;
-    std::string message;
-  };
-
-  /// @brief Struct to represent a function tool parameter
-  struct SLM_ENGINE_EXPORT FunctionParameter {
-    std::string description;
-    std::string type;
-    std::string default_value;
-
-    FunctionParameter() : type("string") {}
-    FunctionParameter(const std::string& desc, const std::string& param_type = "string",
-                      const std::string& default_val = "")
-        : description(desc), type(param_type), default_value(default_val) {}
-  };
-
-  /// @brief Struct to represent a function tool
-  struct SLM_ENGINE_EXPORT FunctionTool {
-    std::string name;
-    std::string description;
-    std::map<std::string, FunctionParameter> parameters;
-
-    FunctionTool() = default;
-    FunctionTool(const std::string& tool_name, const std::string& tool_desc)
-        : name(tool_name), description(tool_desc) {}
-  };
-
-  /// @brief Function calling generation options
-  struct SLM_ENGINE_EXPORT FunctionCallOptions {
-    std::vector<FunctionTool> tools;
-    bool force_function_call;
-
-    FunctionCallOptions() : force_function_call(false) {}
-  };
-
-  /// @brief Single function call information
-  struct SLM_ENGINE_EXPORT FunctionCall {
-    std::string function_name;
-    std::string parameters_json;
-
-    FunctionCall() = default;
-    FunctionCall(const std::string& name, const std::string& params)
-        : function_name(name), parameters_json(params) {}
-  };
-
-  /// @brief Function call result structure supporting multiple calls
-  struct SLM_ENGINE_EXPORT FunctionCallResult {
-    std::vector<FunctionCall> function_calls;
-    bool is_function_call;
-    std::string text_response;
-
-    FunctionCallResult() : is_function_call(false) {}
-
-    // Legacy compatibility methods
-    std::string function_name() const {
-      return function_calls.empty() ? "" : function_calls[0].function_name;
-    }
-
-    std::string parameters_json() const {
-      return function_calls.empty() ? "" : function_calls[0].parameters_json;
-    }
-  };
-
-  /// @brief Get the version of the SLM Engine
-  /// @param slm_version SLM Engine version
-  /// @param ortga_version ORT GenAI version
-  /// @param ort_version ORT version
-  static void GetVersion(
-      std::string& slm_version,
-      std::string& ortga_version,
-      std::string& ort_version);
-
-  /// @brief Creates a new instance of the SLM Engine and initializes it
-  /// @param model_path Path to ONNX GenAI Model Directory
-  /// @param verbose When set, the LLM Generated output is displayed on stdout
-  /// @return New object or null if unsuccessful
-  static std::unique_ptr<SLMEngine> Create(
-      const char* model_path, bool verbose);
-
-  struct SLM_ENGINE_EXPORT LoRAAdapter {
-    std::string name;
-    std::string adapter_path;
-    explicit LoRAAdapter(const std::string& name,
-                         const std::string& adapter_path)
-        : name(name), adapter_path(adapter_path) {}
-    // Copy constructor
-    LoRAAdapter(const LoRAAdapter& other)
-        : name(other.name), adapter_path(other.adapter_path) {}
-  };
-
-  /// @brief Create SLMEngine, loads the model and adapters
-  /// @param model_path Path to ONNX GenAI Model Directory
-  /// @param adapters List of LoRA adapters in NN format
-  /// @param verbose When set to true, the LLM Generated output is displayed on stdout
-  /// @param status_msg Provides information about cause of failure to load model or
-  ///         adapters when applicable.
-  /// @return A new object or nullptr if unsuccessful. When unsuccessful, status_msg
-  ///         will contain information about the cause of failure.
-  static std::unique_ptr<SLMEngine> Create(
-      const char* model_path,
-      const std::vector<LoRAAdapter> adapters,
-      bool verbose,
-      Status& status_msg);
-
-  /// @brief  Get the current memory usage of the SLM Engine
-  /// @return Current memory usage in MB
-  static uint32_t GetMemoryUsage();
-
-  /// @brief Get the model family from the model path
-  /// @param model_path Path to the model file
-  /// @return Model family as a string
-  static std::string GetModelFamily(const std::string& model_path);
-
-  std::string get_model_path() const { return m_model_path; }
-  std::vector<LoRAAdapter> get_adapter_list();
-
-  /// @brief Generates a response to the user prompt using the GenAI Model
-  /// @param prompt User prompt to generate response for. The format for this
-  /// string is exactly the same as the OpenAI Text Completion API
-  /// @return Generated response JSON object as a string.
-  ///
-  /// The complete() function takes a user prompt and generates a response
-  /// using the GenAI Model. The function returns the generated response as a
-  /// string.
-  ///
-  ///  In SLM engine - the model is loaded at the create time. So we are re-purposing
-  ///  the OpenAI API "model" parameter to indicate name as the LoRA adapter if
-  ///  the adapter was loaded. If this parameter is not provided, the default
-  ///  then the base model without the apdapter is used.
-  ///
-  /// The user prompt should be in the following format:
-  /// {
-  ///     "model": "LoRA adapter name",
-  ///     "messages": [
-  ///         {
-  ///             "role": "system",
-  ///             "content": "System message"
-  ///         },
-  ///         {
-  ///             "role": "user",
-  ///             "content": "User message"
-  ///         }
-  ///     ],
-  ///     "temperature": 0.0,
-  ///     "stop": ["Stop token 1", "Stop token 2"],
-  ///     "max_tokens": 250
-  /// }
-  ///
-  /// Format of the response string when the call succeeds
-  /// {
-  ///     "status": "success",
-  ///     "response": {
-  ///         "answer": "Generated response",
-  ///         "kpi": {
-  ///             "prompt_toks": 10,
-  ///             "response_toks": 20,
-  ///             "ttft": 1000,
-  ///             "tok_rate": 10,
-  ///             "total_time": 10000,
-  ///             "memory_usage": 100
-  ///         }
-  ///     }
-  /// }
-  ///
-  /// Format of the response string when the call fails
-  /// {
-  ///     "status": "error",
-  ///     "message": "Error message"
-  /// }
-  ///
-  /// @note To support multi-turn conversations, the history should be
-  /// maintained by the caller and submitted just like how the OpenAI API
-  /// works
-  std::string complete(const char* prompt);
-
-  /// @brief Struct to hold the runtime performance metrics of the SLM Engine
-  /// @param PromptTokenCount Number of tokens in the prompt
-  /// @param TimeToFirstToken Time taken to generate the first token (milliseconds)
-  /// @param GeneratedTokenCount Number of tokens generated
-  /// @param TokenRate Number of tokens generated per second
-  /// @param TotalTime Total time taken to generate the response (milliseconds)
-  /// @param LoRAAdapterSwitchTime Time taken to "SetActiveAdapter" (milliseconds)
-  /// @param CurrentMemoryUsed Current memory used by the SLM Engine
-  struct RuntimePerf {
-    uint32_t PromptTokenCount;
-    uint32_t TimeToFirstToken;
-    uint32_t GeneratedTokenCount;
-    uint32_t TokenRate;
-    uint32_t TotalTime;
-    uint32_t GenerationTimePerToken;
-    uint32_t CurrentMemoryUsed;
-    RuntimePerf()
-        : PromptTokenCount(0),
-          TimeToFirstToken(0),
-          GeneratedTokenCount(0),
-          TokenRate(0),
-          TotalTime(0),
-          GenerationTimePerToken(0),
-          CurrentMemoryUsed(0) {}
-    RuntimePerf(const RuntimePerf& other)
-        : PromptTokenCount(other.PromptTokenCount),
-          TimeToFirstToken(other.TimeToFirstToken),
-          GeneratedTokenCount(other.GeneratedTokenCount),
-          TokenRate(other.TokenRate),
-          TotalTime(other.TotalTime),
-          GenerationTimePerToken(other.GenerationTimePerToken),
-          CurrentMemoryUsed(other.CurrentMemoryUsed) {}
-    RuntimePerf& operator=(const RuntimePerf& other) = delete;
-    RuntimePerf(RuntimePerf&& other) = delete;
-    RuntimePerf& operator=(RuntimePerf&& other) = delete;
-  };
-
-  /// @brief Struct to hold the generation options for the GenAI Model
-  /// @param MaxGeneratedTokens Maximum number of tokens to generate
-  /// @param TopK Top K sampling
-  /// @param TopP Top P sampling
-  /// @param Temperature Temperature for sampling
-  struct GenerationOptions {
-    uint32_t MaxGeneratedTokens;
-    uint32_t TopK;
-    float TopP;
-    float Temperature;
-    explicit GenerationOptions() {
-      MaxGeneratedTokens = 2048;
-      Temperature = 0.00000000000001f;
-      TopK = 50;
-      TopP = 0.1f;
-    }
-  };
-
-  /// @brief Asks the GenAI Model for a response
-  /// @param formatted_prompt Formatted prompt to generate response for
-  /// @param generation_options Generation options for the GenAI Model
-  /// @param response_str Generated response
-  /// @param kpi Runtime performance metrics of the SLM Engine
-  SLMEngine::Status generate(
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      std::string& response_str,
-      RuntimePerf& kpi);
-
-  /// @brief Asks the GenAI Model for a response using the given LoRA adapter
-  /// @param adapter_name Name of the LoRA adapter to use
-  /// @param formatted_prompt Formatted prompt to generate response for
-  /// @param generation_options Generation options for the GenAI Model
-  /// @param response_str Generated response
-  /// @param kpi Runtime performance metrics of the SLM Engine
-  Status generate(
-      const std::string& adapter_name,
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      std::string& response_str,
-      RuntimePerf& kpi);
-
-  /// @brief Generate response with function calling support
-  /// @param formatted_prompt Formatted prompt
-  /// @param generation_options Generation options
-  /// @param function_options Function calling options
-  /// @param response_str Generated response
-  /// @param function_result Function call result if any
-  /// @param kpi Runtime performance metrics
-  /// @return Status of the operation
-  Status generate_with_functions(
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      const FunctionCallOptions& function_options,
-      std::string& response_str,
-      FunctionCallResult& function_result,
-      RuntimePerf& kpi);
-
-  /// @brief Generate response with function calling using adapter
-  /// @param adapter_name Name of the LoRA adapter
-  /// @param formatted_prompt Formatted prompt
-  /// @param generation_options Generation options
-  /// @param function_options Function calling options
-  /// @param response_str Generated response
-  /// @param function_result Function call result if any
-  /// @param kpi Runtime performance metrics
-  /// @return Status of the operation
-  Status generate_with_functions(
-      const std::string& adapter_name,
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      const FunctionCallOptions& function_options,
-      std::string& response_str,
-      FunctionCallResult& function_result,
-      RuntimePerf& kpi);
-
-  /// @brief Given a system and an user prompt, formats the prompt by adding the
-  /// necessary control strings for the current LLM Model
-  /// @param system_prompt
-  /// @param user_prompt
-  /// @return
-  std::string format_prompt(
-      const std::string& system_prompt,
-      const std::string& user_prompt);
-
-  /// @brief Parse tools from JSON string
-  /// @param tools_json JSON string containing tools definition
-  /// @return Vector of FunctionTool objects
-  std::vector<FunctionTool> parse_tools_from_json(const std::string& tools_json);
-
-  SLMEngine(const SLMEngine&) = delete;
-  SLMEngine& operator=(const SLMEngine&) = delete;
-
-  /// @brief Destructor for the SLM Engine
-  ~SLMEngine();
-
- private:
-  SLMEngine(bool verbose) : m_verbose(verbose) {}
-
-  /// @brief
-  /// @param model_path
-  /// @return
-  bool load_model(const char* model_path);
-
-  /// @brief Given the user input parameters formats by adding the necessary
-  /// control strings for the current LLM Model (Phi3)
-  /// @param input_params Input parameters to use
-  /// @return Complete prompt to be fed to the LLM
-  std::string format_input(const InputDecoder::InputParams& input_params);
-
-  /// @brief Format input with tools for function calling
-  /// @param input_params Input parameters with tools
-  /// @return Complete prompt with tools information
-  std::string format_input_with_tools(const InputDecoder::InputParams& input_params);
-
-  // Define the Model related prompts
-  struct PromptFormat {
-    std::string prefix;
-    std::string suffix;
-  };
-
-  struct PromptFormatDictionary {
-    std::string llm_type;
-    std::map<InputDecoder::InputParams::Role, PromptFormat> prompt_format;
-  };
-
-  /// @brief Enum to define the supported model types
-  enum class SupportedModelType { PHI,
-                                  Llama,
-                                  Qwen,
-                                  CUSTOM,
-                                  UNKNOWN };
-
-  /// @param model_type String representation of the model type
-  /// @return SupportedModelType enum value
-  /// @note The string comparison is case-insensitive
-  static SupportedModelType StringToModelType(const std::string& model_type);
-
-  /// @brief  Converts SupportedModelType enum to string
-  /// @param model_type SupportedModelType enum value
-  /// @note The string representation is in lowercase
-  static std::string ModelTypeToString(SupportedModelType model_type);
-
-  bool parse_prompt_format_dict(SupportedModelType model_type,
-                                const std::string& json_dict,
-                                PromptFormatDictionary& prompt_format_dict);
-
-  std::unique_ptr<OgaGenerator> create_generator(
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      uint32_t& time_to_prefill);
-
-  /// @brief Generate the response using the GenAI Model
-  /// @param formatted_prompt Formatted prompt to generate response for
-  /// @param generator OgaGenerator object to use for generation
-  /// @param generation_callback Callback function to use for generation
-  /// @param response_str Generated response
-  /// @param kpi Runtime performance metrics of the SLM Engine
-  /// @return Status of the operation
-  /// @note The generation_callback function (if provided) is called for each token generated
-  Status generate(
-      OgaGenerator* generator,
-      std::function<bool(const std::string&, OgaTensor* logits)> generation_callback,
-      std::string& response_str,
-      RuntimePerf& kpi);
-
-  /// @brief Create Lark grammar for function calling
-  /// @param tools List of function tools
-  /// @param prompt_tool_input Output parameter for tool input prompt
-  /// @param grammar_input Output parameter for grammar input
-  /// @return True if successful
-  bool create_lark_grammar(const std::vector<FunctionTool>& tools,
-                           std::string& prompt_tool_input,
-                           std::string& grammar_input);
-
-  /// @brief Convert tool to grammar input format
-  /// @param tool Function tool to convert
-  /// @return JSON schema for the tool
-  std::string convert_tool_to_grammar_input(const FunctionTool& tool);
-
-  /// @brief Create tool input prompt
-  /// @param tools List of function tools
-  /// @return Tool input prompt string
-  std::string create_prompt_tool_input(const std::vector<FunctionTool>& tools);
-
-  /// @brief Parse function call from generated text
-  /// @param generated_text Generated text to parse
-  /// @param function_result Output function call result
-  /// @return True if function call was found and parsed
-  bool parse_function_call(const std::string& generated_text,
-                           FunctionCallResult& function_result);
-
-  /// @brief Create generator with function calling support
-  /// @param formatted_prompt Formatted prompt
-  /// @param generation_options Generation options
-  /// @param function_options Function calling options
-  /// @param time_to_prefill Output time to prefill
-  /// @return Generator object
-  std::unique_ptr<OgaGenerator> create_function_generator(
-      const std::string& formatted_prompt,
-      const GenerationOptions& generation_options,
-      const FunctionCallOptions& function_options,
-      uint32_t& time_to_prefill);
-
-  std::unique_ptr<OgaModel> m_onnx_model;
-  std::unique_ptr<OgaAdapters> m_adapters;
-  std::unique_ptr<OgaTokenizer> m_tokenizer;
-  std::unique_ptr<OgaTokenizerStream> m_tokenizer_stream;
-  std::unique_ptr<InputDecoder> m_input_decoder;
-  PromptFormatDictionary m_prompt_format;
-
-  std::vector<LoRAAdapter> m_adapters_list;
-  std::string m_model_path;
-  std::string m_model_type;
-
-  bool m_verbose;
-  std::ofstream m_llm_input_dbg_stream;
-  std::ofstream m_llm_output_dbg_stream;
-
-  // Need a scoped mutex to ensure only one complete() call at a time
-  std::mutex m_mutex;
-};
-}  // namespace slm_engine
-}  // namespace microsoft
diff --git a/examples/slm_engine/src/cpp/slm_engine_test.cpp b/examples/slm_engine/src/cpp/slm_engine_test.cpp
deleted file mode 100644
index 0d1ada7eba..0000000000
--- a/examples/slm_engine/src/cpp/slm_engine_test.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-#include "gtest/gtest.h"
-
-#include "slm_engine.h"
-#include <nlohmann/json.hpp>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <fstream>
-#include <filesystem>
-#include <vector>
-
-#define MAGENTA "\033[35;1m"
-#define RED "\033[31;1m"
-#define BLUE "\033[34;1m"
-#define GREEN "\033[32;1m"
-#define CLEAR "\033[0m"
-
-using namespace std;
-
-// Define the path to the model file
-// This should be set in the environment variable MODEL_FILE_PATH
-const char* MODEL_FILE_PATH = getenv("MODEL_FILE_PATH");
-
-// Define the path to the model root directory
-// Al the model directories are expected to be under this directory
-const char* MODEL_ROOT_DIR = getenv("MODEL_ROOT_DIR");
-
-// Define the path to the model root directory
-// Al the model directories are expected to be under this directory
-const char* ADAPTER_ROOT_DIR = getenv("ADAPTER_ROOT_DIR");
-
-namespace microsoft {
-namespace slm_engine {
-namespace testing {
-
-struct ModelInfo {
-  string model_path;
-  string model_family;
-};
-
-const ModelInfo MODELS[] = {
-    {"Llama-3.2-1B-Instruct-ONNX/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/", "llama"},
-    {"Phi-4-mini-instruct-onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/", "phi3"},
-};
-
-void trim_left_inplace(std::string& s) {
-  s.erase(
-      s.begin(), std::find_if_not(s.begin(), s.end(),
-                                  [](unsigned char c) { return std::isspace(c); }));
-}
-
-void trim_right_inplace(std::string& s) {
-  s.erase(
-      std::find_if_not(s.rbegin(), s.rend(),
-                       [](unsigned char c) { return std::isspace(c); })
-          .base(),
-      s.end());
-}
-
-void trim_inplace(std::string& s) {
-  trim_left_inplace(s);
-  trim_right_inplace(s);
-}
-
-std::string trim(const std::string& s) {
-  std::string trimmed = s;
-  trim_inplace(trimmed);
-  return trimmed;
-}
-
-TEST(SLMEngineTest, TestModelFamily) {
-  ASSERT_TRUE(MODEL_ROOT_DIR != nullptr) << "MODEL_ROOT_DIR is not set";
-
-  for (const auto& model : MODELS) {
-    string model_path = string(MODEL_ROOT_DIR) + "/" + model.model_path;
-    string model_family = SLMEngine::GetModelFamily(model_path);
-    ASSERT_EQ(model_family, model.model_family);
-  }
-}
-
-TEST(SLMEngineTest, LoadUnloadModel) {
-  ASSERT_TRUE(MODEL_FILE_PATH != nullptr) << "MODEL_FILE_PATH is not set";
-
-  cout << "Initial Memory Usage: "
-       << microsoft::slm_engine::SLMEngine::GetMemoryUsage() << " MB"
-       << endl;
-
-  for (int i = 0; i < 5; i++) {
-    // Test loading a model
-    cout << "Before Engine Create Memory Usage: "
-         << microsoft::slm_engine::SLMEngine::GetMemoryUsage() << " MB"
-         << endl;
-    auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-        MODEL_FILE_PATH, false);
-
-    cout << "After Loading Model Memory Usage: "
-         << microsoft::slm_engine::SLMEngine::GetMemoryUsage() << " MB"
-         << endl;
-
-    ASSERT_NE(slm_engine, nullptr);
-
-    // Reset the engine to free up resources
-    slm_engine.reset();
-
-    cout << "After delete engine Memory Usage: "
-         << microsoft::slm_engine::SLMEngine::GetMemoryUsage() << " MB"
-         << endl;
-  }
-}
-
-// A few Test Prompts and their expected answers
-struct TestPrompt {
-  string system_prompt;
-  string user_prompt;
-  string expected_answer;
-};
-
-const char* SYS_PROMPT =
-    "You are a helpful assistant. "
-    "Analyze the sentiment of the statement and just answer in one word what it is. "
-    "You can chose from one of Positive, Neutral, or Negative. "
-    "Use a new line after the answer.";
-
-const TestPrompt TEST_PROMPTS[] = {
-    {SYS_PROMPT,
-     "I love the new design of your website!\nThe sentiment of the text is: ", "Positive"},
-    {SYS_PROMPT,
-     "The customer service was terrible and unhelpful.\nThe sentiment of the text is: ",
-     "Negative"},
-    {SYS_PROMPT,
-     "I'm not sure how I feel about the new update.\nThe sentiment of the text is: ", "Neutral"},
-    {SYS_PROMPT,
-     "The product quality has improved significantly.\nThe sentiment of the text is: ",
-     "Positive"},
-    {SYS_PROMPT,
-     "I had a bad experience with the delivery service.\nThe sentiment of the text is: ",
-     "Negative"},
-    {SYS_PROMPT,
-     "The instructions were clear and easy to follow.\nThe sentiment of the text is: ",
-     "Positive"},
-    {SYS_PROMPT,
-     "I'm disappointed with the recent changes.\nThe sentiment of the text is: ",
-     "Negative"},
-    {SYS_PROMPT,
-     "The event was well-organized and enjoyable.\nThe sentiment of the text is: ",
-     "Positive"},
-    {SYS_PROMPT,
-     "I have mixed feelings about the new policy.\nThe sentiment of the text is: ",
-     "Neutral"},
-    {SYS_PROMPT,
-     "The food was delicious and the service was excellent.\nThe sentiment of the text is: ",
-     "Positive"}};
-
-TEST(SLMEngineTest, TestGeneration) {
-  ASSERT_TRUE(MODEL_FILE_PATH != nullptr) << "MODEL_FILE_PATH is not set";
-
-  auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-      MODEL_FILE_PATH, false);
-  ASSERT_NE(slm_engine, nullptr);
-
-  cout << "After Loading Model Memory Usage: "
-       << microsoft::slm_engine::SLMEngine::GetMemoryUsage() << " MB"
-       << endl;
-
-  for (const auto& test_prompt : TEST_PROMPTS) {
-    SLMEngine::GenerationOptions generator_options;
-    generator_options.MaxGeneratedTokens = 250;
-    generator_options.Temperature = 0.000000001f;
-
-    string response;
-    SLMEngine::RuntimePerf kpi;
-
-    cout << "Question: " << test_prompt.user_prompt << endl;
-    slm_engine->generate(
-        test_prompt.system_prompt + test_prompt.user_prompt,
-        generator_options, response, kpi);
-
-    string stop_token("\n");
-    // We need to remove the stop token(s) from the response
-    auto stop_token_pos = response.find(stop_token);
-    if (stop_token_pos != std::string::npos) {
-      response = response.substr(0, stop_token_pos);
-    }
-
-    trim_inplace(response);
-
-    cout << "Response: " << response << endl;
-    cout << "Expected: " << test_prompt.expected_answer << endl;
-    cout << "TTFT: " << kpi.TimeToFirstToken << " TPS: " << kpi.TokenRate
-         << " Memory Usage: " << kpi.CurrentMemoryUsed << " MB "
-         << "Generated Tokens: " << kpi.GeneratedTokenCount
-         << " Prompt Tokens: " << kpi.PromptTokenCount << endl
-         << endl;
-
-    EXPECT_STREQ(response.c_str(), test_prompt.expected_answer.c_str())
-        << "Test failed for prompt: " << test_prompt.user_prompt
-        << " with response: " << response;
-  }
-}
-
-const char* TEST_INPUT_FILE = getenv("TEST_INPUT_FILE");
-
-TEST(SLMEngineTest, CaptureMemoryUsage) {
-  // This test captures the memory usage at various stages of the SLM  Engine lifecycle
-  // Produces a JSON file with performance metrics that can be used for analysis
-  ASSERT_TRUE(MODEL_FILE_PATH != nullptr) << "MODEL_FILE_PATH is not set";
-  ASSERT_TRUE(TEST_INPUT_FILE != nullptr) << "TEST_INPUT_FILE is not set";
-
-  nlohmann::json overall_status_json;
-  overall_status_json["model_path"] = MODEL_FILE_PATH;
-  overall_status_json["test_input_file"] = TEST_INPUT_FILE;
-
-  string ort_version, oga_version, slm_version;
-
-  SLMEngine::GetVersion(ort_version, oga_version, slm_version);
-  overall_status_json["ort_version"] = ort_version;
-  overall_status_json["oga_version"] = oga_version;
-  overall_status_json["slm_version"] = slm_version;
-
-  overall_status_json["memory_before_run"] = SLMEngine::GetMemoryUsage();
-
-  auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-      MODEL_FILE_PATH, false);
-  ASSERT_NE(slm_engine, nullptr) << "Failed to create SLMEngine";
-  overall_status_json["memory_after_load"] = SLMEngine::GetMemoryUsage();
-
-  // Now start the run
-  ASSERT_TRUE(filesystem::exists(TEST_INPUT_FILE))
-      << "Input file doesn't exist: " << TEST_INPUT_FILE;
-
-  ifstream file(TEST_INPUT_FILE);
-  ASSERT_TRUE(file.is_open()) << "Failed to open test input file";
-
-  auto per_prompt_stats_json_array = nlohmann::json::array();
-  string line;
-  while (getline(file, line)) {
-    try {
-      auto jsonObject = nlohmann::json::parse(line);  // Parse the JSON object
-      cout << "Question: " << jsonObject["messages"][1]["content"] << endl;
-
-      auto per_prompt_stats_json = nlohmann::json::object();
-      per_prompt_stats_json["memory_before_generate"] = SLMEngine::GetMemoryUsage();
-
-      SLMEngine::RuntimePerf kpi;
-      SLMEngine::GenerationOptions generator_options;
-      string response;
-
-      slm_engine->generate(line, generator_options, response, kpi);
-
-      // Capture the stats
-      per_prompt_stats_json["ttft"] = kpi.TimeToFirstToken;
-      per_prompt_stats_json["tok_rate"] = kpi.TokenRate;
-      per_prompt_stats_json["memory_usage"] = kpi.CurrentMemoryUsed;
-      per_prompt_stats_json["total_time"] = kpi.TotalTime;
-      per_prompt_stats_json["prompt_toks"] = kpi.PromptTokenCount;
-      per_prompt_stats_json["generated_toks"] = kpi.GeneratedTokenCount;
-
-      per_prompt_stats_json["memory_after_generate"] = SLMEngine::GetMemoryUsage();
-
-      cout << "Response: " << response << endl;
-      cout << "TTFT: " << kpi.TimeToFirstToken << " TPS: " << kpi.TokenRate
-           << " Memory Usage: " << kpi.CurrentMemoryUsed << " MB" << endl
-           << endl;
-
-      per_prompt_stats_json_array.push_back(per_prompt_stats_json);
-
-    } catch (const nlohmann::json::parse_error& e) {
-      FAIL() << "Failed to parse JSON: " << e.what();
-    }
-  }
-
-  // Destroy the engine
-  slm_engine.reset();
-  overall_status_json["memory_after_unload"] = SLMEngine::GetMemoryUsage();
-
-  // At the end - capture the memory usage
-  overall_status_json["memory_after_run"] = SLMEngine::GetMemoryUsage();
-
-  nlohmann::json test_output_json;
-  test_output_json["overall_stats"] = overall_status_json;
-  test_output_json["per_prompt_stats"] = per_prompt_stats_json_array;
-
-  // Write the JSON object to a file
-  ofstream output_file("test_output.json");
-  if (output_file.is_open()) {
-    output_file << test_output_json.dump(4);  // Pretty print with 4 spaces
-    output_file.close();
-  } else {
-    FAIL() << "Failed to open output file for writing";
-  }
-}
-
-TEST(SLMEngineTest, LoRAAdapterTest) {
-  ASSERT_TRUE(ADAPTER_ROOT_DIR != nullptr) << "ADAPTER_ROOT_DIR is not set";
-
-  auto adapters = vector<SLMEngine::LoRAAdapter>();
-  adapters.push_back(SLMEngine::LoRAAdapter(
-      "function_caller",
-      string(ADAPTER_ROOT_DIR) + "/function_calling.onnx_adapter"));
-
-  SLMEngine::Status status;
-  auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-      (string(ADAPTER_ROOT_DIR) + "/adapted_model").c_str(), adapters, false, status);
-
-  ASSERT_NE(slm_engine, nullptr) << "Failed to create SLMEngine with adapters: " << status.message;
-
-  adapters.clear();
-  adapters = slm_engine->get_adapter_list();
-  ASSERT_EQ(adapters.size(), 1) << "Adapter list size mismatch";
-  ASSERT_EQ(adapters[0].name, "function_caller") << "Adapter name mismatch";
-  ASSERT_EQ(adapters[0].adapter_path,
-            string(ADAPTER_ROOT_DIR) + "/function_calling.onnx_adapter")
-      << "Adapter path mismatch";
-
-  // Send some test data
-  const char* SYS_PROMPT =
-      "You are an in car virtual assistant that maps user's inputs to the "
-      "corresponding function call in the vehicle. You must respond with only "
-      "a JSON object matching the following schema: "
-      "{\"function_name\": <name of the function>, \"arguments\": <arguments of the function>}";
-
-  const TestPrompt TEST_INPUTS[] = {
-      {SYS_PROMPT,
-       "Can you please set the radio to 90.3?",
-       "{\"function_name\": \"tune_radio\", \"arguments\": {\"station\": 90.3}}"},
-      {SYS_PROMPT,
-       "Please text Dominik that I am running behind",
-       "{\"function_name\": \"text\", \"arguments\": {\"name\": \"Dominik\", \"message\": \"I am running behind\"}}"},
-      {SYS_PROMPT,
-       "Can you please set it to 74 degrees?",
-       "{\"function_name\": \"set_car_temperature_setpoint\", \"arguments\": {\"temperature\": 74}}"},
-      {SYS_PROMPT,
-       "Drive to 1020 South Figueroa Street.",
-       "{\"function_name\": \"navigate\", \"arguments\": {\"destination\": \"1020 South Figueroa Street\"}}"},
-  };
-
-  for (const auto& next_input : TEST_INPUTS) {
-    cout << "Question: " << next_input.user_prompt << endl;
-    auto formatted_prompt = slm_engine->format_prompt(
-        next_input.system_prompt, next_input.user_prompt);
-
-    // cout << "Formatted Prompt: " BLUE << formatted_prompt << CLEAR << endl;
-
-    SLMEngine::GenerationOptions generator_options;
-    generator_options.MaxGeneratedTokens = 500;
-    generator_options.Temperature = 0.000000001f;
-    string response;
-    SLMEngine::RuntimePerf kpi;
-    slm_engine->generate("function_caller", formatted_prompt, generator_options, response, kpi);
-    cout << "Response (LoRA): " << MAGENTA
-         << "Total Time: " << kpi.TotalTime << " TPS: " << kpi.TokenRate
-         << " Avg Generation Time: " << kpi.GenerationTimePerToken
-         << " Prompt Tokens: " << kpi.PromptTokenCount
-         << " TTFT: " << kpi.TimeToFirstToken
-         << " Generated Tokens: " << kpi.GeneratedTokenCount
-         << " Memory: " << kpi.CurrentMemoryUsed
-         << "\n"
-         << response << CLEAR << endl;
-
-    slm_engine->generate(formatted_prompt, generator_options, response, kpi);
-    cout << "Response: "
-         << GREEN
-         << "Total Time: " << kpi.TotalTime << " TPS: " << kpi.TokenRate
-         << " Avg Generation Time: " << kpi.GenerationTimePerToken
-         << " Prompt Tokens: " << kpi.PromptTokenCount
-         << " TTFT: " << kpi.TimeToFirstToken
-         << " Generated Tokens: " << kpi.GeneratedTokenCount
-         << " Memory: " << kpi.CurrentMemoryUsed
-         << "\n"
-         << response << CLEAR << endl;
-
-    // auto resp_json = nlohmann::json::parse(response);
-    // auto expected_json = nlohmann::json::parse(next_input.expected_answer);
-
-    // EXPECT_EQ(resp_json.dump(), expected_json.dump())
-    //     << "Test failed for prompt: " << next_input.user_prompt
-    //     << " \nwith response: " << resp_json.dump() << " \nexpected: " << expected_json.dump();
-  }
-}
-}  // namespace testing
-}  // namespace slm_engine
-}  // namespace microsoft
diff --git a/examples/slm_engine/src/cpp/slm_runner.cpp b/examples/slm_engine/src/cpp/slm_runner.cpp
deleted file mode 100644
index ac29633833..0000000000
--- a/examples/slm_engine/src/cpp/slm_runner.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#include <argparse/argparse.hpp>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <nlohmann/json.hpp>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-
-using json = nlohmann::json;
-
-#define MAGENTA_BOLD "\033[35;1m"
-#define MAGENTA "\033[35m"
-#define RED_BOLD "\033[31;1m"
-#define RED "\033[31m"
-#define BLUE_BOLD "\033[34;1m"
-#define BLUE "\033[34m"
-#define GREEN_BOLD "\033[32;1m"
-#define GREEN "\033[32m"
-#define CLEAR "\033[0m"
-
-#include "slm_engine.h"
-
-using namespace std;
-
-/// @brief Reading from the input JSONL file, get the LLM response and write to
-/// the output
-/// @param model_path Path to the ONNX Quantized GenAI model
-/// @param test_data_file JSONL file containing the question set to ask SLM
-/// @param output_file Path to the JSONL file to save the SLM response and stats
-/// @return 0 if successful, -1 otherwise
-int run_test(const string& model_path,
-             const std::vector<std::string>& adapter_files,
-             const string& test_data_file, const string& output_file,
-             bool verbose, int wait_between_requests) {
-  // Make sure that the files exist
-  if (!filesystem::exists(model_path)) {
-    cout << "Error! Model path doesn't exist: " << model_path << "\n";
-    return -1;
-  }
-
-  // Make sure that the files exist
-  if (!filesystem::exists(test_data_file)) {
-    cout << "Error! Test Data file doesn't exist: " << test_data_file
-         << "\n";
-    return -1;
-  }
-
-  cout << "Model: " << model_path << "\n"
-       << "Test File: " << test_data_file << "\n";
-
-  std::unique_ptr<microsoft::slm_engine::SLMEngine> slm_engine;
-
-  if (adapter_files.empty()) {
-    slm_engine = microsoft::slm_engine::SLMEngine::Create(
-        model_path.c_str(), verbose);
-  } else {
-    vector<microsoft::slm_engine::SLMEngine::LoRAAdapter> adapters;
-
-    for (const auto& adapter_file : adapter_files) {
-      // Check if the adapter path exists
-      if (!std::filesystem::exists(adapter_file)) {
-        cout << RED << "Adapter path does not exist: " << adapter_file
-             << CLEAR << endl;
-        return -1;
-      }
-
-      // Get the filename from the path
-      std::filesystem::path adapter_name(adapter_file);
-      microsoft::slm_engine::SLMEngine::LoRAAdapter adapter(
-          adapter_name.stem().string(),
-          adapter_file);
-      adapters.push_back(adapter);
-
-      cout << "Adapter: " << adapter_name.stem().string() << "\n";
-      cout << "Adapter Path: " << adapter_file << "\n";
-    }
-
-    microsoft::slm_engine::SLMEngine::Status status;
-    slm_engine = microsoft::slm_engine::SLMEngine::Create(
-        model_path.c_str(), adapters, verbose, status);
-  }
-
-  if (!slm_engine) {
-    cout << "Cannot create engine!\n";
-    return -1;
-  }
-
-  ofstream output(output_file);
-  string line;
-  ifstream test_data(test_data_file);
-  while (getline(test_data, line)) {
-    if (line.empty()) {
-      continue;
-    }
-
-    auto response = slm_engine->complete(line.c_str());
-    json output_json = json::parse(response);
-
-    if (!verbose) {
-      cout << BLUE << "Question: " << output_json["question"]
-           << CLEAR << endl;
-      cout << GREEN << "Answer: " << output_json["choices"][0]["message"]["content"]
-           << CLEAR << endl;
-    }
-    // Save to the file
-    output << output_json.dump() << endl;
-
-    cout << "Prompt Tokens: "
-         << output_json["kpi"]["prompt_toks"] << " "
-         << "TTFT: " << MAGENTA_BOLD
-         << output_json["kpi"]["ttft"].template get<float>() /
-                1000.0f
-         << " sec " << CLEAR << "Generated: "
-         << output_json["kpi"]["generated_toks"] << " "
-         << "Token Rate: " << MAGENTA_BOLD
-         << output_json["kpi"]["tok_rate"] << CLEAR << " "
-         << "Time: "
-         << output_json["kpi"]["total_time"]
-                    .template get<float>() /
-                1000.0f
-         << " sec "
-         << "Memory: " << MAGENTA_BOLD
-         << output_json["kpi"]["memory_usage"] << CLEAR << " MB"
-         << "\n";
-    flush(cout);
-    if (wait_between_requests > 0) {
-      cout << "Waiting for " << wait_between_requests << " ms\n";
-      this_thread::sleep_for(chrono::milliseconds(wait_between_requests));
-    }
-  }
-  return 0;
-}
-
-/// @brief Program entry point
-int main(int argc, char** argv) {
-  argparse::ArgumentParser program("slm_runner", "1.0",
-                                   argparse ::default_arguments::none);
-  string model_path;
-  program.add_argument("-m", "--model_path")
-      .required()
-      .help("Path to the model file")
-      .store_into(model_path);
-
-  program.add_argument("-a", "--adapters")
-      .help(
-          "List of LoRA adapter files to process. "
-          "We will use the filename part as the name of the LoRA adpater")
-      .nargs(argparse::nargs_pattern::any);
-
-  string test_data_file;
-  program.add_argument("-t", "--test_data_file")
-      .required()
-      .help("Path to the test data file (JSONL)")
-      .store_into(test_data_file);
-
-  string output_file;
-  program.add_argument("-o", "--output_file")
-      .required()
-      .help("Path to the output file (JSONL)")
-      .store_into(output_file);
-
-  int wait_between_requests = 0;
-  program.add_argument("-w", "--wait_between_requests")
-      .help("Wait time between requests in milliseconds")
-      .store_into(wait_between_requests);
-
-  program.add_argument("-v", "--verbose")
-      .default_value(false)
-      .implicit_value(true)
-      .help(
-          "If provided, more debugging information printed on standard "
-          "output");
-
-  std::string slm_ver, oga_ver, ort_ver;
-  microsoft::slm_engine::SLMEngine::GetVersion(slm_ver, oga_ver, ort_ver);
-
-  cout << "SLM Runner Version: "
-       << slm_ver << "\nORT GenAI Version: " << oga_ver << "\nORT Version: " << ort_ver
-       << endl;
-  try {
-    program.parse_args(argc, argv);
-  } catch (const std::exception& err) {
-    std::cerr << err.what() << std::endl;
-    std::cerr << program;
-    std::exit(-1);
-  }
-
-  bool verbose = false;
-  if (program["--verbose"] == true) {
-    verbose = true;
-  }
-
-  std::vector<std::string> adapters =
-      program.get<std::vector<std::string>>("adapters");
-  // Responsible for cleaning up the library during shutdown
-  // OgaHandle handle;
-
-  run_test(model_path, adapters, test_data_file, output_file, verbose,
-           wait_between_requests);
-
-  OgaShutdown();
-}
\ No newline at end of file
diff --git a/examples/slm_engine/src/cpp/slm_server.cpp b/examples/slm_engine/src/cpp/slm_server.cpp
deleted file mode 100644
index 8e2784234a..0000000000
--- a/examples/slm_engine/src/cpp/slm_server.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-#include <argparse/argparse.hpp>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <nlohmann/json.hpp>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include "httplib.h"
-#include "slm_engine.h"
-
-using json = nlohmann::json;
-
-#define MAGENTA_BOLD "\033[35;1m"
-#define MAGENTA "\033[35m"
-#define RED_BOLD "\033[31;1m"
-#define RED "\033[31m"
-#define BLUE_BOLD "\033[34;1m"
-#define BLUE "\033[34m"
-#define GREEN_BOLD "\033[32;1m"
-#define GREEN "\033[32m"
-#define CLEAR "\033[0m"
-
-#include "slm_engine.h"
-
-using namespace std;
-
-// Function to clean qwen model response formatting
-std::string cleanQwenResponse(const std::string& input) {
-  std::string result = input;
-
-  // Remove <think>\n\n</think>\n\n pattern
-  std::regex think_pattern(R"(<think>[\s\S]*?</think>\s*\n*)");
-  result = std::regex_replace(result, think_pattern, "");
-
-  // Replace markdown json code blocks ```json ``` with just the content
-  std::regex json_pattern(R"(```json\s*\n([\s\S]*?)\n\s*```)");
-  result = std::regex_replace(result, json_pattern, "$1");
-
-  return result;
-}
-
-int run_server(const string& model_path,
-               int port_number, bool verbose) {
-  // Create the SLM
-  auto slm_engine = microsoft::slm_engine::SLMEngine::Create(
-      model_path.c_str(), verbose);
-  if (!slm_engine) {
-    cout << "Cannot create engine!\n";
-    return -1;
-  }
-
-  httplib::Server svr;
-
-  svr.Get("/", [&](const httplib::Request& req, httplib::Response& res) {
-    json response_body;
-    response_body["status"] = "success";
-
-    std::string slm_ver, oga_ver, ort_ver;
-    microsoft::slm_engine::SLMEngine::GetVersion(slm_ver, oga_ver, ort_ver);
-    json engine_state = {
-        {"model", std::filesystem::path(model_path).filename().string()},
-        {"engine_version", {"slm_version", slm_ver, "oga_version", oga_ver, "ort_version", ort_ver}},
-        {"capabilities", {"text_completion", "function_calling"}}};
-    response_body["engine_state"] = engine_state;
-    json get_response;
-    get_response["response"] = response_body;
-    res.status = 200;
-    res.set_content(get_response.dump(), "application/json");
-  });
-
-  // POST /completions endpoint
-  svr.Post("/completions", [&](const httplib::Request& req,
-                               httplib::Response& res) {
-    try {
-      // Parse the request body to check for function calling
-      json request_json = json::parse(req.body);
-      bool has_tools = request_json.contains("tools") && !request_json["tools"].empty();
-
-      json response_json;
-
-      if (has_tools) {
-        // Handle function calling request
-        cout << GREEN_BOLD << "Processing function calling request..." << CLEAR << endl;
-
-        // Use the enhanced complete method that handles function calling internally
-        auto response = slm_engine->complete(req.body.c_str());
-        json output_json = json::parse(response);
-
-        cout << RED_BOLD << "Response from SLM Engine: "
-             << output_json.dump(2) << CLEAR << endl;
-
-        // Check if this is a function call response
-        // Function calls are detected by checking if the answer starts with '[' and contains JSON
-        bool is_function_call = false;
-        json function_calls_array;
-
-        if (output_json.contains("response") &&
-            output_json["response"].contains("answer")) {
-          std::string answer = output_json["response"]["answer"];
-
-          // Clean qwen model specific formatting
-          answer = cleanQwenResponse(answer);
-
-          // Trim whitespace
-          answer.erase(0, answer.find_first_not_of(" \t\n\r"));
-          answer.erase(answer.find_last_not_of(" \t\n\r") + 1);
-
-          // Update the cleaned answer back to the response
-          output_json["response"]["answer"] = answer;
-
-          // Check if answer starts with '[' and ends with ']' (JSON array format)
-          if (answer.length() > 0 && answer[0] == '[' && answer.back() == ']') {
-            try {
-              function_calls_array = json::parse(answer);
-              if (function_calls_array.is_array() && !function_calls_array.empty()) {
-                // Verify that each element has 'name' and 'arguments' fields
-                bool valid_function_calls = true;
-                for (const auto& call : function_calls_array) {
-                  if (!call.contains("name") || !call.contains("arguments")) {
-                    valid_function_calls = false;
-                    break;
-                  }
-                }
-                if (valid_function_calls) {
-                  is_function_call = true;
-                }
-              }
-            } catch (const json::exception& e) {
-              // Not valid JSON, treat as regular text response
-              is_function_call = false;
-            }
-          }
-        }
-
-        if (is_function_call) {
-          // Function call(s) detected - always use function_calls array format
-          cout << BLUE_BOLD << "Function call" << (function_calls_array.size() > 1 ? "s" : "")
-               << " detected (" << function_calls_array.size() << " call"
-               << (function_calls_array.size() > 1 ? "s" : "") << "):" << CLEAR << endl;
-
-          for (size_t i = 0; i < function_calls_array.size(); ++i) {
-            cout << BLUE_BOLD << "  " << (i + 1) << ". "
-                 << function_calls_array[i]["name"] << CLEAR << endl;
-          }
-
-          // Ensure the response has the unified function_calls array format
-          if (!output_json["response"].contains("function_calls")) {
-            // Convert function_calls_array to the proper format with string arguments
-            json unified_function_calls = json::array();
-            for (const auto& call : function_calls_array) {
-              unified_function_calls.push_back({{"name", call["name"]},
-                                                {"arguments", call["arguments"].dump()}});
-            }
-            output_json["response"]["function_calls"] = unified_function_calls;
-          }
-
-          // Print KPIs for function calling
-          if (output_json.contains("kpi")) {
-            cout << "Prompt Tokens: " << output_json["kpi"]["prompt_toks"] << " "
-                 << "TTFT: " << MAGENTA_BOLD
-                 << output_json["kpi"]["ttft"].template get<float>() / 1000.0f
-                 << " sec " << CLEAR << "Generated: "
-                 << output_json["kpi"]["generated_toks"] << " "
-                 << "Token Rate: " << MAGENTA_BOLD
-                 << output_json["kpi"]["tok_rate"] << CLEAR << " "
-                 << "Time: " << output_json["kpi"]["total_time"].template get<float>() / 1000.0f
-                 << " sec " << "Memory: " << MAGENTA_BOLD
-                 << output_json["kpi"]["memory_usage"] << CLEAR << " MB"
-                 << " [FUNCTION_CALL" << (function_calls_array.size() > 1 ? "S" : "") << "]" << endl;
-          }
-        } else {
-          // Regular text response with tools available
-          cout << "Text response generated (no function call)" << endl;
-
-          // Print KPIs for regular generation
-          if (output_json.contains("kpi")) {
-            cout << "Prompt Tokens: " << output_json["kpi"]["prompt_toks"] << " "
-                 << "TTFT: " << MAGENTA_BOLD
-                 << output_json["kpi"]["ttft"].template get<float>() / 1000.0f
-                 << " sec " << CLEAR << "Generated: "
-                 << output_json["kpi"]["generated_toks"] << " "
-                 << "Token Rate: " << MAGENTA_BOLD
-                 << output_json["kpi"]["tok_rate"] << CLEAR << " "
-                 << "Time: " << output_json["kpi"]["total_time"].template get<float>() / 1000.0f
-                 << " sec " << "Memory: " << MAGENTA_BOLD
-                 << output_json["kpi"]["memory_usage"] << CLEAR << " MB" << endl;
-          }
-        }
-
-        res.status = 200;
-        res.set_content(output_json.dump(), "application/json");
-
-      } else {
-        // Handle regular completion request (no tools)
-        auto response = slm_engine->complete(req.body.c_str());
-        json output_json = json::parse(response);
-
-        // Clean qwen model specific formatting for regular responses too
-        if (output_json.contains("response") &&
-            output_json["response"].contains("answer")) {
-          std::string answer = output_json["response"]["answer"];
-          answer = cleanQwenResponse(answer);
-          output_json["response"]["answer"] = answer;
-        }
-
-        // Print KPIs for regular completion
-        cout << "Prompt Tokens: "
-             << output_json["kpi"]["prompt_toks"] << " "
-             << "TTFT: " << MAGENTA_BOLD
-             << output_json["kpi"]["ttft"].template get<float>() / 1000.0f
-             << " sec " << CLEAR << "Generated: "
-             << output_json["kpi"]["generated_toks"] << " "
-             << "Token Rate: " << MAGENTA_BOLD
-             << output_json["kpi"]["tok_rate"] << CLEAR << " "
-             << "Time: " << output_json["kpi"]["total_time"].template get<float>() / 1000.0f
-             << " sec " << "Memory: " << MAGENTA_BOLD
-             << output_json["kpi"]["memory_usage"] << CLEAR << " MB" << "\n";
-        flush(cout);
-
-        res.status = 200;
-        res.set_content(output_json.dump(), "application/json");
-      }
-
-    } catch (const std::exception& e) {
-      // Handle JSON parsing errors or other exceptions
-      json error_response;
-      error_response["status"] = "error";
-      error_response["message"] = std::string("Request processing error: ") + e.what();
-
-      cout << RED_BOLD << "Error processing request: " << e.what() << CLEAR << endl;
-
-      res.status = 400;
-      res.set_content(error_response.dump(), "application/json");
-    }
-  });
-
-  cout << MAGENTA_BOLD << "Starting server on port: " << port_number << CLEAR << endl;
-  svr.listen("0.0.0.0", port_number);
-  return 0;
-}
-
-/// @brief Program entry point
-int main(int argc, char** argv) {
-  argparse::ArgumentParser program("slm_server", "1.0",
-                                   argparse ::default_arguments::none);
-  string model_path;
-  program.add_argument("-m", "--model_path")
-      .required()
-      .help("Path to the model file")
-      .store_into(model_path);
-
-  int port_number = 8080;
-  program.add_argument("-p", "--port_number")
-      .help("HTTP Port Number to use (default 8080)")
-      .store_into(port_number);
-
-  program.add_argument("-v", "--verbose")
-      .default_value(false)
-      .implicit_value(true)
-      .help(
-          "If provided, more debugging information printed on standard "
-          "output");
-
-  string slm_ver, oga_ver, ort_ver;
-  microsoft::slm_engine::SLMEngine::GetVersion(slm_ver, oga_ver, ort_ver);
-  cout << "SLM Runner Version: " << slm_ver << "\n"
-       << "ORT GenAI Version: " << oga_ver << "\n"
-       << "ORT Version: " << ort_ver
-       << endl;
-  try {
-    program.parse_args(argc, argv);
-  } catch (const std::exception& err) {
-    std::cerr << err.what() << std::endl;
-    std::cerr << program;
-    std::exit(-1);
-  }
-
-  bool verbose = false;
-  if (program["--verbose"] == true) {
-    verbose = true;
-  }
-
-  run_server(model_path, port_number, verbose);
-  OgaShutdown();
-}
\ No newline at end of file
diff --git a/examples/slm_engine/test/10-inputs-ground-truth.jsonl b/examples/slm_engine/test/10-inputs-ground-truth.jsonl
deleted file mode 100644
index 27c0de4992..0000000000
--- a/examples/slm_engine/test/10-inputs-ground-truth.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{  "question": "What makes a cloud form?", "answer": "Clouds form when moist air rises and cools, causing the water vapor to condense into tiny droplets or ice crystals."}
-{  "question": "Can drink and food taste different just by changing its color?",  "answer": "Yes, studies have shown that the color of food and drink can influence our perception of taste, even if the flavor remains the same."}
-{  "question": "What makes popcorn pop?",  "answer": "Popcorn pops because the water inside the kernel heats up and turns into steam, creating pressure that causes the kernel to explode."}
-{  "question": "Does temperature affect seed sprouting?",  "answer": "Yes, temperature can significantly affect seed sprouting. Seeds generally sprout faster in warmer temperatures, but extreme heat can inhibit germination."}
-{  "question": "Does the color of light affect photosynthesis?",  "answer": "Yes, different colors of light can affect the rate of photosynthesis. For example, blue and red light are most effective for photosynthesis, while green light is least effective."}
-{  "question": "Does exercise affect memory?",  "answer": "Yes, regular exercise has been shown to improve memory and cognitive function by increasing blood flow to the brain and promoting the growth of new brain cells."}
-{  "question": "Can fruits and vegetables generate electricity?",  "answer": "Yes, certain fruits and vegetables can generate electricity due to the presence of electrolytes, which can create a small electric current when combined with two different metals."}
-{  "question": "Can you protect a raw egg from a drop using common materials?",  "answer": "Yes, by using materials like bubble wrap, cotton, or a padded container, you can protect a raw egg from breaking when dropped from a height."}
-{  "question": "What happens when a raw egg is put into vinegar?",  "answer": "When a raw egg is placed in vinegar, the acetic acid in the vinegar reacts with the calcium carbonate in the eggshell, dissolving it and leaving behind the egg's membrane."}
-{  "question": "What drink causes more stains on teeth?",  "answer": "Drinks like coffee, tea, and red wine are known to cause more stains on teeth due to their high levels of chromogens, tannins, and acids."}
diff --git a/examples/slm_engine/test/10-inputs-to-slm.jsonl b/examples/slm_engine/test/10-inputs-to-slm.jsonl
deleted file mode 100644
index 997818e494..0000000000
--- a/examples/slm_engine/test/10-inputs-to-slm.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "What makes a cloud form?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Can drink and food taste different just by changing its color?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "What makes popcorn pop?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Does temperature affect seed sprouting?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Does the color of light affect photosynthesis?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Does exercise affect memory?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Can fruits and vegetables generate electricity?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "Can you protect a raw egg from a drop using common materials?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "What happens when a raw egg is put into vinegar?"}], "max_tokens": 500}
-{"messages":[{"role": "system", "content": "You are a helpful assistant. Briefly answer the following: "}, {"role": "user", "content": "What drink causes more stains on teeth?"}], "max_tokens": 500}
diff --git a/examples/slm_engine/test/README_TOOL_CALLING.md b/examples/slm_engine/test/README_TOOL_CALLING.md
deleted file mode 100644
index 46da9ce973..0000000000
--- a/examples/slm_engine/test/README_TOOL_CALLING.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# SLM Engine Tool Calling Tests
-
-This directory contains test cases for the SLM (Small Language Model) engine with tool calling functionality.
-
-## Test Files
-
-### 1. `test-slm-server.sh` (Updated)
-- **Original test**: Basic chat completion test asking about San Diego travel recommendations
-- **New test**: Tool calling test for flight and hotel booking from Beijing to Paris
-
-### 2. `test-slm-server-tools.sh` (New)
-Comprehensive tool calling test suite with three scenarios:
-- **Test 1**: Combined flight and hotel booking (Beijing to Paris)
-- **Test 2**: Flight booking only (JFK to LHR) 
-- **Test 3**: Hotel booking only (Tokyo)
-
-### 3. `test_tool_calling.py` (New)
-Python version of the tool calling tests with better response formatting and error handling.
-
-## Tool Definitions
-
-The tests use two main tools:
-
-### `booking_flight_tickets`
-Parameters:
-- `origin_airport_code`: Departure airport code (string)
-- `destination_airport_code`: Destination airport code (string) 
-- `departure_date`: Outbound flight date (string)
-- `return_date`: Return flight date (string)
-
-### `booking_hotels`
-Parameters:
-- `destination`: City name (string)
-- `check_in_date`: Hotel check-in date (string)
-- `checkout_date`: Hotel check-out date (string)
-
-## Usage
-
-### Prerequisites
-Make sure the SLM server is running on `http://localhost:8080`
-
-### Running the Tests
-
-#### Bash Tests
-```bash
-# Run the updated original test (includes tool calling)
-./test-slm-server.sh
-
-# Run comprehensive tool calling tests
-./test-slm-server-tools.sh
-```
-
-#### Python Test
-```bash
-# Run Python version (requires requests library)
-python3 test_tool_calling.py
-
-# Or make it executable and run directly
-chmod +x test_tool_calling.py
-./test_tool_calling.py
-```
-
-### Installing Python Dependencies
-If running the Python test, make sure you have the `requests` library:
-```bash
-pip install requests
-```
-
-## Test Scenarios
-
-### Scenario 1: Beijing to Paris Trip
-- **Flight**: PEK (Beijing) → CDG (Paris), Dec 4-10, 2025
-- **Hotel**: Paris, Dec 4-10, 2025
-- **Parameters**: Very low temperature (0.00001), deterministic sampling
-
-### Scenario 2: New York to London Flight
-- **Flight**: JFK (New York) → LHR (London), Aug 15-22, 2025
-- **Parameters**: Low temperature (0.1), sampling enabled
-
-### Scenario 3: Tokyo Hotel Booking
-- **Hotel**: Tokyo, Sep 1-5, 2025
-- **Parameters**: Low temperature (0.2), sampling enabled
-
-## Expected Response Format
-
-The SLM should respond with tool calls in a structured format, typically including:
-- Tool name identification
-- Parameter extraction from user request
-- Proper airport code mapping (e.g., Beijing → PEK, Paris → CDG)
-- Date formatting and validation
-
-## Troubleshooting
-
-1. **Connection refused**: Ensure SLM server is running on port 8080
-2. **Tool not recognized**: Verify the SLM model supports tool calling
-3. **Parameter errors**: Check that all required parameters are provided in the tool definitions
-4. **Python import errors**: Install required dependencies with `pip install requests`
diff --git a/examples/slm_engine/test/batch-input.jsonl b/examples/slm_engine/test/batch-input.jsonl
deleted file mode 100644
index 2dc9c78823..0000000000
--- a/examples/slm_engine/test/batch-input.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{"messages":[{"role": "system", "content": "You are a helpful AI Assistant that answers user's questions very accurately. Be very brief and use JSON formatting."}, {"role": "user", "content": "What are the top visitor attractions in Honolulu?"}], "max_tokens": 1200}
-{"messages":[{"role": "system", "content": "You are a helpful AI Assistant that answers user's questions very accurately."}, {"role": "user", "content": "Very briefly mention what are the top three destinations in Honolulu?"}], "max_tokens": 1200, "stop": ["\\n\\n"]}
-{"messages":[{"role": "system", "content": "You are a helpful AI Assistant that answers user's questions very accurately."}, {"role": "user", "content": "How far is Honolulu from San Diego?"}], "max_tokens": 1200, "stop": ["\\n\\n"]}
diff --git a/examples/slm_engine/test/chat_ui.py b/examples/slm_engine/test/chat_ui.py
deleted file mode 100644
index 6f63503279..0000000000
--- a/examples/slm_engine/test/chat_ui.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import json
-
-import gradio as gr
-import pandas as pd
-import requests
-
-SLM_ENDPOINT = "http://localhost:8080/completions"
-SYSTEM_PROMPT = (
-    "You are a helpful AI Assistant. "
-    "Please answer the questions very accurately. "
-    "Use emojis and markdown as appropriate"
-)
-
-# Global variable to store chat history
-chat_history = []
-context_length = 0
-
-
-def ask_slm_engine(prompt, history, max_tokens, slider_temp):
-    global chat_history
-    global context_length
-
-    if not chat_history or len(chat_history) == 0:
-        chat_history = [
-            {
-                "role": "system",
-                "content": f"{SYSTEM_PROMPT}",
-            }
-        ]
-
-    chat_history.append({"role": "user", "content": prompt})
-
-    # Format the message as required by your API
-    payload = {
-        "messages": chat_history,
-        "temperature": slider_temp,
-        "max_tokens": max_tokens,
-    }
-    headers = {"Content-Type": "application/json"}
-
-    # Send the request to your API endpoint
-    response = requests.post(SLM_ENDPOINT, json=payload, headers=headers)
-
-    # Extract the response content
-    response_content = response.json()
-
-    ai_response = response_content["choices"][0]["message"]
-    chat_history.append(ai_response)
-
-    # Print the Response - all of it
-    print(json.dumps(response_content, indent=4))
-
-    return ai_response["content"], pd.DataFrame([response_content["kpi"]])
-
-
-def reset_chat():
-    global chat_history
-    chat_history = []  # Clear the chat history
-    return "", pd.DataFrame(columns=["KPI", "Value"])  # Clear the chat and KPI grid
-
-
-with gr.Blocks() as demo:
-    kpi_grid = gr.Dataframe(headers=["KPI", "Value"], datatype=["str", "str"], render=False)
-    gr.Markdown("<center><h1>Chat with ONNX SLM Engine</h1></center>")
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("## Max Tokens")
-                    slider = gr.Slider(100, 4096, 1200, label="Max Tokens")
-                with gr.Column():
-                    gr.Markdown("## Temperature")
-                    slider_temp = gr.Slider(0, 1.0, 0.5, label="Temperature")
-                with gr.Column():
-                    gr.Markdown("## Reset Chat")
-                    reset_button = gr.Button("Reset")
-            chatbot = gr.Chatbot(height=200, render=False)
-            user_prompt = gr.Textbox(
-                placeholder="Ask me a question",
-                container=False,
-                scale=7,
-                render=False,
-            )
-            gr.ChatInterface(
-                ask_slm_engine,
-                type="messages",
-                additional_inputs=[slider, slider_temp],
-                chatbot=chatbot,
-                textbox=user_prompt,
-                additional_outputs=[kpi_grid],
-            )
-            reset_button.click(reset_chat, outputs=[chatbot, kpi_grid])
-
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("<left><h2>KPI Stats</h2></left>")
-            kpi_grid.render()
-
-demo.launch()
diff --git a/examples/slm_engine/test/sample-qNa-data.jsonl b/examples/slm_engine/test/sample-qNa-data.jsonl
deleted file mode 100644
index 4fd014be68..0000000000
--- a/examples/slm_engine/test/sample-qNa-data.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{  "question": "What makes a cloud form?",    "answer": "Clouds form when moist air rises and cools, causing the water vapor to condense into tiny droplets or ice crystals."}
-{  "question": "Can drink and food taste different just by changing its color?",  "answer": "Yes, studies have shown that the color of food and drink can influence our perception of taste, even if the flavor remains the same."}
-{  "question": "What makes popcorn pop?",  "answer": "Popcorn pops because the water inside the kernel heats up and turns into steam, creating pressure that causes the kernel to explode."}
-{  "question": "Does temperature affect seed sprouting?",  "answer": "Yes, temperature can significantly affect seed sprouting. Seeds generally sprout faster in warmer temperatures, but extreme heat can inhibit germination."}
-{  "question": "Does the color of light affect photosynthesis?",  "answer": "Yes, different colors of light can affect the rate of photosynthesis. For example, blue and red light are most effective for photosynthesis, while green light is least effective."}
-{  "question": "Does exercise affect memory?",  "answer": "Yes, regular exercise has been shown to improve memory and cognitive function by increasing blood flow to the brain and promoting the growth of new brain cells."}
-{  "question": "Can fruits and vegetables generate electricity?",  "answer": "Yes, certain fruits and vegetables can generate electricity due to the presence of electrolytes, which can create a small electric current when combined with two different metals."}
-{  "question": "Can you protect a raw egg from a drop using common materials?",  "answer": "Yes, by using materials like bubble wrap, cotton, or a padded container, you can protect a raw egg from breaking when dropped from a height."}
-{  "question": "What happens when a raw egg is put into vinegar?",  "answer": "When a raw egg is placed in vinegar, the acetic acid in the vinegar reacts with the calcium carbonate in the eggshell, dissolving it and leaving behind the egg's membrane."}
-{  "question": "What drink causes more stains on teeth?",  "answer": "Drinks like coffee, tea, and red wine are known to cause more stains on teeth due to their high levels of chromogens, tannins, and acids."}
diff --git a/examples/slm_engine/test/test-slm-server-tools.sh b/examples/slm_engine/test/test-slm-server-tools.sh
deleted file mode 100755
index 17e1db9b1e..0000000000
--- a/examples/slm_engine/test/test-slm-server-tools.sh
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-
-# Test case for SLM server with tool calling functionality
-# This script tests the booking_flight_tickets and booking_hotels tools
-
-echo "Testing SLM server with tool calling - Flight and Hotel booking scenario"
-echo "================================================================="
-
-# Test 1: Flight and Hotel booking with tools
-echo "Test 1: Flight and Hotel booking from Beijing to Paris"
-curl http://localhost:8080/completions -H "Content-Type: application/json" \
- -d '{
-    "messages": [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant with these tools."
-        },
-        {
-            "role": "user",
-            "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris"
-        }
-    ],
-    "tools": [
-        {
-            "name": "booking_flight_tickets",
-            "description": "booking flights",
-            "parameters": {
-                "origin_airport_code": {
-                    "description": "The name of Departure airport code",
-                    "type": "string"
-                },
-                "destination_airport_code": {
-                    "description": "The name of Destination airport code", 
-                    "type": "string"
-                },
-                "departure_date": {
-                    "description": "The date of outbound flight",
-                    "type": "string"
-                },
-                "return_date": {
-                    "description": "The date of return flight",
-                    "type": "string"
-                }
-            }
-        },
-        {
-            "name": "booking_hotels",
-            "description": "booking hotel",
-            "parameters": {
-                "destination": {
-                    "description": "The name of the city",
-                    "type": "string"
-                },
-                "check_in_date": {
-                    "description": "The date of check in",
-                    "type": "string"
-                },
-                "checkout_date": {
-                    "description": "The date of check out",
-                    "type": "string"
-                }
-            }
-        }
-    ],
-    "temperature": 0.00001,
-    "max_tokens": 4096,
-    "top_p": 1.0,
-    "do_sample": false
-}' -v
-
-echo -e "\n\n"
-echo "================================================================="
-
-# Test 2: Simple tool calling test - Flight only
-echo "Test 2: Flight booking only from New York to London"
-curl http://localhost:8080/completions -H "Content-Type: application/json" \
- -d '{
-    "messages": [
-        {
-            "role": "system",
-            "content": "You are a helpful travel assistant."
-        },
-        {
-            "role": "user",
-            "content": "I need to book a flight from JFK to LHR on 2025-08-15, returning on 2025-08-22"
-        }
-    ],
-    "tools": [
-        {
-            "name": "booking_flight_tickets",
-            "description": "booking flights",
-            "parameters": {
-                "origin_airport_code": {
-                    "description": "The name of Departure airport code",
-                    "type": "string"
-                },
-                "destination_airport_code": {
-                    "description": "The name of Destination airport code", 
-                    "type": "string"
-                },
-                "departure_date": {
-                    "description": "The date of outbound flight",
-                    "type": "string"
-                },
-                "return_date": {
-                    "description": "The date of return flight",
-                    "type": "string"
-                }
-            }
-        }
-    ],
-    "temperature": 0.1,
-    "max_tokens": 2048,
-    "top_p": 0.9,
-    "do_sample": true
-}' -v
-
-echo -e "\n\n"
-echo "================================================================="
-
-# Test 3: Hotel booking only
-echo "Test 3: Hotel booking only in Tokyo"
-curl http://localhost:8080/completions -H "Content-Type: application/json" \
- -d '{
-    "messages": [
-        {
-            "role": "system",
-            "content": "You are a helpful hotel booking assistant."
-        },
-        {
-            "role": "user", 
-            "content": "I need to book a hotel in Tokyo from 2025-09-01 to 2025-09-05"
-        }
-    ],
-    "tools": [
-        {
-            "name": "booking_hotels",
-            "description": "booking hotel",
-            "parameters": {
-                "destination": {
-                    "description": "The name of the city",
-                    "type": "string"
-                },
-                "check_in_date": {
-                    "description": "The date of check in",
-                    "type": "string"
-                },
-                "checkout_date": {
-                    "description": "The date of check out",
-                    "type": "string"
-                }
-            }
-        }
-    ],
-    "temperature": 0.2,
-    "max_tokens": 1024,
-    "top_p": 0.95,
-    "do_sample": true
-}' -v
-
-echo -e "\n\n"
-echo "================================================================="
-echo "All tool calling tests completed!"
diff --git a/examples/slm_engine/test/test-slm-server.sh b/examples/slm_engine/test/test-slm-server.sh
deleted file mode 100755
index 1b48b38761..0000000000
--- a/examples/slm_engine/test/test-slm-server.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-
-# Original test case
-echo "Original test case - San Diego travel recommendations:"
-curl http://localhost:8080/completions -H "Content-Type: application/json" \
- -d '{"messages":[{"role": "system", "content": "You are a helpful AI Assistant. Please answer the questions very accurately. Use emojis and markdown as appropriate"}, {"role": "user", "content": "What are the top 5 places to visit in San Diego? Be brief."}], "max_tokens": 1200, "temperature": 0.7}' -vvv
-
-echo -e "\n\n================================================================="
-echo "Tool calling test case - Flight and Hotel booking:"
-
-# New test case with tool calling
-curl http://localhost:8080/completions -H "Content-Type: application/json" \
- -d '{
-    "messages": [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant with these tools."
-        },
-        {
-            "role": "user",
-            "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris"
-        }
-    ],
-    "tools": [
-        {
-            "name": "booking_flight_tickets",
-            "description": "booking flights",
-            "parameters": {
-                "origin_airport_code": {
-                    "description": "The name of Departure airport code",
-                    "type": "string"
-                },
-                "destination_airport_code": {
-                    "description": "The name of Destination airport code",
-                    "type": "string"
-                },
-                "departure_date": {
-                    "description": "The date of outbound flight",
-                    "type": "string"
-                },
-                "return_date": {
-                    "description": "The date of return flight",
-                    "type": "string"
-                }
-            }
-        },
-        {
-            "name": "booking_hotels",
-            "description": "booking hotel",
-            "parameters": {
-                "destination": {
-                    "description": "The name of the city",
-                    "type": "string"
-                },
-                "check_in_date": {
-                    "description": "The date of check in",
-                    "type": "string"
-                },
-                "checkout_date": {
-                    "description": "The date of check out",
-                    "type": "string"
-                }
-            }
-        }
-    ],
-    "temperature": 0.00001,
-    "max_tokens": 4096,
-    "top_p": 1.0,
-    "do_sample": false
-}' -v
diff --git a/examples/slm_engine/test/test_slm_server.py b/examples/slm_engine/test/test_slm_server.py
deleted file mode 100755
index 56e0b6770b..0000000000
--- a/examples/slm_engine/test/test_slm_server.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-
-import requests
-
-BLUE = "\033[34m"
-GREEN = "\033[32m"
-MAGENTA = "\033[35m"
-RED = "\033[31m"
-CLEAR = "\033[0m"
-
-
-def launch_server(server_binary: str, model_path: str):
-    import subprocess
-
-    pid = subprocess.Popen(
-        [
-            str(server_binary),
-            "--model_path",
-            str(model_path),
-            "--port_number",
-            "8000",
-        ]
-    )
-
-    # Wait until the server starts to listen
-    started = False
-    timeout_countdown = 30
-    url = "http://localhost:8000"
-    while not started:
-        try:
-            response = requests.get(url)
-            json_response = json.loads(response.text)
-            if json_response["response"]["status"] == "success":
-                print(f"{MAGENTA}Engine State: {json_response['response']['engine_state']}{CLEAR}")
-                started = True
-        except Exception:
-            # Initially the server may not be ready to accept requests
-            # We want to ignore and retry
-            pass  # Ignore all exceptions
-
-        # Sleep for a bit
-        import time
-
-        time.sleep(1)
-        timeout_countdown = timeout_countdown - 1
-        if timeout_countdown == 0:
-            raise Exception("Server did not start in time")
-
-    return pid
-
-
-# This function tests the OpenAI API Interface
-def run_test(url: str):
-    # Test the API
-    print("Testing the API with a test message")
-    test_message = """
-        {   
-            "model": "",
-            "messages":
-            [
-                {"role": "system", "content": "You are a helpful assistant. Be very brief and precise"}, 
-                {"role": "user", "content": "How to make pizza in five steps?"}
-            ], 
-                "max_tokens": 1200
-        }
-    """
-    json_message = json.loads(test_message)
-    response = requests.post(url + "/completions", json=json_message)
-    if response.status_code != 200:
-        print(f"{RED}Error: {response.status_code}{CLEAR}")
-        raise Exception("Error in the API")
-
-    json_response = json.loads(response.text)
-    print(f"Question: {json_response['question']}")
-    print(f"Answer: {json_response['choices'][0]['message']['content']}")
-    print(f"{BLUE}KPI: {json_response['kpi']}{CLEAR}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test SLM Engine Using HTTP API")
-
-    # Adding arguments
-    parser.add_argument(
-        "-s",
-        "--server_binary_path",
-        help="Path to the SLM Server binary",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        help="URL of the server to test",
-    )
-
-    parser.add_argument("-m", "--model_path", help="Path to the ONNX model")
-
-    args = parser.parse_args()
-    if args.server_binary_path is None:
-        if args.url is None:
-            raise Exception("Either server_binary_path or url must be provided")
-        # Run the test using existing server
-        run_test(args.url)
-    else:
-        # Launch the server and run the
-        launch_server(args.server_binary_path, args.model_path)
-        url = "http://localhost:8000"
-        run_test(url)
diff --git a/examples/slm_engine/test/test_tool_calling.py b/examples/slm_engine/test/test_tool_calling.py
deleted file mode 100755
index ea8afc165b..0000000000
--- a/examples/slm_engine/test/test_tool_calling.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for SLM server with tool calling functionality
-This script tests the booking_flight_tickets and booking_hotels tools
-"""
-
-import json
-
-import requests
-
-
-def test_tool_calling():
-    """Test tool calling functionality with flight and hotel booking"""
-
-    url = "http://localhost:8080/completions"
-    headers = {"Content-Type": "application/json"}
-
-    # Test case 1: Flight and Hotel booking
-    print("=" * 70)
-    print("Test 1: Flight and Hotel booking from Beijing to Paris")
-    print("=" * 70)
-
-    payload1 = {
-        "messages": [
-            {"role": "system", "content": "You are a helpful assistant with these tools."},
-            {
-                "role": "user",
-                "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris",
-            },
-        ],
-        "tools": [
-            {
-                "name": "booking_flight_tickets",
-                "description": "booking flights",
-                "parameters": {
-                    "origin_airport_code": {"description": "The name of Departure airport code", "type": "string"},
-                    "destination_airport_code": {
-                        "description": "The name of Destination airport code",
-                        "type": "string",
-                    },
-                    "departure_date": {"description": "The date of outbound flight", "type": "string"},
-                    "return_date": {"description": "The date of return flight", "type": "string"},
-                },
-            },
-            {
-                "name": "booking_hotels",
-                "description": "booking hotel",
-                "parameters": {
-                    "destination": {"description": "The name of the city", "type": "string"},
-                    "check_in_date": {"description": "The date of check in", "type": "string"},
-                    "checkout_date": {"description": "The date of check out", "type": "string"},
-                },
-            },
-        ],
-        "temperature": 0.00001,
-        "max_tokens": 4096,
-        "top_p": 1.0,
-        "do_sample": False,
-    }
-
-    try:
-        response1 = requests.post(url, headers=headers, json=payload1, timeout=30)
-        print(f"Status Code: {response1.status_code}")
-        if response1.status_code == 200:
-            result = response1.json()
-            print("Response:")
-            print(json.dumps(result, indent=2, ensure_ascii=False))
-        else:
-            print(f"Error: {response1.text}")
-    except requests.exceptions.RequestException as e:
-        print(f"Request failed: {e}")
-
-    print("\n" + "=" * 70)
-    print("Test 2: Flight booking only (JFK to LHR)")
-    print("=" * 70)
-
-    # Test case 2: Flight only
-    payload2 = {
-        "messages": [
-            {"role": "system", "content": "You are a helpful travel assistant."},
-            {
-                "role": "user",
-                "content": "I need to book a flight from JFK to LHR on 2025-08-15, returning on 2025-08-22",
-            },
-        ],
-        "tools": [
-            {
-                "name": "booking_flight_tickets",
-                "description": "booking flights",
-                "parameters": {
-                    "origin_airport_code": {"description": "The name of Departure airport code", "type": "string"},
-                    "destination_airport_code": {
-                        "description": "The name of Destination airport code",
-                        "type": "string",
-                    },
-                    "departure_date": {"description": "The date of outbound flight", "type": "string"},
-                    "return_date": {"description": "The date of return flight", "type": "string"},
-                },
-            }
-        ],
-        "temperature": 0.1,
-        "max_tokens": 2048,
-        "top_p": 0.9,
-        "do_sample": True,
-    }
-
-    try:
-        response2 = requests.post(url, headers=headers, json=payload2, timeout=30)
-        print(f"Status Code: {response2.status_code}")
-        if response2.status_code == 200:
-            result = response2.json()
-            print("Response:")
-            print(json.dumps(result, indent=2, ensure_ascii=False))
-        else:
-            print(f"Error: {response2.text}")
-    except requests.exceptions.RequestException as e:
-        print(f"Request failed: {e}")
-
-    print("\n" + "=" * 70)
-    print("Test 3: Hotel booking only (Tokyo)")
-    print("=" * 70)
-
-    # Test case 3: Hotel only
-    payload3 = {
-        "messages": [
-            {"role": "system", "content": "You are a helpful hotel booking assistant."},
-            {"role": "user", "content": "I need to book a hotel in Tokyo from 2025-09-01 to 2025-09-05"},
-        ],
-        "tools": [
-            {
-                "name": "booking_hotels",
-                "description": "booking hotel",
-                "parameters": {
-                    "destination": {"description": "The name of the city", "type": "string"},
-                    "check_in_date": {"description": "The date of check in", "type": "string"},
-                    "checkout_date": {"description": "The date of check out", "type": "string"},
-                },
-            }
-        ],
-        "temperature": 0.2,
-        "max_tokens": 1024,
-        "top_p": 0.95,
-        "do_sample": True,
-    }
-
-    try:
-        response3 = requests.post(url, headers=headers, json=payload3, timeout=30)
-        print(f"Status Code: {response3.status_code}")
-        if response3.status_code == 200:
-            result = response3.json()
-            print("Response:")
-            print(json.dumps(result, indent=2, ensure_ascii=False))
-        else:
-            print(f"Error: {response3.text}")
-    except requests.exceptions.RequestException as e:
-        print(f"Request failed: {e}")
-
-    print("\n" + "=" * 70)
-    print("All tool calling tests completed!")
-    print("=" * 70)
-
-
-if __name__ == "__main__":
-    print("Starting SLM Server Tool Calling Tests...")
-    test_tool_calling()
diff --git a/nuget.config b/nuget.config
index ac23ff5a30..eecb601929 100644
--- a/nuget.config
+++ b/nuget.config
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <configuration>
   <packageSources>
     <clear />
diff --git a/src/csharp/Model.cs b/src/csharp/Model.cs
index 63fd7f96e4..5f76f67de6 100644
--- a/src/csharp/Model.cs
+++ b/src/csharp/Model.cs
@@ -23,6 +23,20 @@ public Model(Config config)
 
         internal IntPtr Handle { get { return _modelHandle; } }
 
+        public string GetModelType()
+        {
+            IntPtr outStr = IntPtr.Zero;
+            try
+            {
+                Result.VerifySuccess(NativeMethods.OgaModelGetType(_modelHandle, out outStr));
+                return StringUtils.FromUtf8(outStr);
+            }
+            finally
+            {
+                NativeMethods.OgaDestroyString(outStr);
+            }
+        }
+
         ~Model()
         {
             Dispose(false);
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 1ba9aac906..a1121c672c 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -90,6 +90,10 @@ internal class NativeLib
         public static extern IntPtr /* OgaResult* */ OgaCreateModelFromConfig(IntPtr /* const OgaConfig* */ config,
                                                                               out IntPtr /* OgaModel** */ model);
 
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaModelGetType(IntPtr /* OgaModel* */ model,
+                                                                     out IntPtr /* const char** */ type);
+
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern void OgaDestroyModel(IntPtr /* OgaModel* */ model);
 
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index 330ff1b43c..cb7b020a0b 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -105,12 +105,12 @@ TEST(CAPITests, TokenizerCAPI) {
 
   // Stream Decode one at a time
   for (size_t i = 0; i < sequences->Count(); i++) {
-    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+    auto stream = OgaTokenizerStream::Create(*tokenizer);
 
     auto* sequence = sequences->SequenceData(i);
     std::string stream_result;
     for (size_t j = 0; j < sequences->SequenceCount(i); j++) {
-      stream_result += tokenizer_stream->Decode(sequence[j]);
+      stream_result += stream->Decode(sequence[j]);
     }
     std::cout << "Stream decoded string:" << stream_result << std::endl;
     if (strcmp(input_strings[i], stream_result.c_str()) != 0)
@@ -167,12 +167,12 @@ TEST(CAPITests, TokenizerUpdateOptions) {
 
   // Stream Decode one at a time
   for (size_t i = 0; i < sequences->Count(); i++) {
-    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+    auto stream = OgaTokenizerStream::Create(*tokenizer);
 
     auto* sequence = sequences->SequenceData(i);
     std::string stream_result;
     for (size_t j = 0; j < sequences->SequenceCount(i); j++) {
-      stream_result += tokenizer_stream->Decode(sequence[j]);
+      stream_result += stream->Decode(sequence[j]);
     }
     std::cout << "Stream decoded string:" << stream_result << std::endl;
     if (strcmp(input_strings[i], stream_result.c_str()) != 0)
@@ -381,7 +381,7 @@ TEST(CAPIEngineTests, EndToEndPhiBatch) {
 
   std::vector<std::unique_ptr<OgaRequest>> requests;
   std::vector<std::unique_ptr<OgaGeneratorParams>> params;
-  std::vector<std::unique_ptr<OgaTokenizerStream>> tokenizer_streams;
+  std::vector<std::unique_ptr<OgaTokenizerStream>> streams;
   std::array<std::vector<int32_t>, batch_size> generated_tokens;
   for (auto& string : input_strings) {
     auto input_sequences = OgaSequences::Create();
@@ -394,7 +394,7 @@ TEST(CAPIEngineTests, EndToEndPhiBatch) {
     requests.push_back(OgaRequest::Create(*params.back()));
     requests.back()->AddTokens(*input_sequences);
     requests.back()->SetOpaqueData(&generated_tokens[requests.size() - 1]);
-    tokenizer_streams.emplace_back(OgaTokenizerStream::Create(*tokenizer));
+    streams.emplace_back(OgaTokenizerStream::Create(*tokenizer));
 
     engine->Add(*requests.back());
   }
@@ -444,7 +444,7 @@ TEST(CAPIEngineTests, EndToEndPhiStaggeredBatch) {
 
   std::vector<std::unique_ptr<OgaRequest>> requests;
   std::vector<std::unique_ptr<OgaGeneratorParams>> params;
-  std::vector<std::unique_ptr<OgaTokenizerStream>> tokenizer_streams;
+  std::vector<std::unique_ptr<OgaTokenizerStream>> streams;
   std::array<std::vector<int32_t>, batch_size> generated_tokens;
   for (auto& string : input_strings) {
     auto input_sequences = OgaSequences::Create();
@@ -457,7 +457,7 @@ TEST(CAPIEngineTests, EndToEndPhiStaggeredBatch) {
     requests.push_back(OgaRequest::Create(*params.back()));
     requests.back()->AddTokens(*input_sequences);
     requests.back()->SetOpaqueData(&generated_tokens[requests.size() - 1]);
-    tokenizer_streams.emplace_back(OgaTokenizerStream::Create(*tokenizer));
+    streams.emplace_back(OgaTokenizerStream::Create(*tokenizer));
   }
 
   // Add the first request to the engine
@@ -891,7 +891,7 @@ TEST(CAPITests, SetTerminate) {
     generator->SetRuntimeOption("terminate_session", "1");
   };
 
-  auto GenerateOutput = [](OgaGenerator* generator, std::unique_ptr<OgaTokenizerStream> tokenizer_stream) {
+  auto GenerateOutput = [](OgaGenerator* generator, std::unique_ptr<OgaTokenizerStream> stream) {
     EXPECT_THROW({
       while (!generator->IsDone()) {
         generator->GenerateNextToken();
@@ -900,7 +900,7 @@ TEST(CAPITests, SetTerminate) {
 
   auto model = OgaModel::Create(PHI2_PATH);
   auto tokenizer = OgaTokenizer::Create(*model);
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+  auto stream = OgaTokenizerStream::Create(*tokenizer);
 
   const char* input_string = "She sells sea shells by the sea shore.";
   auto input_sequences = OgaSequences::Create();
@@ -912,7 +912,7 @@ TEST(CAPITests, SetTerminate) {
   generator->AppendTokenSequences(*input_sequences);
   EXPECT_EQ(generator->IsSessionTerminated(), false);
   std::vector<std::thread> threads;
-  threads.push_back(std::thread(GenerateOutput, generator.get(), std::move(tokenizer_stream)));
+  threads.push_back(std::thread(GenerateOutput, generator.get(), std::move(stream)));
   threads.push_back(std::thread(GeneratorSetTerminateCall, generator.get()));
 
   for (auto& th : threads) {
@@ -1335,7 +1335,7 @@ TEST(CAPITests, SetGuidance) {
 
   auto model = OgaModel::Create(PHI2_PATH);
   auto tokenizer = OgaTokenizer::Create(*model);
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+  auto stream = OgaTokenizerStream::Create(*tokenizer);
 
   const char* input_string = "who are you?";
   auto input_sequences = OgaSequences::Create();
diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
index 4e6a05b5cd..3e4f216f88 100644
--- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs
+++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
@@ -161,9 +161,9 @@ public void TestGreedySearch()
                             generator.AppendTokens(inputIDs);
 
                             Assert.False(generator.IsDone());
-                            Assert.Equal(generatorParams.GetSearchNumber("max_length"), maxLength);
-                            Assert.Equal(generatorParams.GetSearchBool("early_stopping"), true);
-                            Assert.Equal((int)generator.TokenCount(), generator.GetSequence(0).Length);
+                            Assert.Equal(maxLength, generatorParams.GetSearchNumber("max_length"));
+                            Assert.Equal(true, generatorParams.GetSearchBool("early_stopping"));
+                            Assert.Equal(generator.GetSequence(0).Length, (int)generator.TokenCount());
 
                             while (!generator.IsDone())
                             {
@@ -175,7 +175,7 @@ public void TestGreedySearch()
                                 var sequence = generator.GetSequence(i).ToArray();
                                 var expectedSequence = expectedOutput.Skip((int)i * (int)maxLength).Take((int)maxLength);
                                 Assert.Equal(expectedSequence, sequence);
-                                Assert.Equal((int)generator.TokenCount(), generator.GetSequence(i).Length);
+                                Assert.Equal(generator.GetSequence(0).Length, (int)generator.TokenCount());
                             }
                         }
                     }
diff --git a/test/python/_test_utils.py b/test/python/_test_utils.py
index 7621ba2b70..1931b2987d 100644
--- a/test/python/_test_utils.py
+++ b/test/python/_test_utils.py
@@ -60,13 +60,14 @@ def get_ci_data_path():
 def get_model_paths():
     # TODO: Uncomment the following models as needed in the CI pipeline.
 
+    # Format: model alias: (HF repo name, create only 1 layer)
     hf_paths = {
-        "phi-2": "microsoft/phi-2",
         # "olmo": "amd/AMD-OLMo-1B-SFT-DPO",
-        "qwen-2.5": "Qwen/Qwen2.5-0.5B",
         # "phi-3.5": "microsoft/Phi-3.5-mini-instruct",
         # "llama-3.2": "meta-llama/Llama-3.2-1B-instruct",
         # "granite-3.0": "ibm-granite/granite-3.0-2b-instruct",
+        "phi-4-mini": ("microsoft/Phi-4-mini-instruct", True),
+        "qwen-2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", False),
     }
 
     ci_data_path = os.path.join(get_ci_data_path(), "pytorch")
@@ -75,24 +76,25 @@ def get_model_paths():
 
     # Note: If a model has over 4B parameters, please add a quantized version
     # to `ci_paths` instead of `hf_paths` to reduce file size and testing time.
+    # Format: model alias: (OS path, create only 1 layer)
     ci_paths = {
         # "llama-2": os.path.join(ci_data_path, "Llama-2-7B-Chat-GPTQ"),
         # "llama-3": os.path.join(ci_data_path, "Meta-Llama-3-8B-AWQ"),
         # "mistral-v0.2": os.path.join(ci_data_path, "Mistral-7B-Instruct-v0.2-GPTQ"),
-        "phi-2": os.path.join(ci_data_path, "phi2"),
+        "phi-2": (os.path.join(ci_data_path, "phi2"), True),
         # "gemma-2b": os.path.join(ci_data_path, "gemma-1.1-2b-it"),
         # "gemma-7b": os.path.join(ci_data_path, "gemma-7b-it-awq"),
         # "phi-3-mini": os.path.join(ci_data_path, "phi3-mini-128k-instruct"),
         # "gemma-2-2b": os.path.join(ci_data_path, "gemma-2-2b-it"),
         # "llama-3.2": os.path.join(ci_data_path, "llama-3.2b-1b-instruct"),
-        "qwen-2.5": os.path.join(ci_data_path, "qwen2.5-0.5b-instruct"),
+        # "qwen-2.5-0.5b": os.path.join(ci_data_path, "qwen2.5-0.5b-instruct"),
         # "nemotron-mini": os.path.join(ci_data_path, "nemotron-mini-4b"),
     }
 
     return ci_paths, hf_paths
 
 
-def download_model(model_name, input_path, output_path, precision, device, one_layer=True):
+def download_model(model_name, input_path, output_path, precision, device, one_layer):
     command = [
         sys.executable,
         "-m",
@@ -119,7 +121,7 @@ def download_model(model_name, input_path, output_path, precision, device, one_l
         device,
     ]
 
-    extra_options = ["--extra_options", "include_hidden_states=true"]
+    extra_options = ["--extra_options", "include_hidden_states=1", "hf_token=0", "hf_remote=0"]
     if device == "cpu" and precision == "int4":
         extra_options += ["int4_accuracy_level=4"]
     if one_layer:
@@ -139,15 +141,19 @@ def download_models(download_path, precision, device, log):
     log.debug(f"Downloading {len(ci_paths)} PyTorch models and {len(hf_paths)} Hugging Face models")
 
     # python -m onnxruntime_genai.models.builder -i <input_path> -o <output_path> -p <precision> -e <device>
-    for model_name, input_path in ci_paths.items():
-        output_path = os.path.join(download_path, model_name, precision, device)
-        log.debug(f"Downloading {model_name} from {input_path} to {output_path}")
-        if not os.path.exists(output_path):
-            download_model(None, input_path, output_path, precision, device)
-            output_paths.append(output_path)
+    for model_name, (input_path, one_layer) in ci_paths.items():
+        try:
+            output_path = os.path.join(download_path, model_name, precision, device)
+            log.debug(f"Downloading {model_name} from {input_path} to {output_path}")
+            if not os.path.exists(output_path):
+                download_model(None, input_path, output_path, precision, device, one_layer)
+                output_paths.append(output_path)
+        except Exception as e:
+            log.warning(f"Error: {e}. Skipping CI model.")
+            continue
 
     # python -m onnxruntime_genai.models.builder -m <model_name> -o <output_path> -p <precision> -e <device>
-    for model_name, hf_name in hf_paths.items():
+    for model_name, (hf_name, one_layer) in hf_paths.items():
         try:
             from huggingface_hub import model_info
 
@@ -163,7 +169,7 @@ def download_models(download_path, precision, device, log):
         log.debug(f"Downloading {model_name} from {hf_name} to {output_path}")
 
         if not os.path.exists(output_path):
-            download_model(hf_name, "", output_path, precision, device)
+            download_model(hf_name, "", output_path, precision, device, one_layer)
             output_paths.append(output_path)
 
     log.info(f"Successfully downloaded {len(output_paths)} models")
diff --git a/test/python/conftest.py b/test/python/conftest.py
index fac85ef11e..8e5fccd745 100644
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -43,6 +43,16 @@ def phi3_for(request):
     )
 
 
+@pytest.fixture
+def phi4_for(request):
+    return functools.partial(
+        get_path_for_model,
+        request.config.getoption("--test_models"),
+        "phi-4-mini",
+        "int4",
+    )
+
+
 @pytest.fixture
 def gemma_for(request):
     return functools.partial(
@@ -68,7 +78,7 @@ def qwen_for(request):
     return functools.partial(
         get_path_for_model,
         request.config.getoption("--test_models"),
-        "qwen-2.5",
+        "qwen-2.5-0.5b",
         "int4",
     )
 
diff --git a/test/python/special_tokens.py b/test/python/special_tokens.py
new file mode 100644
index 0000000000..24967b66f0
--- /dev/null
+++ b/test/python/special_tokens.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#
+# Run this script to mark certain token ids as special
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-p", "--path", required=True, type=str, help="Path to tokenizer.json")
+parser.add_argument("-s", "--start_tool_call", required=True, type=str, help="String representation of starting tool call token")
+parser.add_argument("-e", "--end_tool_call", required=True, type=str, help="String representation of ending tool call token")
+
+args = parser.parse_args()
+assert os.path.exists(args.path), "Invalid path to tokenizer.json"
+assert os.path.basename(args.path) == "tokenizer.json", "Path is not to a tokenizer.json file"
+
+# Use raw bytes when making comparisons
+start_b = args.start_tool_call.encode("ascii", "strict")
+end_b = args.end_tool_call.encode("ascii", "strict")
+false_b = b'"special": false'
+true_b = b'"special": true'
+
+seen = False
+temp_path = args.path.replace("tokenizer.json", "temp.json")
+with open(args.path, "rb") as in_file, open(temp_path, "wb") as out_file:
+    for line in in_file:
+        if start_b in line or end_b in line:
+            seen = True
+
+        if seen and false_b in line:
+            out_file.write(line.replace(false_b, true_b))
+            seen = False
+        else:
+            out_file.write(line)
+
+os.replace(temp_path, args.path)
diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py
index 7286139fda..b46b815a99 100644
--- a/test/python/test_onnxruntime_genai.py
+++ b/test/python/test_onnxruntime_genai.py
@@ -76,13 +76,11 @@ def main():
     log.info("Running onnxruntime-genai tests pipeline")
 
     # Get INT4 ONNX models
-    output_paths = []
-    if not (sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8):
-        output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu", log)
-        if og.is_cuda_available():
-            output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda", log)
-        if og.is_dml_available():
-            output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml", log)
+    output_paths = download_models(os.path.abspath(args.test_models), "int4", "cpu", log)
+    if og.is_cuda_available():
+        output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda", log)
+    if og.is_dml_available():
+        output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml", log)
 
     # Run ONNX Runtime GenAI tests
     run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models))
diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py
index 837bd4726a..2c85707bf5 100644
--- a/test/python/test_onnxruntime_genai_api.py
+++ b/test/python/test_onnxruntime_genai_api.py
@@ -351,10 +351,10 @@ def test_phi2_chat_template(device, phi2_for):
     reason="Model is not available on arm64.",
 )
 @pytest.mark.parametrize("device", devices)
-def test_tokenizer_stream(device, phi2_for):
+def test_stream(device, phi2_for):
     model = og.Model(phi2_for(device))
     tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
+    stream = tokenizer.create_stream()
 
     prompts = [
         "This is a test.",
@@ -366,7 +366,7 @@ def test_tokenizer_stream(device, phi2_for):
         sequence = tokenizer.encode(prompt)
         decoded_string = ""
         for token in sequence:
-            decoded_string += tokenizer_stream.decode(token)
+            decoded_string += stream.decode(token)
 
         assert decoded_string == prompt
 
diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py
index f28d674e21..5e82e4fdca 100644
--- a/test/python/test_onnxruntime_genai_e2e.py
+++ b/test/python/test_onnxruntime_genai_e2e.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
-from __future__ import annotations
 
 import argparse
 import json
@@ -78,6 +77,87 @@ def run_whisper():
         run_subprocess(command, cwd=cwd, log=log).check_returncode()
 
 
+def run_tool_calling():
+    log.debug("Running tool calling Python E2E Tests")
+
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    tool_call_models = [("qwen-2.5-0.5b", "<tool_call>", "</tool_call>")]
+
+    # Runtime settings
+    max_length = 256
+    user_prompt = "What is the weather in Redmond, WA?"
+    response_format = "lark_grammar"
+
+    for (model_name, tool_call_start, tool_call_end) in tool_call_models:
+        for (precision, execution_provider) in [("int4", "cpu")]: # TODO: add ("int4", "cuda"), ("int4", "dml") in CIs later
+            model_path = os.path.join(cwd, "..", "test_models", model_name, precision, execution_provider)
+            if not os.path.exists(model_path): continue
+
+            # Run special_tokens.py to mark tool call token ids as special
+            command = [
+                sys.executable,
+                os.path.join(cwd, "special_tokens.py"),
+                "-p",
+                os.path.join(model_path, "tokenizer.json"),
+                "-s",
+                tool_call_start,
+                "-e",
+                tool_call_end,
+            ]
+            run_subprocess(command, cwd=cwd, log=log).check_returncode()
+
+            # Run model-qa.py for inference
+            command = [
+                sys.executable,
+                os.path.join(cwd, "..", "..", "examples", "python", "model-qa.py"),
+                "-m",
+                model_path,
+                "-e",
+                execution_provider,
+                "--max_length",
+                str(max_length),
+                "--response_format",
+                response_format,
+                "--tools_file",
+                os.path.join(cwd, "..", "test_models", "tool-definitions", "weather.json"),
+                "--tool_call_start",
+                tool_call_start,
+                "--tool_call_end",
+                tool_call_end,
+                "--user_prompt",
+                user_prompt,
+                "--tool_output",
+                "--non_interactive",
+                "--verbose",
+            ]
+            run_subprocess(command, cwd=cwd, log=log).check_returncode()
+
+            # Run model_qa.cpp for inference
+            command = [
+                os.path.join(cwd, "..", "..", "examples", "c", "build", f"{'Release' if sys.platform.startswith('win') else ''}", f"model_qa{'.exe' if sys.platform.startswith('win') else ''}"),
+                "-m",
+                model_path,
+                "-e",
+                execution_provider,
+                "--max_length",
+                str(max_length),
+                "--response_format",
+                response_format,
+                "--tools_file",
+                os.path.join(cwd, "..", "test_models", "tool-definitions", "weather.json"),
+                "--tool_call_start",
+                tool_call_start,
+                "--tool_call_end",
+                tool_call_end,
+                "--user_prompt",
+                user_prompt,
+                "--tool_output",
+                "--non_interactive",
+                "--verbose",
+            ]
+            run_subprocess(command, cwd=cwd, log=log).check_returncode()
+
+
 def get_args():
     parser = argparse.ArgumentParser()
 
@@ -106,3 +186,6 @@ def get_args():
 
     # Run Whisper E2E tests
     run_whisper()
+
+    # Run tool calling E2E tests
+    run_tool_calling()
diff --git a/test/test_models/grammars/blog.sample.json b/test/test_models/grammars/blog.sample.json
deleted file mode 100644
index 592499a6e1..0000000000
--- a/test/test_models/grammars/blog.sample.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "title": "New Blog Post",
-  "content": "This is the content of the blog post...",
-  "publishedDate": "2023-08-25T15:00:00Z",
-  "author": {
-    "username": "authoruser",
-    "email": "author@example.com",
-    "fullName": "Author User",
-    "age": 30,
-    "location": "Earth",
-    "interests": [
-      "Technology",
-      "Foo"
-    ]
-  },
-  "tags": [
-    "Technology",
-    "Programming"
-  ]
-}
\ No newline at end of file
diff --git a/test/test_models/grammars/blog.schema.json b/test/test_models/grammars/blog.schema.json
deleted file mode 100644
index 11e042c29c..0000000000
--- a/test/test_models/grammars/blog.schema.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-    "description": "A representation of a blog post",
-    "type": "object",
-    "required": [
-        "title",
-        "content",
-        "author"
-    ],
-    "additionalProperties": false,
-    "properties": {
-        "title": {
-            "type": "string"
-        },
-        "content": {
-            "type": "string"
-        },
-        "publishedDate": {
-            "type": "string"
-        },
-        "author": {
-            "type": "object",
-            "properties": {
-                "username": {
-                    "type": "string"
-                },
-                "email": {
-                    "type": "string"
-                },
-                "fullName": {
-                    "type": "string"
-                },
-                "age": {
-                    "type": "integer"
-                },
-                "location": {
-                    "type": "string"
-                },
-                "interests": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            },
-            "additionalProperties": false
-        },
-        "tags": {
-            "type": "array",
-            "items": {
-                "type": "string"
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/test/test_models/grammars/grammar_multiple_functions.txt b/test/test_models/grammars/grammar_multiple_functions.txt
deleted file mode 100644
index 42fd6c6342..0000000000
--- a/test/test_models/grammars/grammar_multiple_functions.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-start: TEXT | fun_call
-TEXT: /[^{](.|\n)*/
-fun_call: <|tool_call|> %json { "anyOf": [ {
-  "description": "How to get the weather for a city",
-  "type": "object",
-  "required": ["name", "parameters"],
-  "additionalProperties": false,
-  "properties": {
-    "name": { "const": "get_weather" },
-    "parameters": {
-      "type": "object",
-      "properties": {
-        "city": { "type": "string" }
-      },
-      "required": ["city"],
-      "additionalProperties": false
-    }
-  }
-  },
-  {
-  "description": "How to get the population for a city",
-  "type": "object",
-  "required": ["name", "parameters"],
-  "additionalProperties": false,
-  "properties": {
-    "name": { "const": "get_population" },
-    "parameters": {
-      "type": "object",
-      "properties": {
-        "city": { "type": "string" }
-      },
-      "required": ["city"],
-      "additionalProperties": false
-    }
-  }
-  }
-]
-}
\ No newline at end of file
diff --git a/test/test_models/grammars/grammar_multiple_inputs.txt b/test/test_models/grammars/grammar_multiple_inputs.txt
deleted file mode 100644
index 1065a4123b..0000000000
--- a/test/test_models/grammars/grammar_multiple_inputs.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-start: TEXT | fun_call
-TEXT: /[^{](.|\n)*/
-fun_call: <|tool_call|> %json {
-  "description": "Move or rename files and directories. Can move files between directories and rename them in a single operation. If the destination exists, the operation will fail. Works across different directories and can be used for simple renaming within the same directory. Both source and destination must be within allowed directories.",
-  "type": "object",
-  "required": ["name", "parameters"],
-  "additionalProperties": false,
-  "properties": {
-    "name": { "const": "move_file" },
-    "parameters": {
-      "type": "object",
-      "properties": {
-        "source": { "type": "string", "description": "source of the file"}, "destination": { "type": "string", "description": "destination of the file"}
-      },
-      "required": ["source", "destination"],
-      "additionalProperties": false
-    }
-  }
-}
\ No newline at end of file
diff --git a/test/test_models/grammars/grammar_read_files.txt b/test/test_models/grammars/grammar_read_files.txt
deleted file mode 100644
index f8b288e593..0000000000
--- a/test/test_models/grammars/grammar_read_files.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-start: TEXT | fun_call
-TEXT: /[^{](.|\\n)*/
-fun_call: <|tool_call|> %json {"anyOf": [{"description": "Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "read_file"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"}}, "required": ["path"], "additionalProperties": false}}},{"description": "Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "read_multiple_files"}, "parameters": {"type": "object", "properties": {"paths":{"type":"array","items":{"type":"string"}}}, "required": ["paths"], "additionalProperties": false}}},{"description": "Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "write_file"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"},"content":{"type":"string"}}, "required": ["path","content"], "additionalProperties": false}}},{"description": "Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "edit_file"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"},"edits":{"type":"array","items":{"type":"object","properties":{"oldText":{"type":"string","description":"Text to search for - must match exactly"},"newText":{"type":"string","description":"Text to replace with"}},"required":["oldText","newText"],"additionalProperties":false}},"dryRun":{"type":"boolean","default":false,"description":"Preview changes using git-style diff format"}}, "required": ["path","edits"], "additionalProperties": false}}},{"description": "Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "create_directory"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"}}, "required": ["path"], "additionalProperties": false}}},{"description": "Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "list_directory"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"}}, "required": ["path"], "additionalProperties": false}}},{"description": "Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "directory_tree"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"}}, "required": ["path"], "additionalProperties": false}}},{"description": "Move or rename files and directories. Can move files between directories and rename them in a single operation. If the destination exists, the operation will fail. Works across different directories and can be used for simple renaming within the same directory. Both source and destination must be within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "move_file"}, "parameters": {"type": "object", "properties": {"source":{"type":"string"},"destination":{"type":"string"}}, "required": ["source","destination"], "additionalProperties": false}}},{"description": "Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "search_files"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"},"pattern":{"type":"string"},"excludePatterns":{"type":"array","items":{"type":"string"},"default":[]}}, "required": ["path","pattern"], "additionalProperties": false}}},{"description": "Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "get_file_info"}, "parameters": {"type": "object", "properties": {"path":{"type":"string"}}, "required": ["path"], "additionalProperties": false}}},{"description": "Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.", "type": "object", "required": ["name", "parameters"], "additionalProperties": false, "properties": {"name": {"const": "list_allowed_directories"}, "parameters": {"type": "object", "properties": {}, "required": [], "additionalProperties": false}}}]}
diff --git a/test/test_models/grammars/weather_grammar.txt b/test/test_models/grammars/weather_grammar.txt
deleted file mode 100644
index 53df8ad16d..0000000000
--- a/test/test_models/grammars/weather_grammar.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-start: TEXT | fun_call
-TEXT: /[^{](.|\n)*/
-fun_call: <|tool_call|> %json {
-  "description": "How to get the weather for a city",
-  "type": "object",
-  "required": ["name", "parameters"],
-  "additionalProperties": false,
-  "properties": {
-    "name": { "const": "get_weather" },
-    "parameters": {
-      "type": "object",
-      "properties": {
-        "city": { "type": "string" }
-      },
-      "required": ["city"],
-      "additionalProperties": false
-    }
-  }
-}
\ No newline at end of file
diff --git a/test/test_models/grammars/weather_population.json b/test/test_models/grammars/weather_population.json
deleted file mode 100644
index cfb89e3a12..0000000000
--- a/test/test_models/grammars/weather_population.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{ "anyOf": [
-  {
-    "description": "How to get the statistics for a city",
-    "type": "object",
-    "required": ["name", "parameters"],
-    "additionalProperties": false,
-    "properties": {
-      "name": { "const": "get_statistics" },
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "city": { "type": "string" }
-        },
-        "required": ["city"],
-        "additionalProperties": false
-      }
-    }
-  },
-  {
-    "description": "How to get the weather for a city",
-    "type": "object",
-    "required": ["name", "parameters"],
-    "additionalProperties": false,
-    "properties": {
-      "name": { "const": "get_weather" },
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "city": { "type": "string" }
-        },
-        "required": ["city"],
-        "additionalProperties": false
-      }
-    }
-  },
-  {
-    "description": "How to get the population for a city",
-    "type": "object",
-    "required": ["name", "parameters"],
-    "additionalProperties": false,
-    "properties": {
-      "name": { "const": "get_population" },
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "city": { "type": "string" }
-        },
-        "required": ["city"],
-        "additionalProperties": false
-      }
-    }
-  }
-] }
diff --git a/test/test_models/grammars/weather_schema.json b/test/test_models/grammars/weather_schema.json
deleted file mode 100644
index e9e6f81730..0000000000
--- a/test/test_models/grammars/weather_schema.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "description": "How to get the weather for a city",
-  "type": "object",
-  "required": ["name", "parameters"],
-  "additionalProperties": false,
-  "properties": {
-    "name": { "const": "get_weather" },
-    "parameters": {
-      "type": "object",
-      "properties": {
-        "city": { "type": "string" }
-      },
-      "required": ["city"],
-      "additionalProperties": false
-    }
-  }
-}
diff --git a/test/test_models/tool-definitions/filesystem.json b/test/test_models/tool-definitions/filesystem.json
new file mode 100644
index 0000000000..c40179b939
--- /dev/null
+++ b/test/test_models/tool-definitions/filesystem.json
@@ -0,0 +1,254 @@
+[
+    {
+        "type": "function",
+        "function": {
+            "name": "read_file",
+            "description": "Read the complete contents of a file from the file system. Handles various text encodings and provides detailed error messages if the file cannot be read. Use this tool when you need to examine the contents of a single file. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "read_multiple_files",
+            "description": "Read the contents of multiple files simultaneously. This is more efficient than reading files one by one when you need to analyze or compare multiple files. Each file's content is returned with its path as a reference. Failed reads for individual files won't stop the entire operation. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "paths": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": [
+                    "paths"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "write_file",
+            "description": "Create a new file or completely overwrite an existing file with new content. Use with caution as it will overwrite existing files without warning. Handles text content with proper encoding. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    },
+                    "content": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path",
+                    "content"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "edit_file",
+            "description": "Make line-based edits to a text file. Each edit replaces exact line sequences with new content. Returns a git-style diff showing the changes made. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    },
+                    "edits": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "oldText": {
+                                    "type": "string",
+                                    "description": "Text to search for - must match exactly"
+                                },
+                                "newText": {
+                                    "type": "string",
+                                    "description": "Text to replace with"
+                                }
+                            },
+                            "required": [
+                                "oldText",
+                                "newText"
+                            ],
+                            "additionalProperties": false
+                        }
+                    },
+                    "dryRun": {
+                        "type": "boolean",
+                        "default": false,
+                        "description": "Preview changes using git-style diff format"
+                    }
+                },
+                "required": [
+                    "path",
+                    "edits"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "create_directory",
+            "description": "Create a new directory or ensure a directory exists. Can create multiple nested directories in one operation. If the directory already exists, this operation will succeed silently. Perfect for setting up directory structures for projects or ensuring required paths exist. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "list_directory",
+            "description": "Get a detailed listing of all files and directories in a specified path. Results clearly distinguish between files and directories with [FILE] and [DIR] prefixes. This tool is essential for understanding directory structure and finding specific files within a directory. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "directory_tree",
+            "description": "Get a recursive tree view of files and directories as a JSON structure. Each entry includes 'name', 'type' (file/directory), and 'children' for directories. Files have no children array, while directories always have a children array (which may be empty). The output is formatted with 2-space indentation for readability. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "move_file",
+            "description": "Move or rename files and directories. Can move files between directories and rename them in a single operation. If the destination exists, the operation will fail. Works across different directories and can be used for simple renaming within the same directory. Both source and destination must be within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "source": {
+                        "type": "string"
+                    },
+                    "destination": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "source",
+                    "destination"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search_files",
+            "description": "Recursively search for files and directories matching a pattern. Searches through all subdirectories from the starting path. The search is case-insensitive and matches partial names. Returns full paths to all matching items. Great for finding files when you don't know their exact location. Only searches within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    },
+                    "pattern": {
+                        "type": "string"
+                    },
+                    "excludePatterns": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "default": []
+                    }
+                },
+                "required": [
+                    "path",
+                    "pattern"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_file_info",
+            "description": "Retrieve detailed metadata about a file or directory. Returns comprehensive information including size, creation time, last modified time, permissions, and type. This tool is perfect for understanding file characteristics without reading the actual content. Only works within allowed directories.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "path"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "list_allowed_directories",
+            "description": "Returns the list of directories that this server is allowed to access. Use this to understand which directories are available before trying to access files.",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": false
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/test/test_models/tool-definitions/ocr.json b/test/test_models/tool-definitions/ocr.json
new file mode 100644
index 0000000000..bd27978894
--- /dev/null
+++ b/test/test_models/tool-definitions/ocr.json
@@ -0,0 +1,85 @@
+[
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_ocr",
+            "description": "Perform OCR on the provided input. Args: input_data: Can be one of: - File path to an image - URL to an image - Raw image bytes language: Tesseract language code (default: 'eng') config: Tesseract configuration options (default: '--oem 3 --psm 6') Returns: Extracted text from the image Usage: perform_ocr('/path/to/image.jpg') perform_ocr('https://example.com/image.jpg') perform_ocr(image_bytes) ",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "input_data": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "format": "binary",
+                                "type": "string"
+                            }
+                        ],
+                        "title": "Input Data"
+                    },
+                    "language": {
+                        "default": "eng",
+                        "title": "Language",
+                        "type": "string"
+                    },
+                    "config": {
+                        "default": "--oem 3 --psm 6",
+                        "title": "Config",
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "input_data"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_supported_languages",
+            "description": "Get list of supported OCR languages. Returns: List of supported language codes Usage: get_supported_languages() ",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_output",
+            "description": "Returns the output for any information that is requested",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "text"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "exit",
+            "description": "Exit because the agent's task is complete",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": false
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/test/test_models/tool-definitions/weather.json b/test/test_models/tool-definitions/weather.json
new file mode 100644
index 0000000000..14350bafda
--- /dev/null
+++ b/test/test_models/tool-definitions/weather.json
@@ -0,0 +1,59 @@
+[
+    {
+        "type": "function",
+        "function": {
+            "name": "get_statistics",
+            "description": "How to get the statistics for a city",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "city"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "How to get the weather for a city",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "city"
+                ],
+                "additionalProperties": false
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_population",
+            "description": "How to get the population for a city",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    }
+                },
+                "required": [
+                    "city"
+                ],
+                "additionalProperties": false
+            }
+        }
+    }
+]
\ No newline at end of file