diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd39521d1d6..0610462aed9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,10 +558,6 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  if(EXECUTORCH_BUILD_CUSTOM)
-    list(APPEND _dep_libs custom_ops)
-  endif()
-
   if(EXECUTORCH_BUILD_QUANTIZED)
     target_link_options_shared_lib(quantized_ops_lib)
     list(APPEND _dep_libs quantized_kernels quantized_ops_lib)
@@ -571,6 +567,13 @@ if(EXECUTORCH_BUILD_PYBIND)
   if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT AND NOT APPLE)
     list(APPEND _dep_libs custom_ops_aot_lib)
   endif()
+  # TODO(laryliu): Fix linux duplicate registation problem. In GH CI worker
+  # libcustom_ops.a doesn't dedup with the one indirectly linked from
+  # libcustom_ops_aot_lib.a
+  if(EXECUTORCH_BUILD_CUSTOM AND APPLE)
+    target_link_options_shared_lib(custom_ops)
+    list(APPEND _dep_libs custom_ops)
+  endif()
   # compile options for pybind
 
   set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
diff --git a/examples/models/llama2/custom_ops/TARGETS b/examples/models/llama2/custom_ops/TARGETS
index 199cbe363d0..195df3bb931 100644
--- a/examples/models/llama2/custom_ops/TARGETS
+++ b/examples/models/llama2/custom_ops/TARGETS
@@ -15,6 +15,7 @@ runtime.python_test(
     ],
     preload_deps = [
         ":custom_ops_aot_lib",
+        ":custom_ops_aot_py",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py b/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
index 949fdeab2c4..abf3abc0284 100644
--- a/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
+++ b/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
@@ -9,6 +9,8 @@
 import torch
 import torch.nn.functional as F
 
+from .sdpa_with_kv_cache import custom_ops_lib  # noqa
+
 
 class SDPATest(unittest.TestCase):
 
diff --git a/setup.py b/setup.py
index 6591123d5e6..56664281764 100644
--- a/setup.py
+++ b/setup.py
@@ -389,6 +389,7 @@ def run(self):
 
         if ShouldBuild.llama_custom_ops:
             cmake_args += [
+                "-DEXECUTORCH_BUILD_CUSTOM=ON",  # add llama sdpa ops to pybindings.
                 "-DEXECUTORCH_BUILD_CUSTOM_OPS_AOT=ON",
             ]
             build_args += ["--target", "custom_ops_aot_lib"]