diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 40a12ee9a..c5073a7bb 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2312,18 +2312,9 @@ def __init__(
         self.lineinfo = lineinfo
         self.cc = cc
         self.arch = arch
-        if lto is False:
-            # WAR for apparent nvjitlink issue
-            lto = None
         self.lto = lto
         self.additional_flags = additional_flags
-
-        self.options = LinkerOptions(
-            max_register_count=self.max_registers,
-            lineinfo=lineinfo,
-            arch=arch,
-            link_time_optimization=lto,
-        )
+        self._has_ltoir = False
         self._complete = False
         self._object_codes = []
         self.linker = None  # need at least one program
@@ -2446,6 +2437,8 @@ def add_cu(self, cu, name="<cudapy-cu>"):
             print(obj.code)
 
         self._object_codes.append(obj)
+        if self.lto:
+            self._has_ltoir = True
 
     def add_cubin(self, cubin, name="<cudapy-cubin>"):
         obj = ObjectCode.from_cubin(cubin, name=name)
@@ -2454,6 +2447,7 @@ def add_cubin(self, cubin, name="<cudapy-cubin>"):
     def add_ltoir(self, ltoir, name="<cudapy-ltoir>"):
         obj = ObjectCode.from_ltoir(ltoir, name=name)
         self._object_codes.append(obj)
+        self._has_ltoir = True
 
     def add_fatbin(self, fatbin, name="<cudapy-fatbin>"):
         obj = ObjectCode.from_fatbin(fatbin, name=name)
@@ -2499,15 +2493,23 @@ def add_data(self, data, kind, name):
 
         fn(data, name)
 
-    def get_linked_ptx(self):
+    def _get_linker_options(self, ptx):
+        # Some linker flags are only valid/required if LTOIR object code is present.
+        # WAR for cuda-core < 0.4.0 where passing False incorrectly appends flags
+        # (fixed in cuda-python PR #989, released in cuda-core v0.4.0)
+        lto_flag = True if self._has_ltoir else None
+        ptx_flag = True if (self._has_ltoir and ptx) else None
         options = LinkerOptions(
             max_register_count=self.max_registers,
             lineinfo=self.lineinfo,
             arch=self.arch,
-            link_time_optimization=True,
-            ptx=True,
+            link_time_optimization=lto_flag,
+            ptx=ptx_flag,
         )
+        return options
 
+    def get_linked_ptx(self):
+        options = self._get_linker_options(ptx=True)
         self.linker = Linker(*self._object_codes, options=options)
 
         result = self.linker.link("ptx")
@@ -2526,7 +2528,8 @@ def complete(self):
         cubin is a pointer to a internal buffer of cubin owned by the linker;
         thus, it should be loaded before the linker is destroyed.
         """
-        self.linker = Linker(*self._object_codes, options=self.options)
+        options = self._get_linker_options(ptx=False)
+        self.linker = Linker(*self._object_codes, options=options)
         result = self.linker.link("cubin")
         self.close()
         self._complete = True
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
index 23124ee06..2ca7493e4 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 import numpy as np
+import os
 import pytest
 from numba.cuda.testing import unittest
 from numba.cuda.testing import (
@@ -109,6 +110,25 @@ def simple_lmem(A, B, dty):
         B[i] = C[i]
 
 
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
+
+
+add_from_numba = cuda.declare_device(
+    "add_from_numba",
+    "int32(int32, int32)",
+    link=[test_device_functions_ltoir],
+)
+
+
+def debuggable_kernel(result):
+    i = cuda.grid(1)
+    result[i] = add_from_numba(i, i)
+
+
 @skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
     @require_context
@@ -330,6 +350,9 @@ def test_get_local_mem_per_specialized(self):
         calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
         self.assertGreaterEqual(local_mem_size, calc_size)
 
+    def test_debug_kernel_with_lto(self):
+        cuda.jit("void(int32[::1])", debug=True, opt=False)(debuggable_kernel)
+
     @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
     def test_link_for_different_cc(self):
         linker = _Linker(max_registers=0, cc=(7, 5), lto=True)