Skip to content

Conversation

@razvanlupusoru
Copy link
Contributor

In regions destined for GPU offload, computing an address_of means getting device address directly - no need (and actually incorrect) to insert a runtime call to get the address. This was already working for regions such as gpu.launch - but now it applies to acc regions as well.

In regions destined for GPU offload, computing an address_of means
getting device address directly - no need (and actually incorrect)
to insert a runtime call to get the address. This was already working
for regions such as `gpu.launch` - but now it applies to acc regions
as well.
@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Jan 9, 2026
@llvmbot
Copy link
Member

llvmbot commented Jan 9, 2026

@llvm/pr-subscribers-flang-fir-hlfir

Author: Razvan Lupusoru (razvanlupusoru)

Changes

In regions destined for GPU offload, computing an address_of means getting device address directly - no need (and actually incorrect) to insert a runtime call to get the address. This was already working for regions such as gpu.launch - but now it applies to acc regions as well.


Full diff: https://github.com/llvm/llvm-project/pull/175225.diff

2 Files Affected:

  • (modified) flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp (+6-2)
  • (modified) flang/test/Fir/CUDA/cuda-global-addr.mlir (+27)
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
index 424a8fd9d959b..352f8abde6093 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -49,9 +50,9 @@ namespace {
 static bool inDeviceContext(mlir::Operation *op) {
   if (op->getParentOfType<cuf::KernelOp>())
     return true;
-  if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
+  if (op->getParentOfType<mlir::acc::OffloadRegionOpInterface>())
     return true;
-  if (auto funcOp = op->getParentOfType<mlir::gpu::LaunchOp>())
+  if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
     return true;
   if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
     if (auto cudaProcAttr =
@@ -128,6 +129,9 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
     if (op.getResult().getUsers().empty())
       return success();
     if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
+      if (inDeviceContext(addrOfOp)) {
+        return failure();
+      }
       if (auto global = symTab.lookup<fir::GlobalOp>(
               addrOfOp.getSymbol().getRootReference().getValue())) {
         if (cuf::isRegisteredDeviceGlobal(global)) {
diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir
index 6f7816c9163cb..ae88af3d3c16c 100644
--- a/flang/test/Fir/CUDA/cuda-global-addr.mlir
+++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir
@@ -94,6 +94,33 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 
 // -----
 
+// Check that we do not introduce call to _FortranACUFGetDeviceAddress when the
+// address_of is inside an acc.parallel region (OffloadRegionOpInterface).
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+fir.global @_QMmod1Eadev_acc {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {
+  %0 = fir.zero_bits !fir.array<10xi32>
+  fir.has_value %0 : !fir.array<10xi32>
+}
+func.func @_QQmain_acc() attributes {fir.bindc_name = "test_acc"} {
+  acc.parallel {
+    %c10 = arith.constant 10 : index
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %3 = fir.address_of(@_QMmod1Eadev_acc) : !fir.ref<!fir.array<10xi32>>
+    %4 = fir.declare %3(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Eadev_acc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @_QQmain_acc()
+// CHECK: acc.parallel
+// CHECK-NOT: fir.call {{.*}}GetDeviceAddress
+
+}
+
+// -----
+
 // Check that we do not introduce call to _FortranACUFGetDeviceAddress when the
 // value has no user.
 

if (op->getParentOfType<cuf::KernelOp>())
return true;
if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
if (op->getParentOfType<mlir::acc::OffloadRegionOpInterface>())
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The diffs look a bit weird - so to explain what I did:

  • I added a new check for acc::OffloadRegionOpInterface which includes both acc regions and gpu.launch
  • I removed the gpu.launch case since it is already covered.

Copy link
Contributor

@clementval clementval left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Thanks for the fix Razvan.

@razvanlupusoru razvanlupusoru merged commit 33c4e3e into llvm:main Jan 9, 2026
13 checks passed
Priyanshu3820 pushed a commit to Priyanshu3820/llvm-project that referenced this pull request Jan 18, 2026
llvm#175225)

In regions destined for GPU offload, computing an address_of means
getting device address directly - no need (and actually incorrect) to
insert a runtime call to get the address. This was already working for
regions such as `gpu.launch` - but now it applies to acc regions as
well.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

flang:fir-hlfir flang Flang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants