-
-
Notifications
You must be signed in to change notification settings - Fork 14.4k
Add scalar support for offload #150288
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Add scalar support for offload #150288
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ use std::ffi::CString; | |
|
|
||
| use llvm::Linkage::*; | ||
| use rustc_abi::Align; | ||
| use rustc_codegen_ssa::common::TypeKind; | ||
| use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue}; | ||
| use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; | ||
| use rustc_middle::bug; | ||
|
|
@@ -357,7 +358,6 @@ pub(crate) fn add_global<'ll>( | |
| pub(crate) fn gen_define_handling<'ll>( | ||
| cx: &CodegenCx<'ll, '_>, | ||
| metadata: &[OffloadMetadata], | ||
| types: &[&'ll Type], | ||
| symbol: String, | ||
| offload_globals: &OffloadGlobals<'ll>, | ||
| ) -> OffloadKernelGlobals<'ll> { | ||
|
|
@@ -367,25 +367,18 @@ pub(crate) fn gen_define_handling<'ll>( | |
|
|
||
| let offload_entry_ty = offload_globals.offload_entry_ty; | ||
|
|
||
| // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or | ||
| // reference) types. | ||
| let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) { | ||
| rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), | ||
| _ => None, | ||
| }); | ||
|
|
||
| // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary | ||
| let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) = | ||
| ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip(); | ||
| let (sizes, transfer): (Vec<_>, Vec<_>) = | ||
| metadata.iter().map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip(); | ||
|
|
||
| let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); | ||
| let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes); | ||
| // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2), | ||
| // or both to and from the gpu (=3). Other values shouldn't affect us for now. | ||
| // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten | ||
| // will be 2. For now, everything is 3, until we have our frontend set up. | ||
| // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later). | ||
| let memtransfer_types = | ||
| add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer); | ||
| add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &transfer); | ||
|
|
||
| // Next: For each function, generate these three entries. A weak constant, | ||
| // the llvm.rodata entry name, and the llvm_offload_entries value | ||
|
|
@@ -441,13 +434,25 @@ fn declare_offload_fn<'ll>( | |
| ) | ||
| } | ||
|
|
||
| pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 { | ||
| match cx.type_kind(ty) { | ||
| TypeKind::Half | ||
| | TypeKind::Float | ||
| | TypeKind::Double | ||
| | TypeKind::X86_FP80 | ||
| | TypeKind::FP128 | ||
| | TypeKind::PPC_FP128 => cx.float_width(ty) as u64, | ||
| TypeKind::Integer => cx.int_width(ty), | ||
| other => bug!("scalar_width was called on a non scalar type {other:?}"), | ||
| } | ||
| } | ||
|
|
||
| // For each kernel *call*, we now use some of our previous declared globals to move data to and from | ||
| // the gpu. For now, we only handle the data transfer part of it. | ||
| // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu. | ||
| // Since in our frontend users (by default) don't have to specify data transfer, this is something | ||
| // we should optimize in the future! We also assume that everything should be copied back and forth, | ||
| // but sometimes we can directly zero-allocate on the device and only move back, or if something is | ||
| // immutable, we might only copy it to the device, but not back. | ||
| // we should optimize in the future! In some cases we can directly zero-allocate on the device and | ||
| // only move data back, or if something is immutable, we might only copy it to the device. | ||
| // | ||
| // Current steps: | ||
| // 0. Alloca some variables for the following steps | ||
|
|
@@ -534,8 +539,34 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( | |
| let mut geps = vec![]; | ||
| let i32_0 = cx.get_const_i32(0); | ||
| for &v in args { | ||
| let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]); | ||
| vals.push(v); | ||
| let ty = cx.val_ty(v); | ||
| let ty_kind = cx.type_kind(ty); | ||
| let (base_val, gep_base) = match ty_kind { | ||
| TypeKind::Pointer => (v, v), | ||
| TypeKind::Half | TypeKind::Float | TypeKind::Double | TypeKind::Integer => { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a FIXME that we should later check for f128 support. At least newer NVIDIA cards should support it. |
||
| // FIXME(Sa4dUs): check for `f128` support, latest NVIDIA cards support it | ||
| let num_bits = scalar_width(cx, ty); | ||
|
|
||
| let bb = builder.llbb(); | ||
| unsafe { | ||
| llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, builder.llfn()); | ||
| } | ||
| let addr = builder.direct_alloca(cx.type_i64(), Align::EIGHT, "addr"); | ||
| unsafe { | ||
| llvm::LLVMPositionBuilderAtEnd(builder.llbuilder, bb); | ||
| } | ||
|
|
||
| let cast = builder.bitcast(v, cx.type_ix(num_bits)); | ||
| let value = builder.zext(cast, cx.type_i64()); | ||
| builder.store(value, addr, Align::EIGHT); | ||
| (value, addr) | ||
| } | ||
| other => bug!("offload does not support {other:?}"), | ||
| }; | ||
|
|
||
| let gep = builder.inbounds_gep(cx.type_f32(), gep_base, &[i32_0]); | ||
|
|
||
| vals.push(base_val); | ||
| geps.push(gep); | ||
| } | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| //@ add-minicore | ||
| //@ revisions: amdgpu nvptx | ||
| //@[nvptx] compile-flags: -Copt-level=0 -Zunstable-options -Zoffload=Device --target nvptx64-nvidia-cuda --crate-type=rlib | ||
| //@[nvptx] needs-llvm-components: nvptx | ||
| //@[amdgpu] compile-flags: -Copt-level=0 -Zunstable-options -Zoffload=Device --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 --crate-type=rlib | ||
| //@[amdgpu] needs-llvm-components: amdgpu | ||
| //@ no-prefer-dynamic | ||
| //@ needs-offload | ||
|
|
||
| // This test verifies that the offload intrinsic is properly handling scalar args on the device, | ||
| // replacing the args by i64 and then trunc and cast them to the original type | ||
|
|
||
| #![feature(abi_gpu_kernel, rustc_attrs, no_core)] | ||
| #![no_core] | ||
|
|
||
| extern crate minicore; | ||
|
|
||
| // CHECK: ; Function Attrs | ||
| // nvptx-NEXT: define ptx_kernel void @foo(ptr %dyn_ptr, ptr %0, i64 %1) | ||
| // amdgpu-NEXT: define amdgpu_kernel void @foo(ptr %dyn_ptr, ptr %0, i64 %1) | ||
| // CHECK-NEXT: entry: | ||
| // CHECK-NEXT: %2 = trunc i64 %1 to i32 | ||
| // CHECK-NEXT: %3 = bitcast i32 %2 to float | ||
| // CHECK-NEXT: br label %start | ||
| // CHECK: start: | ||
| // CHECK-NEXT: store float %3, ptr %0, align 4 | ||
| // CHECK-NEXT: ret void | ||
| // CHECK-NEXT: } | ||
|
|
||
| #[unsafe(no_mangle)] | ||
| #[rustc_offload_kernel] | ||
| pub unsafe extern "gpu-kernel" fn foo(x: *mut f32, k: f32) { | ||
| unsafe { | ||
| *x = k; | ||
| }; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| //@ compile-flags: -Zoffload=Test -Zunstable-options -C opt-level=1 -Clto=fat | ||
| //@ no-prefer-dynamic | ||
| //@ needs-offload | ||
|
|
||
| // This test verifies that the offload intrinsic is properly handling scalar args, passing them to | ||
| // the kernel as i64 | ||
|
|
||
| #![feature(abi_gpu_kernel)] | ||
| #![feature(rustc_attrs)] | ||
| #![feature(core_intrinsics)] | ||
| #![no_main] | ||
|
|
||
| // CHECK: define{{( dso_local)?}} void @main() | ||
| // CHECK-NOT: define | ||
| // CHECK: %addr = alloca i64, align 8 | ||
| // CHECK: store double 4.200000e+01, ptr %0, align 8 | ||
| // CHECK: %_0.i = load double, ptr %0, align 8 | ||
| // CHECK: store double %_0.i, ptr %addr, align 8 | ||
| // CHECK: %1 = getelementptr inbounds nuw i8, ptr %.offload_baseptrs, i64 8 | ||
| // CHECK-NEXT: store double %_0.i, ptr %1, align 8 | ||
| // CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8 | ||
| // CHECK-NEXT: store ptr %addr, ptr %2, align 8 | ||
| // CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8 | ||
| // CHECK-NEXT: store i64 4, ptr %3, align 8 | ||
| // CHECK-NEXT: call void @__tgt_target_data_begin_mapper | ||
|
|
||
| #[unsafe(no_mangle)] | ||
| fn main() { | ||
| let mut x = 0.0; | ||
| let k = core::hint::black_box(42.0); | ||
|
|
||
| core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [1, 1, 1], (&mut x, k)); | ||
| } | ||
|
|
||
| unsafe extern "C" { | ||
| pub fn foo(x: *mut f32, k: f32); | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.