Volume Rendering Crash (Summit + WarpX) #825

cyrush · 2021-10-12T18:36:46Z

12: ./warpx() [0x12694ad0]
    vtkm::cont::cuda::internal::CudaAllocator::Free(void*) at ??:?

13: ./warpx() [0x12698c30]
    (anonymous namespace)::CudaDelete(void*) at ??:?

14: ./warpx() [0x12023ec0]
    vtkm::cont::internal::BufferInfo::~BufferInfo() at ??:?

15: ./warpx() [0x12018708]
    std::_Rb_tree<vtkm::cont::DeviceAdapterId, std::pair<vtkm::cont::DeviceAdapterId const, (anonymous na
mespace)::BufferState>, std::_Select1st<std::pair<vtkm::cont::DeviceAdapterId const, (anonymous namespace)::BufferState> >, std::less<vtkm::cont::DeviceAdapterId>, std::allocator<std::pair<vtkm::cont::DeviceAdapterId const, (anonymous namespace)::BufferState> > >::_M_erase(std::_Rb_tree_node<std::pair<vtkm::cont::DeviceAdapterId const, (anonymous namespace)::BufferState> >*) at Buffer.cxx:?

16: ./warpx() [0x120189d8]
    std::_Sp_counted_ptr<vtkm::cont::internal::Buffer::InternalsStruct*, (__gnu_cxx::_Lock_policy)2>::_M_dispose() at ??:?

17: ./warpx() [0x12019294]
    vtkm::cont::internal::Buffer::~Buffer() at ??:?

18: ./warpx() [0x109ce36c]
    vtkm::rendering::raytracing::Ray<float>::~Ray() at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:98
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:108
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:136
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:206
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_vector.h:677
 (inlined by) ?? at /gpfs/alpine/csc340/world-shared/software/ascent/2021_09_01_gcc_9_3_0_warpx/summit/cuda/gnu/spack/opt/spack/linux-rhel8-power9le/gcc-9.3.0/vtk-m-1.6.0-zh4ljmhdg3xl7byw3uk6hui2urcvuncm/include/vtkm-1.6/vtkm/cont/ArrayHandle.h:376
 (inlined by) ?? at /gpfs/alpine/csc340/world-shared/software/ascent/2021_09_01_gcc_9_3_0_warpx/summit/cuda/gnu/spack/opt/spack/linux-rhel8-power9le/gcc-9.3.0/vtk-m-1.6.0-zh4ljmhdg3xl7byw3uk6hui2urcvuncm/include/vtkm-1.6/vtkm/rendering/raytracing/ChannelBuffer.h:45
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:98
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:108
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:136
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_construct.h:206
 (inlined by) ?? at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_vector.h:677
 (inlined by) vtkm::rendering::raytracing::Ray<float>::~Ray() at /gpfs/alpine/csc340/world-shared/software/ascent/2021_09_01_gcc_9_3_0_warpx/summit/cuda/gnu/spack/opt/spack/linux-rhel8-power9le/gcc-9.3.0/vtk-m-1.6.0-zh4ljmhdg3xl7byw3uk6hui2urcvuncm/include/vtkm-1.6/vtkm/rendering/raytracing/Ray.h:39

19: ./warpx() [0x10ff2a10]
    vtkm::rendering::MapperVolume::RenderCells(vtkm::cont::DynamicCellSetBase<vtkm::List<vtkm::cont::CellSetStructured<2>, vtkm::cont::CellSetStructured<3>, vtkm::cont::CellSetExplicit<vtkm::cont::StorageTagBasic, vtkm::cont::StorageTagBasic, vtkm::cont::StorageTagBasic>, vtkm::cont::CellSetSingleType<vtkm::cont::StorageTagBasic> > > const&, vtkm::cont::CoordinateSystem const&, vtkm::cont::Field const&, vtkm::cont::ColorTable const&, vtkm::rendering::Camera const&, vtkm::Range const&) at ??:?

20: ./warpx() [0x10a1aab0]
    vtkh::VolumeRenderer::RenderOneDomainPerRank() at ??:?

21: ./warpx() [0x10a1b2a4]
    vtkh::VolumeRenderer::Update() at ??:?

22: ./warpx() [0x10a12180]
    vtkh::Scene::Render() at ??:?

23: ./warpx() [0x1096c94c]
    ascent::runtime::filters::ExecScene::execute() at /tmp/cyrush/spack-stage/spack-stage-ascent-develop-we4tn3kj7xdbrwnsmaaf3kmncgbjufqe/spack-src/src/ascent/runtimes/flow_filters/ascent_runtime_rendering_filters.cpp:311
 (inlined by) ascent::runtime::filters::ExecScene::execute() at /tmp/cyrush/spack-stage/spack-stage-ascent-develop-we4tn3kj7xdbrwnsmaaf3kmncgbjufqe/spack-src/src/ascent/runtimes/flow_filters/ascent_runtime_rendering_filters.cpp:1572

24: ./warpx() [0x109b6b84]
    flow::Workspace::execute() at /tmp/cyrush/spack-stage/spack-stage-ascent-develop-we4tn3kj7xdbrwnsmaaf3kmncgbjufqe/spack-src/src/flow/flow_workspace.cpp:341

25: ./warpx() [0x10881ed8]
    ascent::AscentRuntime::Execute(conduit::Node const&) at /tmp/cyrush/spack-stage/spack-stage-ascent-develop-we4tn3kj7xdbrwnsmaaf3kmncgbjufqe/spack-src/src/ascent/runtimes/ascent_main_runtime.cpp:1659

26: ./warpx() [0x1086f578]
    ascent::Ascent::execute(conduit::Node const&) at /tmp/cyrush/spack-stage/spack-stage-ascent-develop-we4tn3kj7xdbrwnsmaaf3kmncgbjufqe/spack-src/src/ascent/ascent.cpp:448

27: ./warpx() [0x101b744c]
    FlushFormatAscent::WriteToFile(amrex::Vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, amrex::Vector<amrex::MultiFab, std::allocator<amrex::MultiFab> > const&, amrex::Vector<amrex::Geometry, std::allocator<amrex::Geometry> >&, amrex::Vector<int, std::allocator<int> >, double, amrex::Vector<ParticleDiag, std::allocator<ParticleDiag> > const&, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, int, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, int, amrex::Geometry const&, bool) const at ??:?

28: ./warpx() [0x10101c50]
    FullDiagnostics::Flush(int) at ??:?

29: ./warpx() [0x100feb04]
    Diagnostics::FilterComputePackFlush(int, bool) at ??:?

30: ./warpx() [0x10106750]
    MultiDiagnostics::FilterComputePackFlush(int, bool) at ??:?

31: ./warpx() [0x103009c4]
    WarpX::InitData() at ??:?

32: ./warpx() [0x10050088]
    main at ??:?

33: /lib64/power9/libc.so.6(+0x24078) [0x20000ad84078]

34: /lib64/power9/libc.so.6(__libc_start_main+0xb4) [0x20000ad84264]

The text was updated successfully, but these errors were encountered:

cyrush · 2021-10-12T18:46:13Z

WarpX Run path:
/gpfs/alpine/proj-shared/aph114/2021-09-16_axel-ascent-morenodes-doubleFreq-HD-vol-time/

cyrush · 2021-10-12T18:49:18Z

ascent_actions_warpx_vol_crash.yaml.txt

cyrush · 2021-10-21T16:14:54Z

I can replicate with Replay, however only at scale.

I tried with latest Ascent. Best to try next with newer VTK-m?

Error: TryExecute encountered an error: CUDA Error: an illegal memory access was encountered
cudaEventSynchronize(this->StopEvent) @ /tmp/cyrush/spack-stage/spack-stage-vtk-m-1.6.0-m62boczsoseadflojgh6f7dddwuacw6m/spack-src/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu:97
	- Failing functor: N4vtkm9rendering10raytracing24VolumeRendererStructured13RenderFunctorIfEE
	- Failing device: Cuda

Error: TryExecute encountered an error: CUDA Error: an illegal memory access was encountered
Unchecked asynchronous error @ /tmp/cyrush/spack-stage/spack-stage-vtk-m-1.6.0-m62boczsoseadflojgh6f7dddwuacw6m/spack-src/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu:25
	- Failing functor: N4vtkm9rendering10raytracing24VolumeRendererStructured13RenderFunctorIfEE
	- Failing device: Serial

terminate called after throwing an instance of 'vtkm::cont::cuda::ErrorCuda'
  what():  
[f35n17:776734] *** Process received signal ***
[f35n17:776734] Signal: Aborted (6)
[f35n17:776734] Signal code:  (-6)
[f35n17:776734] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f35n17:776734] [ 1] /lib64/power9/libc.so.6(gsignal+0xd8)[0x200000bc3618]
[f35n17:776734] [ 2] /lib64/power9/libc.so.6(abort+0x164)[0x200000ba3a2c]
[f35n17:776734] [ 3] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x158)[0x20000084ba28]
[f35n17:776734] [ 4] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libstdc++.so.6(+0xc7004)[0x200000847004]
[f35n17:776734] [ 5] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libstdc++.so.6(+0xc54a8)[0x2000008454a8]
[f35n17:776734] [ 6] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libstdc++.so.6(__gxx_personality_v0+0x440)[0x2000008465a0]
[f35n17:776734] [ 7] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libgcc_s.so.1(+0xbeec)[0x200000b4beec]
[f35n17:776734] [ 8] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libgcc_s.so.1(_Unwind_RaiseException+0x378)[0x200000b4c658]
[f35n17:776734] [ 9] /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/lib64/libstdc++.so.6(__cxa_throw+0x64)[0x200000847594]
[f35n17:776734] [10] ./ascent_replay_mpi_new[0x124c28f0]
[f35n17:776734] [11] ./ascent_replay_mpi_new[0x124c6a50]
[f35n17:776734] [12] ./ascent_replay_mpi_new[0x11e51a20]
[f35n17:776734] [13] ./ascent_replay_mpi_new[0x11e46268]
[f35n17:776734] [14] ./ascent_replay_mpi_new[0x11e46538]
[f35n17:776734] [15] ./ascent_replay_mpi_new[0x11e46df4]
[f35n17:776734] [16] ./ascent_replay_mpi_new[0x102e3c0c]
[f35n17:776734] [17] ./ascent_replay_mpi_new[0x10847b64]
[f35n17:776734] [18] ./ascent_replay_mpi_new[0x10845f94]
[f35n17:776734] [19] ./ascent_replay_mpi_new[0x1084bc08]
[f35n17:776734] [20] ./ascent_replay_mpi_new[0x10842860]
[f35n17:776734] [21] ./ascent_replay_mpi_new[0x101b83ec]
[f35n17:776734] [22] ./ascent_replay_mpi_new[0x1025f0d4]
[f35n17:776734] [23] ./ascent_replay_mpi_new[0x1006b6cc]
[f35n17:776734] [24] ./ascent_replay_mpi_new[0x10057e58]
[f35n17:776734] [25] ./ascent_replay_mpi_new[0x100283b8]
[f35n17:776734] [26] /lib64/power9/libc.so.6(+0x24078)[0x200000ba4078]
[f35n17:776734] [27] /lib64/power9/libc.so.6(__libc_start_main+0xb4)[0x200000ba4264]
[f35n17:776734] *** End of error message ***

cyrush · 2021-11-18T20:35:00Z

Same issue with newer VTK-m (at scale)

cudaEventSynchronize(this->StopEvent) @ /tmp/cyrush/spack-stage/spack-stage-vtk-m-1.7.0-jch355wjfl6ze7rqouvwldjxoyvip2rc/spack-src/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu:97
	- Failing functor: N4vtkm9rendering10raytracing24VolumeRendererStructured13RenderFunctorIfEE
	- Failing device: Cuda

Error: TryExecute encountered an error: CUDA Error: an illegal memory access was encountered
Unchecked asynchronous error @ /tmp/cyrush/spack-stage/spack-stage-vtk-m-1.7.0-jch355wjfl6ze7rqouvwldjxoyvip2rc/spack-src/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu:25
	- Failing functor: N4vtkm9rendering10raytracing24VolumeRendererStructured13RenderFunctorIfEE
	- Failing device: Serial

terminate called after throwing an instance of 'vtkm::cont::cuda::ErrorCuda'
  what():  CUDA Error: an illegal memory access was encountered
Unchecked asynchronous error @ /tmp/cyrush/spack-stage/spack-stage-vtk-m-1.7.0-jch355wjfl6ze7rqouvwldjxoyvip2rc/spack-src/vtkm/cont/cuda/internal/CudaAllocator.cu:187
(Stack trace unavailable)

cyrush · 2021-11-18T20:44:20Z

Crash is reported in this functor:

https://gitlab.kitware.com/vtk/vtk-m/-/blob/master/vtkm/rendering/raytracing/VolumeRendererStructured.cxx#L747

The heavy lifting is clearly in:
https://gitlab.kitware.com/vtk/vtk-m/-/blob/master/vtkm/rendering/raytracing/VolumeRendererStructured.cxx#L786

We are using 80 nodes ( 480 GPUS)

There should be plenty of memory -- the HDF5 size of this data (all domains) on disk is only 3.6 GB.

cyrush · 2021-11-19T00:23:52Z

Some more context, the min and max of the input datasets
(megabytes of data described with conduit)

min: 0.0258083343506
max: 649.828548431

The total across ranks in memory seems to be:
98.8 GB

(I suspect lots of zero so those must be compressing well to HDF5 to get to 3gb on disk)

Based on this, we shouldn't hit a memory problem?

I think I worked out the data sizes on the bad ranks as well

(71, 'd06n10') --> data size mb 336.3104667663574
(74, 'd06n13') --> data size mb 336.3104667663574
(52, 'd05n09') -->  data size mb 169.84807968139648

cyrush · 2021-11-22T15:15:20Z

Replay Exe, and Actions and Blueprint files:

/gpfs/alpine/world-shared/csc340/2021_11_warpx_vol_rend_issue

goodbadwolf · 2021-11-30T19:47:42Z

Thanks for all this great information. Did you try isolating domain 71, 74 or 52 and running with just one GPU? (I can do this, but just wanted to make sure you hadn't done it already)

cyrush · 2021-11-30T19:49:06Z

Yes - I pulled those out into a blueprint hdf5 dataset and tried with replay.

cyrush · 2022-05-25T22:20:40Z

This was resolved by @goodbadwolf with fixes in VTK-m that are in the 1.7.1 release. Thanks again @goodbadwolf !

cyrush added the bug label Dec 23, 2021

cyrush closed this as completed May 25, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Volume Rendering Crash (Summit + WarpX) #825

Volume Rendering Crash (Summit + WarpX) #825

cyrush commented Oct 12, 2021

cyrush commented Oct 12, 2021

cyrush commented Oct 12, 2021

cyrush commented Oct 21, 2021

cyrush commented Nov 18, 2021

cyrush commented Nov 18, 2021

cyrush commented Nov 19, 2021 •

edited

Loading

cyrush commented Nov 22, 2021

goodbadwolf commented Nov 30, 2021

cyrush commented Nov 30, 2021

cyrush commented May 25, 2022

Volume Rendering Crash (Summit + WarpX) #825

Volume Rendering Crash (Summit + WarpX) #825

Comments

cyrush commented Oct 12, 2021

cyrush commented Oct 12, 2021

cyrush commented Oct 12, 2021

cyrush commented Oct 21, 2021

cyrush commented Nov 18, 2021

cyrush commented Nov 18, 2021

cyrush commented Nov 19, 2021 • edited Loading

cyrush commented Nov 22, 2021

goodbadwolf commented Nov 30, 2021

cyrush commented Nov 30, 2021

cyrush commented May 25, 2022

cyrush commented Nov 19, 2021 •

edited

Loading