Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
69f3721
[fix] fix fail test when backend is mack
zhang-chenyi Sep 4, 2025
e45d324
[Metax] fix fail test when backend is mack
metax666 Sep 4, 2025
ef9d554
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 4, 2025
a1530d2
[metax]change_cupti_and_fix_softmax (#7)
duqimeng Sep 9, 2025
352f02e
[Metax] fix dgc & mklml compile product path problem (#8)
StareAtYou Sep 9, 2025
8f13fae
[Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test…
StareAtYou Sep 11, 2025
8938293
[Metax] update metax_gpu CMakeLists.txt (#10)
StareAtYou Sep 11, 2025
f54187f
[metax] updata_qr_kernel (#11)
duqimeng Sep 11, 2025
7964c35
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 12, 2025
1e04216
[Metax] fix illegal address access error in test_momentum_op (#12)
StareAtYou Sep 15, 2025
aca80a4
[Metax] fix cufft and fix some blas kernel apply (#13)
duqimeng Sep 15, 2025
1c54010
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 15, 2025
fb547db
[metax] add warpctc_warprnn (#14)
duqimeng Sep 15, 2025
8e98198
[Metax] update metax CI (#15)
StareAtYou Sep 15, 2025
528ec55
[Metax] update metax CI CMakeLists (#16)
StareAtYou Sep 16, 2025
5b31405
[Metax] add github action (#18)
duqimeng Sep 16, 2025
b93c971
[metax] chang build (#19)
duqimeng Sep 16, 2025
6dbbe84
change_build (#20)
duqimeng Sep 16, 2025
ef1b28e
change_build (#21)
duqimeng Sep 16, 2025
3737e48
change_build (#22)
duqimeng Sep 16, 2025
16f3584
【metax】modify cmake for warpctc and warprnnt (#17)
jxwangmetax Sep 16, 2025
ce54693
[metax]modify library to static library (#24)
jxwangmetax Sep 16, 2025
4cda637
[Metax] organize documents (#25)
StareAtYou Sep 16, 2025
23fca59
[metax]fix_code style and index_elementwise_put_kernel (#27)
duqimeng Sep 17, 2025
a513aae
change_build_917 (#29)
duqimeng Sep 17, 2025
4eb455e
chang_build (#30)
duqimeng Sep 17, 2025
1773978
[metax]modify kernel (#31)
jxwangmetax Sep 17, 2025
69af381
change_metax_work (#32)
duqimeng Sep 17, 2025
7fe6f2d
change_build (#33)
duqimeng Sep 17, 2025
b22fc13
[metax] modify fused_bias_dropout_residual_layer_norm (#34)
jxwangmetax Sep 17, 2025
c3d1444
change_build (#35)
duqimeng Sep 17, 2025
569a867
change_build (#36)
duqimeng Sep 17, 2025
0edc6f6
change_warpctc.cmake (#38)
duqimeng Sep 18, 2025
2688c86
change_warpctc.cmake (#39)
duqimeng Sep 18, 2025
6f031fe
test (#40)
duqimeng Sep 18, 2025
e84d399
test_ut (#41)
duqimeng Sep 18, 2025
b5f2feb
tets (#43)
duqimeng Sep 18, 2025
e20eca7
test (#44)
duqimeng Sep 18, 2025
e37f633
[metax] modify compile (#42)
jxwangmetax Sep 19, 2025
1af5148
[Metax] add log analysis script (#46)
StareAtYou Sep 19, 2025
518bee8
add_generate_pb (#47)
duqimeng Sep 19, 2025
bc02549
modify blas (#51)
jxwangmetax Sep 22, 2025
1977ca8
[metax] modify tf32 (#52)
jxwangmetax Sep 22, 2025
1ae2618
[Metax] update metax backend CI test (#53)
StareAtYou Sep 22, 2025
76d5eb0
[Metax] fix log_analysis.py bug (#54)
StareAtYou Sep 23, 2025
9c17b6e
[Metax] update metax CI CMakeLists & scripts (#56)
StareAtYou Sep 23, 2025
51c98a2
[Metax] fix MatmulKernel problem (#57)
StareAtYou Sep 23, 2025
d113018
[metax]fix paddle bug" (#58)
duqimeng Sep 23, 2025
8991299
change—ut (#59)
duqimeng Sep 23, 2025
a770e6f
change_ut (#60)
duqimeng Sep 23, 2025
902112b
change_ut (#63)
duqimeng Sep 24, 2025
9a88a09
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 24, 2025
4ae65f7
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 24, 2025
61c32ba
[Metax] add keyword filter in CI CMakeLists.txt
StareAtYou Sep 24, 2025
642eb37
Merge branch 'metax666:develop' into develop
StareAtYou Sep 25, 2025
b2ddc81
[Metax] add ignore case list
StareAtYou Sep 25, 2025
cfe44ce
[Metax] add keyword filter in CI CMakeLists.txt (#64)
StareAtYou Sep 25, 2025
041e585
Merge branch 'metax666:develop' into develop
StareAtYou Sep 25, 2025
087a9c1
[Metax] fix phi::backends::gpu::DnnVersion() symbol not found
StareAtYou Sep 26, 2025
73710c5
Revert "[Metax] fix phi::backends::gpu::DnnVersion() symbol not found"
StareAtYou Sep 26, 2025
78946fd
[metax] modify kernels (#67)
jxwangmetax Sep 26, 2025
ac78af2
Fix part of the missing kernel issues (#66)
Theendlessofhell Sep 26, 2025
404ff3d
[Metax] fix index_elementwise_get kernel
StareAtYou Sep 26, 2025
4ce9fe6
[Metax] fix index_elementwise_get kernel (#68)
StareAtYou Sep 26, 2025
739c5c7
Merge branch 'metax666:develop' into develop
StareAtYou Sep 28, 2025
3c8d017
[metax]fix patch and fix missing kernel (#72)
duqimeng Sep 29, 2025
35a4e49
Merge branch 'metax666:develop' into develop
StareAtYou Sep 29, 2025
7303ae2
[metax] modify kernels (#73)
jxwangmetax Sep 29, 2025
8b184a3
[metax] modify kernels (#74)
jxwangmetax Sep 29, 2025
60f0ed6
[metax] link mccl and fix missing kernel (#76)
duqimeng Sep 30, 2025
cccf6b7
[metax] rename yaml file (#77)
duqimeng Sep 30, 2025
7a7a7a0
[metax] rm file (#78)
duqimeng Sep 30, 2025
70ef70d
Merge branch 'PaddlePaddle:develop' into develop
metax666 Sep 30, 2025
5a76d35
metax_fix_ci (#79)
duqimeng Sep 30, 2025
8f91b94
Merge branch 'metax666:develop' into develop
StareAtYou Oct 9, 2025
ceb55eb
[metax] add print tensor (#91)
jxwangmetax Oct 11, 2025
b533149
Merge branch 'metax666:develop' into develop
StareAtYou Oct 11, 2025
7750faa
Update Paddle submodule to latest develop
tianshuo78520a Oct 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/metax_work.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
branches: [develop, release/**]
paths:
- "**"
- "Paddle/**"
- "!backends/**"
- "backends/metax_gpu/**"

Expand Down
2 changes: 1 addition & 1 deletion Paddle
Submodule Paddle updated 1091 files
2 changes: 2 additions & 0 deletions backends/metax_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ file(
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/gpu_info.cc
# ############################################################################
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
# kernels/kps
Expand Down Expand Up @@ -687,6 +688,7 @@ file(
RELATIVE ${CMAKE_SOURCE_DIR}
runtime/runtime.cc
passes/*.cc
common/*.cc
kernels/*.cc
kernels/*.cu
kernels/fusion/*.cc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@ PHI_DEFINE_EXPORTED_bool(
"faster but it may loss precision in most case. If true, the compute "
"type will be set to fp16. Default is false.");

PHI_DEFINE_EXPORTED_string(
selected_gpus,
"",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
"share-memory only.");

PHI_DEFINE_EXPORTED_bool(use_fast_math,
false,
"Whether to use fast math GPU functions.");
Expand Down
297 changes: 297 additions & 0 deletions backends/metax_gpu/common/utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "common/utils.h"

#include "glog/logging.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/custom/custom_context.h"

namespace phi {
namespace {
C_Status AsyncMemCpyH2D(const C_Device device,
C_Stream stream,
void* dst,
const void* src,
size_t size) {
if (size == 0) {
return C_SUCCESS;
}

if (dst == NULL || src == NULL) {
return C_ERROR;
}
cudaError_t cudaErr = cudaSetDevice(device->id);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}

cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}

return C_SUCCESS;
}

C_Status AsyncMemCpyD2H(const C_Device device,
C_Stream stream,
void* dst,
const void* src,
size_t size) {
if (size == 0) {
return C_SUCCESS;
}

if (dst == NULL || src == NULL) {
return C_ERROR;
}

cudaError_t cudaErr = cudaSetDevice(device->id);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}

cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}

return C_SUCCESS;
}

C_Status AsyncMemCpyD2D(const C_Device device,
C_Stream stream,
void* dst,
const void* src,
size_t size) {
if (size == 0) {
VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
<< size; // NOLINT
return C_SUCCESS;
}

if (dst == NULL || src == NULL) {
return C_ERROR;
}

cudaError_t cudaErr = cudaSetDevice(device->id);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}

cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice);
if (cudaErr != cudaSuccess) {
return C_ERROR;
}
VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
<< size; // NOLINT
return C_SUCCESS;
}

template <typename Context>
inline void TensorCopy(const Context& dev_ctx,
const phi::DenseTensor& src,
bool blocking,
phi::DenseTensor* dst,
const phi::Place& dst_place = phi::CustomPlace()) {
auto* src_ptr = src.data();
const auto& src_place = src.place();
if (src_ptr == nullptr) {
return;
}
auto dst_place_ = dst_place;
if (dst_place_.GetType() != phi::AllocationType::CPU) {
dst_place_ = dev_ctx.GetPlace();
}

if (&src == dst) {
if (src_place == dst_place_) {
VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place
<< " to " << dst_place_;
} else {
VLOG(6) << "Src and dst are the same Tensor, in-place copy data("
<< src_ptr << ") from " << src_place << " to " << dst_place_;
const phi::DenseTensor src_copy = src;
TensorCopy(dev_ctx, src_copy, blocking, dst, dst_place_);
}
return;
}

auto dst_dims = dst->dims();
dst->Resize(src.dims());
void* dst_ptr = nullptr;
if (dst_place_.GetType() != phi::AllocationType::CPU) {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
}

PADDLE_ENFORCE_EQ(
dst->place(),
dst_place_,
phi::errors::Unavailable(
"The Dst Tensor's place and dst_place do not match, Tensor's place "
"place is %s, dst_place is %s.",
dst->place(),
dst_place_));

if (src_ptr == dst_ptr && src_place == dst_place_) {
if ((dst_dims == src.dims()) || (src_place == phi::CPUPlace())) {
VLOG(3) << "Skip copy the same data async from " << src_ptr << " in "
<< src_place << " to " << dst_ptr << " in " << dst_place_;
return;
} else {
// scatter memory
phi::DenseTensor tmp_dst;
tmp_dst.set_meta(dst->meta());
tmp_dst.Resize(dst_dims);
dst_ptr = dev_ctx.Alloc(&tmp_dst, tmp_dst.dtype());
*dst = tmp_dst;
}
}
VLOG(4) << "src:" << src_ptr << " place: " << src_place
<< " type:" << static_cast<int>(src_place.GetType())
<< ", dst:" << dst_ptr << " place: " << dst_place_
<< " type:" << static_cast<int>(dst_place_.GetType());

C_Stream stream = reinterpret_cast<C_Stream>(dev_ctx.stream());

auto size =
(src.dims().size() != 0 ? src.numel() : 1) * phi::SizeOf(src.dtype());
if (UNLIKELY(size) == 0) {
return;
}

if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place_.GetType() == phi::AllocationType::CUSTOM) {
VLOG(6) << "TensorCopy from cpu to cus";
C_Device_st device;
device.id = dst_place_.GetDeviceId();
AsyncMemCpyH2D(&device, stream, dst_ptr, src_ptr, size);
if (blocking) {
dev_ctx.Wait();
}
} else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
dst_place_.GetType() == phi::AllocationType::CPU) {
VLOG(6) << "TensorCopy from cus to cpu";
C_Device_st device;
device.id = src_place.GetDeviceId();
AsyncMemCpyD2H(&device, stream, dst_ptr, src_ptr, size);
if (blocking) {
dev_ctx.Wait();
}
} else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
dst_place_.GetType() == phi::AllocationType::CUSTOM) {
VLOG(6) << "TensorCopy from cus to cus";
if (src_place.GetDeviceType() == dst_place_.GetDeviceType()) {
if (src_place.GetDeviceId() == dst_place_.GetDeviceId()) {
C_Device_st device;
device.id = src_place.GetDeviceId();
AsyncMemCpyD2D(&device, stream, dst_ptr, src_ptr, size);
if (blocking) {
dev_ctx.Wait();
}
} else {
PADDLE_THROW(
phi::errors::Unimplemented("TensorCopy is not supported."));
}
} else {
PADDLE_THROW(phi::errors::Unimplemented("TensorCopy is not supported."));
}
} else if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place_.GetType() == phi::AllocationType::CPU) {
VLOG(6) << "TensorCopy from cpu to cpu";
std::memcpy(dst_ptr, src_ptr, size);
}
}

template <typename T = float>
std::ostream& PrintTensor(std::ostream& os, const phi::DenseTensor& tensor) {
phi::DenseTensor cpu_tensor;
if (tensor.place().GetType() != phi::AllocationType::CPU) {
auto dev_ctx = static_cast<const phi::CustomContext*>(
phi::DeviceContextPool::Instance().Get(tensor.place()));
TensorCopy(*dev_ctx, tensor, true, &cpu_tensor, phi::CPUPlace());
} else {
cpu_tensor = tensor;
}
os << "DenseTensor<";
if (tensor.initialized()) {
os << phi::DataTypeToString(tensor.dtype()) << ", ";
os << tensor.place() << ", ";
os << "Shape(" << tensor.dims() << "), ";
os << "Strides(" << tensor.strides() << "), ";
os << "layout:" << tensor.layout() << ", ";
os << "data: [";

auto ptr = cpu_tensor.data<T>();
auto element_num = cpu_tensor.numel();
// Note: int8_t && uint8_t is typedef of char, ostream unable to print
// properly
if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
if (element_num > 0) {
os << signed(ptr[0]);
for (int j = 1; j < element_num; ++j) {
os << " " << signed(ptr[j]);
}
}
} else {
if (element_num > 0) {
os << ptr[0];
for (int j = 1; j < element_num; ++j) {
os << " " << ptr[j];
}
}
}
os << "]";
} else {
os << "NOT_INITED";
}
os << ">";
return os;
}
} // namespace

#define FOR_EACH_DATA_TYPE_TO_PRINT(_) \
_(bool, phi::DataType::BOOL) \
_(int8_t, phi::DataType::INT8) \
_(uint8_t, phi::DataType::UINT8) \
_(int16_t, phi::DataType::INT16) \
_(uint16_t, phi::DataType::UINT16) \
_(int32_t, phi::DataType::INT32) \
_(uint32_t, phi::DataType::UINT32) \
_(int64_t, phi::DataType::INT64) \
_(uint64_t, phi::DataType::UINT64) \
_(phi::bfloat16, phi::DataType::BFLOAT16) \
_(phi::float16, phi::DataType::FLOAT16) \
_(float, phi::DataType::FLOAT32) \
_(double, phi::DataType::FLOAT64)

#define CALL_PRINT_TENSOR(cpp_type, data_type) \
case data_type: \
PrintTensor<cpp_type>(os, t); \
break;

std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
switch (t.dtype()) {
FOR_EACH_DATA_TYPE_TO_PRINT(CALL_PRINT_TENSOR)
default:
VLOG(1) << "PrintTensor unrecognized data type:" << t.dtype();
}
return os;
}
#undef FOR_EACH_DATA_TYPE_TO_PRINT
#undef CALL_PRINT_TENSOR
} // namespace phi
28 changes: 28 additions & 0 deletions backends/metax_gpu/common/utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <algorithm>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>

#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"

namespace phi {
std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t);
}