Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,10 @@ void ApplyCinnPass(::pir::Program* program,
.file_name("original_programs.py")
.dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
.SaveIfFlagEnabled();
ApplyShapeOptimizationPass(program, CreatePassManager);
ApplyPdToCinnPass(program, CreatePassManager);
// TODO(Hongqing-work): move ApplyShapeOptimizationPass before
// ApplyPdToCinnPass after fixing infer shape bug.
ApplyShapeOptimizationPass(program, CreatePassManager);
ApplyCinnPreprocessPass(program, CreatePassManager);
ApplyBuildGroupOpPass(program, CreatePassManager);
PirToPyCodeConverter(program)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "paddle/fluid/framework/feed_hook.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
Expand Down Expand Up @@ -75,23 +76,34 @@ void VisitFeedName(const pir::Program& program,
const DoEachFeadNameT& DoEachFeadName) {
auto module_op = program.module_op();
const auto& block = module_op.block();
const auto& IsDataOp = [](const pir::Operation& op) -> bool {
return op.isa<paddle::dialect::DataOp>();
};
const auto& GetDataOpName = [](const pir::Operation& op) -> std::string {
auto GetDataOpName =
[](const pir::Operation& op) -> std::optional<std::string> {
if (!op.isa<paddle::dialect::DataOp>()) return std::nullopt;
return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
};
const auto& IsFeedOp = [](const pir::Operation& op) -> bool {
return op.isa<paddle::dialect::FeedOp>();
auto GetFeedOpName =
[](const pir::Operation& op) -> std::optional<std::string> {
if (!op.isa<paddle::dialect::FeedOp>()) return std::nullopt;
return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
};
const auto& GetFeedOpName = [](const pir::Operation& op) -> std::string {
auto GetPhiFeedOpName =
[](const pir::Operation& op) -> std::optional<std::string> {
if (!op.isa<paddle::dialect::PhiKernelOp>()) return std::nullopt;
const auto& attributes = op.attributes();
const auto& op_name_it = attributes.find("op_name");
if (op_name_it == attributes.end()) return std::nullopt;
const auto& op_name =
op_name_it->second.dyn_cast<pir::StrAttribute>().AsString();
if (op_name != "pd_op.feed") return std::nullopt;
return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
};
for (const auto& op : block) {
if (IsDataOp(op)) {
DoEachFeadName(GetDataOpName(op));
} else if (IsFeedOp(op)) {
DoEachFeadName(GetFeedOpName(op));
if (const auto& name = GetDataOpName(op)) {
DoEachFeadName(name.value());
} else if (const auto& name = GetFeedOpName(op)) {
DoEachFeadName(name.value());
} else if (const auto& name = GetPhiFeedOpName(op)) {
DoEachFeadName(name.value());
} else {
// Do nothing.
}
Expand Down Expand Up @@ -1431,34 +1443,48 @@ std::optional<pir::ShapeConstraintIRAnalysis*> GetNullShapeAnalysis(
return std::nullopt;
}

void TryTruncateLogginFile(const std::string& file_path) {
if (!FLAGS_logging_trunc_pir_py_code) return;
static std::mutex mutex;
std::unique_lock<std::mutex> lock(mutex);
static std::unordered_map<std::string, std::once_flag> once_flags;
std::call_once(once_flags[file_path], [&] {
std::ofstream ofs;
ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
ofs.close();
});
}

} // namespace

void PirToPyCodeConverter::SaveIfFlagEnabled() const {
if (program_ == nullptr) return;
if (file_name_.empty()) return;
if (FLAGS_logging_pir_py_code_dir == "") return;
if (FLAGS_logging_pir_py_code_dir.empty()) return;
const std::string file_path =
FLAGS_logging_pir_py_code_dir + "/" + file_name_;
ShapeAnalysisGetterT ShapeAnalysisGetter =
(dump_symbolic_shape_ ? GetShapeAnalysisFromManager
: GetNullShapeAnalysis);
PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
const std::string content = converter_helper.Convert();
static std::mutex mutex;
std::unique_lock<std::mutex> lock(mutex);
if (FLAGS_logging_trunc_pir_py_code) {
static std::unordered_map<std::string, std::once_flag> once_flags;
std::call_once(once_flags[file_path], [&] {
std::ofstream ofs;
ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
ofs.close();
});
}
std::ofstream ofs;
ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
if (!ofs.is_open()) return;
ofs << content << std::endl;
ofs.close();
TryTruncateLogginFile(file_path);
const auto MutOnceFlag = [&]() -> std::once_flag* {
static std::mutex mutex;
std::unique_lock<std::mutex> lock(mutex);
using FileName = std::string;
using FileName2OnceFlag = std::unordered_map<FileName, std::once_flag>;
using ProgramId = int64_t;
static std::unordered_map<ProgramId, FileName2OnceFlag> once_flags;
return &once_flags[program_->id()][file_name_];
};
std::call_once(*MutOnceFlag(), [&] {
ShapeAnalysisGetterT ShapeAnalysisGetter =
(dump_symbolic_shape_ ? GetShapeAnalysisFromManager
: GetNullShapeAnalysis);
PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
const std::string content = converter_helper.Convert();
std::ofstream ofs;
ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
if (!ofs.is_open()) return;
ofs << content << std::endl;
ofs.close();
});
}

void DumpExecProgram(const pir::Program& program,
Expand Down
12 changes: 12 additions & 0 deletions paddle/common/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1758,6 +1758,18 @@ PHI_DEFINE_EXPORTED_string(
"If default, "
"dlopen will search mkl from LD_LIBRARY_PATH");

/**
* Apply global search in blaslt FLAG
* Name: enable_blaslt_global_search
* Since Version: 3.0.0
* Value Range: bool, default=false
* Example:
* Note: If True, will apply global search in blaslt.
*/
PHI_DEFINE_EXPORTED_bool(enable_blaslt_global_search,
false,
"Whether to use global search in blaslt.");

PHI_DEFINE_EXPORTED_string(op_dir, // NOLINT
"",
"Specify path for loading user-defined op library.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() {
// When using cuda graph, fast GC must be used. Because
// `EventQuery` method in event GC cannot be used in
// cuda graph.
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == true &&
memory::allocation::AllocatorFacade::Instance()
.IsCUDAMallocAsyncAllocatorUsed() == true,
false,
platform::errors::InvalidArgument(
"StreamSafeAllocator and AsyncAllocator shouldn't be "
"True together."));
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == false &&
memory::allocation::AllocatorFacade::Instance()
.IsCUDAMallocAsyncAllocatorUsed() == false &&
FLAGS_new_executor_use_cuda_graph,
false,
platform::errors::InvalidArgument(
"When FLAGS_new_executor_use_cuda_graph is true, "
"IsStreamSafeCUDAAllocatorUsed must be true, but "
"Either IsStreamSafeCUDAAllocatorUsed or "
"IsCUDAMallocAsyncAllocatorUsed must be true, but "
"got false."));
return (memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
#include "paddle/pir/include/pass/pass_registry.h"

COMMON_DECLARE_bool(pir_apply_inplace_pass);
COMMON_DECLARE_bool(enable_pir_api);

namespace paddle {
namespace {
Expand Down Expand Up @@ -390,6 +391,10 @@ AnalysisPredictor::AnalysisPredictor(const AnalysisConfig &config)
if (config_.shape_range_info_collected()) {
config_.SwitchIrOptim(false);
}
if (FLAGS_enable_pir_api) {
config_.EnableNewExecutor(true);
config_.EnableNewIR(true);
}
if (config_.new_executor_enabled()) {
config_.EnableMemoryOptim(false);
if (config_.new_ir_enabled()) {
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,11 @@ class AllocatorFacadePrivate {
// application, treating it separately can avoid lots of overhead of
// acquiring default stream and applying read-write lock.
if (FLAGS_use_cuda_malloc_async_allocator) {
PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory,
false,
platform::errors::InvalidArgument(
"Async allocator cannot be used with CUDA "
"managed memory."));
WrapCUDAMallocAsyncAllocatorForDefault();
is_cuda_malloc_async_allocator_used_ = true;
} else {
Expand Down Expand Up @@ -871,6 +876,11 @@ class AllocatorFacadePrivate {
"the allocator strategy %d is unsupported for multi-stream",
static_cast<int>(strategy_)));
if (FLAGS_use_cuda_malloc_async_allocator) {
PADDLE_ENFORCE_EQ(
FLAGS_use_cuda_managed_memory,
false,
platform::errors::InvalidArgument(
"Async allocator cannot be used with CUDA managed memory."));
VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream "
<< stream << " in place " << p;
InitCUDAMallocAsyncAllocator(p, stream);
Expand Down
17 changes: 17 additions & 0 deletions paddle/fluid/operators/save_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,23 @@ PD_REGISTER_KERNEL(save,
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
}

#ifdef PADDLE_WITH_XPU
PD_REGISTER_KERNEL(save,
XPU,
ALL_LAYOUT,
ops::SaveKernel,
float,
double,
int,
uint8_t,
int8_t,
int64_t,
phi::dtype::float16,
phi::dtype::bfloat16) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
}
#endif

PD_REGISTER_KERNEL(save_sr,
CPU,
ALL_LAYOUT,
Expand Down
16 changes: 4 additions & 12 deletions paddle/fluid/pir/dialect/operator/ir/manual_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,27 +252,19 @@ bool AddNOpInferSymbolicShape(pir::Operation *op,
"should be larger than 0. But received X's dimensions %d.",
inputs_shape.size()));
symbol::TensorShapeOrDataDimExprs candidate_shape = inputs_shape.front();
size_t candidate_idx = 0;
for (size_t i = 1; i < inputs_shape.size(); ++i) {
// 0D tensor
if (inputs_shape[i].shape().size() == 0) {
continue;
}
if (candidate_shape.shape().size() == 0) {
candidate_shape = inputs_shape[i];
candidate_idx = i;
continue;
}
PADDLE_ENFORCE_EQ(candidate_shape,
inputs_shape[i],
common::errors::InvalidArgument(
"The input tensor X of AddNOp must"
" have same shape. But received X[%d]'s shape = "
"[%s], X[%d]'s shape = [%s].",
candidate_idx,
candidate_shape,
i,
inputs_shape[i]));
for (size_t j = 0; j < candidate_shape.shape().size(); ++j) {
infer_context->AddEqualCstr(candidate_shape.shape()[j],
inputs_shape[i].shape()[j]);
}
}
infer_context->SetShapeOrDataForValue(
op->result(0), symbol::ShapeOrDataDimExprs{candidate_shape});
Expand Down
9 changes: 9 additions & 0 deletions paddle/phi/backends/xpu/xpu2_op_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,15 @@ XPUOpMap& get_kl2_ops() {
{"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
{"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"save",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT64,
phi::DataType::INT32,
phi::DataType::UINT8,
phi::DataType::INT8,
phi::DataType::INT64,
phi::DataType::FLOAT16,
phi::DataType::BFLOAT16})},
{"scale",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
Expand Down
13 changes: 12 additions & 1 deletion paddle/phi/backends/xpu/xpu3_op_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,9 @@ XPUOpMap& get_kl3_ops() {
phi::DataType::BFLOAT16,
phi::DataType::FLOAT16})},
{"mean_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
phi::DataType::BFLOAT16})},
{"mean",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
Expand Down Expand Up @@ -828,6 +830,15 @@ XPUOpMap& get_kl3_ops() {
{"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
{"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"save",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT64,
phi::DataType::INT32,
phi::DataType::UINT8,
phi::DataType::INT8,
phi::DataType::INT64,
phi::DataType::FLOAT16,
phi::DataType::BFLOAT16})},
{"scale",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
Expand Down
7 changes: 6 additions & 1 deletion paddle/phi/kernels/activation_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ DECLARE_ACTIVATION_KERNEL(Log)
DECLARE_ACTIVATION_KERNEL(Log2)
DECLARE_ACTIVATION_KERNEL(Log10)
DECLARE_ACTIVATION_KERNEL(Log1p)
DECLARE_ACTIVATION_KERNEL(Round)
DECLARE_ACTIVATION_KERNEL(Floor)
DECLARE_ACTIVATION_KERNEL(Ceil)
DECLARE_ACTIVATION_KERNEL(Negative)
Expand Down Expand Up @@ -98,6 +97,12 @@ void Relu6Kernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out);

template <typename T, typename Context>
void RoundKernel(const Context& dev_ctx,
const DenseTensor& x,
const int decimals,
DenseTensor* out);

template <typename T, typename Context>
void SwishKernel(const Context& dev_ctx,
const DenseTensor& x,
Expand Down
14 changes: 13 additions & 1 deletion paddle/phi/kernels/cpu/activation_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Softsign, SoftsignFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Round, RoundFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Negative, NegativeFunctor)
Expand Down Expand Up @@ -161,6 +160,19 @@ void Relu6Kernel(const Context& dev_ctx,
ActivationImpl<T, T, Context, funcs::Relu6Functor<T>>(
dev_ctx, x, out, functor);
}

template <typename T, typename Context>
void RoundKernel(const Context& dev_ctx,
const DenseTensor& x,
const int decimals,
DenseTensor* out) {
funcs::RoundFunctor<T> functor;
auto attrs = functor.GetAttrs();
*(attrs[0].second) = decimals;
ActivationImpl<T, T, Context, funcs::RoundFunctor<T>>(
dev_ctx, x, out, functor);
}

} // namespace phi
PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}

Expand Down
Loading