diff --git a/lib/api/blas/clBLAS.cpp b/lib/api/blas/clBLAS.cpp index 843d2e846740..fa7935881346 100644 --- a/lib/api/blas/clBLAS.cpp +++ b/lib/api/blas/clBLAS.cpp @@ -92,6 +92,7 @@ extern "C" cl_uint numCommandQueues, cl_command_queue *commandQueues,\ cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)\ {\ + if(incx < 0) return clblasSuccess;\ sc::array x((sc::int_t)N, TYPE_ISAAC, sc::driver::Buffer(mx,false), (sc::int_t)offx, incx);\ execute(sc::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ return clblasSuccess;\ @@ -143,6 +144,11 @@ extern "C" cl_mem /*scratchBuff*/, cl_uint numCommandQueues, cl_command_queue *commandQueues,\ cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)\ {\ + if(incx <= 0) {\ + sc::array sum((sc::int_t)offAsum + 1, TYPE_ISAAC, sc::driver::Buffer(asum, false), (sc::int_t)offAsum, 1);\ + sum[0] = 0;\ + return clblasSuccess;\ + }\ sc::array x((sc::int_t)N, TYPE_ISAAC, sc::driver::Buffer(mx, false), (sc::int_t)offx, incx);\ sc::scalar s(TYPE_ISAAC, sc::driver::Buffer(asum, false), (sc::int_t)offAsum);\ execute(sc::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ @@ -167,7 +173,7 @@ extern "C" {\ if(order==clblasRowMajor){\ std::swap(M, N);\ - transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\ + transA = (transA==clblasTrans || transA==clblasConjTrans)?clblasNoTrans:clblasTrans;\ }\ sc::array A((sc::int_t)M, (sc::int_t)N, TYPE_ISAAC, sc::driver::Buffer(mA, false), (sc::int_t)offA, (sc::int_t)lda);\ \ @@ -177,7 +183,7 @@ extern "C" sc::array y(sy, TYPE_ISAAC, sc::driver::Buffer(my, false), (sc::int_t)offy, incy);\ \ sc::driver::Context const & context = A.context();\ - if(transA==clblasTrans)\ + if(transA==clblasTrans || transA==clblasConjTrans)\ execute(sc::assign(y, alpha*dot(A.T, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ else\ execute(sc::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ @@ -226,11 +232,11 @@ extern "C" sc::array C((sc::int_t)M, (sc::int_t)N, TYPE_ISAAC, sc::driver::Buffer(mC, false), (sc::int_t)offC, (sc::int_t)ldc);\ sc::driver::Context const & context = C.context();\ /*Operation*/\ - if((transA==clblasTrans) && (transB==clblasTrans))\ + if((transA==clblasTrans || transA==clblasConjTrans) && (transB==clblasTrans || transB==clblasConjTrans))\ execute(sc::assign(C, alpha*dot(A.T, B.T) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ - else if((transA==clblasTrans) && (transB==clblasNoTrans))\ + else if((transA==clblasTrans || transA==clblasConjTrans) && (transB==clblasNoTrans))\ execute(sc::assign(C, alpha*dot(A.T, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ - else if((transA==clblasNoTrans) && (transB==clblasTrans))\ + else if((transA==clblasNoTrans) && (transB==clblasTrans || transB==clblasConjTrans))\ execute(sc::assign(C, alpha*dot(A, B.T) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ else\ execute(sc::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ diff --git a/lib/jit/generation/elementwise_1d.cpp b/lib/jit/generation/elementwise_1d.cpp index aadc3df346d5..d58646ce3646 100644 --- a/lib/jit/generation/elementwise_1d.cpp +++ b/lib/jit/generation/elementwise_1d.cpp @@ -30,6 +30,7 @@ #include "tools/vector_types.hpp" #include "tools/arguments.hpp" + #include namespace isaac @@ -71,7 +72,7 @@ std::string elementwise_1d::generate_impl(std::string const & suffix, expression stream << "{" << std::endl; stream.inc_tab(); } - + stream << tools::join(negative_inc_process(device, symbols, tree), " ") << std::endl; element_wise_loop_1D(stream, vwidth_, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", [&](unsigned int vwidth) { std::string dtype = append_width("#scalartype",vwidth); @@ -83,27 +84,25 @@ std::string elementwise_1d::generate_impl(std::string const & suffix, expression //Load to registers for(symbolic::leaf* sym: symbolic::extract(tree, symbols, assignments_rhs, false)) stream << sym->process(dtype + " #name = " + append_width("loadv", vwidth) + "(i);") << std::endl; - //Compute for(size_t idx: assignments) for(unsigned int s = 0 ; s < vwidth ; ++s) - stream << symbols.at(idx)->evaluate({{"leaf", access_vector_type("#name", s, vwidth)}}) << ";" << std::endl; + stream << symbols.at(idx)->evaluate({{"leaf", access_vector_type("#name", s, vwidth)}}) << ";" << std::endl; //Writes back for(symbolic::leaf* sym: symbolic::extract(tree, symbols, assignments_lhs, false)) for(unsigned int s = 0 ; s < vwidth ; ++s) - stream << sym->process("at(i+" + tools::to_string(s)+") = " + access_vector_type("#name", s, vwidth) + ";") << std::endl; + stream << sym->process("at(i+" + tools::to_string(s)+") = " + access_vector_type("#name", s, vwidth) + ";") << std::endl; }); //Close user-provided for-loops if(sfors.size()){ stream.dec_tab(); stream << "}" << std::endl; } - stream.dec_tab(); stream << "}" << std::endl; -// std::cout << stream.str() << std::endl; + // std::cout << stream.str() << std::endl; return stream.str(); } diff --git a/lib/jit/generation/elementwise_2d.cpp b/lib/jit/generation/elementwise_2d.cpp index 041b60b3fa27..6ad36b588c01 100644 --- a/lib/jit/generation/elementwise_2d.cpp +++ b/lib/jit/generation/elementwise_2d.cpp @@ -68,6 +68,7 @@ std::string elementwise_2d::generate_impl(std::string const & suffix, expression stream << "{" << std::endl; stream.inc_tab(); + stream << tools::join(negative_inc_process(device, symbols, tree), " ") << std::endl; element_wise_loop_1D(stream, 1, "i", "M", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", [&](unsigned int){ element_wise_loop_1D(stream, 1, "j", "N", "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", [&](unsigned int){ //Declares register to store results diff --git a/lib/jit/generation/reduce_1d.cpp b/lib/jit/generation/reduce_1d.cpp index 0052e9c24ee9..45f8a97963c7 100644 --- a/lib/jit/generation/reduce_1d.cpp +++ b/lib/jit/generation/reduce_1d.cpp @@ -143,6 +143,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";") << std::endl; } } + stream << tools::join(negative_inc_process(device, symbols, tree), " ") << std::endl; element_wise_loop_1D(stream, vwidth_, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", [&](unsigned int vwidth) { std::string dtype = append_width("#scalartype",vwidth); @@ -195,6 +196,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree stream << "{" << std::endl; stream.inc_tab(); unroll_tmp(); + stream << tools::join(negative_inc_process(device, symbols, tree), " ") << std::endl; //Declarations stream << "unsigned int lid = $LOCAL_IDX_0;" << std::endl; stream << "unsigned int lsize = $LOCAL_SIZE_0;" << std::endl; @@ -210,7 +212,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree else { stream << rd->process("$LOCAL #scalartype #name_buf[" + tools::to_string(ls0_) + "];") << std::endl; - stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";"); + stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";"); } } //Private reduction diff --git a/lib/jit/generation/reduce_2d.cpp b/lib/jit/generation/reduce_2d.cpp index 6188415e2c79..c077e32c0c5b 100644 --- a/lib/jit/generation/reduce_2d.cpp +++ b/lib/jit/generation/reduce_2d.cpp @@ -114,6 +114,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree std::ostringstream upper; upper << "(M +" << ls1_ - 1 << ")/" << ls1_ << "*" << ls1_; + stream << tools::join(reduce_2d_negative_inc_process(device, symbols, tree), " ") << std::endl; element_wise_loop_1D(stream, (reduction_type_==REDUCE_ROWS)?1:1, "r", upper.str(), "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", [&](unsigned int cwidth) { //Declare Buffers @@ -212,6 +213,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree stream << "{" << std::endl; stream.inc_tab(); unroll_tmp(); + stream << tools::join(reduce_2d_negative_inc_process(device, symbols, tree), " ") << std::endl; for (symbolic::reduce_2d* rd : reductions) stream << rd->process("$LOCAL #scalartype #name_buf[" + to_string(ls1_*ldls) + "];") << std::endl; stream << "for($SIZE_T r = $GLOBAL_IDX_1; r < (M +" << ls1_ - 1 << ")/" << ls1_ << "*" << ls1_ << "; r += " << GlobalSize1(backend) << "){" << std::endl; @@ -265,7 +267,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree stream << "}" << std::endl; } -// std::cout << stream.str() << std::endl; + // std::cout << stream.str() << std::endl; return stream.str(); } diff --git a/lib/jit/generation/tools/arguments.hpp b/lib/jit/generation/tools/arguments.hpp index 9a171c652abf..ffbbe22b9185 100644 --- a/lib/jit/generation/tools/arguments.hpp +++ b/lib/jit/generation/tools/arguments.hpp @@ -44,7 +44,11 @@ inline std::vector kernel_arguments(driver::Device const &, symboli result.push_back(sym->process("#scalartype #name_value")); if(symbolic::buffer* sym = dynamic_cast(obj)) { - result.push_back("$GLOBAL " + sym->process("#scalartype* #pointer")); + std::string pointer_name = sym->process("#scalartype* #pointer"); + if(sym->hasattr("inc0") && !sym->hasattr("inc1")) + result.push_back("$GLOBAL " + pointer_name+"_bk"); + else + result.push_back("$GLOBAL " + pointer_name); if(sym->hasattr("off")) result.push_back("$SIZE_T " + sym->process("#off")); if(sym->hasattr("inc0")) result.push_back("$SIZE_T " + sym->process("#inc0")); if(sym->hasattr("inc1")) result.push_back("$SIZE_T " + sym->process("#inc1")); @@ -59,5 +63,59 @@ inline std::vector kernel_arguments(driver::Device const &, symboli } +inline std::vector negative_inc_process(driver::Device const &, symbolic::symbols_table const & symbols, expression_tree const & expressions) +{ + std::vector result; + for(symbolic::object* obj: symbolic::extract(expressions, symbols)) + { + if(symbolic::buffer* sym = dynamic_cast(obj)) + if( sym->hasattr("inc0") && ! sym->hasattr("inc1")) + { + std::string pointer = sym->process("#scalartype* #pointer"); + { + int pointer_pos = pointer.find_first_of(" "); + std::string pointer_name = pointer.substr(pointer_pos+1, pointer.length()); + std::string inc0 = sym->process("#inc0"); + std::string type = pointer.substr(0,pointer_pos); + std::string pointer_dec = "__global " + type + " " + pointer_name; + std::string pointer_def = pointer_dec + " = " + pointer_name + "_bk;"; + std::string judge = " if(" + inc0 + " < 0)"; + std::string re = pointer_def + "\n"+judge + "\n" + " " + pointer_name + " += (1-N) * " + inc0+";\n"; + result.push_back(re); + } + } + } + return result; +} + +inline std::vector reduce_2d_negative_inc_process(driver::Device const &, symbolic::symbols_table const & symbols, expression_tree const & expressions) +{ + std::vector result; + for(symbolic::object* obj: symbolic::extract(expressions, symbols)) + { + if(symbolic::buffer* sym = dynamic_cast(obj)) + if( sym->hasattr("inc0") && ! sym->hasattr("inc1")) + { + std::string pointer = sym->process("#scalartype* #pointer"); + { + int pointer_pos = pointer.find_first_of(" "); + std::string pointer_name = pointer.substr(pointer_pos+1, pointer.length()); + std::string inc0 = sym->process("#inc0"); + std::string type = pointer.substr(0,pointer_pos); + std::string pointer_dec = "__global " + type + " " + pointer_name; + std::string pointer_def = pointer_dec + " = " + pointer_name + "_bk;"; + std::string judge = " if(" + inc0 + " < 0)"; + std::string re; + if(pointer.find("obj3") == std::string::npos ) + re = pointer_def + "\n"+judge + "\n" + " " + pointer_name + " += (1-M) * " + inc0+";\n"; + else + re = pointer_def + "\n"+judge + "\n" + " " + pointer_name + " += (1-N) * " + inc0+";\n"; + result.push_back(re); + } + } + } + return result; +} + } } diff --git a/lib/jit/generation/tools/loop.hpp b/lib/jit/generation/tools/loop.hpp index 5a4945411946..137012e4f60a 100644 --- a/lib/jit/generation/tools/loop.hpp +++ b/lib/jit/generation/tools/loop.hpp @@ -38,7 +38,7 @@ inline void element_wise_loop_1D(kernel_generation_stream & stream, unsigned int std::string init = domain_id + "*" + svwidth; std::string lbound = bound + "/" + svwidth + "*" + svwidth; std::string inc = domain_size + "*" + svwidth; - stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << lbound << "; " << i << " += " << inc << ")" << std::endl; + stream << "for(int " << i << " = " << init << "; " << i << " < " << lbound << "; " << i << " += " << inc << ")" << std::endl; stream << "{" << std::endl; stream.inc_tab(); generate_body(vwidth); @@ -47,7 +47,7 @@ inline void element_wise_loop_1D(kernel_generation_stream & stream, unsigned int if (vwidth>1) { - stream << "for(unsigned int " << i << " = " << lbound << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; + stream << "for(int " << i << " = " << lbound << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; stream << "{" << std::endl; stream.inc_tab(); generate_body(1); diff --git a/lib/jit/syntax/engine/object.cpp b/lib/jit/syntax/engine/object.cpp index 7b6239529ca3..640230516334 100644 --- a/lib/jit/syntax/engine/object.cpp +++ b/lib/jit/syntax/engine/object.cpp @@ -253,7 +253,10 @@ buffer::buffer(driver::Context const & context, std::string const & scalartype, macros_.insert(make_broadcast(shape)); add_base("buffer"); - add_load(strides[0]==1 && shape[0]>1); +// add_load(strides[0]==1 && shape[0]>1); +// stride==1 would result in "vloadn"'s use in kernel,if a kernel is generated with stride==1, +// it can't run samples whose stride<0,so vloadn can't be used in kernels. + add_load(false); } //