intel · bader · Apr 20, 2020 · Mar 26, 2020 · Apr 15, 2020 · Apr 15, 2020
diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/inline-asm/asm_16_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/inline-asm/asm_16_matrix_mult.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/inline-asm/asm_16_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/inline-asm/asm_16_no_opts.cpp
@@ -0,0 +1,45 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          for (int i = 0; i < 10; ++i) {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+            asm("fence_sw");
+            C[wiID] += i;
+
+#else
+            C[wiID] += i;
+#endif
+          }
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 45))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/inline-asm/asm_8_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/inline-asm/asm_8_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(D[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+          D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &D = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/inline-asm/asm_decl_in_scope.cpp
@@ -0,0 +1,67 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()},
+        [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+    // declaration of temp within and outside the scope
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              "}\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              : "+rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID];
+          C[wiID] *= B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = 2;
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}