1. Changed CMakeLists.txt to require from the user to specify

pthomadakis · pthomadakis · commit c862a838cdd4 · 2024-09-16T12:19:55.000-07:00
the device target compute capability and use this value as the default.
This can still be overriden with the flag --gpu-compute-capability.

2. Added semiring tests for GPU target
3. Changed GPU tests to only run when COMET is compiled with ENABLE_GPU_TARGET=ON
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -125,6 +125,11 @@ option(ENABLE_GPU_TARGET OFF)
 if(${ENABLE_GPU_TARGET})
   set(TRITON_PATH "" CACHE PATH "Path to Triton")
   set(TRITON_BUILD_PATH "${CMAKE_BINARY_DIR}/triton" CACHE INTERNAL "Path to Triton Build")
+  if(NOT DEFINED CUDA_COMPUTE_CAPABILITY)
+    message(FATAL_ERROR "Please specify cuda compute capability requested")
+  endif()
+  add_compile_definitions(CUDA_COMPUTE_CAPABILITY=${CUDA_COMPUTE_CAPABILITY})
+
   add_subdirectory(${TRITON_PATH} ${TRITON_BUILD_PATH})
   get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
   include_directories("${TRITON_PATH}")
diff --git a/frontends/comet_dsl/comet.cpp b/frontends/comet_dsl/comet.cpp
@@ -180,7 +180,7 @@ static cl::opt<TargetDevice> CodegenTarget("target", cl::init(CPU), cl::desc("Co
 static cl::opt<int> GPUBlockSizeX("gpu-block-x-size", cl::init(32), cl::desc("GPU Block size in X direction"));
 static cl::opt<int> GPUBlockSizeY("gpu-block-y-size", cl::init(8), cl::desc("GPU Block size in Y direction"));
 static cl::opt<int> GPUBlockSizeR("gpu-block-r-size", cl::init(32), cl::desc("GPU Block size in R direction"));
-static cl::opt<int> GPUComputeCapability("gpu-compute-capability", cl::init(80), cl::desc("GPU compute capability"));
+static cl::opt<int> GPUComputeCapability("gpu-compute-capability", cl::init(CUDA_COMPUTE_CAPABILITY), cl::desc("GPU compute capability"));
 static cl::opt<int> GPUNumWarps("gpu-num-warps", cl::init(4), cl::desc("GPU number of warps"));
 static cl::opt<int> GPUThreadsPerWarp("gpu-threads-per-warp", cl::init(32), cl::desc("GPU threads per warp"));
 static cl::opt<int> GPUNumCTAs("gpu-num-ctas", cl::init(1), cl::desc("GPU num CTAs"));
diff --git a/frontends/numpy-scipy/cometpy/MLIRGen/lowering.py b/frontends/numpy-scipy/cometpy/MLIRGen/lowering.py
@@ -716,6 +716,9 @@ def lower_dialect_with_jit(ta_dialect_rep, target: str, out_dims, compile_with_f
         if target.startswith("sm_") or target.startswith("compute_") or target.startswith("lto_"):
             scf_lower_flags += " " + " --convert-to-triton --target=GPU --gpu-compute-capability="+target.split("_")[1]
             mlir_lower_flags += " " + "--target=GPU"
+        elif target == "gpu":
+            scf_lower_flags += " " + " --convert-to-triton --target=GPU"
+            mlir_lower_flags += " " + "--target=GPU"
         else :
             raise "Expected target formats:\
                     cpu, compute_<version>, sm_<version>, lto_<version>"
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_add_dense_matrix.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_add_dense_matrix.py
@@ -8,7 +8,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = A+B 
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_mult_DensexDense_oDense.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_mult_DensexDense_oDense.py
@@ -8,7 +8,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = A * B 
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_subtract_dense_matrix.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_eltwise_subtract_dense_matrix.py
@@ -8,7 +8,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = A - B 
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_ij-ikj-kj.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_ij-ikj-kj.py
@@ -8,7 +8,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = comet.einsum('ikj,kj->ij', A,B)
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_matrix.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_matrix.py
@@ -7,7 +7,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = comet.einsum('ij,jk->ik', A,B)
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_matrix_vector.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_mult_dense_matrix_vector.py
@@ -8,7 +8,7 @@ def run_numpy(A,B):
 
 	return C
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A,B):
 	C = comet.einsum('ij,j->i', A,B)
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_sum_dense_matrix.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_sum_dense_matrix.py
@@ -8,7 +8,7 @@ def run_numpy(A):
 
 	return var
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A):
 	var = A.sum()
 
diff --git a/frontends/numpy-scipy/integration_tests/ops/gpu/test_transpose_dense_matrix.py b/frontends/numpy-scipy/integration_tests/ops/gpu/test_transpose_dense_matrix.py
@@ -8,7 +8,7 @@ def run_numpy(A):
 
 	return B
 
-@comet.compile(flags=None, target="sm_70")
+@comet.compile(flags=None, target="gpu")
 def run_comet_with_jit(A):
 	B = A.transpose()
 
diff --git a/integration_test/CMakeLists.txt b/integration_test/CMakeLists.txt
@@ -19,6 +19,14 @@ message(STATUS "Using COMET_UTILITY_LIBRARIES: ${COMET_UTILITY_LIBRARY_DIR}")
 set(COMET_INTEGRATION_TEST_DATA_DIR  ${CMAKE_CURRENT_SOURCE_DIR}/data/)
 message(STATUS "Using COMET_INTEGRATION_TEST_DATA_DIR: ${COMET_INTEGRATION_TEST_DATA_DIR}")
 
+if(ENABLE_GPU_TARGET)
+set(COMET_ENABLE_GPU  True)
+message(STATUS "Using COMET_ENABLE_GPU: ${COMET_ENABLE_GPU}")
+else()
+set(COMET_ENABLE_GPU  False)
+message(STATUS "Using COMET_ENABLE_GPU: ${COMET_ENABLE_GPU}")
+endif()
+
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
diff --git a/integration_test/compound_exps/gpu/Dense_chain_mult_matrix.ta b/integration_test/compound_exps/gpu/Dense_chain_mult_matrix.ta
@@ -0,0 +1,31 @@
+# RUN: comet-opt --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> Dense_chain_mult_matrix.llvm
+# RUN: mlir-cpu-runner Dense_chain_mult_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
+
+
+def main() {
+	#IndexLabel Declarations
+	IndexLabel [i] = [2];
+	IndexLabel [j] = [2];  
+	IndexLabel [k] = [5];           
+	IndexLabel [l] = [2];           
+
+	#Tensor Declarations
+	Tensor<double> A([i, j], {Dense});	  
+	Tensor<double> B([j, k], {Dense});
+	Tensor<double> C([k, l], {Dense});
+	Tensor<double> D([i, l], {Dense});
+
+	#Tensor Fill Operation
+	A[i, j] = 2.2;
+	B[j, k] = 3.4;
+	C[k, l] = 1.0;
+	D[i, l] = 0.0;
+
+	D[i, l] = A[i, j] * B[j, k] * C[k,l];
+	print(D);
+}
+
+# Print the result for verification.
+# CHECK: data = 
+# CHECK-NEXT: 74.8,74.8,74.8,74.8,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/lit.site.cfg.py.in b/integration_test/lit.site.cfg.py.in
@@ -41,6 +41,9 @@ config.comet_integration_test_data_dir = "@COMET_INTEGRATION_TEST_DATA_DIR@"
 config.comet_shlib_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@"
 config.timeout = "@COMET_INTEGRATION_TIMEOUT@"
 
+if @COMET_ENABLE_GPU@:
+    config.available_features.add('gpu_target_enabled')
+
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
 try:
diff --git a/integration_test/ops/gpu/eltwise_add_dense_matrix.ta b/integration_test/ops/gpu/eltwise_add_dense_matrix.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_add_dense_matrix.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_add_dense_matrix.llvm
 # RUN: mlir-cpu-runner eltwise_add_dense_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 
@@ -23,4 +23,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,
+# CHECK-NEXT: 5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,5.6,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/eltwise_mult_DensexDense_oDense.ta b/integration_test/ops/gpu/eltwise_mult_DensexDense_oDense.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_DensexDense_oDense.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_DensexDense_oDense.llvm
 # RUN: mlir-cpu-runner eltwise_DensexDense_oDense.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 
@@ -27,4 +27,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,
+# CHECK-NEXT: 8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,8.64,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/eltwise_subtract_dense_matrix.ta b/integration_test/ops/gpu/eltwise_subtract_dense_matrix.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_sub_dense_matrix.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_sub_dense_matrix.llvm
 # RUN: mlir-cpu-runner eltwise_sub_dense_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 
@@ -23,4 +23,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,
+# CHECK-NEXT: 1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/mult_dense_ij-ikj-kj.ta b/integration_test/ops/gpu/mult_dense_ij-ikj-kj.ta
@@ -1,7 +1,7 @@
 # This example demostrates that the compiler can generate code for arbitrary tensor operations
 # No assumption that contraction indices should disapper in the output tensor. 
 
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> mult_dense_ij-ikj-kj.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> mult_dense_ij-ikj-kj.llvm
 # RUN: mlir-cpu-runner mult_dense_ij-ikj-kj.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 def main() {
@@ -27,4 +27,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,
+# CHECK-NEXT: 21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,21.76,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/mult_dense_matrix.ta b/integration_test/ops/gpu/mult_dense_matrix.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt  --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm  %s &> mult_dense_matrix.llvm
+# RUN: comet-opt  --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm  %s &> mult_dense_matrix.llvm
 # RUN: mlir-cpu-runner mult_dense_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 
@@ -24,4 +24,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,
+# CHECK-NEXT: 29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,29.92,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/mult_dense_matrix_vector.ta b/integration_test/ops/gpu/mult_dense_matrix_vector.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> mult_dense_matrix_vector.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> mult_dense_matrix_vector.llvm
 # RUN: mlir-cpu-runner mult_dense_matrix_vector.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 def main() {
@@ -23,4 +23,4 @@ def main() {
 # Print the result for verification.
 # CHECK: data = 
 # CHECK-NEXT: 136.16,136.16,136.16,136.16,136.16,136.16,136.16,136.16,
-
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/sum_dense_matrix.ta b/integration_test/ops/gpu/sum_dense_matrix.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> sum_dense_matrix.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> sum_dense_matrix.llvm
 # RUN: mlir-cpu-runner sum_dense_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 def main() {
@@ -19,4 +19,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data = 
-# CHECK-NEXT: 59.2,
+# CHECK-NEXT: 59.2,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/ops/gpu/transpose_dense_matrix.ta b/integration_test/ops/gpu/transpose_dense_matrix.ta
@@ -1,4 +1,4 @@
-# RUN: comet-opt --target=GPU --gpu-compute-capability=70 --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> transpose_dense_matrix.llvm
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> transpose_dense_matrix.llvm
 # RUN: mlir-cpu-runner transpose_dense_matrix.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
 
 #TODO(gkestor): read dense input from file
@@ -22,4 +22,5 @@ def main() {
 
 # Print the result for verification.
 # CHECK: data =
-# CHECK-NEXT: 3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,
+# CHECK-NEXT: 3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,3.2,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/semiring/gpu/eltwise_monoidMin_DensexDense_oDense.ta b/integration_test/semiring/gpu/eltwise_monoidMin_DensexDense_oDense.ta
@@ -0,0 +1,31 @@
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_monoidMin_DensexDense_oDense.llvm
+# RUN: mlir-cpu-runner eltwise_monoidMin_DensexDense_oDense.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
+
+
+def main() {
+	#IndexLabel Declarations
+	IndexLabel [a] = [4];
+	IndexLabel [b] = [4];
+
+	#Tensor Declarations
+	Tensor<double> A([a, b], {Dense});	  
+	Tensor<double> B([a, b], {Dense});
+	Tensor<double> C([a, b], {Dense});
+
+	#Tensor Readfile Operation
+	A[a, b] = 2.7;
+	B[a, b] = 3.2;	
+
+	#If output tensor is dense, it needs to be initialized to 0
+	C[a, b] = 0.0;
+
+	#Tensor Contraction
+	C[a, b] = A[a, b] @(min) B[a, b];
+	
+	print(C);
+}
+
+# Print the result for verification.
+# CHECK: data = 
+# CHECK-NEXT: 2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/semiring/gpu/eltwise_monoidMinus_DensexDense_oDense.ta b/integration_test/semiring/gpu/eltwise_monoidMinus_DensexDense_oDense.ta
@@ -0,0 +1,31 @@
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_monoidMinus_DensexDense_oDense.llvm
+# RUN: mlir-cpu-runner eltwise_monoidMinus_DensexDense_oDense.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
+
+
+def main() {
+	#IndexLabel Declarations
+	IndexLabel [a] = [4];
+	IndexLabel [b] = [4];
+
+	#Tensor Declarations
+	Tensor<double> A([a, b], {Dense});	  
+	Tensor<double> B([a, b], {Dense});
+	Tensor<double> C([a, b], {Dense});
+
+	#Tensor Readfile Operation
+	A[a, b] = 4.2;
+	B[a, b] = 2.7;	
+
+	#If output tensor is dense, it needs to be initialized to 0
+	C[a, b] = 0.0;
+
+	#Tensor Contraction
+	C[a, b] = A[a, b] @(-) B[a, b];
+	
+	print(C);
+}
+
+# Print the result for verification.
+# CHECK: data = 
+# CHECK-NEXT: 1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/semiring/gpu/eltwise_monoidPlus_DensexDense_oDense.ta b/integration_test/semiring/gpu/eltwise_monoidPlus_DensexDense_oDense.ta
@@ -0,0 +1,31 @@
+# RUN: comet-opt --target=GPU --convert-ta-to-it --convert-to-loops --convert-to-triton --convert-to-llvm %s &> eltwise_monoidPlus_DensexDense_oDense.llvm
+# RUN: mlir-cpu-runner eltwise_monoidPlus_DensexDense_oDense.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
+
+
+def main() {
+	#IndexLabel Declarations
+	IndexLabel [a] = [4];
+	IndexLabel [b] = [4];
+
+	#Tensor Declarations
+	Tensor<double> A([a, b], {Dense});	  
+	Tensor<double> B([a, b], {Dense});
+	Tensor<double> C([a, b], {Dense});
+
+	#Tensor Readfile Operation
+	A[a, b] = 2.7;
+	B[a, b] = 3.2;	
+
+	#If output tensor is dense, it needs to be initialized to 0
+	C[a, b] = 0.0;
+
+	#Tensor Contraction
+	C[a, b] = A[a, b] @(+) B[a, b];
+	
+	print(C);
+}
+
+# Print the result for verification.
+# CHECK: data = 
+# CHECK-NEXT: 5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,5.9,
+# REQUIRES: gpu_target_enabled
diff --git a/integration_test/semiring/gpu/eltwise_monoidTimes_DensexDense_oDense.ta b/integration_test/semiring/gpu/eltwise_monoidTimes_DensexDense_oDense.ta
diff --git a/integration_test/semiring/gpu/mm_SemiringPlusTimes_DensexDense_oDense.ta b/integration_test/semiring/gpu/mm_SemiringPlusTimes_DensexDense_oDense.ta
diff --git a/integration_test/semiring/gpu/mv_SemiringPlusTimes_DensexDense_oDense.ta b/integration_test/semiring/gpu/mv_SemiringPlusTimes_DensexDense_oDense.ta
diff --git a/integration_test/semiring/gpu/try.mlir b/integration_test/semiring/gpu/try.mlir