Added block info gathering support to IndexTreeToSCF #20

pflynn157 · pflynn157 · commit 9ad568f47680 · 2023-11-06T09:30:41.000-05:00
diff --git a/first.mlir b/first.mlir
@@ -1,30 +1,115 @@
 module {
   func.func @main() {
+    %cst = arith.constant 1.000000e+00 : f64
+    %cst_0 = arith.constant 0.000000e+00 : f64
+    %c10 = arith.constant 10 : index
+    %c9 = arith.constant 9 : index
+    %c8 = arith.constant 8 : index
+    %c7 = arith.constant 7 : index
+    %c6 = arith.constant 6 : index
+    %c5 = arith.constant 5 : index
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c4 = arith.constant 4 : index
-    %0 = "ta.static_index_label"(%c0, %c4, %c1) : (index, index, index) -> !ta.range
-    %c0_0 = arith.constant 0 : index
-    %c1_1 = arith.constant 1 : index
-    %1 = "ta.dynamic_index_label"(%c0_0, %c1_1) : (index, index) -> !ta.range
-    %c0_2 = arith.constant 0 : index
-    %c1_3 = arith.constant 1 : index
-    %2 = "ta.dynamic_index_label"(%c0_2, %c1_3) : (index, index) -> !ta.range
-    %3 = "ta.sparse_tensor_decl"(%1, %2) {format = "CSR", temporal_tensor = false} : (!ta.range, !ta.range) -> tensor<?x?xf64>
-    %4 = "ta.dense_tensor_decl"(%0, %1) {format = "Dense"} : (!ta.range, !ta.range) -> tensor<4x?xf64>
-    %5 = "ta.dense_tensor_decl"(%0, %2) {format = "Dense"} : (!ta.range, !ta.range) -> tensor<4x?xf64>
-    "ta.fill"(%4) {value = 1.000000e+00 : f64} : (tensor<4x?xf64>) -> ()
-    "ta.fill_from_file"(%3) {filename = "SPARSE_FILE_NAME0", readMode = 1 : i32} : (tensor<?x?xf64>) -> ()
-    "ta.fill"(%5) {value = 0.000000e+00 : f64} : (tensor<4x?xf64>) -> ()
-    %6 = "it.ComputeRHS"(%4, %3) {allBlocks = [["UNK", "UNK"], ["UNK", "UNK"]], allFormats = [["D", "D"], ["D", "CU"]], allPerms = [[0, 1], [1, 2]]} : (tensor<4x?xf64>, tensor<?x?xf64>) -> tensor<*xf64>
-    %7 = "it.ComputeLHS"(%5) {allBlocks = [["UNK", "UNK"]], allFormats = [["D", "D"]], allPerms = [[0, 2]]} : (tensor<4x?xf64>) -> tensor<*xf64>
-    %8 = "it.Compute"(%6, %7) {MaskType = "none", comp_worksp_opt = false, semiring = "plusxy_times"} : (tensor<*xf64>, tensor<*xf64>) -> i64
-    %9 = "it.Indices"(%8) {indices = [2]} : (i64) -> i64
-    %10 = "it.Indices"(%9) {indices = [1]} : (i64) -> i64
-    %11 = "it.Indices"(%10) {indices = [0]} : (i64) -> i64
-    %12 = "it.itree"(%11) : (i64) -> i64
-    "ta.print"(%5) : (tensor<4x?xf64>) -> ()
-    "ta.print"(%3) : (tensor<?x?xf64>) -> ()
+    %alloc = memref.alloc() : memref<13xindex>
+    %cast = memref.cast %alloc : memref<13xindex> to memref<*xindex>
+    call @read_input_sizes_2D_f64(%c0_i32, %c0, %c0, %c2, %c0, %cast, %c1_i32) {filename = "SPARSE_FILE_NAME0"} : (i32, index, index, index, index, memref<*xindex>, i32) -> ()
+    %0 = memref.load %alloc[%c0] : memref<13xindex>
+    %1 = memref.load %alloc[%c1] : memref<13xindex>
+    %2 = memref.load %alloc[%c2] : memref<13xindex>
+    %3 = memref.load %alloc[%c3] : memref<13xindex>
+    %4 = memref.load %alloc[%c4] : memref<13xindex>
+    %5 = memref.load %alloc[%c5] : memref<13xindex>
+    %6 = memref.load %alloc[%c6] : memref<13xindex>
+    %7 = memref.load %alloc[%c7] : memref<13xindex>
+    %8 = memref.load %alloc[%c8] : memref<13xindex>
+    %9 = memref.load %alloc[%c9] : memref<13xindex>
+    %10 = memref.load %alloc[%c10] : memref<13xindex>
+    %alloc_1 = memref.alloc(%0) : memref<?xindex>
+    scf.for %arg0 = %c0 to %0 step %c1 {
+      memref.store %c0, %alloc_1[%arg0] : memref<?xindex>
+    }
+    %cast_2 = memref.cast %alloc_1 : memref<?xindex> to memref<*xindex>
+    %alloc_3 = memref.alloc(%1) : memref<?xindex>
+    scf.for %arg0 = %c0 to %1 step %c1 {
+      memref.store %c0, %alloc_3[%arg0] : memref<?xindex>
+    }
+    %cast_4 = memref.cast %alloc_3 : memref<?xindex> to memref<*xindex>
+    %alloc_5 = memref.alloc(%2) : memref<?xindex>
+    scf.for %arg0 = %c0 to %2 step %c1 {
+      memref.store %c0, %alloc_5[%arg0] : memref<?xindex>
+    }
+    %cast_6 = memref.cast %alloc_5 : memref<?xindex> to memref<*xindex>
+    %alloc_7 = memref.alloc(%3) : memref<?xindex>
+    scf.for %arg0 = %c0 to %3 step %c1 {
+      memref.store %c0, %alloc_7[%arg0] : memref<?xindex>
+    }
+    %cast_8 = memref.cast %alloc_7 : memref<?xindex> to memref<*xindex>
+    %alloc_9 = memref.alloc(%4) : memref<?xindex>
+    scf.for %arg0 = %c0 to %4 step %c1 {
+      memref.store %c0, %alloc_9[%arg0] : memref<?xindex>
+    }
+    %cast_10 = memref.cast %alloc_9 : memref<?xindex> to memref<*xindex>
+    %alloc_11 = memref.alloc(%5) : memref<?xindex>
+    scf.for %arg0 = %c0 to %5 step %c1 {
+      memref.store %c0, %alloc_11[%arg0] : memref<?xindex>
+    }
+    %cast_12 = memref.cast %alloc_11 : memref<?xindex> to memref<*xindex>
+    %alloc_13 = memref.alloc(%6) : memref<?xindex>
+    scf.for %arg0 = %c0 to %6 step %c1 {
+      memref.store %c0, %alloc_13[%arg0] : memref<?xindex>
+    }
+    %cast_14 = memref.cast %alloc_13 : memref<?xindex> to memref<*xindex>
+    %alloc_15 = memref.alloc(%7) : memref<?xindex>
+    scf.for %arg0 = %c0 to %7 step %c1 {
+      memref.store %c0, %alloc_15[%arg0] : memref<?xindex>
+    }
+    %cast_16 = memref.cast %alloc_15 : memref<?xindex> to memref<*xindex>
+    %alloc_17 = memref.alloc(%8) : memref<?xf64>
+    scf.for %arg0 = %c0 to %8 step %c1 {
+      memref.store %cst_0, %alloc_17[%arg0] : memref<?xf64>
+    }
+    %cast_18 = memref.cast %alloc_17 : memref<?xf64> to memref<*xf64>
+    call @read_input_2D_f64(%c0_i32, %c0, %c0, %c2, %c0, %cast_2, %cast_4, %cast_6, %cast_8, %cast_10, %cast_12, %cast_14, %cast_16, %cast_18, %c1_i32) {filename = "SPARSE_FILE_NAME0"} : (i32, index, index, index, index, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xf64>, i32) -> ()
+    %alloc_19 = memref.alloc(%9) {alignment = 32 : i64} : memref<4x?xf64>
+    %alloc_20 = memref.alloc(%10) {alignment = 32 : i64} : memref<4x?xf64>
+    linalg.fill ins(%cst : f64) outs(%alloc_19 : memref<4x?xf64>)
+    linalg.fill ins(%cst_0 : f64) outs(%alloc_20 : memref<4x?xf64>)
+    scf.for %arg0 = %c0 to %c4 step %c1 {
+      scf.for %arg1 = %c0 to %9 step %c1 {
+        %11 = memref.load %alloc_9[%c0] : memref<?xindex>
+        %12 = memref.load %alloc_9[%c1] : memref<?xindex>
+        scf.for %arg2 = %11 to %12 step %c1 {
+          %13 = memref.load %alloc_11[%arg2] : memref<?xindex>
+          %14 = memref.load %alloc_19[%arg0, %arg1] : memref<4x?xf64>
+          %15 = memref.load %alloc_17[%arg2] : memref<?xf64>
+          %16 = memref.load %alloc_20[%arg0, %13] : memref<4x?xf64>
+          %17 = arith.mulf %14, %15 : f64
+          %18 = arith.addf %16, %17 : f64
+          memref.store %18, %alloc_20[%arg0, %13] : memref<4x?xf64>
+        }
+      }
+    }
+    %cast_21 = memref.cast %alloc_20 : memref<4x?xf64> to memref<*xf64>
+    call @comet_print_memref_f64(%cast_21) : (memref<*xf64>) -> ()
+    call @comet_print_memref_i64(%cast_2) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_4) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_6) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_8) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_10) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_12) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_14) : (memref<*xindex>) -> ()
+    call @comet_print_memref_i64(%cast_16) : (memref<*xindex>) -> ()
+    call @comet_print_memref_f64(%cast_18) : (memref<*xf64>) -> ()
     return
   }
+  func.func private @read_input_2D_f64(i32, index, index, index, index, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xindex>, memref<*xf64>, i32)
+  func.func private @read_input_sizes_2D_f64(i32, index, index, index, index, memref<*xindex>, i32)
+  func.func private @comet_sort_index(memref<*xindex>, index, index)
+  func.func private @comet_print_memref_f64(memref<*xf64>)
+  func.func private @comet_print_memref_i64(memref<*xindex>)
 }
diff --git a/first.ta b/first.ta
@@ -5,7 +5,7 @@ def main() {
 	IndexLabel [c] = [?];             
 
 	#Tensor Declarations
-	Tensor<double> B([b, c], {CSR});   #sparse tensor declarations should be before dense tensor declarations
+	Tensor<double> B([b, c], {BCSR});   #sparse tensor declarations should be before dense tensor declarations
 	Tensor<double> A([a, b], {Dense});	  
 	Tensor<double> C([a, c], {Dense});
 
diff --git a/include/comet/Dialect/Utils/Utils.h b/include/comet/Dialect/Utils/Utils.h
@@ -211,6 +211,11 @@ namespace mlir
     void getFormatsOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
     void getRHSFormatsOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
     void getLHSFormatsOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
+    
+    /// TODO(patrick): Do we want to merge these with the getFormat functions above?
+    void getBlocksOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
+    void getRHSBlocksOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
+    void getLHSBlocksOfComputeOp(Value computeOp, std::vector<std::vector<std::string>> &opFormats);
 
     void getFormatsPermsOfComputeOp(Value computeOp,
                                     std::vector<std::vector<std::string>> &opFormats,
@@ -223,7 +228,8 @@ namespace mlir
                         std::vector<Value> &leafs,
                         std::vector<Value> &tensors,
                         std::vector<unsigned int> &ids,
-                        std::vector<std::string> &formats);
+                        std::vector<std::string> &formats,
+                        std::vector<std::string> &blocks);
 
     void replaceOperands(Operation *itComputeOp, std::vector<Value> newComputeOps);
 
diff --git a/lib/Conversion/IndexTreeToSCF/IndexTreeToSCF.cpp b/lib/Conversion/IndexTreeToSCF/IndexTreeToSCF.cpp
@@ -1044,6 +1044,7 @@ namespace
   void genForOps(std::vector<Value> &tensors,
                  std::vector<unsigned int> &ids,
                  std::vector<std::string> &formats,
+                 std::vector<std::string> &blocks,
                  indexTree::IndexTreeOp rootOp,
                  OpBuilder &builder,
                  OpsTree *opstree,
@@ -2383,6 +2384,7 @@ namespace
                          std::vector<std::vector<Value>> &tensors_lhs_Allocs /* output */,
                          std::vector<std::vector<Value>> &tensors_rhs_Allocs /* output */,
                          std::vector<std::vector<std::string>> &allFormats /*output*/,
+                         std::vector<std::vector<std::string>> &allBlocks /*output*/,
                          std::vector<std::vector<int>> &allPerms /* output */,
                          std::vector<std::vector<int>> &allPerms_rhs /* output */,
                          std::vector<Value> &main_tensors_all /* output */,
@@ -2452,6 +2454,18 @@ namespace
       }
       comet_debug() << "\n";
     }
+    
+    getBlocksOfComputeOp(cur_op.getOperation()->getResult(0), allBlocks);
+    comet_debug() << " allBlocks: \n";
+    for (auto m : allBlocks)
+    {
+      comet_debug() << " ";
+      for (auto n : m)
+      {
+        comet_debug() << n << " ";
+      }
+      comet_debug() << "\n";
+    }
 
     comet_debug() << " ";
     comet_vdump(cur_op);
@@ -2510,6 +2524,7 @@ namespace
                                  int main_tensor_nums,
                                  std::vector<std::vector<int>> &allPerms,
                                  std::vector<std::vector<std::string>> &allFormats,
+                                 std::vector<std::vector<std::string>> &allBlocks,
                                  std::vector<Value> &main_tensors_all,
                                  std::vector<scf::ForOp> &nested_forops,
                                  std::vector<Value> &nested_AccessIdx,
@@ -2529,6 +2544,7 @@ namespace
         comet_debug() << " index_loc " << index_loc << "\n";
         comet_debug() << " Perm: " << allPerms[i][j] << "\n";
         comet_debug() << " Format: " << allFormats[i][j] << "\n";
+        comet_debug() << " Block: " << allBlocks[i][j] << "\n";
         assert(index_loc < nested_forops.size() &&
                "index_loc < nested_forops.size(), i.e. the index not exist in nested for loop\n");
         allLoopsArg[i].push_back(nested_forops[index_loc].getInductionVar());
@@ -3705,6 +3721,7 @@ namespace
     std::vector<std::vector<Value>> tensors_lhs_Allocs;
     std::vector<std::vector<Value>> tensors_rhs_Allocs;
     std::vector<std::vector<std::string>> allFormats;
+    std::vector<std::vector<std::string>> allBlocks;
     std::vector<std::vector<int>> allPerms;
     std::vector<std::vector<int>> allPerms_rhs;
     std::vector<Value> main_tensors_all; /// main_tensors_all has first RHS tensors then LHS tensors
@@ -3714,6 +3731,7 @@ namespace
                       tensors_lhs_Allocs /* output */,
                       tensors_rhs_Allocs /* output */,
                       allFormats /* output */,
+                      allBlocks /* output */,
                       allPerms /* output */,
                       allPerms_rhs /* output */,
                       main_tensors_all /* output */,
@@ -3743,6 +3761,7 @@ namespace
                               main_tensor_nums,
                               allPerms,
                               allFormats,
+                              allBlocks,
                               main_tensors_all,
                               nested_forops,
                               nested_AccessIdx,
@@ -3776,6 +3795,7 @@ namespace
                                 main_tensor_nums,
                                 allPerms,
                                 allFormats,
+                                allBlocks,
                                 main_tensors_all,
                                 symbolic_nested_forops,
                                 symbolic_nested_AccessIdx,
@@ -4390,6 +4410,7 @@ void LowerIndexTreeToSCFPass::doLoweringIndexTreeToSCF(indexTree::IndexTreeOp &r
       std::vector<Value> tensors;
       std::vector<unsigned int> ids;
       std::vector<std::string> formats;
+      std::vector<std::string> blocks;
 
       comet_vdump(cur_op);
 
@@ -4398,17 +4419,29 @@ void LowerIndexTreeToSCFPass::doLoweringIndexTreeToSCF(indexTree::IndexTreeOp &r
                      leafs,
                      tensors /* output */,
                      ids /* output */,
-                     formats /* output */);
+                     formats /* output */,
+                     blocks /* output */);
 
       comet_debug() << " indices.size(): " << indices.size() << " tensors.size(): " << tensors.size() << "\n";
       for (unsigned int m = 0; m < tensors.size(); m++)
       {
-        comet_debug() << " Formats:" << formats[m] << " " << ids[m] << " ";
+        comet_debug() << " Formats:" << formats[m] << " " << ids[m] << " \n";
+        comet_debug() << " Blocks:" << blocks[m] << " " << ids[m] << " \n";
         comet_vdump(tensors[m]);
+        comet_debug() << "\n";
       }
+      comet_debug() << "---------------\n";
+      
+      //debug
+      //for (auto fmt : formats) {
+      //  std::cout << "FMT: " << fmt << std::endl;
+      //}
+      //for (auto block : blocks) {
+      //  std::cout << "BLOCK: " << block << std::endl;
+      //}
 
       comet_debug() << " call genForOps, i = " << i << "\n";
-      genForOps(tensors, ids, formats, rootOp, builder, opstree_vec[i], symbolicInfo);
+      genForOps(tensors, ids, formats, blocks, rootOp, builder, opstree_vec[i], symbolicInfo);
       {
         comet_pdump(rootOp->getParentOfType<ModuleOp>());
       }
diff --git a/lib/Dialect/Utils/Utils.cpp b/lib/Dialect/Utils/Utils.cpp