ROCm · jayfurmanek · May 22, 2023 · Apr 26, 2023 · Apr 27, 2023 · Apr 28, 2023
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -25,9 +25,9 @@ jobs:
         id: set-matrix
         run: |
           if [ x"${{ github.repository }}" == x"openai/triton" ]; then
-            echo '::set-output name=matrix::[["self-hosted", "A100"], ["self-hosted", "V100"], ["self-hosted", "gfx908"], "macos-10.15"]'
+            echo '::set-output name=matrix::[["self-hosted", "A100"], ["self-hosted", "V100"], ["self-hosted", "gfx908"]]'
           else
-            echo '::set-output name=matrix::["ubuntu-latest", "macos-10.15"]'
+            echo '::set-output name=matrix::["ubuntu-latest"]'
           fi
 
   Integration-Tests:
@@ -101,6 +101,18 @@ jobs:
           cd python/test/unit
           python3 -m pytest
 
+      - name: Create artifacts archive
+        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100')}}
+        run: |
+          tar -czvf artifacts.tar.gz ~/.triton/cache
+
+      - name: Upload artifacts archive
+        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100')}}
+        uses: actions/upload-artifact@v2
+        with:
+          name: artifacts
+          path: artifacts.tar.gz
+
       - name: Run CXX unittests
         if: ${{ env.BACKEND != 'ROCM'}}
         run: |

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -30,7 +30,7 @@ jobs:
           #export CIBW_MANYLINUX_PYPY_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest"
           export CIBW_BEFORE_BUILD="pip install cmake;"
           export CIBW_SKIP="{cp,pp}35-*"
-          export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64"
+          export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64 cp3*-musllinux_x86_64"
           python3 -m cibuildwheel python --output-dir wheelhouse
 
 

diff --git a/.gitmodules b/.gitmodules
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -49,12 +49,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 # Third-party
 include_directories(${PYBIND11_INCLUDE_DIR})
 
-if(WIN32)
-  SET(BUILD_SHARED_LIBS OFF)
-  find_package(dlfcn-win32 REQUIRED)
-  set(CMAKE_DL_LIBS dlfcn-win32::dl)
-endif()
-
 set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  -fPIC -std=gnu++17 -fvisibility=hidden -fvisibility-inlines-hidden")
 
 if (TRITON_USE_ROCM)

diff --git a/docs/python-api/triton.language.rst b/docs/python-api/triton.language.rst
@@ -34,6 +34,7 @@ Shape Manipulation Ops
     :nosignatures:
 
     broadcast_to
+    expand_dims
     reshape
     ravel
 

diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h
@@ -1,6 +1,7 @@
 #ifndef TRITON_ANALYSIS_ALLOCATION_H
 #define TRITON_ANALYSIS_ALLOCATION_H
 
+#include "triton/Analysis/Utility.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
@@ -49,18 +50,25 @@ template <typename T> class Interval {
   T End = std::numeric_limits<T>::max();
 };
 
+template <class T> Interval(T, T) -> Interval<T>;
+
 class Allocation {
 public:
   /// A unique identifier for shared memory buffers
   using BufferId = size_t;
   using BufferIdSetT = DenseSet<BufferId>;
+  using FuncAllocMapT = CallGraph<Allocation>::FuncDataMapT;
 
   static constexpr BufferId InvalidBufferId =
       std::numeric_limits<BufferId>::max();
 
+  Allocation() = default;
   /// Creates a new Allocation analysis that computes the shared memory
   /// information for all associated shared memory values.
-  Allocation(Operation *operation) : operation(operation) { run(); }
+  explicit Allocation(Operation *operation) : operation(operation) {}
+
+  /// Runs allocation analysis on the given top-level operation.
+  void run(FuncAllocMapT &funcAllocMap);
 
   /// Returns the operation this analysis was constructed from.
   Operation *getOperation() const { return operation; }
@@ -75,6 +83,12 @@ class Allocation {
     return bufferSet.at(bufferId).size;
   }
 
+  /// Returns the allocated interval of the given buffer.
+  Interval<size_t> getAllocatedInterval(BufferId bufferId) const {
+    auto &buffer = bufferSet.at(bufferId);
+    return Interval<size_t>(buffer.offset, buffer.offset + buffer.size);
+  }
+
   /// Returns the buffer id of the given value.
   /// This interface only returns the allocated buffer id.
   /// If you want to get all the buffer ids that are associated with the given
@@ -104,26 +118,28 @@ class Allocation {
   BufferId getBufferId(Operation *operation) const {
     if (opScratch.count(operation)) {
       return opScratch.lookup(operation)->id;
+    } else if (opVirtual.count(operation)) {
+      return opVirtual.lookup(operation)->id;
     } else {
       return InvalidBufferId;
     }
   }
 
+  /// Returns the size of the given buffer is a virtual buffer.
+  bool isVirtualBuffer(BufferId bufferId) const {
+    return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
+  }
+
   /// Returns the size of total shared memory allocated
   size_t getSharedMemorySize() const { return sharedMemorySize; }
 
-  bool isIntersected(BufferId lhsId, BufferId rhsId) const {
-    if (lhsId == InvalidBufferId || rhsId == InvalidBufferId)
-      return false;
-    auto lhsBuffer = bufferSet.at(lhsId);
-    auto rhsBuffer = bufferSet.at(rhsId);
-    return lhsBuffer.intersects(rhsBuffer);
-  }
-
 private:
   /// A class that represents a shared memory buffer
   struct BufferT {
-    enum class BufferKind { Explicit, Scratch };
+    /// Explicit: triton_gpu.alloc_tensor
+    /// Scratch: triton_gpu.convert_layout
+    /// Virtual: triton.call
+    enum class BufferKind { Explicit, Scratch, Virtual };
 
     /// MT: thread-safe
     inline static std::atomic<BufferId> nextId = 0;
@@ -142,12 +158,6 @@ class Allocation {
     BufferT(BufferKind kind, size_t size) : BufferT(kind, size, 0) {}
     BufferT(BufferKind kind, size_t size, size_t offset)
         : kind(kind), id(nextId++), size(size), offset(offset) {}
-
-    bool intersects(const BufferT &other) const {
-      return Interval<size_t>(offset, offset + size)
-          .intersects(
-              Interval<size_t>(other.offset, other.offset + other.size));
-    }
   };
 
   /// Op -> Scratch Buffer
@@ -158,8 +168,6 @@ class Allocation {
   using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
   /// BufferId -> Buffer
   using BufferSetT = std::map<BufferId, BufferT>;
-  /// Runs allocation analysis on the given top-level operation.
-  void run();
 
 private:
   template <BufferT::BufferKind Kind, typename KeyType, typename... Args>
@@ -168,6 +176,8 @@ class Allocation {
     bufferSet[buffer.id] = std::move(buffer);
     if constexpr (Kind == BufferT::BufferKind::Explicit) {
       valueBuffer[key] = &bufferSet[buffer.id];
+    } else if constexpr (Kind == BufferT::BufferKind::Virtual) {
+      opVirtual[key] = &bufferSet[buffer.id];
     } else {
       opScratch[key] = &bufferSet[buffer.id];
     }
@@ -178,8 +188,9 @@ class Allocation {
   }
 
 private:
-  Operation *operation;
+  Operation *operation = nullptr;
   OpScratchMapT opScratch;
+  OpScratchMapT opVirtual;
   ValueBufferMapT valueBuffer;
   AliasBufferMapT aliasBuffer;
   BufferSetT bufferSet;
@@ -188,7 +199,53 @@ class Allocation {
   friend class triton::AllocationAnalysis;
 };
 
-template <typename T> Interval(T, T) -> Interval<T>;
+/// Static analysis that computes the allocation of shared memory buffers
+/// of the entire call graph.
+/// The allocation is performed in a post-order walk of the call graph.
+/// Each call op is treated like convert_layout that allocates a scratch buffer.
+/// At each call, we compute the start offset of the scratch buffer and pass it
+/// as an argument to the callee.
+class ModuleAllocation : public CallGraph<Allocation> {
+public:
+  using FuncOffsetMapT = DenseMap<FunctionOpInterface, Value>;
+
+  explicit ModuleAllocation(ModuleOp moduleOp)
+      : CallGraph<Allocation>(moduleOp) {
+    walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
+        // Pre-order edge walk callback
+        [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
+        // Post-order node walk callback
+        [&](FunctionOpInterface funcOp) {
+          auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
+          if (inserted)
+            iter->second.run(funcMap);
+        });
+  }
+
+  size_t getSharedMemorySize() {
+    size_t size = 0;
+    for (auto funcOp : getRoots()) {
+      auto *alloc = getFuncData(funcOp);
+      size = std::max(size, alloc->getSharedMemorySize());
+    }
+    return size;
+  }
+
+  size_t getSharedMemorySize(FunctionOpInterface funcOp) {
+    return getFuncData(funcOp)->getSharedMemorySize();
+  }
+
+  void setFunctionSharedMemoryValue(FunctionOpInterface funcOp, Value value) {
+    sharedMemoryValue[funcOp] = value;
+  }
+
+  Value getFunctionSharedMemoryBase(FunctionOpInterface funcOp) {
+    return sharedMemoryValue[funcOp];
+  }
+
+private:
+  FuncOffsetMapT sharedMemoryValue;
+};
 
 } // namespace mlir
 

diff --git a/include/triton/Analysis/AxisInfo.h b/include/triton/Analysis/AxisInfo.h
@@ -286,16 +286,71 @@ class AxisInfoAnalysis
   AxisInfoAnalysis(DataFlowSolver &solver);
   using dataflow::SparseDataFlowAnalysis<
       dataflow::Lattice<AxisInfo>>::getLatticeElement;
+  using FuncAxisInfoMapT = DenseMap<FunctionOpInterface, AxisInfo>;
 
   void visitOperation(Operation *op,
                       ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
                       ArrayRef<dataflow::Lattice<AxisInfo> *> results) override;
+};
+
+/// Module level axis info analysis based on the call graph, assuming that we
+/// do not have recursive functions.
+/// Since each function will be called multiple times, we need to
+/// calculate the axis info based on the axis info of all the callers.
+/// In the future, we can perform optimization using function cloning so that
+/// each call site will have unique axis info.
+using AxisInfoMapT = DenseMap<Value, AxisInfo>;
+class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
+public:
+  explicit ModuleAxisInfoAnalysis(ModuleOp moduleOp)
+      : CallGraph<AxisInfoMapT>(moduleOp) {
+    SmallVector<FunctionOpInterface> funcs;
+    for (auto root : getRoots()) {
+      walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
+          // Pre-order edge walk callback
+          [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
+          // Post-order node walk callback
+          [&](FunctionOpInterface funcOp) {
+            funcs.push_back(funcOp);
+            funcMap.try_emplace(funcOp, AxisInfoMapT{});
+          });
+    }
+    SetVector<FunctionOpInterface> sortedFuncs(funcs.begin(), funcs.end());
+    SymbolTableCollection symbolTable;
+    for (auto funcOp : llvm::reverse(sortedFuncs)) {
+      initialize(funcOp);
+      funcOp.walk([&](CallOpInterface callOp) {
+        auto callee =
+            dyn_cast<FunctionOpInterface>(callOp.resolveCallable(&symbolTable));
+        update(callOp, callee);
+      });
+    }
+  }
+
+  AxisInfo *getAxisInfo(Value value) {
+    auto funcOp =
+        value.getParentRegion()->getParentOfType<FunctionOpInterface>();
+    auto *axisInfoMap = getFuncData(funcOp);
+    if (!axisInfoMap) {
+      return nullptr;
+    }
+    auto it = axisInfoMap->find(value);
+    if (it == axisInfoMap->end()) {
+      return nullptr;
+    }
+    return &(it->second);
+  }
 
   unsigned getPtrContiguity(Value ptr);
 
   unsigned getPtrAlignment(Value ptr);
 
   unsigned getMaskAlignment(Value mask);
+
+private:
+  void initialize(FunctionOpInterface funcOp);
+
+  void update(CallOpInterface callOp, FunctionOpInterface funcOp);
 };
 
 } // namespace mlir
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ Shape Manipulation Ops @@
         :nosignatures:
         broadcast_to
+        expand_dims
         reshape
         ravel
@@ Expand Down @@