From 1aaa9646df02769a492965caf41f87fae127305a Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Sun, 8 Mar 2020 06:00:30 -0500
Subject: [PATCH 1/3] Small tweaks to the devdocs

---
 docs/Project.toml                         |  3 ++
 docs/make.jl                              |  2 +-
 docs/src/devdocs/constructing_loopsets.md | 51 ++++++++++++-----------
 docs/src/devdocs/evaluating_loops.md      |  7 ++++
 docs/src/devdocs/loopset_structure.md     | 11 ++---
 5 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index dfa65cd10..1b9ab1f81 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,2 +1,5 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+
+[compat]
+Documenter = "0.24"
diff --git a/docs/make.jl b/docs/make.jl
index 74a005944..1a22f2bee 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, LoopVectorization
 
 makedocs(;
     modules=[LoopVectorization],
-    format=Documenter.HTML(),
+    format=Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
     pages=[
         "Home" => "index.md",
         "Getting Started" => "getting_started.md",
diff --git a/docs/src/devdocs/constructing_loopsets.md b/docs/src/devdocs/constructing_loopsets.md
index b562e0e6e..4c4756fb2 100644
--- a/docs/src/devdocs/constructing_loopsets.md
+++ b/docs/src/devdocs/constructing_loopsets.md
@@ -1,28 +1,6 @@
 # Constructing LoopSets
 
-When applying the `@avx` macro to a broadcast expression, the LoopSet object is constructed by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts.
-```julia
-julia> Meta.@lower @. f(g(a,b) + c) / d
-:($(Expr(:thunk, CodeInfo(
-    @ none within `top-level scope'
-1 ─ %1 = Base.broadcasted(g, a, b)
-│   %2 = Base.broadcasted(+, %1, c)
-│   %3 = Base.broadcasted(f, %2)
-│   %4 = Base.broadcasted(/, %3, d)
-│   %5 = Base.materialize(%4)
-└──      return %5
-))))
-
-julia> @macroexpand @avx @. f(g(a,b) + c) / d
-quote
-    var"##262" = Base.broadcasted(g, a, b)
-    var"##263" = Base.broadcasted(+, var"##262", c)
-    var"##264" = Base.broadcasted(f, var"##263")
-    var"##265" = Base.broadcasted(/, var"##264", d)
-    var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
-end
-```
-These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies.
+## Loop expressions
 
 When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
 ```julia
@@ -41,7 +19,8 @@ quote
     end
 end
 ```
-This summary is then [reconstruced](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been tranposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This is why 
+When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_avx_!` get compiled. This causes the summary to be [reconstructed](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_avx_!`.
+
 The three chief components of the summaries are the definitions of operations, e.g.:
 ```julia
 :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03)
@@ -55,6 +34,28 @@ and the set of loop bounds:
 (LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K))
 ```
 
+## Broadcasting
 
+When applying the `@avx` macro to a broadcast expression, there are no explicit loops, and even the dimensionality of the operation is unknown.  Consequently the `LoopSet` object must be constructed at compile time. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts:
+```julia
+julia> Meta.@lower @. f(g(a,b) + c) / d
+:($(Expr(:thunk, CodeInfo(
+    @ none within `top-level scope'
+1 ─ %1 = Base.broadcasted(g, a, b)
+│   %2 = Base.broadcasted(+, %1, c)
+│   %3 = Base.broadcasted(f, %2)
+│   %4 = Base.broadcasted(/, %3, d)
+│   %5 = Base.materialize(%4)
+└──      return %5
+))))
 
-
+julia> @macroexpand @avx @. f(g(a,b) + c) / d
+quote
+    var"##262" = Base.broadcasted(g, a, b)
+    var"##263" = Base.broadcasted(+, var"##262", c)
+    var"##264" = Base.broadcasted(f, var"##263")
+    var"##265" = Base.broadcasted(/, var"##264", d)
+    var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
+end
+```
+These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies, but again this information is available only when the method is compiled for specific types. The `@generated` function `vmaterialize` constructs the LoopSet by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields.
diff --git a/docs/src/devdocs/evaluating_loops.md b/docs/src/devdocs/evaluating_loops.md
index fc0f57321..17c91a20c 100644
--- a/docs/src/devdocs/evaluating_loops.md
+++ b/docs/src/devdocs/evaluating_loops.md
@@ -3,3 +3,10 @@
 The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/costs.jl).
 Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
 
+The cost estimate is based on the costs of individual instructions and the number of times each one needs to be executed for the given strategy. The instruction cost can be broken into several components:
+
+- The `scalar latency` is the minimum delay, in clock cycles, associated with the instruction. Think of it as the delay from turning on the water to when water starts coming out the hose.
+- The `reciprocal throughput` is similar to the latency, but it measures the number of cycles per operation when many of the same operation are repeated in sequence.  Continuing our hose analogy, think of it as the inverse of the flow rate at steady-state. It is typically ≤ the `scalar latency`.
+- The `register pressure` measures the register consumption by the operation
+
+Data on individual instructions for specific architectures can be found on [Agner Fog's website](https://agner.org/optimize/instruction_tables.pdf).
diff --git a/docs/src/devdocs/loopset_structure.md b/docs/src/devdocs/loopset_structure.md
index d83af6ea4..3b198449d 100644
--- a/docs/src/devdocs/loopset_structure.md
+++ b/docs/src/devdocs/loopset_structure.md
@@ -1,7 +1,7 @@
 # LoopSet Structure
 
-The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph. Currently, only single return values are supported.
-Lets use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
+The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph.
+Let's use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
 ```julia
 julia> using LoopVectorization
 
@@ -50,8 +50,8 @@ julia> LoopVectorization.parents(ans)
  var"##tempload#258" = A[m, k]
  var"##tempload#259" = B[k, n]
  var"##reduction#260" = var"##reductzero#261"
- ```
-References to arrays are represtened with an `ArrayReferenceMeta` data structure:
+```
+References to arrays are represented with an `ArrayReferenceMeta` data structure:
 ```julia
 julia> LoopVectorization.operations(lsAmulB)[3].ref
 LoopVectorization.ArrayReferenceMeta(LoopVectorization.ArrayReference(:A, [:m, :k], Int8[0, 0]), Bool[1, 1], Symbol("##vptr##_A"))
@@ -59,4 +59,5 @@ LoopVectorization.ArrayReferenceMeta(LoopVectorization.ArrayReference(:A, [:m, :
 It contains the name of the parent array (`:A`), the indicies `[:m,:k]`, and a boolean vector (`Bool[1, 1]`) indicating whether these indices are loop iterables. Note that the optimizer assumes arrays are column-major, and thus that it is efficient to read contiguous elements from the first index. In lower level terms, it means that [high-throughput vmov](https://www.felixcloutier.com/x86/movupd) instructions can be used rather than [low-throughput](https://www.felixcloutier.com/x86/vgatherdpd:vgatherqpd) [gathers](https://www.felixcloutier.com/x86/vgatherqps:vgatherqpd). Similar story for storing elements.
 When no axis has unit stride, the first given index will be the dummy `Symbol("##DISCONTIGUOUSSUBARRAY##")`.
 
-
+!!! warning
+    Currently, only single return values are supported (tuple destructuring is not supported in assignments).

From 6030e21170042300ccfdf42302d577574da8bda5 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Sun, 8 Mar 2020 06:41:16 -0500
Subject: [PATCH 2/3] Fix typo

---
 src/costs.jl | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/costs.jl b/src/costs.jl
index 9eca32327..9d9696973 100644
--- a/src/costs.jl
+++ b/src/costs.jl
@@ -35,21 +35,21 @@ Base.isequal(ins1::Instruction, ins2::Instruction) = (ins1.instr === ins2.instr)
 const LOOPCONSTANT = Instruction(gensym())
 
 struct InstructionCost
-    scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 ->  == latency == reciprical throughput
-    scalar_reciprical_throughput::Float64
+    scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 ->  == latency == reciprocal throughput
+    scalar_reciprocal_throughput::Float64
     scalar_latency::Int
     register_pressure::Int
 end
 InstructionCost(sl::Int, srt::Float64, scaling::Float64 = -3.0) = InstructionCost(scaling, srt, sl, 0)
 
-nocost(c::InstructionCost) = c.scalar_reciprical_throughput == 0.0
+nocost(c::InstructionCost) = c.scalar_reciprocal_throughput == 0.0
 flatcost(c::InstructionCost) = c.scaling == -3.0
 offsetscaling(c::InstructionCost) = c.scaling == -2.0
 linearscaling(c::InstructionCost) = c.scaling == -1.0
 
 function scalar_cost(ic::InstructionCost)#, ::Type{T} = Float64) where {T}
-    @unpack scalar_reciprical_throughput, scalar_latency, register_pressure = ic
-    scalar_reciprical_throughput, scalar_latency, register_pressure
+    @unpack scalar_reciprocal_throughput, scalar_latency, register_pressure = ic
+    scalar_reciprocal_throughput, scalar_latency, register_pressure
 end
 function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     srt, sl, srp = scalar_cost(ic)
@@ -68,7 +68,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     else # we assume custom cost, and that latency == recip_throughput
         scaling = ic.scaling
         sl, srt = round(Int,scaling), scaling
-    end    
+    end
     srt, sl, srp
 end
 # instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
@@ -278,7 +278,7 @@ function reduction_combine_to(x::Float64)
     x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
 end
 reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
-function reduction_zero(x::Float64) 
+function reduction_zero(x::Float64)
     # x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
     x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
 end
@@ -373,4 +373,3 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(ifelse) => :vifelse,
     typeof(vifelse) => :vifelse
 )
-

From b47c95f2ef5e9fb25791484301e80759c3fc7586 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Sun, 8 Mar 2020 12:54:23 -0400
Subject: [PATCH 3/3] Update Manifest.toml and expand a little on
 evaluating_loops.

---
 docs/Manifest.toml                   | 31 ++++++++++++++++++++++------
 docs/src/devdocs/evaluating_loops.md | 10 ++++++++-
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index bf8ff841f..43d5c30ca 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -13,23 +13,33 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[DocStringExtensions]]
 deps = ["LibGit2", "Markdown", "Pkg", "Test"]
-git-tree-sha1 = "1df01539a1c952cef21f2d2d1c092c2bcf0177d7"
+git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.6.0"
+version = "0.8.1"
 
 [[Documenter]]
-deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "LibGit2", "Logging", "Markdown", "Pkg", "REPL", "Random", "Test", "Unicode"]
-git-tree-sha1 = "a6db1c69925cdc53aafb38caec4446be26e0c617"
+deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
+git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df"
 uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.21.0"
+version = "0.24.5"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.0"
+
 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
@@ -37,8 +47,17 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[Parsers]]
+deps = ["Dates", "Test"]
+git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "0.3.12"
+
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
diff --git a/docs/src/devdocs/evaluating_loops.md b/docs/src/devdocs/evaluating_loops.md
index 17c91a20c..47a3a5d11 100644
--- a/docs/src/devdocs/evaluating_loops.md
+++ b/docs/src/devdocs/evaluating_loops.md
@@ -9,4 +9,12 @@ The cost estimate is based on the costs of individual instructions and the numbe
 - The `reciprocal throughput` is similar to the latency, but it measures the number of cycles per operation when many of the same operation are repeated in sequence.  Continuing our hose analogy, think of it as the inverse of the flow rate at steady-state. It is typically ≤ the `scalar latency`.
 - The `register pressure` measures the register consumption by the operation
 
-Data on individual instructions for specific architectures can be found on [Agner Fog's website](https://agner.org/optimize/instruction_tables.pdf).
+Data on individual instructions for specific architectures can be found on [Agner Fog's website](https://agner.org/optimize/instruction_tables.pdf). Most of the costs used were those for the Skylake-X architecture.
+
+Examples of how these come into play:
+- Vectorizing a loop will result in each instruction evaluating multiple iterations, but the costs of loads and stores will change based on the memory layouts of the accessed arrays.
+- Unrolling can help reduce the number of times an operation must be performed, for example if it can allow us to reuse memory multiple times rather than reloading it every time it is needed.
+- When there is a reduction, such as performing a sum, there is a dependency chain. Each `+` has to wait for the previous `+` to finish executing before it can begin, thus execution time is bounded by latency rather than minimum of the throughput of the `+` and load operations. By unrolling the loop, we can create multiple independent dependency chains.
+
+
+