Parallel chunks does not use for loops anymore

mratsim · Nov 3, 2018 · dbd483c · dbd483c
1 parent d1ba625
commit dbd483c
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 70 deletions.
diff --git a/examples/ex03_omp_parallel_blocks.nim → examples/ex03_omp_parallel_chunks.nim b/examples/ex03_omp_parallel_blocks.nim → examples/ex03_omp_parallel_chunks.nim
@@ -2,8 +2,7 @@ import ../laser/openmp
 import math, random, sequtils
 
 proc doOp(s: var seq[float32]) {.noInline.}=
-  var nb_chunks: Natural
-  omp_parallel_chunks_default(s.len, nb_chunks, chunk_offset, chunk_size):
+  omp_parallel_chunks_default(s.len, chunk_offset, chunk_size):
     # Create a block range for each thread
     # Each thread can work on it's own range
     for idx in chunk_offset ..< chunk_offset + chunk_size:

diff --git a/laser/hpc_kernels/reduction_sum_min_max.nim b/laser/hpc_kernels/reduction_sum_min_max.nim
@@ -48,41 +48,30 @@ proc sum_kernel*(data: ptr UncheckedArray[float32], len: Natural): float32 =
         return sum_sse3(data, len)
     return sum_fallback(data, len)
   else:
-    # To prevent false sharing we use a padding solution.
-    # Once https://github.com/nim-lang/Nim/pull/9493 is merged
-    # we can use `#pragma omp critical` instead which doesn't allocate
-    # but uses mutexes/locks. TODO: is allocation or locks more expensive?
+    # TODO: Fastest between a padded seq, a critical section, OMP atomics or CPU atomics?
+    let
+      max_threads = omp_get_max_threads()
+      omp_condition = OMP_MEMORY_BOUND_GRAIN_SIZE * max_threads < len
+      sse3 = cpuinfo_has_x86_sse3()
 
-    let cache_line_size = cpuinfo_get_l1d_caches().line_size
-    let padding = int min(cache_line_size div float32.sizeof.uint32, 1)
-
-    var
-      partial_sums = newSeq[float32](omp_get_max_threads() * padding)
-      nb_chunks: Natural  # Actual number of chunks used in the loop
-                          # under a certain threshold we don't parallelize
-                          # due to overhead
-                          # i.e. we have a wasted seq alloc
-
-    template fallback(){.dirty.} =
-      omp_parallel_chunks_default(
-            len, nb_chunks, chunk_offset, chunk_size):
+    {.emit: "#pragma omp parallel if (`omp_condition`)".}
+    block:
+      let
+        nb_chunks = omp_get_num_threads()
+        whole_chunk_size = len div nb_chunks
+        thread_id = omp_get_thread_num()
+        `chunk_offset`{.inject.} = whole_chunk_size * thread_id
+        `chunk_size`{.inject.} =  if thread_id < nb_chunks - 1: whole_chunk_size
+                                    else: len - chunk_offset
+      block:
         let p_chunk{.restrict.} = cast[ptr UncheckedArray[float32]](
                                     data[chunk_offset].addr
                                   )
-        partial_sums[omp_get_thread_num() * padding] = sum_fallback(p_chunk, chunk_size)
-
-    when defined(i386) or defined(amd_64):
-      if cpuinfo_has_x86_sse3():
-        omp_parallel_chunks_default(
-              len, nb_chunks, chunk_offset, chunk_size):
-          let p_chunk{.restrict.} = cast[ptr UncheckedArray[float32]](
-                                      data[chunk_offset].addr
-                                    )
-          partial_sums[omp_get_thread_num() * padding] = sum_sse3(p_chunk, chunk_size)
-      else:
-        fallback()
-    else:
-      fallback()
+        when defined(i386) or defined(amd_64):
+          let local_sum = if sse3: sum_sse3(p_chunk, chunk_size)
+                          else: sum_fallback(p_chunk, chunk_size)
+        else:
+          let local_sum = sum_fallback(p_chunk, chunk_size)
 
-    for i in 0 ..< nb_chunks:
-      result += partial_sums[i * padding]
+        {.emit: "#pragma omp atomic".}
+        {.emit: "`result` += `local_sum`;".}
diff --git a/laser/openmp.nim b/laser/openmp.nim
@@ -154,7 +154,7 @@ template omp_parallel_for*(
     let
       omp_size = length # make sure if length is computed it's only done once
       omp_condition {.exportc: "omp_condition_" &
-        omp_suffix(genNew = false).} = omp_grain_size * omp_get_max_threads() < omp_size
+        omp_suffix().} = omp_grain_size * omp_get_max_threads() < omp_size
 
     const omp_annotation = (when use_simd:"simd " else: "") &
       "if(" & $omp_condition_csym & ")"
@@ -184,10 +184,9 @@ template omp_parallel_for_default*(
     body)
 
 template omp_parallel_chunks*(
-    length: Natural, nb_chunks: var Natural,
+    length: Natural,
     chunk_offset, chunk_size: untyped,
     omp_grain_size: static Natural,
-    use_simd: static bool,
     body: untyped): untyped =
   ## Create a chunk for each thread. You can use:
   ## `for index in chunk_offset ..< chunk_size:` or
@@ -208,37 +207,28 @@ template omp_parallel_chunks*(
   ## sequences, strings, or reference types.
   ## Those should be thread-local temporaries.
   when not defined(openmp):
-    nb_chunks = 1
     const `chunk_offset`{.inject.} = 0
     let `chunk_size`{.inject.} = length
     block: body
   else:
-    const # Workaround to expose an unique symbol in C.
-      omp_condition_csym = "omp_condition_" & omp_suffix(genNew = true)
-
     let
       omp_size = length # make sure if length is computed it's only done once
       max_threads = omp_get_max_threads()
-      omp_condition {.exportc: "omp_condition_" &
-        omp_suffix(genNew = false).} = omp_grain_size * max_threads < omp_size
-
-    if omp_condition:
-      nb_chunks = max_threads
-    else:
-      nb_chunks = 1
-    let whole_chunk_size = omp_size div nb_chunks
-
-    const omp_annotation = (when use_simd:"simd " else: "") &
-      "if(" & $omp_condition_csym & ")"
+      omp_condition = omp_grain_size * max_threads < omp_size
 
-    for chunk_id in `||`(0, nb_chunks - 1, omp_annotation):
-      let `chunk_offset`{.inject.} = whole_chunk_size * chunk_id
-      let `chunk_size`{.inject.} =  if chunk_id < nb_chunks - 1: whole_chunk_size
+    {.emit: "#pragma omp parallel if (`omp_condition`)".}
+    block:
+      let
+        nb_chunks = omp_get_num_threads()
+        whole_chunk_size = omp_size div nb_chunks
+        thread_id = omp_get_thread_num()
+        `chunk_offset`{.inject.} = whole_chunk_size * thread_id
+        `chunk_size`{.inject.} =  if thread_id < nb_chunks - 1: whole_chunk_size
                                     else: ompsize - chunk_offset
       block: body
 
 template omp_parallel_chunks_default*(
-    length: Natural, nb_chunks: var Natural,
+    length: Natural,
     chunk_offset, chunk_size: untyped,
     body: untyped): untyped =
   ## This will be renamed omp_parallel_chunks once
@@ -249,12 +239,10 @@ template omp_parallel_chunks_default*(
   ##     contiguous copy or add operations. It's 1024 and can be changed
   ##     by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation.
   ##     A value of 1 will always parallelize the loop.
-  ## - simd is used by default
   omp_parallel_chunks(
-    length, nb_chunks,
+    length,
     chunk_offset, chunk_size,
     omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,
-    use_simd = true,
     body
   )
 
@@ -265,8 +253,7 @@ template omp_parallel*(body: untyped): untyped =
   ## sequences, strings, or reference types.
   ## Those should be thread-local temporaries.
   {.emit: "#pragma omp parallel".}
-  block:
-    body
+  block: body
 
 template omp_critical*(body: untyped): untyped =
   {.emit: "#pragma omp critical".}

diff --git a/laser/tensor/initialization.nim b/laser/tensor/initialization.nim
@@ -52,9 +52,8 @@ proc deepCopy*[T](dst: var Tensor[T], src: Tensor[T]) =
     # We use memcpy, due to SIMD optimizations in memcpy,
     # we require higher parallelization thresholds
     if src.is_C_contiguous:
-      var nb_chunks: Natural
       omp_parallel_chunks(
-            size, nb_chunks, chunk_offset, chunk_size,
+            size, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -90,9 +89,8 @@ proc copyFrom*[T](dst: var Tensor[T], src: Tensor[T]) =
     # we require higher parallelization thresholds
     if src.is_C_contiguous:
       assert dst.shape == src.shape
-      var nb_chunks: Natural
       omp_parallel_chunks(
-            src.size, nb_chunks, chunk_offset, chunk_size,
+            src.size, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -116,9 +114,8 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
     withCompilerOptimHints()
     doAssert dst.size == len, "Tensor size and buffer length should be the same"
     let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
-    var nb_chunks: Natural
     omp_parallel_chunks(
-            len, nb_chunks, chunk_offset, chunk_size,
+            len, chunk_offset, chunk_size,
             OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
             use_simd = false):
         copyMem(
@@ -147,9 +144,8 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
   when not T.supportsCopyMem:
     t.storage.raw_data.reset()
   else:
-    var nb_chunks: Natural
     omp_parallel_chunks(
-          t.size, nb_chunks, chunk_offset, chunk_size,
+          t.size, chunk_offset, chunk_size,
           OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
           use_simd = false):
       zeroMem(