Skip to content

Commit

Permalink
Parallel chunks does not use for loops anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Nov 3, 2018
1 parent d1ba625 commit dbd483c
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ import ../laser/openmp
import math, random, sequtils

proc doOp(s: var seq[float32]) {.noInline.}=
var nb_chunks: Natural
omp_parallel_chunks_default(s.len, nb_chunks, chunk_offset, chunk_size):
omp_parallel_chunks_default(s.len, chunk_offset, chunk_size):
# Create a block range for each thread
# Each thread can work on it's own range
for idx in chunk_offset ..< chunk_offset + chunk_size:
Expand Down
55 changes: 22 additions & 33 deletions laser/hpc_kernels/reduction_sum_min_max.nim
Original file line number Diff line number Diff line change
Expand Up @@ -48,41 +48,30 @@ proc sum_kernel*(data: ptr UncheckedArray[float32], len: Natural): float32 =
return sum_sse3(data, len)
return sum_fallback(data, len)
else:
# To prevent false sharing we use a padding solution.
# Once https://github.com/nim-lang/Nim/pull/9493 is merged
# we can use `#pragma omp critical` instead which doesn't allocate
# but uses mutexes/locks. TODO: is allocation or locks more expensive?
# TODO: Fastest between a padded seq, a critical section, OMP atomics or CPU atomics?
let
max_threads = omp_get_max_threads()
omp_condition = OMP_MEMORY_BOUND_GRAIN_SIZE * max_threads < len
sse3 = cpuinfo_has_x86_sse3()

let cache_line_size = cpuinfo_get_l1d_caches().line_size
let padding = int min(cache_line_size div float32.sizeof.uint32, 1)

var
partial_sums = newSeq[float32](omp_get_max_threads() * padding)
nb_chunks: Natural # Actual number of chunks used in the loop
# under a certain threshold we don't parallelize
# due to overhead
# i.e. we have a wasted seq alloc

template fallback(){.dirty.} =
omp_parallel_chunks_default(
len, nb_chunks, chunk_offset, chunk_size):
{.emit: "#pragma omp parallel if (`omp_condition`)".}
block:
let
nb_chunks = omp_get_num_threads()
whole_chunk_size = len div nb_chunks
thread_id = omp_get_thread_num()
`chunk_offset`{.inject.} = whole_chunk_size * thread_id
`chunk_size`{.inject.} = if thread_id < nb_chunks - 1: whole_chunk_size
else: len - chunk_offset
block:
let p_chunk{.restrict.} = cast[ptr UncheckedArray[float32]](
data[chunk_offset].addr
)
partial_sums[omp_get_thread_num() * padding] = sum_fallback(p_chunk, chunk_size)

when defined(i386) or defined(amd_64):
if cpuinfo_has_x86_sse3():
omp_parallel_chunks_default(
len, nb_chunks, chunk_offset, chunk_size):
let p_chunk{.restrict.} = cast[ptr UncheckedArray[float32]](
data[chunk_offset].addr
)
partial_sums[omp_get_thread_num() * padding] = sum_sse3(p_chunk, chunk_size)
else:
fallback()
else:
fallback()
when defined(i386) or defined(amd_64):
let local_sum = if sse3: sum_sse3(p_chunk, chunk_size)
else: sum_fallback(p_chunk, chunk_size)
else:
let local_sum = sum_fallback(p_chunk, chunk_size)

for i in 0 ..< nb_chunks:
result += partial_sums[i * padding]
{.emit: "#pragma omp atomic".}
{.emit: "`result` += `local_sum`;".}
41 changes: 14 additions & 27 deletions laser/openmp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ template omp_parallel_for*(
let
omp_size = length # make sure if length is computed it's only done once
omp_condition {.exportc: "omp_condition_" &
omp_suffix(genNew = false).} = omp_grain_size * omp_get_max_threads() < omp_size
omp_suffix().} = omp_grain_size * omp_get_max_threads() < omp_size

const omp_annotation = (when use_simd:"simd " else: "") &
"if(" & $omp_condition_csym & ")"
Expand Down Expand Up @@ -184,10 +184,9 @@ template omp_parallel_for_default*(
body)

template omp_parallel_chunks*(
length: Natural, nb_chunks: var Natural,
length: Natural,
chunk_offset, chunk_size: untyped,
omp_grain_size: static Natural,
use_simd: static bool,
body: untyped): untyped =
## Create a chunk for each thread. You can use:
## `for index in chunk_offset ..< chunk_size:` or
Expand All @@ -208,37 +207,28 @@ template omp_parallel_chunks*(
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
when not defined(openmp):
nb_chunks = 1
const `chunk_offset`{.inject.} = 0
let `chunk_size`{.inject.} = length
block: body
else:
const # Workaround to expose an unique symbol in C.
omp_condition_csym = "omp_condition_" & omp_suffix(genNew = true)

let
omp_size = length # make sure if length is computed it's only done once
max_threads = omp_get_max_threads()
omp_condition {.exportc: "omp_condition_" &
omp_suffix(genNew = false).} = omp_grain_size * max_threads < omp_size

if omp_condition:
nb_chunks = max_threads
else:
nb_chunks = 1
let whole_chunk_size = omp_size div nb_chunks

const omp_annotation = (when use_simd:"simd " else: "") &
"if(" & $omp_condition_csym & ")"
omp_condition = omp_grain_size * max_threads < omp_size

for chunk_id in `||`(0, nb_chunks - 1, omp_annotation):
let `chunk_offset`{.inject.} = whole_chunk_size * chunk_id
let `chunk_size`{.inject.} = if chunk_id < nb_chunks - 1: whole_chunk_size
{.emit: "#pragma omp parallel if (`omp_condition`)".}
block:
let
nb_chunks = omp_get_num_threads()
whole_chunk_size = omp_size div nb_chunks
thread_id = omp_get_thread_num()
`chunk_offset`{.inject.} = whole_chunk_size * thread_id
`chunk_size`{.inject.} = if thread_id < nb_chunks - 1: whole_chunk_size
else: ompsize - chunk_offset
block: body

template omp_parallel_chunks_default*(
length: Natural, nb_chunks: var Natural,
length: Natural,
chunk_offset, chunk_size: untyped,
body: untyped): untyped =
## This will be renamed omp_parallel_chunks once
Expand All @@ -249,12 +239,10 @@ template omp_parallel_chunks_default*(
## contiguous copy or add operations. It's 1024 and can be changed
## by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation.
## A value of 1 will always parallelize the loop.
## - simd is used by default
omp_parallel_chunks(
length, nb_chunks,
length,
chunk_offset, chunk_size,
omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,
use_simd = true,
body
)

Expand All @@ -265,8 +253,7 @@ template omp_parallel*(body: untyped): untyped =
## sequences, strings, or reference types.
## Those should be thread-local temporaries.
{.emit: "#pragma omp parallel".}
block:
body
block: body

template omp_critical*(body: untyped): untyped =
{.emit: "#pragma omp critical".}
Expand Down
12 changes: 4 additions & 8 deletions laser/tensor/initialization.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ proc deepCopy*[T](dst: var Tensor[T], src: Tensor[T]) =
# We use memcpy, due to SIMD optimizations in memcpy,
# we require higher parallelization thresholds
if src.is_C_contiguous:
var nb_chunks: Natural
omp_parallel_chunks(
size, nb_chunks, chunk_offset, chunk_size,
size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand Down Expand Up @@ -90,9 +89,8 @@ proc copyFrom*[T](dst: var Tensor[T], src: Tensor[T]) =
# we require higher parallelization thresholds
if src.is_C_contiguous:
assert dst.shape == src.shape
var nb_chunks: Natural
omp_parallel_chunks(
src.size, nb_chunks, chunk_offset, chunk_size,
src.size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand All @@ -116,9 +114,8 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
withCompilerOptimHints()
doAssert dst.size == len, "Tensor size and buffer length should be the same"
let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
var nb_chunks: Natural
omp_parallel_chunks(
len, nb_chunks, chunk_offset, chunk_size,
len, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
copyMem(
Expand Down Expand Up @@ -147,9 +144,8 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
when not T.supportsCopyMem:
t.storage.raw_data.reset()
else:
var nb_chunks: Natural
omp_parallel_chunks(
t.size, nb_chunks, chunk_offset, chunk_size,
t.size, chunk_offset, chunk_size,
OMP_MEMORY_BOUND_GRAIN_SIZE * 4,
use_simd = false):
zeroMem(
Expand Down

0 comments on commit dbd483c

Please sign in to comment.