Openmp parallel iterator improvements #9493

mratsim · 2018-10-24T16:15:37Z

Tagging @jcosborn as I think he is also a heavy user of OpenMP with QEX.

This fixes #9490.

The first commit is the actual fix.
The second one is to use static instead of a custom check in semexprs. I'm not that familiar with the compiler internals so please review that carefully.

Test case:

# Compile with
# `nim c --stackTraces:off -r -d:openmp omp_reduction.nim`
#
# On mac, default Clang doesn't support OpenMP use
# `nim c --stackTraces:off --cc:gcc --gcc.exe:"/usr/local/bin/gcc-7" --gcc.linkerexe:"/usr/local/bin/gcc-7" -r -d:openmp omp_reduction.nim`

import sequtils

when defined(openmp):
  {.passC: "-fopenmp".}
  {.passL: "-fopenmp".}

  {.pragma: omp, header:"omp.h".}

  proc omp_set_num_threads*(x: cint) {.omp.}
  proc omp_get_num_threads*(): cint {.omp.}
  proc omp_get_max_threads*(): cint {.omp.} # This takes hyperthreading into account
  proc omp_get_thread_num*(): cint {.omp.}

else:
  template omp_set_num_threads*(x: cint) = discard
  template omp_get_num_threads*(): cint = 1
  template omp_get_max_threads*(): cint = 1
  template omp_get_thread_num*(): cint = 0

template omp_parallel*(body: untyped): untyped =
  {.emit: "#pragma omp parallel".}
  block:
    body

template omp_critical*(body: untyped): untyped =
  {.emit: "#pragma omp critical".}
  block:
    body

template omp_master*(body: untyped): untyped =
  {.emit: "#pragma omp master".}
  block:
    body

# #########################################
# OMP mangling, workaround for https://github.com/nim-lang/Nim/issues/9365

import random
from strutils import toHex

var mangling_rng {.compileTime.} = initRand(0x1337DEADBEEF)
var current_suffix {.compileTime.} = ""

proc omp_suffix(genNew: static bool = false): static string =
  if genNew:
    current_suffix = mangling_rng.rand(high(uint32)).toHex
  result = current_suffix

# #########################################

func reduction_serial(s: seq[int]): int =
  for val in s:
    result += val

proc reduction_reduction_clause(s: seq[int]): int =
  var sum{.exportc: "sum_" & omp_suffix(genNew = true).} = 0
  const omp_annotation = "parallel for reduction(+:sum_" & omp_suffix(genNew = false) & ')'
  # very restrictive in terms of op supported (add, mul, min, max)

  for i in `||`(0, s.len - 1, omp_annotation):
    let si = s[i]
    {.emit: "`sum` += `si`;".}
  return sum


proc reduction_padding(s: seq[int]): int =
  const
    CacheLineSize = 64 # To avoid false sharing, partial_sums must be padded by a CPU cache line size
    Padding = min(1, CacheLineSize div sizeof(int))

  var partial_sums = newSeq[int](omp_get_max_threads() * Padding)
  # For sum, 0 is a neutral element but if we wanted to do "product"
  # the seq must be initialized with ones in case OpenMP scheduled less
  # threads than omp_get_max_threads

  doAssert omp_get_num_threads() == 1 # Outside of a parallel section
  for i in 0||(s.len-1):
    # There is no way to get the number of threads used
    # except in the inner loop that will be repeated `s.len` times
    partial_sums[omp_get_thread_num() * Padding] += s[i]

  for i in 0 ..< omp_get_max_threads(): # We assume all threads were used but it might not be true
    result += partial_sums[i * Padding]

proc reduction_localvar(s: seq[int]): int =

  # var num_threads_used: int

  omp_parallel:
    ### initialization
    # # We have a proper way to get the number of threads used
    # # without setting them repeatedly in the for loop
    # omp_master:
    #   num_threads_used = omp_get_num_threads()

    var local_sum = 0 # Variables declared in a parallel region are automatically private to each thread
                      # http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-data.html#Default

    ### for loop
    for i in `||`(0, s.len-1, "for"):
      local_sum += s[i]

    ### Finalization
    omp_critical: # This will use a mutex and can accept any C code (omp atomic would not work with Nim)
      result += local_sum

let s = toSeq(1..100)
let expected = 100 * (100+1) div 2

echo "Expected: ", expected # 5050
echo "Serial: ", s.reduction_serial # 5050
echo "OMP reduction clause: ", s.reduction_reduction_clause # 5050
echo "OMP padding: ", s.reduction_padding # 5050
echo "OMP localvar: ", s.reduction_localvar # 5050, used to give 20200 on a dual-core with hyper-threading if "omp parallel for" instead of "omp for"

C produced

N_LIB_PRIVATE N_NIMCALL(NI, reduction_serial_CHGNrM5OLOUd6oaeqbjTsw)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	result = (NI)0;
	{
		NI val;
		NI i;
		NI L;
		NI T2_;
		val = (NI)0;
		i = ((NI) 0);
		T2_ = (s ? s->Sup.len : 0);
		L = T2_;
		{
			while (1) {
				if (!(i < L)) goto LA4;
				val = s->data[i];
				result += val;
				i += ((NI) 1);
			} LA4: ;
		}
	}
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_reduction_clause_JEfpoSz9aVctQlq3wX1sSqQ)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	NI sum_00000000CB2E7BAE;
	NI i;
	NI T1_;
{	result = (NI)0;
	sum_00000000CB2E7BAE = ((NI) 0);
	T1_ = (s ? s->Sup.len : 0);
	#pragma omp parallel for reduction(+:sum_00000000CB2E7BAE)
for (i = ((NI) 0); i <= (NI)(T1_ - ((NI) 1)); ++i)	{
		NI si;
		si = s->data[i];
		sum_00000000CB2E7BAE += si;
	}
	result = sum_00000000CB2E7BAE;
	goto BeforeRet_;
	}BeforeRet_: ;
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_padding_JEfpoSz9aVctQlq3wX1sSqQ_2)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	tySequence_qwqHTkRvwhrRyENtudHQ7g* partial_sums;
	int T1_;
	NI i;
	NI T7_;
	result = (NI)0;
	T1_ = (int)0;
	T1_ = omp_get_max_threads();
	partial_sums = newSeq_vg7LhJDfsuQYl9cZfOCH1Fw(((NI) ((NI32)(T1_ * ((NI32) 1)))));
	{
		int T4_;
		T4_ = (int)0;
		T4_ = omp_get_num_threads();
		if (!!((T4_ == ((NI32) 1)))) goto LA5_;
		failedAssertImpl_aDmpBTs9cPuXp0Mp9cfiNeyA(((NimStringDesc*) &TM_yYAv1zZ5tQshMpSZd9bY4fA_8));
	}
	LA5_: ;
	T7_ = (s ? s->Sup.len : 0);
	#pragma omp parallel for
for (i = ((NI) 0); i <= (NI)(T7_ - ((NI) 1)); ++i)	{
		int T9_;
		T9_ = (int)0;
		T9_ = omp_get_thread_num();
		partial_sums->data[(NI32)(T9_ * ((NI32) 1))] += s->data[i];
	}
	{
		NI i_2;
		NI colontmp_;
		int T11_;
		NI i_3;
		i_2 = (NI)0;
		colontmp_ = (NI)0;
		T11_ = (int)0;
		T11_ = omp_get_max_threads();
		colontmp_ = ((NI) (T11_));
		i_3 = ((NI) 0);
		{
			while (1) {
				if (!(i_3 < colontmp_)) goto LA13;
				i_2 = i_3;
				result += partial_sums->data[(NI)(i_2 * ((NI) 1))];
				i_3 += ((NI) 1);
			} LA13: ;
		}
	}
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_localvar_JEfpoSz9aVctQlq3wX1sSqQ_3)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	result = (NI)0;
	#pragma omp parallel
	{
		NI local_sum;
		NI i;
		NI T2_;
		local_sum = ((NI) 0);
		T2_ = (s ? s->Sup.len : 0);
		#pragma omp for
for (i = ((NI) 0); i <= (NI)(T2_ - ((NI) 1)); ++i)		{
			local_sum += s->data[i];
		}
		#pragma omp critical
		{
			result += local_sum;
		}
	}
	return result;
}

And the results are all correct:

Expected: 5050
Serial: 5050
OMP reduction clause: 5050
OMP padding: 5050
OMP localvar: 5050

mratsim · 2018-10-24T16:18:07Z

Don't pull right away, I need to add to the change log as well

mratsim · 2018-10-24T17:19:45Z

CI failing is unrelated, the current devel branch doesn't past test following this commit 963292f

Araq · 2018-10-25T11:14:27Z

But appveyor produced a more mysterious error, so please rebase.

mratsim · 2018-10-25T15:47:17Z

Rebased and green

* More flexibility in OpenMP pragma * Use static to constrain to compile-time annotation string * Update changelog with OpenMP change

mratsim mentioned this pull request Oct 25, 2018

OpenMP: Memory optimization of intermediate result mratsim/Arraymancer#261

Open

mratsim added 3 commits October 25, 2018 16:54

More flexibility in OpenMP pragma

93d240a

Use static to constrain to compile-time annotation string

ddfc708

Update changelog with OpenMP change

cd6d9b1

Araq merged commit 5b97762 into nim-lang:devel Oct 25, 2018

This was referenced Oct 25, 2018

Update for devel OpenMP mratsim/Arraymancer#311

Closed

Update for devel OpenMP mratsim/laser#3

Closed

narimiran pushed a commit to narimiran/Nim that referenced this pull request Oct 31, 2018

Openmp parallel iterator improvements (nim-lang#9493)

ce5653a

* More flexibility in OpenMP pragma * Use static to constrain to compile-time annotation string * Update changelog with OpenMP change

mratsim added a commit to mratsim/laser that referenced this pull request Nov 4, 2018

Update OpenMP to latest nim devel - nim-lang/Nim#9493

1dec89c

mratsim mentioned this pull request Nov 4, 2018

Introduce forEach multi-stage domain specific language mratsim/laser#4

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Openmp parallel iterator improvements #9493

Openmp parallel iterator improvements #9493

mratsim commented Oct 24, 2018

mratsim commented Oct 24, 2018

mratsim commented Oct 24, 2018

Araq commented Oct 25, 2018

mratsim commented Oct 25, 2018

Openmp parallel iterator improvements #9493

Openmp parallel iterator improvements #9493

Conversation

mratsim commented Oct 24, 2018

mratsim commented Oct 24, 2018

mratsim commented Oct 24, 2018

Araq commented Oct 25, 2018

mratsim commented Oct 25, 2018