Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Openmp parallel iterator improvements #9493

Merged
merged 3 commits into from
Oct 25, 2018
Merged

Openmp parallel iterator improvements #9493

merged 3 commits into from
Oct 25, 2018

Conversation

mratsim
Copy link
Collaborator

@mratsim mratsim commented Oct 24, 2018

Tagging @jcosborn as I think he is also a heavy user of OpenMP with QEX.

This fixes #9490.

  • The first commit is the actual fix.
  • The second one is to use static instead of a custom check in semexprs. I'm not that familiar with the compiler internals so please review that carefully.

Test case:

# Compile with
# `nim c --stackTraces:off -r -d:openmp omp_reduction.nim`
#
# On mac, default Clang doesn't support OpenMP use
# `nim c --stackTraces:off --cc:gcc --gcc.exe:"/usr/local/bin/gcc-7" --gcc.linkerexe:"/usr/local/bin/gcc-7" -r -d:openmp omp_reduction.nim`

import sequtils

when defined(openmp):
  {.passC: "-fopenmp".}
  {.passL: "-fopenmp".}

  {.pragma: omp, header:"omp.h".}

  proc omp_set_num_threads*(x: cint) {.omp.}
  proc omp_get_num_threads*(): cint {.omp.}
  proc omp_get_max_threads*(): cint {.omp.} # This takes hyperthreading into account
  proc omp_get_thread_num*(): cint {.omp.}

else:
  template omp_set_num_threads*(x: cint) = discard
  template omp_get_num_threads*(): cint = 1
  template omp_get_max_threads*(): cint = 1
  template omp_get_thread_num*(): cint = 0

template omp_parallel*(body: untyped): untyped =
  {.emit: "#pragma omp parallel".}
  block:
    body

template omp_critical*(body: untyped): untyped =
  {.emit: "#pragma omp critical".}
  block:
    body

template omp_master*(body: untyped): untyped =
  {.emit: "#pragma omp master".}
  block:
    body

# #########################################
# OMP mangling, workaround for https://github.com/nim-lang/Nim/issues/9365

import random
from strutils import toHex

var mangling_rng {.compileTime.} = initRand(0x1337DEADBEEF)
var current_suffix {.compileTime.} = ""

proc omp_suffix(genNew: static bool = false): static string =
  if genNew:
    current_suffix = mangling_rng.rand(high(uint32)).toHex
  result = current_suffix

# #########################################

func reduction_serial(s: seq[int]): int =
  for val in s:
    result += val

proc reduction_reduction_clause(s: seq[int]): int =
  var sum{.exportc: "sum_" & omp_suffix(genNew = true).} = 0
  const omp_annotation = "parallel for reduction(+:sum_" & omp_suffix(genNew = false) & ')'
  # very restrictive in terms of op supported (add, mul, min, max)

  for i in `||`(0, s.len - 1, omp_annotation):
    let si = s[i]
    {.emit: "`sum` += `si`;".}
  return sum


proc reduction_padding(s: seq[int]): int =
  const
    CacheLineSize = 64 # To avoid false sharing, partial_sums must be padded by a CPU cache line size
    Padding = min(1, CacheLineSize div sizeof(int))

  var partial_sums = newSeq[int](omp_get_max_threads() * Padding)
  # For sum, 0 is a neutral element but if we wanted to do "product"
  # the seq must be initialized with ones in case OpenMP scheduled less
  # threads than omp_get_max_threads

  doAssert omp_get_num_threads() == 1 # Outside of a parallel section
  for i in 0||(s.len-1):
    # There is no way to get the number of threads used
    # except in the inner loop that will be repeated `s.len` times
    partial_sums[omp_get_thread_num() * Padding] += s[i]

  for i in 0 ..< omp_get_max_threads(): # We assume all threads were used but it might not be true
    result += partial_sums[i * Padding]

proc reduction_localvar(s: seq[int]): int =

  # var num_threads_used: int

  omp_parallel:
    ### initialization
    # # We have a proper way to get the number of threads used
    # # without setting them repeatedly in the for loop
    # omp_master:
    #   num_threads_used = omp_get_num_threads()

    var local_sum = 0 # Variables declared in a parallel region are automatically private to each thread
                      # http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-data.html#Default

    ### for loop
    for i in `||`(0, s.len-1, "for"):
      local_sum += s[i]

    ### Finalization
    omp_critical: # This will use a mutex and can accept any C code (omp atomic would not work with Nim)
      result += local_sum

let s = toSeq(1..100)
let expected = 100 * (100+1) div 2

echo "Expected: ", expected # 5050
echo "Serial: ", s.reduction_serial # 5050
echo "OMP reduction clause: ", s.reduction_reduction_clause # 5050
echo "OMP padding: ", s.reduction_padding # 5050
echo "OMP localvar: ", s.reduction_localvar # 5050, used to give 20200 on a dual-core with hyper-threading if "omp parallel for" instead of "omp for"

C produced

N_LIB_PRIVATE N_NIMCALL(NI, reduction_serial_CHGNrM5OLOUd6oaeqbjTsw)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	result = (NI)0;
	{
		NI val;
		NI i;
		NI L;
		NI T2_;
		val = (NI)0;
		i = ((NI) 0);
		T2_ = (s ? s->Sup.len : 0);
		L = T2_;
		{
			while (1) {
				if (!(i < L)) goto LA4;
				val = s->data[i];
				result += val;
				i += ((NI) 1);
			} LA4: ;
		}
	}
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_reduction_clause_JEfpoSz9aVctQlq3wX1sSqQ)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	NI sum_00000000CB2E7BAE;
	NI i;
	NI T1_;
{	result = (NI)0;
	sum_00000000CB2E7BAE = ((NI) 0);
	T1_ = (s ? s->Sup.len : 0);
	#pragma omp parallel for reduction(+:sum_00000000CB2E7BAE)
for (i = ((NI) 0); i <= (NI)(T1_ - ((NI) 1)); ++i)	{
		NI si;
		si = s->data[i];
		sum_00000000CB2E7BAE += si;
	}
	result = sum_00000000CB2E7BAE;
	goto BeforeRet_;
	}BeforeRet_: ;
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_padding_JEfpoSz9aVctQlq3wX1sSqQ_2)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	tySequence_qwqHTkRvwhrRyENtudHQ7g* partial_sums;
	int T1_;
	NI i;
	NI T7_;
	result = (NI)0;
	T1_ = (int)0;
	T1_ = omp_get_max_threads();
	partial_sums = newSeq_vg7LhJDfsuQYl9cZfOCH1Fw(((NI) ((NI32)(T1_ * ((NI32) 1)))));
	{
		int T4_;
		T4_ = (int)0;
		T4_ = omp_get_num_threads();
		if (!!((T4_ == ((NI32) 1)))) goto LA5_;
		failedAssertImpl_aDmpBTs9cPuXp0Mp9cfiNeyA(((NimStringDesc*) &TM_yYAv1zZ5tQshMpSZd9bY4fA_8));
	}
	LA5_: ;
	T7_ = (s ? s->Sup.len : 0);
	#pragma omp parallel for
for (i = ((NI) 0); i <= (NI)(T7_ - ((NI) 1)); ++i)	{
		int T9_;
		T9_ = (int)0;
		T9_ = omp_get_thread_num();
		partial_sums->data[(NI32)(T9_ * ((NI32) 1))] += s->data[i];
	}
	{
		NI i_2;
		NI colontmp_;
		int T11_;
		NI i_3;
		i_2 = (NI)0;
		colontmp_ = (NI)0;
		T11_ = (int)0;
		T11_ = omp_get_max_threads();
		colontmp_ = ((NI) (T11_));
		i_3 = ((NI) 0);
		{
			while (1) {
				if (!(i_3 < colontmp_)) goto LA13;
				i_2 = i_3;
				result += partial_sums->data[(NI)(i_2 * ((NI) 1))];
				i_3 += ((NI) 1);
			} LA13: ;
		}
	}
	return result;
}

N_LIB_PRIVATE N_NIMCALL(NI, reduction_localvar_JEfpoSz9aVctQlq3wX1sSqQ_3)(tySequence_qwqHTkRvwhrRyENtudHQ7g* s) {
	NI result;
	result = (NI)0;
	#pragma omp parallel
	{
		NI local_sum;
		NI i;
		NI T2_;
		local_sum = ((NI) 0);
		T2_ = (s ? s->Sup.len : 0);
		#pragma omp for
for (i = ((NI) 0); i <= (NI)(T2_ - ((NI) 1)); ++i)		{
			local_sum += s->data[i];
		}
		#pragma omp critical
		{
			result += local_sum;
		}
	}
	return result;
}

And the results are all correct:

Expected: 5050
Serial: 5050
OMP reduction clause: 5050
OMP padding: 5050
OMP localvar: 5050

@mratsim
Copy link
Collaborator Author

mratsim commented Oct 24, 2018

Don't pull right away, I need to add to the change log as well

@mratsim
Copy link
Collaborator Author

mratsim commented Oct 24, 2018

CI failing is unrelated, the current devel branch doesn't past test following this commit 963292f

@Araq
Copy link
Member

Araq commented Oct 25, 2018

But appveyor produced a more mysterious error, so please rebase.

@mratsim
Copy link
Collaborator Author

mratsim commented Oct 25, 2018

Rebased and green

@Araq Araq merged commit 5b97762 into nim-lang:devel Oct 25, 2018
narimiran pushed a commit to narimiran/Nim that referenced this pull request Oct 31, 2018
* More flexibility in OpenMP pragma
* Use static to constrain to compile-time annotation string
* Update changelog with OpenMP change
mratsim added a commit to mratsim/laser that referenced this pull request Nov 4, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

OpenMP: allow "#pragma omp" without "parallel for"
2 participants