wongalvis14 · wongalvis14 · Jun 29, 2019 · Jul 4, 2019 · Jul 16, 2019 · Jul 17, 2019
diff --git a/modules/packages/LinearAlgebra.chpl b/modules/packages/LinearAlgebra.chpl
@@ -631,15 +631,61 @@ private proc _matmatMult(A: [?Adom] ?eltType, B: [?Bdom] eltType)
   return C;
 }
 
+pragma "no doc"
+/* Returns ``true`` if the domain is distributed */
+private proc isDistributed(a) param {
+  return !isSubtype(a.domain.dist.type, DefaultDist);
+}
 
 /* Inner product of 2 vectors. */
-proc inner(A: [?Adom], B: [?Bdom]) {
+proc inner(const ref A: [?Adom] ?eltType, const ref B: [?Bdom]) {
   if Adom.rank != 1 || Bdom.rank != 1 then
     compilerError("Rank sizes are not 1");
   if Adom.size != Bdom.size then
     halt("Mismatched size in inner multiplication");
+
+  var result: eltType = 0;
+
+  if !isDistributed(A) {
+    result = + reduce (A*B);
+  }
+  else {
+    // Replaces `+ reduce (A*B)` for improved distributed performance
+
+    var localResults: [Locales.domain] eltType = 0;
+
+    coforall l in Locales do on l {
+      const maxThreads = if dataParTasksPerLocale==0 
+                         then here.maxTaskPar else dataParTasksPerLocale;
+      const localDomain = A.localSubdomain();
+      const iterPerThread = divceil(localDomain.size, maxThreads);
+      var localResult: eltType = 0; 
+      var threadResults: [0..#maxThreads] eltType = 0;
+
+      coforall tid in 0..#maxThreads {
+        const startid = localDomain.low + tid * iterPerThread;
+        const temp_endid = startid + iterPerThread - 1;
+        const endid = if localDomain.high < temp_endid
+                      then  localDomain.high else temp_endid;
+        var myResult: eltType = 0;
+        for ind in startid..endid {
+          myResult += A.localAccess(ind) * B.localAccess(ind);
+        }
+        threadResults[tid] = myResult;
+      }
 
-  return + reduce(A[..]*B[..]);
+      for tr in threadResults {
+        localResult += tr;
+      }
+      localResults[here.id] = localResult;
+    }
+
+    for r in localResults {
+      result += r;
+    }
+  }
+
+  return result;
 }
 
 

diff --git a/test/library/packages/LinearAlgebra/performance/dot-perf.chpl b/test/library/packages/LinearAlgebra/performance/dot-perf.chpl
@@ -0,0 +1,79 @@
+/*
+Dense dot product performance testing
+*/
+
+use LinearAlgebra;
+use Time;
+use BlockDist;
+use Math;
+
+config const n=1000000,
+             iters=10,
+             thresh=1.0e-10,
+             /* Skip benchmarking against reduction */
+             reference=false,
+             /* Omit timing output */
+             correctness=false;
+
+config type eltType = real;
+
+const nbytes = numBytes(eltType);
+
+proc main() {
+  const Space = {1..n};
+  const BlockSpace = Space dmapped Block(boundingBox=Space);
+
+  var BA : [BlockSpace] real = [i in BlockSpace] (i / (1000000));
+
+  var t: Timer;
+
+  if !correctness {
+    writeln('============================');
+    writeln('Dense Dot Performance Test');
+    writeln('============================');
+    writeln('iters : ', iters);
+    writeln('n     : ', n);
+    writeln('numLocales     : ', numLocales);
+    writeln('MB    : ', (nbytes*n) / 10**6);
+    writeln();
+  }
+
+  var d: eltType;
+
+  for 1..iters {
+    t.start();
+    d = dot(BA, BA);
+    t.stop();
+  }
+
+  if correctness {
+    var d_reduce = + reduce (BA * BA);
+    const diff = abs(d - d_reduce);
+    if diff > thresh {
+      writeln("FAILED ", diff);
+    }
+    else {
+      writeln("PASSED");
+    }
+  }
+
+  if !correctness then
+    writeln('LinearAlgebra.dot: ', t.elapsed() / iters);
+  t.clear();
+
+  if reference {
+    for 1..iters {
+      t.start();
+      d = + reduce (BA * BA);
+      t.stop();
+    }
+
+    if !correctness then
+      writeln('reduction: ', t.elapsed() / iters);
+    t.clear();
+  } else {
+    if !correctness then
+      writeln('reduction: -1');
+  }
+}
+
diff --git a/test/library/packages/LinearAlgebra/performance/dot-perf.execopts b/test/library/packages/LinearAlgebra/performance/dot-perf.execopts
@@ -0,0 +1 @@
+--n=100000 --iters=1 -nl 4 --correctness=true
diff --git a/test/library/packages/LinearAlgebra/performance/dot-perf.good b/test/library/packages/LinearAlgebra/performance/dot-perf.good
@@ -0,0 +1 @@
+PASSED
diff --git a/test/library/packages/LinearAlgebra/performance/dot-perf.perfexecopts b/test/library/packages/LinearAlgebra/performance/dot-perf.perfexecopts
@@ -0,0 +1,7 @@
+--n=1000000 --iters=100 -nl 8 --reference=true #n1000000-8
+--n=1000000000 --iters=10 -nl 8 --reference=true #n1000000000-8
+--n=100000000000 --iters=2 -nl 8 --reference=true #100000000000-8
+--n=1000000 --iters=20 -nl 4 --reference=true #n1000000-4
+--n=1000000000 --iters=2 -nl 4 --reference=true #n1000000000-4
+--n=1000000 --iters=20 -nl 2 --reference=true #n1000000-2
+--n=1000000 --iters=10 -nl 1 --reference=true #n1000000-1
diff --git a/test/library/packages/LinearAlgebra/performance/dot-perf.perfkeys b/test/library/packages/LinearAlgebra/performance/dot-perf.perfkeys
@@ -0,0 +1,2 @@
+LinearAlgebra.dot: 
+reduction: