Implement {float}::asin_acos(), {float}::asin(), and {float}::acos() (#…

…48) * f64x2::asin_acos, f64x2::asin, f64x2::acos * f32x4::asin_acos, f32x4::asin, f32x4::acos * tests for asin and acos * fix polynomial_4, make test thresholds tighter * impl asin_acos, asin, acos for avx/256bit types
Lokathor · Aug 28, 2020 · 55b870c · 55b870c
1 parent e2c2283
commit 55b870c
Show file tree

Hide file tree

Showing 8 changed files with 1,081 additions and 0 deletions.
diff --git a/src/f32x4_.rs b/src/f32x4_.rs
@@ -448,6 +448,119 @@ impl f32x4 {
     self ^ (signs & Self::from(-0.0))
   }
 
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn asin_acos(self) -> (Self, Self) {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x4::splat(0.5));
+
+    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // acos
+    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
+    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
+    let acos = big.blend(z3, z4);
+
+    // asin
+    let z3 = f32x4::FRAC_PI_2 - z1;
+    let asin = big.blend(z3, z);
+    let asin = asin.flip_signs(self);
+
+    (asin, acos)
+  }
+
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn asin(self) -> Self {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x4::splat(0.5));
+
+    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // asin
+    let z3 = f32x4::FRAC_PI_2 - z1;
+    let asin = big.blend(z3, z);
+    let asin = asin.flip_signs(self);
+
+    asin
+  }
+
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn acos(self) -> Self {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x4::splat(0.5));
+
+    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // acos
+    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
+    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
+    let acos = big.blend(z3, z4);
+
+    acos
+  }
+
   #[inline]
   #[must_use]
   #[allow(non_upper_case_globals)]
@@ -691,3 +804,11 @@ impl f32x4 {
     Self::ln(self) * Self::LOG10_E
   }
 }
+
+#[must_use]
+#[inline]
+fn polynomial_4(x: f32x4, c0: f32x4, c1: f32x4, c2: f32x4, c3: f32x4, c4: f32x4) -> f32x4 {
+  let x2 = x * x;
+  let x4 = x2 * x2;
+  c3.mul_add(x, c2).mul_add(x2, c1.mul_add(x, c0) + c4 * x4)
+}
diff --git a/src/f32x8_.rs b/src/f32x8_.rs
@@ -567,6 +567,119 @@ impl f32x8 {
     self ^ (signs & Self::from(-0.0))
   }
 
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn asin_acos(self) -> (Self, Self) {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x8::splat(0.5));
+
+    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // acos
+    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
+    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
+    let acos = big.blend(z3, z4);
+
+    // asin
+    let z3 = f32x8::FRAC_PI_2 - z1;
+    let asin = big.blend(z3, z);
+    let asin = asin.flip_signs(self);
+
+    (asin, acos)
+  }
+
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn asin(self) -> Self {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x8::splat(0.5));
+
+    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // asin
+    let z3 = f32x8::FRAC_PI_2 - z1;
+    let asin = big.blend(z3, z);
+    let asin = asin.flip_signs(self);
+
+    asin
+  }
+
+  #[inline]
+  #[must_use]
+  #[allow(non_upper_case_globals)]
+  pub fn acos(self) -> Self {
+    // Based on the Agner Fog "vector class library":
+    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
+    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
+    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
+    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
+    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
+    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
+
+    let xa = self.abs();
+    let big = xa.cmp_ge(f32x8::splat(0.5));
+
+    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
+    let x2 = xa * xa;
+    let x3 = big.blend(x1, x2);
+
+    let xb = x1.sqrt();
+
+    let x4 = big.blend(xb, xa);
+
+    let z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    let z = z.mul_add(x3 * x4, x4);
+
+    let z1 = z + z;
+
+    // acos
+    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
+    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
+    let acos = big.blend(z3, z4);
+
+    acos
+  }
+
   #[inline]
   #[must_use]
   #[allow(non_upper_case_globals)]
@@ -863,3 +976,11 @@ impl Not for f32x8 {
     }
   }
 }
+
+#[must_use]
+#[inline]
+fn polynomial_4(x: f32x8, c0: f32x8, c1: f32x8, c2: f32x8, c3: f32x8, c4: f32x8) -> f32x8 {
+  let x2 = x * x;
+  let x4 = x2 * x2;
+  c3.mul_add(x, c2).mul_add(x2, c1.mul_add(x, c0) + c4 * x4)
+}