Skip to content

Commit 7f70bc6

Browse files
committed
kram - simd - bring over more fast math calls
1 parent 673c026 commit 7f70bc6

File tree

2 files changed

+171
-7
lines changed

2 files changed

+171
-7
lines changed

Diff for: libkram/vectormath/vectormath++.cpp

+152-6
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
// These days I use a variant of the RTNE/RN version that also preserves NaN payload bits,
2828
// which is slightly more ops but matches hardware conversions exactly for every input, including all NaNs.
2929
//
30-
// TODO: bring over fast inverses (RTS, RTU, etc)
30+
// DONE: bring over fast inverses (RTS, RTU, etc)
31+
// DONE: need translation, rotation, scale
32+
// TODO: need fast post-translation, post-rotation, post-scale
3133
//
3234
// TODO: saturating conversions would be useful to, and prevent overflow
3335
// see the conversion.h code, bit select to clamp values.
@@ -49,13 +51,22 @@
4951
// c02
5052
// c03
5153
//
54+
// col: TRS * TRS * v cameraToWorldTfm * worldToModelTfm * ..
55+
// row: v * SRT * SRT modelToWorldTfm * worldToCameraTfm * ...
56+
5257
// TODO: need natvis and lldb formatting of math classes.
5358

5459
//-----------------
5560

5661

5762
namespace SIMD_NAMESPACE {
5863

64+
// Which cmath had this
65+
inline void sincosf(float angleInRadians, float& s, float& c) {
66+
s = sinf(angleInRadians);
67+
c = cosf(angleInRadians);
68+
}
69+
5970
// These aren't embedded in function, so may have pre-init ordering issues.
6071
// or could add pre-init order to skip using functions.
6172
// Expose these through function calls as const&
@@ -655,9 +666,8 @@ static const float4 kConvertToMatrix = {0.0f,1.0f,2.0f,-2.0f};
655666

656667
quatf::quatf(float3 axis, float angleInRadians)
657668
{
658-
float c = ::cosf(angleInRadians * 0.5f);
659-
float s = ::sinf(angleInRadians * 0.5f);
660-
669+
float s, c;
670+
sincosf(angleInRadians * 0.5f, s, c);
661671
v = float4m(s * axis, c);
662672
}
663673

@@ -763,13 +773,15 @@ inline void quat_bezier_cp_impl(quatf q0, quatf q1, quatf q2,
763773
quatf& a1, quatf& b1)
764774
{
765775
// TODO: find out of these were quat or vec mul?
766-
a1.v = 2.0f * dot(q0.v,q1.v) * q1.v - q0.v; // Double(q0, q1);
776+
// Double(q0, q1);
777+
a1.v = 2.0f * dot(q0.v,q1.v) * q1.v - q0.v;
767778

768779
// Bisect(a1, q2);
769780
a1.v = (a1.v + q2.v);
770781
a1.v = normalize(a1.v);
771782

772-
b1.v = 2.0f * dot(a1.v,q1.v) * q1.v - a1.v; // Double(a1, q1);
783+
// Double(a1, q1);
784+
b1.v = 2.0f * dot(a1.v,q1.v) * q1.v - a1.v;
773785
}
774786

775787

@@ -808,6 +820,140 @@ quatf quat_bezer_lerp(quatf a, quatf b, quatf c, quatf d, float t)
808820
#endif // SIMD_FLOAT
809821

810822

823+
#if SIMD_FLOAT
824+
825+
void transpose_affine(float4x4& m)
826+
{
827+
// avoid copy and one shuffle
828+
float4 tmp3, tmp2, tmp1, tmp0;
829+
830+
// TOOD: use platform shuffles on Neon
831+
// this is using sse2neon
832+
tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
833+
tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
834+
835+
tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
836+
tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
837+
838+
m[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
839+
m[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
840+
m[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
841+
842+
// skips m[3] - known 0001
843+
}
844+
845+
float4x4 inverse_tr(const float4x4& mtx)
846+
{
847+
float4x4 inverse(mtx);
848+
inverse[3] = float4_negw(); // will be flipped by matrix mul
849+
transpose_affine(inverse); // handle rotation (R)inv = (R)T
850+
851+
inverse[3] = inverse * (-mtx[3]); // 1 mul, 3 mads
852+
853+
return inverse;
854+
}
855+
856+
// invert a row vector matrix
857+
float4x4 inverse_tru(const float4x4& mtx)
858+
{
859+
bool success = false;
860+
861+
float scaleX = length_squared(mtx[0]);
862+
863+
float4x4 inverse(mtx);
864+
if (scaleX > 1e-6f) {
865+
inverse[3] = float4_negw();
866+
867+
transpose_affine(inverse);
868+
869+
// all rows/columns in orthogonal tfm have same magnitude with uniform scale
870+
float4 invScaleX = float4m(1.0f / scaleX); // inverse squared
871+
872+
// scale the rotation matrix by scaling inverse
873+
inverse[0] *= invScaleX;
874+
inverse[1] *= invScaleX;
875+
inverse[2] *= invScaleX;
876+
877+
// handle the translation
878+
inverse[3] = inverse * (-mtx[3]);
879+
880+
success = true;
881+
}
882+
883+
return inverse;
884+
}
885+
886+
float4x4 inverse_trs(const float4x4& mtx)
887+
{
888+
bool success = false;
889+
890+
float4x4 inverse(mtx);
891+
892+
// TODO: fix handling of failure
893+
// compute the scaling terms (4 dots)
894+
// float3 scale = calcScaleSquaredRowTfm(m);
895+
// if (all(scale > float3(1e-6f)) {
896+
inverse[3] = float4_negw(); // neccessary for simple inverse to hold
897+
898+
transpose_affine(inverse);
899+
900+
// this is cheaper than 3 dot's above, just mul/add
901+
float4 invScale = recip(inverse[0]*inverse[0] +
902+
inverse[1]*inverse[1] +
903+
inverse[2]*inverse[2]);
904+
905+
// scale the rotation matrix by scaling inverse
906+
inverse[0] *= invScale;
907+
inverse[1] *= invScale;
908+
inverse[2] *= invScale;
909+
inverse[3] = inverse * (-mtx[3]);
910+
911+
success = true;
912+
//}
913+
914+
return inverse;
915+
}
916+
917+
// TODO: make ctor to avoid returning large matrix
918+
float4x4 float4x4m(char axis, float angleInRadians)
919+
{
920+
float sinTheta, cosTheta;
921+
sincosf(angleInRadians, sinTheta, cosTheta);
922+
923+
float4x4 m;
924+
m[3] = float4_posw();
925+
926+
switch(axis) {
927+
case 'x':
928+
{
929+
m[0] = float4_posx();
930+
m[1] = float4m(0.0f, cosTheta, sinTheta, 0.0f);
931+
m[2] = float4m(0.0f, -sinTheta, cosTheta, 0.0f);
932+
break;
933+
}
934+
935+
case 'y':
936+
{
937+
m[0] = float4m(cosTheta, 0.0f, -sinTheta, 0.0f);
938+
m[1] = float4_posy();
939+
m[2] = float4m(sinTheta, 0.0f, cosTheta, 0.0f);
940+
break;
941+
}
942+
943+
case 'z':
944+
{
945+
m[0] = float4m( cosTheta, sinTheta, 0.0f, 0.0f);
946+
m[1] = float4m(-sinTheta, cosTheta, 0.0f, 0.0f);
947+
m[2] = float4_posz();
948+
break;
949+
}
950+
}
951+
return m;
952+
}
953+
954+
955+
#endif // SIMD_FLOAT
956+
811957
} // namespace SIMD_NAMESPACE
812958

813959
#endif

Diff for: libkram/vectormath/vectormath++.h

+19-1
Original file line numberDiff line numberDiff line change
@@ -1601,7 +1601,25 @@ quatf quat_bezer_slerp(quatf a, quatf b, quatf c, quatf d, float t);
16011601

16021602
quatf inverse(quatf q);
16031603

1604-
#endif
1604+
#endif // SIMD_FLOAT
1605+
1606+
#if SIMD_FLOAT
1607+
1608+
// in-place affine transose
1609+
void transpose_affine(float4x4& m);
1610+
1611+
// fast inverses for translate, rotate, scale
1612+
float4x4 inverse_tr(const float4x4& mtx);
1613+
float4x4 inverse_tru(const float4x4& mtx);
1614+
float4x4 inverse_trs(const float4x4& mtx);
1615+
1616+
float4x4 float4x4m(char axis, float angleInRadians);
1617+
1618+
SIMD_CALL float4x4 float4x4m(float3 axis, float angleInRadians) {
1619+
return float4x4m(quatf(axis, angleInRadians));
1620+
}
1621+
1622+
#endif // SIMD_FLOAT
16051623

16061624

16071625
using namespace STL_NAMESPACE;

0 commit comments

Comments
 (0)