5 #ifndef __EFFEKSEER_SIMD_BASE_H__
6 #define __EFFEKSEER_SIMD_BASE_H__
11 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
16 #if defined(_M_ARM64) || defined(__aarch64__)
17 #define EFK_SIMD_NEON_ARM64
22 #elif (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__SSE2__)
30 #if defined(__AVX__) || defined(EFK_SIMD_AVX2)
33 #if defined(__SSE4_2__) || defined(EFK_SIMD_AVX)
34 #define EFK_SIMD_SSE4_2
36 #if defined(__SSE4_1__) || defined(EFK_SIMD_SSE4_2)
37 #define EFK_SIMD_SSE4_1
39 #if defined(__SSSE3__) || defined(EFK_SIMD_SSE4_1)
40 #define EFK_SIMD_SSSE3
42 #if defined(__SSE3__) || defined(EFK_SIMD_SSSE3)
46 #if defined(EFK_SIMD_AVX) || defined(EFK_SIMD_AVX2)
47 #include <immintrin.h>
48 #elif defined(EFK_SIMD_SSE4_2)
49 #include <nmmintrin.h>
50 #elif defined(EFK_SIMD_SSE4_1)
51 #include <smmintrin.h>
52 #elif defined(EFK_SIMD_SSSE3)
53 #include <tmmintrin.h>
54 #elif defined(EFK_SIMD_SSE3)
55 #include <pmmintrin.h>
56 #elif defined(EFK_SIMD_SSE2)
57 #include <emmintrin.h>
67 const float DefaultEpsilon = 1e-6f;
69 #endif // __EFFEKSEER_SIMD_BASE_H__
71 #ifndef __EFFEKSEER_SIMD_FLOAT4_GEN_H__
72 #define __EFFEKSEER_SIMD_FLOAT4_GEN_H__
75 #if defined(EFK_SIMD_GEN)
86 inline float Sqrt(
float x)
90 inline float Rsqrt(
float x)
92 return 1.0f / std::sqrt(x);
110 Float4(
float x,
float y,
float z,
float w) { vf[0] = x; vf[1] = y; vf[2] = z; vf[3] = w; }
111 Float4(
float i) { vf[0] = i; vf[1] = i; vf[2] = i; vf[3] = i; }
113 float GetX()
const {
return vf[0]; }
114 float GetY()
const {
return vf[1]; }
115 float GetZ()
const {
return vf[2]; }
116 float GetW()
const {
return vf[3]; }
118 void SetX(
float o) { vf[0] = o; }
119 void SetY(
float o) { vf[1] = o; }
120 void SetZ(
float o) { vf[2] = o; }
121 void SetW(
float o) { vf[3] = o; }
123 template <
size_t LANE>
124 Float4 Dup() {
return Float4(vf[LANE], vf[LANE], vf[LANE], vf[LANE]); }
126 Int4 Convert4i()
const;
131 for (
size_t i = 0; i < 4; i++)
140 for (
size_t i = 0; i < 4; i++)
149 for (
size_t i = 0; i < 4; i++)
156 Float4& operator*=(
float rhs)
158 for (
size_t i = 0; i < 4; i++)
167 for (
size_t i = 0; i < 4; i++)
174 Float4& operator/=(
float rhs)
176 for (
size_t i = 0; i < 4; i++)
183 static Float4 Load2(
const void* mem);
184 static void Store2(
void* mem,
const Float4& i);
185 static Float4 Load3(
const void* mem);
186 static void Store3(
void* mem,
const Float4& i);
187 static Float4 Load4(
const void* mem);
188 static void Store4(
void* mem,
const Float4& i);
191 static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
192 static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
203 template<
size_t LANE>
205 template<
size_t LANE>
207 template<
size_t LANE>
209 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
215 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
217 static uint32_t MoveMask(
const Float4& in);
225 static Float4 NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon = DefaultEpsilon);
226 static Float4 IsZero(
const Float4& in,
float epsilon = DefaultEpsilon);
233 for (
size_t i = 0; i < 4; i++)
235 ret.vf[i] = lhs.vf[i] + rhs.vf[i];
240 inline Float4 operator-(
const Float4& lhs,
const Float4& rhs)
243 for (
size_t i = 0; i < 4; i++)
245 ret.vf[i] = lhs.vf[i] - rhs.vf[i];
250 inline Float4 operator*(
const Float4& lhs,
const Float4& rhs)
253 for (
size_t i = 0; i < 4; i++)
255 ret.vf[i] = lhs.vf[i] * rhs.vf[i];
260 inline Float4 operator*(
const Float4& lhs,
float rhs)
263 for (
size_t i = 0; i < 4; i++)
265 ret.vf[i] = lhs.vf[i] * rhs;
270 inline Float4 operator/(
const Float4& lhs,
const Float4& rhs)
273 for (
size_t i = 0; i < 4; i++)
275 ret.vf[i] = lhs.vf[i] / rhs.vf[i];
280 inline Float4 operator/(
const Float4& lhs,
float rhs)
283 for (
size_t i = 0; i < 4; i++)
285 ret.vf[i] = lhs.vf[i] / rhs;
290 inline Float4 operator&(
const Float4& lhs,
const Float4& rhs)
293 for (
size_t i = 0; i < 4; i++)
295 ret.vu[i] = lhs.vu[i] & rhs.vu[i];
300 inline Float4 operator|(
const Float4& lhs,
const Float4& rhs)
303 for (
size_t i = 0; i < 4; i++)
305 ret.vu[i] = lhs.vu[i] | rhs.vu[i];
310 inline Float4 operator^(
const Float4& lhs,
const Float4& rhs)
313 for (
size_t i = 0; i < 4; i++)
315 ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
320 inline bool operator==(
const Float4& lhs,
const Float4& rhs)
323 for (
size_t i = 0; i < 4; i++)
325 ret &= lhs.vf[i] == rhs.vf[i];
330 inline bool operator!=(
const Float4& lhs,
const Float4& rhs)
333 for (
size_t i = 0; i < 4; i++)
335 ret &= lhs.vf[i] == rhs.vf[i];
340 inline Float4 Float4::Load2(
const void* mem)
343 memcpy(ret.vf, mem,
sizeof(
float) * 2);
350 inline void Float4::Store2(
void* mem,
const Float4& i)
352 memcpy(mem, i.vf,
sizeof(
float) * 2);
358 inline Float4 Float4::Load3(
const void* mem)
361 memcpy(ret.vf, mem,
sizeof(
float) * 3);
369 inline void Float4::Store3(
void* mem,
const Float4& i)
371 memcpy(mem, i.vf,
sizeof(
float) * 3);
378 inline Float4 Float4::Load4(
const void* mem)
381 memcpy(ret.vf, mem,
sizeof(
float) * 4);
390 inline void Float4::Store4(
void* mem,
const Float4& i)
392 memcpy(mem, i.vf,
sizeof(
float) * 4);
400 inline Float4 Float4::SetZero()
410 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
413 ret.vu[0] = (uint32_t)x;
414 ret.vu[1] = (uint32_t)y;
415 ret.vu[2] = (uint32_t)z;
416 ret.vu[3] = (uint32_t)w;
420 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
423 ret.vu[0] = (uint32_t)x;
424 ret.vu[1] = (uint32_t)y;
425 ret.vu[2] = (uint32_t)z;
426 ret.vu[3] = (uint32_t)w;
430 inline Float4 Float4::Sqrt(
const Float4& in)
433 for (
size_t i = 0; i < 4; i++)
435 ret.vf[i] = std::sqrt(in.vf[i]);
440 inline Float4 Float4::Rsqrt(
const Float4& in)
443 for (
size_t i = 0; i < 4; i++)
445 ret.vf[i] = 1.0f / std::sqrt(in.vf[i]);
450 inline Float4 Float4::Abs(
const Float4& in)
453 for (
size_t i = 0; i < 4; i++)
455 ret.vf[i] = std::abs(in.vf[i]);
460 inline Float4 Float4::Min(
const Float4& lhs,
const Float4& rhs)
463 for (
size_t i = 0; i < 4; i++)
465 ret.vf[i] = std::fmin(lhs.vf[i], rhs.vf[i]);
470 inline Float4 Float4::Max(
const Float4& lhs,
const Float4& rhs)
473 for (
size_t i = 0; i < 4; i++)
475 ret.vf[i] = std::fmax(lhs.vf[i], rhs.vf[i]);
480 inline Float4 Float4::Floor(
const Float4& in)
483 for (
size_t i = 0; i < 4; i++)
485 ret.vf[i] = std::floor(in.vf[i]);
490 inline Float4 Float4::Ceil(
const Float4& in)
493 for (
size_t i = 0; i < 4; i++)
495 ret.vf[i] = std::ceil(in.vf[i]);
500 inline Float4 Float4::MulAdd(
const Float4& a,
const Float4& b,
const Float4& c)
503 for (
size_t i = 0; i < 4; i++)
505 ret.vf[i] = a.vf[i] + b.vf[i] * c.vf[i];
510 inline Float4 Float4::MulSub(
const Float4& a,
const Float4& b,
const Float4& c)
513 for (
size_t i = 0; i < 4; i++)
515 ret.vf[i] = a.vf[i] - b.vf[i] * c.vf[i];
520 inline Float4 Float4::Dot3(
const Float4& lhs,
const Float4& rhs)
522 Float4 muled = lhs * rhs;
523 return Float4{muled.vf[0] + muled.vf[1] + muled.vf[2], 0.0f, 0.0f, 0.0f};
526 inline Float4 Float4::Cross3(
const Float4& lhs,
const Float4& rhs)
528 return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
529 Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
532 template<
size_t LANE>
533 Float4 Float4::MulLane(
const Float4& lhs,
const Float4& rhs)
535 static_assert(LANE < 4,
"LANE is must be less than 4.");
536 return lhs * rhs.vf[LANE];
539 template<
size_t LANE>
540 Float4 Float4::MulAddLane(
const Float4& a,
const Float4& b,
const Float4& c)
542 static_assert(LANE < 4,
"LANE is must be less than 4.");
543 return a + b * c.vf[LANE];
546 template<
size_t LANE>
547 Float4 Float4::MulSubLane(
const Float4& a,
const Float4& b,
const Float4& c)
549 static_assert(LANE < 4,
"LANE is must be less than 4.");
550 return a - b * c.vf[LANE];
553 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
554 Float4 Float4::Swizzle(
const Float4& in)
556 static_assert(indexX < 4,
"indexX is must be less than 4.");
557 static_assert(indexY < 4,
"indexY is must be less than 4.");
558 static_assert(indexZ < 4,
"indexZ is must be less than 4.");
559 static_assert(indexW < 4,
"indexW is must be less than 4.");
560 return Float4{in.vf[indexX], in.vf[indexY], in.vf[indexZ], in.vf[indexW]};
564 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
565 Float4 Float4::Mask()
567 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
568 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
569 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
570 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
572 ret.vu[0] = 0xffffffff * X;
573 ret.vu[1] = 0xffffffff * Y;
574 ret.vu[2] = 0xffffffff * Z;
575 ret.vu[3] = 0xffffffff * W;
579 inline uint32_t Float4::MoveMask(
const Float4& in)
581 return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
584 inline Float4 Float4::Select(
const Float4& mask,
const Float4& sel1,
const Float4& sel2)
587 for (
size_t i = 0; i < 4; i++)
589 ret.vu[i] = (mask.vu[i] & sel1.vu[i]) | (~mask.vu[i] & sel2.vu[i]);
594 inline Float4 Float4::Equal(
const Float4& lhs,
const Float4& rhs)
597 for (
size_t i = 0; i < 4; i++)
599 ret.vu[i] = (lhs.vf[i] == rhs.vf[i]) ? 0xffffffff : 0;
604 inline Float4 Float4::NotEqual(
const Float4& lhs,
const Float4& rhs)
607 for (
size_t i = 0; i < 4; i++)
609 ret.vu[i] = (lhs.vf[i] != rhs.vf[i]) ? 0xffffffff : 0;
614 inline Float4 Float4::LessThan(
const Float4& lhs,
const Float4& rhs)
617 for (
size_t i = 0; i < 4; i++)
619 ret.vu[i] = (lhs.vf[i] < rhs.vf[i]) ? 0xffffffff : 0;
624 inline Float4 Float4::LessEqual(
const Float4& lhs,
const Float4& rhs)
627 for (
size_t i = 0; i < 4; i++)
629 ret.vu[i] = (lhs.vf[i] <= rhs.vf[i]) ? 0xffffffff : 0;
634 inline Float4 Float4::GreaterThan(
const Float4& lhs,
const Float4& rhs)
637 for (
size_t i = 0; i < 4; i++)
639 ret.vu[i] = (lhs.vf[i] > rhs.vf[i]) ? 0xffffffff : 0;
644 inline Float4 Float4::GreaterEqual(
const Float4& lhs,
const Float4& rhs)
647 for (
size_t i = 0; i < 4; i++)
649 ret.vu[i] = (lhs.vf[i] >= rhs.vf[i]) ? 0xffffffff : 0;
654 inline Float4 Float4::NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon)
657 for (
size_t i = 0; i < 4; i++)
659 ret.vu[i] = (std::abs(lhs.vf[i] - rhs.vf[i]) <= epsilon) ? 0xffffffff : 0;
664 inline Float4 Float4::IsZero(
const Float4& in,
float epsilon)
667 for (
size_t i = 0; i < 4; i++)
669 ret.vu[i] = (std::abs(in.vf[i]) <= epsilon) ? 0xffffffff : 0;
674 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
676 std::swap(s0.vf[1], s1.vf[0]);
677 std::swap(s0.vf[2], s2.vf[0]);
678 std::swap(s0.vf[3], s3.vf[0]);
679 std::swap(s1.vf[2], s2.vf[1]);
680 std::swap(s2.vf[3], s3.vf[2]);
681 std::swap(s1.vf[3], s3.vf[1]);
688 #endif // defined(EFK_SIMD_GEN)
690 #endif // __EFFEKSEER_SIMD_FLOAT4_GEN_H__
692 #ifndef __EFFEKSEER_SIMD_FLOAT4_NEON_H__
693 #define __EFFEKSEER_SIMD_FLOAT4_NEON_H__
696 #if defined(EFK_SIMD_NEON)
704 inline float Sqrt(
float x)
709 inline float Rsqrt(
float x)
711 return 1.0f / sqrt(x);
720 struct alignas(16) Float4
725 Float4(
const Float4& rhs) =
default;
726 Float4(float32x4_t rhs) { s = rhs; }
727 Float4(uint32x4_t rhs) { s = vreinterpretq_f32_u32(rhs); }
728 Float4(
float x,
float y,
float z,
float w) {
const float f[4] = {x, y, z, w}; s = vld1q_f32(f); }
729 Float4(
float i) { s = vdupq_n_f32(i); }
731 float GetX()
const {
return vgetq_lane_f32(s, 0); }
732 float GetY()
const {
return vgetq_lane_f32(s, 1); }
733 float GetZ()
const {
return vgetq_lane_f32(s, 2); }
734 float GetW()
const {
return vgetq_lane_f32(s, 3); }
736 void SetX(
float i) { s = vsetq_lane_f32(i, s, 0); }
737 void SetY(
float i) { s = vsetq_lane_f32(i, s, 1); }
738 void SetZ(
float i) { s = vsetq_lane_f32(i, s, 2); }
739 void SetW(
float i) { s = vsetq_lane_f32(i, s, 3); }
741 template <
size_t LANE>
744 Int4 Convert4i()
const;
747 Float4& operator+=(
const Float4& rhs);
748 Float4& operator-=(
const Float4& rhs);
749 Float4& operator*=(
const Float4& rhs);
750 Float4& operator*=(
float rhs);
751 Float4& operator/=(
const Float4& rhs);
752 Float4& operator/=(
float rhs);
754 static Float4 Load2(
const void* mem);
755 static void Store2(
void* mem,
const Float4& i);
756 static Float4 Load3(
const void* mem);
757 static void Store3(
void* mem,
const Float4& i);
758 static Float4 Load4(
const void* mem);
759 static void Store4(
void* mem,
const Float4& i);
761 static Float4 SetZero();
762 static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
763 static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
764 static Float4 Sqrt(
const Float4& in);
765 static Float4 Rsqrt(
const Float4& in);
766 static Float4 Abs(
const Float4& in);
767 static Float4 Min(
const Float4& lhs,
const Float4& rhs);
768 static Float4 Max(
const Float4& lhs,
const Float4& rhs);
769 static Float4 Floor(
const Float4& in);
770 static Float4 Ceil(
const Float4& in);
771 static Float4 MulAdd(
const Float4& a,
const Float4& b,
const Float4& c);
772 static Float4 MulSub(
const Float4& a,
const Float4& b,
const Float4& c);
774 template<
size_t LANE>
775 static Float4 MulLane(
const Float4& lhs,
const Float4& rhs);
776 template<
size_t LANE>
777 static Float4 MulAddLane(
const Float4& a,
const Float4& b,
const Float4& c);
778 template<
size_t LANE>
779 static Float4 MulSubLane(
const Float4& a,
const Float4& b,
const Float4& c);
780 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
781 static Float4 Swizzle(
const Float4& v);
783 static Float4 Dot3(
const Float4& lhs,
const Float4& rhs);
784 static Float4 Cross3(
const Float4& lhs,
const Float4& rhs);
786 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
787 static Float4 Mask();
788 static uint32_t MoveMask(
const Float4& in);
789 static Float4 Select(
const Float4& mask,
const Float4& sel1,
const Float4& sel2);
790 static Float4 Equal(
const Float4& lhs,
const Float4& rhs);
791 static Float4 NotEqual(
const Float4& lhs,
const Float4& rhs);
792 static Float4 LessThan(
const Float4& lhs,
const Float4& rhs);
793 static Float4 LessEqual(
const Float4& lhs,
const Float4& rhs);
794 static Float4 GreaterThan(
const Float4& lhs,
const Float4& rhs);
795 static Float4 GreaterEqual(
const Float4& lhs,
const Float4& rhs);
796 static Float4 NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon = DefaultEpsilon);
797 static Float4 IsZero(
const Float4& in,
float epsilon = DefaultEpsilon);
798 static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
801 static Float4 SwizzleYZX(
const Float4& in);
802 static Float4 SwizzleZXY(
const Float4& in);
805 template <
size_t LANE>
809 vdupq_lane_f32(vget_low_f32(s), LANE & 1) :
810 vdupq_lane_f32(vget_high_f32(s), LANE & 1);
813 inline Float4 operator+(
const Float4& lhs,
const Float4& rhs)
815 return vaddq_f32(lhs.s, rhs.s);
818 inline Float4 operator-(
const Float4& lhs,
const Float4& rhs)
820 return vsubq_f32(lhs.s, rhs.s);
823 inline Float4 operator*(
const Float4& lhs,
const Float4& rhs)
825 return vmulq_f32(lhs.s, rhs.s);
828 inline Float4 operator*(
const Float4& lhs,
float rhs)
830 return vmulq_n_f32(lhs.s, rhs);
833 inline Float4 operator/(
const Float4& lhs,
const Float4& rhs)
835 #if defined(_M_ARM64) || __aarch64__
836 return vdivq_f32(lhs.s, rhs.s);
838 float32x4_t recp = vrecpeq_f32(rhs.s);
839 float32x4_t s = vrecpsq_f32(recp, rhs.s);
840 recp = vmulq_f32(s, recp);
841 s = vrecpsq_f32(recp, rhs.s);
842 recp = vmulq_f32(s, recp);
843 return vmulq_f32(lhs.s, recp);
847 inline Float4 operator/(
const Float4& lhs,
float rhs)
849 return lhs * (1.0f / rhs);
852 inline Float4 operator&(
const Float4& lhs,
const Float4& rhs)
854 uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
855 uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
856 return vreinterpretq_f32_u32(vandq_u32(lhsi, rhsi));
859 inline Float4 operator|(
const Float4& lhs,
const Float4& rhs)
861 uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
862 uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
863 return vreinterpretq_f32_u32(vorrq_u32(lhsi, rhsi));
866 inline Float4 operator^(
const Float4& lhs,
const Float4& rhs)
868 uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
869 uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
870 return vreinterpretq_f32_u32(veorq_u32(lhsi, rhsi));
873 inline bool operator==(
const Float4& lhs,
const Float4& rhs)
875 return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
878 inline bool operator!=(
const Float4& lhs,
const Float4& rhs)
880 return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
883 inline Float4& Float4::operator+=(
const Float4& rhs) {
return *
this = *
this + rhs; }
884 inline Float4& Float4::operator-=(
const Float4& rhs) {
return *
this = *
this - rhs; }
885 inline Float4& Float4::operator*=(
const Float4& rhs) {
return *
this = *
this * rhs; }
886 inline Float4& Float4::operator*=(
float rhs) {
return *
this = *
this * rhs; }
887 inline Float4& Float4::operator/=(
const Float4& rhs) {
return *
this = *
this / rhs; }
888 inline Float4& Float4::operator/=(
float rhs) {
return *
this = *
this / rhs; }
890 inline Float4 Float4::Load2(
const void* mem)
892 float32x2_t low = vld1_f32((
const float*)mem);
893 float32x2_t high = vdup_n_f32(0.0f);
894 return vcombine_f32(low, high);
897 inline void Float4::Store2(
void* mem,
const Float4& i)
899 vst1_f32((
float*)mem, vget_low_f32(i.s));
902 inline Float4 Float4::Load3(
const void* mem)
904 float32x2_t low = vld1_f32((
const float*)mem);
905 float32x2_t high = vld1_lane_f32((
const float*)mem + 2, vdup_n_f32(0.0f), 0);
906 return vcombine_f32(low, high);
909 inline void Float4::Store3(
void* mem,
const Float4& i)
911 vst1_f32((
float*)mem, vget_low_f32(i.s));
912 vst1q_lane_f32((
float*)mem + 2, i.s, 2);
915 inline Float4 Float4::Load4(
const void* mem)
917 return vld1q_f32((
const float*)mem);
920 inline void Float4::Store4(
void* mem,
const Float4& i)
922 vst1q_f32((
float*)mem, i.s);
925 inline Float4 Float4::SetZero()
927 return vdupq_n_f32(0.0f);
930 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
932 const int32_t i[4] = {x, y, z, w};
933 return vreinterpretq_f32_s32(vld1q_s32(i));
936 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
938 const uint32_t i[4] = {x, y, z, w};
939 return vreinterpretq_f32_u32(vld1q_u32(i));
942 inline Float4 Float4::Sqrt(
const Float4& in)
944 #if defined(_M_ARM64) || __aarch64__
945 return vsqrtq_f32(in.s);
947 return Float4(1.0f) / Float4::Rsqrt(in);
951 inline Float4 Float4::Rsqrt(
const Float4& in)
953 float32x4_t s0 = vrsqrteq_f32(in.s);
954 float32x4_t p0 = vmulq_f32(in.s, s0);
955 float32x4_t r0 = vrsqrtsq_f32(p0, s0);
956 float32x4_t s1 = vmulq_f32(s0, r0);
960 inline Float4 Float4::Abs(
const Float4& in)
962 return vabsq_f32(in.s);
965 inline Float4 Float4::Min(
const Float4& lhs,
const Float4& rhs)
967 return vminq_f32(lhs.s, rhs.s);
970 inline Float4 Float4::Max(
const Float4& lhs,
const Float4& rhs)
972 return vmaxq_f32(lhs.s, rhs.s);
975 inline Float4 Float4::Floor(
const Float4& in)
977 #if defined(_M_ARM64) || __aarch64__
978 return vrndmq_f32(in.s);
980 int32x4_t in_i = vcvtq_s32_f32(in.s);
981 float32x4_t result = vcvtq_f32_s32(in_i);
982 float32x4_t larger = vcgtq_f32(result, in.s);
983 larger = vcvtq_f32_s32(larger);
984 return vaddq_f32(result, larger);
988 inline Float4 Float4::Ceil(
const Float4& in)
990 #if defined(_M_ARM64) || __aarch64__
991 return vrndpq_f32(in.s);
993 int32x4_t in_i = vcvtq_s32_f32(in.s);
994 float32x4_t result = vcvtq_f32_s32(in_i);
995 float32x4_t smaller = vcltq_f32(result, in.s);
996 smaller = vcvtq_f32_s32(smaller);
997 return vsubq_f32(result, smaller);
1001 inline Float4 Float4::MulAdd(
const Float4& a,
const Float4& b,
const Float4& c)
1003 return vmlaq_f32(a.s, b.s, c.s);
1006 inline Float4 Float4::MulSub(
const Float4& a,
const Float4& b,
const Float4& c)
1008 return vmlsq_f32(a.s, b.s, c.s);
1011 template<
size_t LANE>
1012 inline Float4 Float4::MulLane(
const Float4& lhs,
const Float4& rhs)
1014 static_assert(LANE < 4,
"LANE is must be less than 4.");
1015 float32x2_t rhs2 = (LANE < 2) ? vget_low_f32(rhs.s) : vget_high_f32(rhs.s);
1016 return vmulq_lane_f32(lhs.s, rhs2, LANE & 1);
1019 template<
size_t LANE>
1020 inline Float4 Float4::MulAddLane(
const Float4& a,
const Float4& b,
const Float4& c)
1022 static_assert(LANE < 4,
"LANE is must be less than 4.");
1023 float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
1024 return vmlaq_lane_f32(a.s, b.s, c2, LANE & 1);
1027 template<
size_t LANE>
1028 inline Float4 Float4::MulSubLane(
const Float4& a,
const Float4& b,
const Float4& c)
1030 static_assert(LANE < 4,
"LANE is must be less than 4.");
1031 float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
1032 return vmlsq_lane_f32(a.s, b.s, c2, LANE & 1);
1044 inline Float4 Float4::Dot3(
const Float4& lhs,
const Float4& rhs)
1046 float32x4_t mul = vmulq_f32(lhs.s, rhs.s);
1047 float32x2_t xy = vpadd_f32(vget_low_f32(mul), vget_low_f32(mul));
1048 float32x2_t dot = vadd_f32(xy, vget_high_f32(mul));
1049 return vcombine_f32(dot, vdup_n_f32(0.0f));
1052 inline Float4 Float4::Cross3(
const Float4& lhs,
const Float4& rhs)
1054 return MulSub(SwizzleYZX(lhs.s) * SwizzleZXY(rhs.s), SwizzleZXY(lhs.s), SwizzleYZX(rhs.s));
1057 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
1058 inline Float4 Float4::Mask()
1060 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
1061 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
1062 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
1063 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
1064 const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
1065 return vld1q_f32((
const float*)in);
1068 inline uint32_t Float4::MoveMask(
const Float4& in)
1070 uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_f32(in.s));
1072 vst1_u16(u16, u16x4);
1073 return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
1076 inline Float4 Float4::Select(
const Float4& mask,
const Float4& sel1,
const Float4& sel2)
1078 uint32x4_t maski = vreinterpretq_u32_f32(mask.s);
1079 return vbslq_f32(maski, sel1.s, sel2.s);
1082 inline Float4 Float4::Equal(
const Float4& lhs,
const Float4& rhs)
1084 return vceqq_f32(lhs.s, rhs.s);
1087 inline Float4 Float4::NotEqual(
const Float4& lhs,
const Float4& rhs)
1089 return vmvnq_u32(vceqq_f32(lhs.s, rhs.s));
1092 inline Float4 Float4::LessThan(
const Float4& lhs,
const Float4& rhs)
1094 return vcltq_f32(lhs.s, rhs.s);
1097 inline Float4 Float4::LessEqual(
const Float4& lhs,
const Float4& rhs)
1099 return vcleq_f32(lhs.s, rhs.s);
1102 inline Float4 Float4::GreaterThan(
const Float4& lhs,
const Float4& rhs)
1104 return vcgtq_f32(lhs.s, rhs.s);
1107 inline Float4 Float4::GreaterEqual(
const Float4& lhs,
const Float4& rhs)
1109 return vcgeq_f32(lhs.s, rhs.s);
1112 inline Float4 Float4::NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon)
1114 return LessEqual(Abs(lhs - rhs), Float4(epsilon));
1117 inline Float4 Float4::IsZero(
const Float4& in,
float epsilon)
1119 return LessEqual(Abs(in), Float4(epsilon));
1122 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
1124 float32x4x2_t t0 = vzipq_f32(s0.s, s2.s);
1125 float32x4x2_t t1 = vzipq_f32(s1.s, s3.s);
1126 float32x4x2_t t2 = vzipq_f32(t0.val[0], t1.val[0]);
1127 float32x4x2_t t3 = vzipq_f32(t0.val[1], t1.val[1]);
1135 inline Float4 Float4::SwizzleYZX(
const Float4& in)
1137 float32x4_t ex = vextq_f32(in.s, in.s, 1);
1138 return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 2);
1141 inline Float4 Float4::SwizzleZXY(
const Float4& in)
1143 float32x4_t ex = vextq_f32(in.s, in.s, 3);
1144 return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 0);
1152 #endif // __EFFEKSEER_SIMD_FLOAT4_NEON_H__
1154 #ifndef __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1155 #define __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1158 #if defined(EFK_SIMD_SSE2)
1166 inline float Sqrt(
float x)
1168 _mm_store_ss(&x, _mm_sqrt_ss(_mm_load_ss(&x)));
1171 inline float Rsqrt(
float x)
1173 _mm_store_ss(&x, _mm_rsqrt_ss(_mm_load_ss(&x)));
1183 struct alignas(16) Float4
1188 Float4(
const Float4& rhs) =
default;
1189 Float4(__m128 rhs) { s = rhs; }
1190 Float4(__m128i rhs) { s = _mm_castsi128_ps(rhs); }
1191 Float4(
float x,
float y,
float z,
float w) { s = _mm_setr_ps(x, y, z, w); }
1192 Float4(
float i) { s = _mm_set_ps1(i); }
1194 float GetX()
const {
return _mm_cvtss_f32(s); }
1195 float GetY()
const {
return _mm_cvtss_f32(Swizzle<1,1,1,1>(s).s); }
1196 float GetZ()
const {
return _mm_cvtss_f32(Swizzle<2,2,2,2>(s).s); }
1197 float GetW()
const {
return _mm_cvtss_f32(Swizzle<3,3,3,3>(s).s); }
1199 void SetX(
float i) { s = _mm_move_ss(s, _mm_set_ss(i)); }
1200 void SetY(
float i) { s = Swizzle<1,0,2,3>(_mm_move_ss(Swizzle<1,0,2,3>(s).s, _mm_set_ss(i))).s; }
1201 void SetZ(
float i) { s = Swizzle<2,1,0,3>(_mm_move_ss(Swizzle<2,1,0,3>(s).s, _mm_set_ss(i))).s; }
1202 void SetW(
float i) { s = Swizzle<3,1,2,0>(_mm_move_ss(Swizzle<3,1,2,0>(s).s, _mm_set_ss(i))).s; }
1204 template <
size_t LANE>
1205 Float4 Dup() {
return Swizzle<LANE,LANE,LANE,LANE>(s); }
1207 Int4 Convert4i()
const;
1208 Int4 Cast4i()
const;
1210 Float4& operator+=(
const Float4& rhs);
1211 Float4& operator-=(
const Float4& rhs);
1212 Float4& operator*=(
const Float4& rhs);
1213 Float4& operator*=(
float rhs);
1214 Float4& operator/=(
const Float4& rhs);
1215 Float4& operator/=(
float rhs);
1217 static Float4 Load2(
const void* mem);
1218 static void Store2(
void* mem,
const Float4& i);
1219 static Float4 Load3(
const void* mem);
1220 static void Store3(
void* mem,
const Float4& i);
1221 static Float4 Load4(
const void* mem);
1222 static void Store4(
void* mem,
const Float4& i);
1224 static Float4 SetZero();
1225 static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
1226 static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
1227 static Float4 Sqrt(
const Float4& in);
1228 static Float4 Rsqrt(
const Float4& in);
1229 static Float4 Abs(
const Float4& in);
1230 static Float4 Min(
const Float4& lhs,
const Float4& rhs);
1231 static Float4 Max(
const Float4& lhs,
const Float4& rhs);
1232 static Float4 Floor(
const Float4& in);
1233 static Float4 Ceil(
const Float4& in);
1234 static Float4 MulAdd(
const Float4& a,
const Float4& b,
const Float4& c);
1235 static Float4 MulSub(
const Float4& a,
const Float4& b,
const Float4& c);
1237 template<
size_t LANE>
1238 static Float4 MulLane(
const Float4& lhs,
const Float4& rhs);
1239 template<
size_t LANE>
1240 static Float4 MulAddLane(
const Float4& a,
const Float4& b,
const Float4& c);
1241 template<
size_t LANE>
1242 static Float4 MulSubLane(
const Float4& a,
const Float4& b,
const Float4& c);
1243 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
1244 static Float4 Swizzle(
const Float4& v);
1246 static Float4 Dot3(
const Float4& lhs,
const Float4& rhs);
1247 static Float4 Cross3(
const Float4& lhs,
const Float4& rhs);
1249 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
1250 static Float4 Mask();
1251 static uint32_t MoveMask(
const Float4& in);
1252 static Float4 Select(
const Float4& mask,
const Float4& sel1,
const Float4& sel2);
1253 static Float4 Equal(
const Float4& lhs,
const Float4& rhs);
1254 static Float4 NotEqual(
const Float4& lhs,
const Float4& rhs);
1255 static Float4 LessThan(
const Float4& lhs,
const Float4& rhs);
1256 static Float4 LessEqual(
const Float4& lhs,
const Float4& rhs);
1257 static Float4 GreaterThan(
const Float4& lhs,
const Float4& rhs);
1258 static Float4 GreaterEqual(
const Float4& lhs,
const Float4& rhs);
1259 static Float4 NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon = DefaultEpsilon);
1260 static Float4 IsZero(
const Float4& in,
float epsilon = DefaultEpsilon);
1261 static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
1264 inline Float4 operator+(
const Float4& lhs,
const Float4& rhs)
1266 return Float4{_mm_add_ps(lhs.s, rhs.s)};
1269 inline Float4 operator-(
const Float4& lhs,
const Float4& rhs)
1271 return Float4{_mm_sub_ps(lhs.s, rhs.s)};
1274 inline Float4 operator*(
const Float4& lhs,
const Float4& rhs)
1276 return Float4{_mm_mul_ps(lhs.s, rhs.s)};
1279 inline Float4 operator*(
const Float4& lhs,
float rhs)
1281 return Float4{_mm_mul_ps(lhs.s, _mm_set1_ps(rhs))};
1284 inline Float4 operator/(
const Float4& lhs,
const Float4& rhs)
1286 return Float4{_mm_div_ps(lhs.s, rhs.s)};
1289 inline Float4 operator/(
const Float4& lhs,
float rhs)
1291 return Float4{_mm_div_ps(lhs.s, _mm_set1_ps(rhs))};
1294 inline Float4 operator&(
const Float4& lhs,
const Float4& rhs)
1296 return Float4{_mm_and_ps(lhs.s, rhs.s)};
1299 inline Float4 operator|(
const Float4& lhs,
const Float4& rhs)
1301 return Float4{_mm_or_ps(lhs.s, rhs.s)};
1304 inline Float4 operator^(
const Float4& lhs,
const Float4& rhs)
1306 return Float4{_mm_xor_ps(lhs.s, rhs.s)};
1309 inline bool operator==(
const Float4& lhs,
const Float4& rhs)
1311 return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
1314 inline bool operator!=(
const Float4& lhs,
const Float4& rhs)
1316 return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
1319 inline Float4& Float4::operator+=(
const Float4& rhs) {
return *
this = *
this + rhs; }
1320 inline Float4& Float4::operator-=(
const Float4& rhs) {
return *
this = *
this - rhs; }
1321 inline Float4& Float4::operator*=(
const Float4& rhs) {
return *
this = *
this * rhs; }
1322 inline Float4& Float4::operator*=(
float rhs) {
return *
this = *
this * rhs; }
1323 inline Float4& Float4::operator/=(
const Float4& rhs) {
return *
this = *
this / rhs; }
1324 inline Float4& Float4::operator/=(
float rhs) {
return *
this = *
this / rhs; }
1326 inline Float4 Float4::Load2(
const void* mem)
1328 __m128 x = _mm_load_ss((
const float*)mem + 0);
1329 __m128 y = _mm_load_ss((
const float*)mem + 1);
1330 return _mm_unpacklo_ps(x, y);
1333 inline void Float4::Store2(
void* mem,
const Float4& i)
1335 Float4 t1 = Swizzle<1,1,1,1>(i.s);
1336 _mm_store_ss((
float*)mem + 0, i.s);
1337 _mm_store_ss((
float*)mem + 1, t1.s);
1340 inline Float4 Float4::Load3(
const void* mem)
1342 __m128 x = _mm_load_ss((
const float*)mem + 0);
1343 __m128 y = _mm_load_ss((
const float*)mem + 1);
1344 __m128 z = _mm_load_ss((
const float*)mem + 2);
1345 __m128 xy = _mm_unpacklo_ps(x, y);
1346 return _mm_movelh_ps(xy, z);
1349 inline void Float4::Store3(
void* mem,
const Float4& i)
1351 Float4 t1 = Swizzle<1,1,1,1>(i.s);
1352 Float4 t2 = Swizzle<2,2,2,2>(i.s);
1353 _mm_store_ss((
float*)mem + 0, i.s);
1354 _mm_store_ss((
float*)mem + 1, t1.s);
1355 _mm_store_ss((
float*)mem + 2, t2.s);
1358 inline Float4 Float4::Load4(
const void* mem)
1360 return _mm_loadu_ps((
const float*)mem);
1363 inline void Float4::Store4(
void* mem,
const Float4& i)
1365 _mm_storeu_ps((
float*)mem, i.s);
1368 inline Float4 Float4::SetZero()
1370 return _mm_setzero_ps();
1373 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
1375 return Float4{_mm_setr_epi32((
int)x, (
int)y, (
int)z, (
int)w)};
1378 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
1380 return Float4{_mm_setr_epi32((
int)x, (
int)y, (
int)z, (
int)w)};
1383 inline Float4 Float4::Sqrt(
const Float4& in)
1385 return Float4{_mm_sqrt_ps(in.s)};
1388 inline Float4 Float4::Rsqrt(
const Float4& in)
1390 return Float4{_mm_rsqrt_ps(in.s)};
1393 inline Float4 Float4::Abs(
const Float4& in)
1395 return _mm_andnot_ps(_mm_set1_ps(-0.0f), in.s);
1398 inline Float4 Float4::Min(
const Float4& lhs,
const Float4& rhs)
1400 return Float4{_mm_min_ps(lhs.s, rhs.s)};
1403 inline Float4 Float4::Max(
const Float4& lhs,
const Float4& rhs)
1405 return Float4{_mm_max_ps(lhs.s, rhs.s)};
1408 inline Float4 Float4::Floor(
const Float4& in)
1410 #if defined(EFK_SIMD_SSE4_2)
1411 return _mm_floor_ps(in.s);
1413 __m128i in_i = _mm_cvttps_epi32(in.s);
1414 __m128 result = _mm_cvtepi32_ps(in_i);
1415 __m128 larger = _mm_cmpgt_ps(result, in.s);
1416 larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
1417 return _mm_add_ps(result, larger);
1421 inline Float4 Float4::Ceil(
const Float4& in)
1423 #if defined(EFK_SIMD_SSE4_2)
1424 return _mm_ceil_ps(in.s);
1426 __m128i in_i = _mm_cvttps_epi32(in.s);
1427 __m128 result = _mm_cvtepi32_ps(in_i);
1428 __m128 smaller = _mm_cmplt_ps(result, in.s);
1429 smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
1430 return _mm_sub_ps(result, smaller);
1434 inline Float4 Float4::MulAdd(
const Float4& a,
const Float4& b,
const Float4& c)
1436 #if defined(EFK_SIMD_AVX2)
1437 return Float4{_mm_fmadd_ps(b.s, c.s, a.s)};
1439 return Float4{_mm_add_ps(a.s, _mm_mul_ps(b.s, c.s))};
1443 inline Float4 Float4::MulSub(
const Float4& a,
const Float4& b,
const Float4& c)
1445 #if defined(EFK_SIMD_AVX2)
1446 return Float4{_mm_fnmadd_ps(b.s, c.s, a.s)};
1448 return Float4{_mm_sub_ps(a.s, _mm_mul_ps(b.s, c.s))};
1452 template<
size_t LANE>
1453 Float4 Float4::MulLane(
const Float4& lhs,
const Float4& rhs)
1455 static_assert(LANE < 4,
"LANE is must be less than 4.");
1456 return _mm_mul_ps(lhs.s, Swizzle<LANE,LANE,LANE,LANE>(rhs).s);
1459 template<
size_t LANE>
1460 Float4 Float4::MulAddLane(
const Float4& a,
const Float4& b,
const Float4& c)
1462 static_assert(LANE < 4,
"LANE is must be less than 4.");
1463 #if defined(EFK_SIMD_AVX2)
1464 return _mm_fmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
1466 return _mm_add_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
1470 template<
size_t LANE>
1471 Float4 Float4::MulSubLane(
const Float4& a,
const Float4& b,
const Float4& c)
1473 static_assert(LANE < 4,
"LANE is must be less than 4.");
1474 #if defined(EFK_SIMD_AVX2)
1475 return _mm_fnmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
1477 return _mm_sub_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
1481 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
1482 Float4 Float4::Swizzle(
const Float4& v)
1484 static_assert(indexX < 4,
"indexX is must be less than 4.");
1485 static_assert(indexY < 4,
"indexY is must be less than 4.");
1486 static_assert(indexZ < 4,
"indexZ is must be less than 4.");
1487 static_assert(indexW < 4,
"indexW is must be less than 4.");
1489 #if defined(EFK_SIMD_AVX)
1490 return _mm_permute_ps(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
1492 return _mm_shuffle_ps(v.s, v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
1496 inline Float4 Float4::Dot3(
const Float4& lhs,
const Float4& rhs)
1498 Float4 muled = lhs * rhs;
1499 return _mm_add_ss(_mm_add_ss(muled.s, Float4::Swizzle<1,1,1,1>(muled).s), Float4::Swizzle<2,2,2,2>(muled).s);
1502 inline Float4 Float4::Cross3(
const Float4& lhs,
const Float4& rhs)
1504 return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
1505 Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
1508 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
1509 inline Float4 Float4::Mask()
1511 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
1512 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
1513 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
1514 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
1515 return _mm_setr_epi32(
1516 (
int)(0xffffffff * X),
1517 (
int)(0xffffffff * Y),
1518 (
int)(0xffffffff * Z),
1519 (
int)(0xffffffff * W));
1522 inline uint32_t Float4::MoveMask(
const Float4& in)
1524 return (uint32_t)_mm_movemask_ps(in.s);
1527 inline Float4 Float4::Select(
const Float4& mask,
const Float4& sel1,
const Float4& sel2)
1529 return _mm_or_ps(_mm_and_ps(mask.s, sel1.s), _mm_andnot_ps(mask.s, sel2.s));
1532 inline Float4 Float4::Equal(
const Float4& lhs,
const Float4& rhs)
1534 return Float4{_mm_cmpeq_ps(lhs.s, rhs.s)};
1537 inline Float4 Float4::NotEqual(
const Float4& lhs,
const Float4& rhs)
1539 return Float4{_mm_cmpneq_ps(lhs.s, rhs.s)};
1542 inline Float4 Float4::LessThan(
const Float4& lhs,
const Float4& rhs)
1544 return Float4{_mm_cmplt_ps(lhs.s, rhs.s)};
1547 inline Float4 Float4::LessEqual(
const Float4& lhs,
const Float4& rhs)
1549 return Float4{_mm_cmple_ps(lhs.s, rhs.s)};
1552 inline Float4 Float4::GreaterThan(
const Float4& lhs,
const Float4& rhs)
1554 return Float4{_mm_cmpgt_ps(lhs.s, rhs.s)};
1557 inline Float4 Float4::GreaterEqual(
const Float4& lhs,
const Float4& rhs)
1559 return Float4{_mm_cmpge_ps(lhs.s, rhs.s)};
1562 inline Float4 Float4::NearEqual(
const Float4& lhs,
const Float4& rhs,
float epsilon)
1564 return LessEqual(Abs(lhs - rhs), Float4(epsilon));
1567 inline Float4 Float4::IsZero(
const Float4& in,
float epsilon)
1569 return LessEqual(Abs(in), Float4(epsilon));
1572 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
1574 _MM_TRANSPOSE4_PS(s0.s, s1.s, s2.s, s3.s);
1583 #endif // __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1585 #ifndef __EFFEKSEER_SIMD_INT4_GEN_H__
1586 #define __EFFEKSEER_SIMD_INT4_GEN_H__
1589 #if defined(EFK_SIMD_GEN)
1592 #include <algorithm>
1615 Int4(int32_t x, int32_t y, int32_t z, int32_t w) { vi[0] = x; vi[1] = y; vi[2] = z; vi[3] = w; }
1616 Int4(int32_t i) { vi[0] = i; vi[1] = i; vi[2] = i; vi[3] = i; }
1618 int32_t GetX()
const {
return vi[0]; }
1619 int32_t GetY()
const {
return vi[1]; }
1620 int32_t GetZ()
const {
return vi[2]; }
1621 int32_t GetW()
const {
return vi[3]; }
1623 void SetX(int32_t o) { vi[0] = o; }
1624 void SetY(int32_t o) { vi[1] = o; }
1625 void SetZ(int32_t o) { vi[2] = o; }
1626 void SetW(int32_t o) { vi[3] = o; }
1628 Float4 Convert4f()
const;
1633 for (
size_t i = 0; i < 4; i++)
1642 for (
size_t i = 0; i < 4; i++)
1651 for (
size_t i = 0; i < 4; i++)
1658 Int4& operator*=(int32_t rhs)
1660 for (
size_t i = 0; i < 4; i++)
1669 for (
size_t i = 0; i < 4; i++)
1676 Int4& operator/=(int32_t rhs)
1678 for (
size_t i = 0; i < 4; i++)
1685 static Int4 Load2(
const void* mem);
1686 static void Store2(
void* mem,
const Int4& i);
1687 static Int4 Load3(
const void* mem);
1688 static void Store3(
void* mem,
const Int4& i);
1689 static Int4 Load4(
const void* mem);
1690 static void Store4(
void* mem,
const Int4& i);
1692 static Int4 SetZero();
1699 template<
size_t LANE>
1701 template<
size_t LANE>
1703 template<
size_t LANE>
1705 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
1706 static Int4 Swizzle(
const Int4& in);
1708 template <
int COUNT>
1709 static Int4 ShiftL(
const Int4& in);
1710 template <
int COUNT>
1711 static Int4 ShiftR(
const Int4& in);
1712 template <
int COUNT>
1713 static Int4 ShiftRA(
const Int4& in);
1715 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
1717 static uint32_t MoveMask(
const Int4& in);
1719 static Int4 NotEqual(
const Int4& lhs,
const Int4& rhs);
1720 static Int4 LessThan(
const Int4& lhs,
const Int4& rhs);
1721 static Int4 LessEqual(
const Int4& lhs,
const Int4& rhs);
1722 static Int4 GreaterThan(
const Int4& lhs,
const Int4& rhs);
1723 static Int4 GreaterEqual(
const Int4& lhs,
const Int4& rhs);
1724 static Int4 NearEqual(
const Int4& lhs,
const Int4& rhs,
float epsilon = DefaultEpsilon);
1725 static Int4 IsZero(
const Int4& in,
float epsilon = DefaultEpsilon);
1729 inline Int4 operator+(
const Int4& lhs,
const Int4& rhs)
1732 for (
size_t i = 0; i < 4; i++)
1734 ret.vi[i] = lhs.vi[i] + rhs.vi[i];
1739 inline Int4 operator-(
const Int4& lhs,
const Int4& rhs)
1742 for (
size_t i = 0; i < 4; i++)
1744 ret.vi[i] = lhs.vi[i] - rhs.vi[i];
1749 inline Int4 operator*(
const Int4& lhs,
const Int4& rhs)
1752 for (
size_t i = 0; i < 4; i++)
1754 ret.vi[i] = lhs.vi[i] * rhs.vi[i];
1759 inline Int4 operator*(
const Int4& lhs, int32_t rhs)
1762 for (
size_t i = 0; i < 4; i++)
1764 ret.vi[i] = lhs.vi[i] * rhs;
1769 inline Int4 operator/(
const Int4& lhs,
const Int4& rhs)
1772 for (
size_t i = 0; i < 4; i++)
1774 ret.vi[i] = lhs.vi[i] / rhs.vi[i];
1779 inline Int4 operator/(
const Int4& lhs, int32_t rhs)
1782 for (
size_t i = 0; i < 4; i++)
1784 ret.vi[i] = lhs.vi[i] / rhs;
1789 inline Int4 operator&(
const Int4& lhs,
const Int4& rhs)
1792 for (
size_t i = 0; i < 4; i++)
1794 ret.vu[i] = lhs.vu[i] & rhs.vu[i];
1799 inline Int4 operator|(
const Int4& lhs,
const Int4& rhs)
1802 for (
size_t i = 0; i < 4; i++)
1804 ret.vu[i] = lhs.vu[i] | rhs.vu[i];
1809 inline Int4 operator^(
const Int4& lhs,
const Int4& rhs)
1812 for (
size_t i = 0; i < 4; i++)
1814 ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
1819 inline bool operator==(
const Int4& lhs,
const Int4& rhs)
1822 for (
size_t i = 0; i < 4; i++)
1824 ret &= lhs.vi[i] == rhs.vi[i];
1829 inline bool operator!=(
const Int4& lhs,
const Int4& rhs)
1832 for (
size_t i = 0; i < 4; i++)
1834 ret &= lhs.vi[i] == rhs.vi[i];
1839 inline Int4 Int4::Load2(
const void* mem)
1842 memcpy(ret.vi, mem,
sizeof(
float) * 2);
1849 inline void Int4::Store2(
void* mem,
const Int4& i)
1851 memcpy(mem, i.vi,
sizeof(
float) * 2);
1857 inline Int4 Int4::Load3(
const void* mem)
1860 memcpy(ret.vi, mem,
sizeof(
float) * 3);
1868 inline void Int4::Store3(
void* mem,
const Int4& i)
1870 memcpy(mem, i.vi,
sizeof(
float) * 3);
1877 inline Int4 Int4::Load4(
const void* mem)
1880 memcpy(ret.vi, mem,
sizeof(
float) * 4);
1889 inline void Int4::Store4(
void* mem,
const Int4& i)
1891 memcpy(mem, i.vi,
sizeof(
float) * 4);
1899 inline Int4 Int4::SetZero()
1909 inline Int4 Int4::Abs(
const Int4& in)
1912 for (
size_t i = 0; i < 4; i++)
1914 ret.vi[i] = std::abs(in.vi[i]);
1919 inline Int4 Int4::Min(
const Int4& lhs,
const Int4& rhs)
1922 for (
size_t i = 0; i < 4; i++)
1924 ret.vi[i] = (lhs.vi[i] < rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
1929 inline Int4 Int4::Max(
const Int4& lhs,
const Int4& rhs)
1932 for (
size_t i = 0; i < 4; i++)
1934 ret.vi[i] = (lhs.vi[i] > rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
1939 inline Int4 Int4::MulAdd(
const Int4& a,
const Int4& b,
const Int4& c)
1942 for (
size_t i = 0; i < 4; i++)
1944 ret.vi[i] = a.vi[i] + b.vi[i] * c.vi[i];
1949 inline Int4 Int4::MulSub(
const Int4& a,
const Int4& b,
const Int4& c)
1952 for (
size_t i = 0; i < 4; i++)
1954 ret.vi[i] = a.vi[i] - b.vi[i] * c.vi[i];
1959 template<
size_t LANE>
1960 Int4 Int4::MulLane(
const Int4& lhs,
const Int4& rhs)
1962 static_assert(LANE < 4,
"LANE is must be less than 4.");
1963 return lhs * rhs.vi[LANE];
1966 template<
size_t LANE>
1967 Int4 Int4::MulAddLane(
const Int4& a,
const Int4& b,
const Int4& c)
1969 static_assert(LANE < 4,
"LANE is must be less than 4.");
1970 return a + b * c.vi[LANE];
1973 template<
size_t LANE>
1974 Int4 Int4::MulSubLane(
const Int4& a,
const Int4& b,
const Int4& c)
1976 static_assert(LANE < 4,
"LANE is must be less than 4.");
1977 return a - b * c.vi[LANE];
1980 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
1981 Int4 Int4::Swizzle(
const Int4& in)
1983 static_assert(indexX < 4,
"indexX is must be less than 4.");
1984 static_assert(indexY < 4,
"indexY is must be less than 4.");
1985 static_assert(indexZ < 4,
"indexZ is must be less than 4.");
1986 static_assert(indexW < 4,
"indexW is must be less than 4.");
1987 return Int4{in.vi[indexX], in.vi[indexY], in.vi[indexZ], in.vi[indexW]};
1990 template <
int COUNT>
1991 inline Int4 Int4::ShiftL(
const Int4& lhs)
1994 for (
size_t i = 0; i < 4; i++)
1996 ret.vu[i] = lhs.vu[i] << COUNT;
2001 template <
int COUNT>
2002 inline Int4 Int4::ShiftR(
const Int4& lhs)
2005 for (
size_t i = 0; i < 4; i++)
2007 ret.vu[i] = lhs.vu[i] >> COUNT;
2012 template <
int COUNT>
2013 inline Int4 Int4::ShiftRA(
const Int4& lhs)
2016 for (
size_t i = 0; i < 4; i++)
2018 ret.vi[i] = lhs.vi[i] >> COUNT;
2023 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
2026 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
2027 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
2028 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
2029 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
2031 ret.vu[0] = 0xffffffff * X;
2032 ret.vu[1] = 0xffffffff * Y;
2033 ret.vu[2] = 0xffffffff * Z;
2034 ret.vu[3] = 0xffffffff * W;
2038 inline uint32_t Int4::MoveMask(
const Int4& in)
2040 return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
2043 inline Int4 Int4::Equal(
const Int4& lhs,
const Int4& rhs)
2046 for (
size_t i = 0; i < 4; i++)
2048 ret.vu[i] = (lhs.vi[i] == rhs.vi[i]) ? 0xffffffff : 0;
2053 inline Int4 Int4::NotEqual(
const Int4& lhs,
const Int4& rhs)
2056 for (
size_t i = 0; i < 4; i++)
2058 ret.vu[i] = (lhs.vi[i] != rhs.vi[i]) ? 0xffffffff : 0;
2063 inline Int4 Int4::LessThan(
const Int4& lhs,
const Int4& rhs)
2066 for (
size_t i = 0; i < 4; i++)
2068 ret.vu[i] = (lhs.vi[i] < rhs.vi[i]) ? 0xffffffff : 0;
2073 inline Int4 Int4::LessEqual(
const Int4& lhs,
const Int4& rhs)
2076 for (
size_t i = 0; i < 4; i++)
2078 ret.vu[i] = (lhs.vi[i] <= rhs.vi[i]) ? 0xffffffff : 0;
2083 inline Int4 Int4::GreaterThan(
const Int4& lhs,
const Int4& rhs)
2086 for (
size_t i = 0; i < 4; i++)
2088 ret.vu[i] = (lhs.vi[i] > rhs.vi[i]) ? 0xffffffff : 0;
2093 inline Int4 Int4::GreaterEqual(
const Int4& lhs,
const Int4& rhs)
2096 for (
size_t i = 0; i < 4; i++)
2098 ret.vu[i] = (lhs.vi[i] >= rhs.vi[i]) ? 0xffffffff : 0;
2103 inline Int4 Int4::NearEqual(
const Int4& lhs,
const Int4& rhs,
float epsilon)
2106 for (
size_t i = 0; i < 4; i++)
2108 ret.vu[i] = (std::abs(lhs.vi[i] - rhs.vi[i]) <= epsilon) ? 0xffffffff : 0;
2113 inline Int4 Int4::IsZero(
const Int4& in,
float epsilon)
2116 for (
size_t i = 0; i < 4; i++)
2118 ret.vu[i] = (std::abs(in.vi[i]) <= epsilon) ? 0xffffffff : 0;
2123 inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
2125 std::swap(s0.vi[1], s1.vi[0]);
2126 std::swap(s0.vi[2], s2.vi[0]);
2127 std::swap(s0.vi[3], s3.vi[0]);
2128 std::swap(s1.vi[2], s2.vi[1]);
2129 std::swap(s2.vi[3], s3.vi[2]);
2130 std::swap(s1.vi[3], s3.vi[1]);
2139 #endif // __EFFEKSEER_SIMD_INT4_GEN_H__
2141 #ifndef __EFFEKSEER_SIMD_INT4_NEON_H__
2142 #define __EFFEKSEER_SIMD_INT4_NEON_H__
2145 #if defined(EFK_SIMD_NEON)
2159 struct alignas(16) Int4
2164 Int4(
const Int4& rhs) =
default;
2165 Int4(int32x4_t rhs) { s = rhs; }
2166 Int4(int32_t x, int32_t y, int32_t z, int32_t w) {
const int32_t v[4] = {x, y, z, w}; s = vld1q_s32(v); }
2167 Int4(int32_t i) { s = vdupq_n_s32(i); }
2169 int32_t GetX()
const {
return vgetq_lane_s32(s, 0); }
2170 int32_t GetY()
const {
return vgetq_lane_s32(s, 1); }
2171 int32_t GetZ()
const {
return vgetq_lane_s32(s, 2); }
2172 int32_t GetW()
const {
return vgetq_lane_s32(s, 3); }
2174 void SetX(int32_t i) { s = vsetq_lane_s32(i, s, 0); }
2175 void SetY(int32_t i) { s = vsetq_lane_s32(i, s, 1); }
2176 void SetZ(int32_t i) { s = vsetq_lane_s32(i, s, 2); }
2177 void SetW(int32_t i) { s = vsetq_lane_s32(i, s, 3); }
2179 Float4 Convert4f()
const;
2180 Float4 Cast4f()
const;
2182 Int4& operator+=(
const Int4& rhs);
2183 Int4& operator-=(
const Int4& rhs);
2184 Int4& operator*=(
const Int4& rhs);
2185 Int4& operator*=(int32_t rhs);
2186 Int4& operator/=(
const Int4& rhs);
2187 Int4& operator/=(int32_t rhs);
2189 static Int4 Load2(
const void* mem);
2190 static void Store2(
void* mem,
const Int4& i);
2191 static Int4 Load3(
const void* mem);
2192 static void Store3(
void* mem,
const Int4& i);
2193 static Int4 Load4(
const void* mem);
2194 static void Store4(
void* mem,
const Int4& i);
2196 static Int4 SetZero();
2197 static Int4 Abs(
const Int4& in);
2198 static Int4 Min(
const Int4& lhs,
const Int4& rhs);
2199 static Int4 Max(
const Int4& lhs,
const Int4& rhs);
2200 static Int4 MulAdd(
const Int4& a,
const Int4& b,
const Int4& c);
2201 static Int4 MulSub(
const Int4& a,
const Int4& b,
const Int4& c);
2203 template<
size_t LANE>
2204 static Int4 MulLane(
const Int4& lhs,
const Int4& rhs);
2205 template<
size_t LANE>
2206 static Int4 MulAddLane(
const Int4& a,
const Int4& b,
const Int4& c);
2207 template<
size_t LANE>
2208 static Int4 MulSubLane(
const Int4& a,
const Int4& b,
const Int4& c);
2209 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
2210 static Int4 Swizzle(
const Int4& v);
2212 template <
int COUNT>
2213 static Int4 ShiftL(
const Int4& in);
2214 template <
int COUNT>
2215 static Int4 ShiftR(
const Int4& in);
2216 template <
int COUNT>
2217 static Int4 ShiftRA(
const Int4& in);
2219 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
2221 static uint32_t MoveMask(
const Int4& in);
2222 static Int4 Equal(
const Int4& lhs,
const Int4& rhs);
2223 static Int4 NotEqual(
const Int4& lhs,
const Int4& rhs);
2224 static Int4 LessThan(
const Int4& lhs,
const Int4& rhs);
2225 static Int4 LessEqual(
const Int4& lhs,
const Int4& rhs);
2226 static Int4 GreaterThan(
const Int4& lhs,
const Int4& rhs);
2227 static Int4 GreaterEqual(
const Int4& lhs,
const Int4& rhs);
2228 static Int4 NearEqual(
const Int4& lhs,
const Int4& rhs, int32_t epsilon = DefaultEpsilon);
2229 static Int4 IsZero(
const Int4& in, int32_t epsilon = DefaultEpsilon);
2230 static void Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3);
2233 static Int4 SwizzleYZX(
const Int4& in);
2234 static Int4 SwizzleZXY(
const Int4& in);
2237 inline Int4 operator+(
const Int4& lhs,
const Int4& rhs)
2239 return vaddq_s32(lhs.s, rhs.s);
2242 inline Int4 operator-(
const Int4& lhs,
const Int4& rhs)
2244 return vsubq_s32(lhs.s, rhs.s);
2247 inline Int4 operator*(
const Int4& lhs,
const Int4& rhs)
2249 return vmulq_s32(lhs.s, rhs.s);
2252 inline Int4 operator*(
const Int4& lhs, int32_t rhs)
2254 return vmulq_n_s32(lhs.s, rhs);
2257 inline Int4 operator/(
const Int4& lhs,
const Int4& rhs)
2259 #if defined(EFK_NEON_ARM64)
2260 return vdivq_s32(lhs.s, rhs.s);
2263 lhs.GetX() / rhs.GetX(),
2264 lhs.GetY() / rhs.GetY(),
2265 lhs.GetZ() / rhs.GetZ(),
2266 lhs.GetW() / rhs.GetW());
2270 inline Int4 operator/(
const Int4& lhs, int32_t rhs)
2272 return lhs * (1.0f / rhs);
2275 inline Int4 operator&(
const Int4& lhs,
const Int4& rhs)
2277 uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
2278 uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
2279 return vreinterpretq_s32_u32(vandq_u32(lhsi, rhsi));
2282 inline Int4 operator|(
const Int4& lhs,
const Int4& rhs)
2284 uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
2285 uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
2286 return vreinterpretq_s32_u32(vorrq_u32(lhsi, rhsi));
2289 inline bool operator==(
const Int4& lhs,
const Int4& rhs)
2291 return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
2294 inline bool operator!=(
const Int4& lhs,
const Int4& rhs)
2296 return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
2299 inline Int4& Int4::operator+=(
const Int4& rhs) {
return *
this = *
this + rhs; }
2300 inline Int4& Int4::operator-=(
const Int4& rhs) {
return *
this = *
this - rhs; }
2301 inline Int4& Int4::operator*=(
const Int4& rhs) {
return *
this = *
this * rhs; }
2302 inline Int4& Int4::operator*=(int32_t rhs) {
return *
this = *
this * rhs; }
2303 inline Int4& Int4::operator/=(
const Int4& rhs) {
return *
this = *
this / rhs; }
2304 inline Int4& Int4::operator/=(int32_t rhs) {
return *
this = *
this / rhs; }
2306 inline Int4 Int4::Load2(
const void* mem)
2308 int32x2_t low = vld1_s32((
const int32_t*)mem);
2309 int32x2_t high = vdup_n_s32(0.0f);
2310 return vcombine_s32(low, high);
2313 inline void Int4::Store2(
void* mem,
const Int4& i)
2315 vst1_s32((int32_t*)mem, vget_low_s32(i.s));
2318 inline Int4 Int4::Load3(
const void* mem)
2320 int32x2_t low = vld1_s32((
const int32_t*)mem);
2321 int32x2_t high = vld1_lane_s32((
const int32_t*)mem + 2, vdup_n_s32(0.0f), 0);
2322 return vcombine_s32(low, high);
2325 inline void Int4::Store3(
void* mem,
const Int4& i)
2327 vst1_s32((int32_t*)mem, vget_low_s32(i.s));
2328 vst1q_lane_s32((int32_t*)mem + 2, i.s, 2);
2331 inline Int4 Int4::Load4(
const void* mem)
2333 return vld1q_s32((
const int32_t*)mem);
2336 inline void Int4::Store4(
void* mem,
const Int4& i)
2338 vst1q_s32((int32_t*)mem, i.s);
2341 inline Int4 Int4::SetZero()
2343 return vdupq_n_s32(0.0f);
2346 inline Int4 Int4::Abs(
const Int4& in)
2348 return vabsq_s32(in.s);
2351 inline Int4 Int4::Min(
const Int4& lhs,
const Int4& rhs)
2353 return vminq_s32(lhs.s, rhs.s);
2356 inline Int4 Int4::Max(
const Int4& lhs,
const Int4& rhs)
2358 return vmaxq_s32(lhs.s, rhs.s);
2361 inline Int4 Int4::MulAdd(
const Int4& a,
const Int4& b,
const Int4& c)
2363 return vmlaq_s32(a.s, b.s, c.s);
2366 inline Int4 Int4::MulSub(
const Int4& a,
const Int4& b,
const Int4& c)
2368 return vmlsq_s32(a.s, b.s, c.s);
2371 template<
size_t LANE>
2372 inline Int4 Int4::MulLane(
const Int4& lhs,
const Int4& rhs)
2374 static_assert(LANE < 4,
"LANE is must be less than 4.");
2375 int32x2_t rhs2 = (LANE < 2) ? vget_low_s32(rhs.s) : vget_high_s32(rhs.s);
2376 return vmulq_lane_s32(lhs.s, rhs2, LANE & 1);
2379 template<
size_t LANE>
2380 inline Int4 Int4::MulAddLane(
const Int4& a,
const Int4& b,
const Int4& c)
2382 static_assert(LANE < 4,
"LANE is must be less than 4.");
2383 int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
2384 return vmlaq_lane_s32(a.s, b.s, c2, LANE & 1);
2387 template<
size_t LANE>
2388 inline Int4 Int4::MulSubLane(
const Int4& a,
const Int4& b,
const Int4& c)
2390 static_assert(LANE < 4,
"LANE is must be less than 4.");
2391 int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
2392 return vmlsq_lane_s32(a.s, b.s, c2, LANE & 1);
2404 template <
int COUNT>
2405 inline Int4 Int4::ShiftL(
const Int4& lhs)
2407 return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
2410 template <
int COUNT>
2411 inline Int4 Int4::ShiftR(
const Int4& lhs)
2413 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
2416 template <
int COUNT>
2417 inline Int4 Int4::ShiftRA(
const Int4& lhs)
2419 return vshrq_n_s32(lhs.s, COUNT);
2422 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
2423 inline Int4 Int4::Mask()
2425 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
2426 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
2427 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
2428 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
2429 const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
2430 return vld1q_u32(in);
2433 inline uint32_t Int4::MoveMask(
const Int4& in)
2435 uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_s32(in.s));
2437 vst1_u16(u16, u16x4);
2438 return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
2441 inline Int4 Int4::Equal(
const Int4& lhs,
const Int4& rhs)
2443 return vreinterpretq_s32_u32(vceqq_s32(lhs.s, rhs.s));
2446 inline Int4 Int4::NotEqual(
const Int4& lhs,
const Int4& rhs)
2448 return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(lhs.s, rhs.s)));
2451 inline Int4 Int4::LessThan(
const Int4& lhs,
const Int4& rhs)
2453 return vreinterpretq_s32_u32(vcltq_s32(lhs.s, rhs.s));
2456 inline Int4 Int4::LessEqual(
const Int4& lhs,
const Int4& rhs)
2458 return vreinterpretq_s32_u32(vcleq_s32(lhs.s, rhs.s));
2461 inline Int4 Int4::GreaterThan(
const Int4& lhs,
const Int4& rhs)
2463 return vreinterpretq_s32_u32(vcgtq_s32(lhs.s, rhs.s));
2466 inline Int4 Int4::GreaterEqual(
const Int4& lhs,
const Int4& rhs)
2468 return vreinterpretq_s32_u32(vcgeq_s32(lhs.s, rhs.s));
2471 inline Int4 Int4::NearEqual(
const Int4& lhs,
const Int4& rhs, int32_t epsilon)
2473 return LessEqual(Abs(lhs - rhs), Int4(epsilon));
2476 inline Int4 Int4::IsZero(
const Int4& in, int32_t epsilon)
2478 return LessEqual(Abs(in), Int4(epsilon));
2481 inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
2483 int32x4x2_t t0 = vzipq_s32(s0.s, s2.s);
2484 int32x4x2_t t1 = vzipq_s32(s1.s, s3.s);
2485 int32x4x2_t t2 = vzipq_s32(t0.val[0], t1.val[0]);
2486 int32x4x2_t t3 = vzipq_s32(t0.val[1], t1.val[1]);
2494 inline Int4 Int4::SwizzleYZX(
const Int4& in)
2496 int32x4_t ex = vextq_s32(in.s, in.s, 1);
2497 return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 2);
2500 inline Int4 Int4::SwizzleZXY(
const Int4& in)
2502 int32x4_t ex = vextq_s32(in.s, in.s, 3);
2503 return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 0);
2511 #endif // __EFFEKSEER_SIMD_INT4_NEON_H__
2513 #ifndef __EFFEKSEER_SIMD_INT4_SSE_H__
2514 #define __EFFEKSEER_SIMD_INT4_SSE_H__
2517 #if defined(EFK_SIMD_SSE2)
2531 struct alignas(16) Int4
2536 Int4(
const Int4& rhs) =
default;
2537 Int4(__m128i rhs) { s = rhs; }
2538 Int4(__m128 rhs) { s = _mm_castps_si128(rhs); }
2539 Int4(int32_t x, int32_t y, int32_t z, int32_t w) { s = _mm_setr_epi32((
int)x, (
int)y, (
int)z, (
int)w); }
2540 Int4(int32_t i) { s = _mm_set1_epi32((
int)i); }
2542 int32_t GetX()
const {
return _mm_cvtsi128_si32(s); }
2543 int32_t GetY()
const {
return _mm_cvtsi128_si32(Swizzle<1,1,1,1>(s).s); }
2544 int32_t GetZ()
const {
return _mm_cvtsi128_si32(Swizzle<2,2,2,2>(s).s); }
2545 int32_t GetW()
const {
return _mm_cvtsi128_si32(Swizzle<3,3,3,3>(s).s); }
2547 void SetX(int32_t i) { s = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s), _mm_castsi128_ps(_mm_cvtsi32_si128(i)))); }
2548 void SetY(int32_t i) { s = Swizzle<1,0,2,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<1,0,2,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2549 void SetZ(int32_t i) { s = Swizzle<2,1,0,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<2,1,0,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2550 void SetW(int32_t i) { s = Swizzle<3,1,2,0>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<3,1,2,0>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2552 Float4 Convert4f()
const;
2553 Float4 Cast4f()
const;
2555 Int4& operator+=(
const Int4& rhs);
2556 Int4& operator-=(
const Int4& rhs);
2557 Int4& operator*=(
const Int4& rhs);
2558 Int4& operator*=(int32_t rhs);
2559 Int4& operator/=(
const Int4& rhs);
2560 Int4& operator/=(int32_t rhs);
2562 static Int4 Load2(
const void* mem);
2563 static void Store2(
void* mem,
const Int4& i);
2564 static Int4 Load3(
const void* mem);
2565 static void Store3(
void* mem,
const Int4& i);
2566 static Int4 Load4(
const void* mem);
2567 static void Store4(
void* mem,
const Int4& i);
2569 static Int4 SetZero();
2570 static Int4 Abs(
const Int4& in);
2571 static Int4 Min(
const Int4& lhs,
const Int4& rhs);
2572 static Int4 Max(
const Int4& lhs,
const Int4& rhs);
2573 static Int4 MulAdd(
const Int4& a,
const Int4& b,
const Int4& c);
2574 static Int4 MulSub(
const Int4& a,
const Int4& b,
const Int4& c);
2576 template<
size_t LANE>
2577 static Int4 MulLane(
const Int4& lhs,
const Int4& rhs);
2578 template<
size_t LANE>
2579 static Int4 MulAddLane(
const Int4& a,
const Int4& b,
const Int4& c);
2580 template<
size_t LANE>
2581 static Int4 MulSubLane(
const Int4& a,
const Int4& b,
const Int4& c);
2582 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
2583 static Int4 Swizzle(
const Int4& v);
2585 template <
int COUNT>
2586 static Int4 ShiftL(
const Int4& in);
2587 template <
int COUNT>
2588 static Int4 ShiftR(
const Int4& in);
2589 template <
int COUNT>
2590 static Int4 ShiftRA(
const Int4& in);
2592 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
2594 static uint32_t MoveMask(
const Int4& in);
2595 static Int4 Equal(
const Int4& lhs,
const Int4& rhs);
2596 static Int4 NotEqual(
const Int4& lhs,
const Int4& rhs);
2597 static Int4 LessThan(
const Int4& lhs,
const Int4& rhs);
2598 static Int4 LessEqual(
const Int4& lhs,
const Int4& rhs);
2599 static Int4 GreaterThan(
const Int4& lhs,
const Int4& rhs);
2600 static Int4 GreaterEqual(
const Int4& lhs,
const Int4& rhs);
2603 inline Int4 operator+(
const Int4& lhs,
const Int4& rhs)
2605 return Int4{_mm_add_epi32(lhs.s, rhs.s)};
2608 inline Int4 operator-(
const Int4& lhs,
const Int4& rhs)
2610 return Int4{_mm_sub_epi32(lhs.s, rhs.s)};
2613 inline Int4 operator*(
const Int4& lhs,
const Int4& rhs)
2615 #if defined(EFK_SIMD_SSE4_1)
2616 return _mm_mullo_epi32(lhs.s, rhs.s);
2618 __m128i tmp1 = _mm_mul_epu32(lhs.s, rhs.s);
2619 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_srli_si128(rhs.s, 4));
2620 return _mm_unpacklo_epi32(
2621 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
2622 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
2626 inline Int4 operator*(
const Int4& lhs, int32_t rhs)
2628 #if defined(EFK_SIMD_SSE4_1)
2629 return _mm_mullo_epi32(lhs.s, _mm_set1_epi32(rhs));
2631 __m128i tmp1 = _mm_mul_epu32(lhs.s, _mm_set1_epi32(rhs));
2632 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_set1_epi32(rhs));
2633 return _mm_unpacklo_epi32(
2634 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
2635 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
2639 inline Int4 operator/(
const Int4& lhs,
const Int4& rhs)
2642 lhs.GetX() * rhs.GetX(),
2643 lhs.GetY() * rhs.GetY(),
2644 lhs.GetZ() * rhs.GetZ(),
2645 lhs.GetW() * rhs.GetW());
2648 inline Int4 operator/(
const Int4& lhs, int32_t rhs)
2657 inline Int4 operator&(
const Int4& lhs,
const Int4& rhs)
2659 return Int4{_mm_and_si128(lhs.s, rhs.s)};
2662 inline Int4 operator|(
const Int4& lhs,
const Int4& rhs)
2664 return Int4{_mm_or_si128(lhs.s, rhs.s)};
2667 inline Int4 operator^(
const Int4& lhs,
const Int4& rhs)
2669 return Int4{_mm_xor_si128(lhs.s, rhs.s)};
2672 inline bool operator==(
const Int4& lhs,
const Int4& rhs)
2674 return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
2677 inline bool operator!=(
const Int4& lhs,
const Int4& rhs)
2679 return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
2682 inline Int4& Int4::operator+=(
const Int4& rhs) {
return *
this = *
this + rhs; }
2683 inline Int4& Int4::operator-=(
const Int4& rhs) {
return *
this = *
this - rhs; }
2684 inline Int4& Int4::operator*=(
const Int4& rhs) {
return *
this = *
this * rhs; }
2685 inline Int4& Int4::operator*=(int32_t rhs) {
return *
this = *
this * rhs; }
2686 inline Int4& Int4::operator/=(
const Int4& rhs) {
return *
this = *
this / rhs; }
2687 inline Int4& Int4::operator/=(int32_t rhs) {
return *
this = *
this / rhs; }
2689 inline Int4 Int4::Load2(
const void* mem)
2691 __m128 x = _mm_load_ss((
const float*)mem + 0);
2692 __m128 y = _mm_load_ss((
const float*)mem + 1);
2693 return _mm_castps_si128(_mm_unpacklo_ps(x, y));
2696 inline void Int4::Store2(
void* mem,
const Int4& i)
2698 Int4 t1 = Swizzle<1,1,1,1>(i);
2699 _mm_store_ss((
float*)mem + 0, _mm_castsi128_ps(i.s));
2700 _mm_store_ss((
float*)mem + 1, _mm_castsi128_ps(t1.s));
2703 inline Int4 Int4::Load3(
const void* mem)
2705 __m128 x = _mm_load_ss((
const float*)mem + 0);
2706 __m128 y = _mm_load_ss((
const float*)mem + 1);
2707 __m128 z = _mm_load_ss((
const float*)mem + 2);
2708 __m128 xy = _mm_unpacklo_ps(x, y);
2709 return _mm_castps_si128(_mm_movelh_ps(xy, z));
2712 inline void Int4::Store3(
void* mem,
const Int4& i)
2714 Int4 t1 = Swizzle<1,1,1,1>(i);
2715 Int4 t2 = Swizzle<2,2,2,2>(i);
2716 _mm_store_ss((
float*)mem + 0, _mm_castsi128_ps(i.s));
2717 _mm_store_ss((
float*)mem + 1, _mm_castsi128_ps(t1.s));
2718 _mm_store_ss((
float*)mem + 2, _mm_castsi128_ps(t2.s));
2721 inline Int4 Int4::Load4(
const void* mem)
2723 return _mm_loadu_si128((
const __m128i*)mem);
2726 inline void Int4::Store4(
void* mem,
const Int4& i)
2728 _mm_storeu_si128((__m128i*)mem, i.s);
2731 inline Int4 Int4::SetZero()
2733 return _mm_setzero_si128();
2736 inline Int4 Int4::Abs(
const Int4& in)
2738 #if defined(EFK_SIMD_SSSE3)
2739 return _mm_abs_epi32(in.s);
2741 __m128i sign = _mm_srai_epi32(in.s, 31);
2742 return _mm_sub_epi32(_mm_xor_si128(in.s, sign), sign);
2746 inline Int4 Int4::Min(
const Int4& lhs,
const Int4& rhs)
2748 #if defined(EFK_SIMD_SSE4_1)
2749 return _mm_min_epi32(lhs.s, rhs.s);
2751 __m128i mask = _mm_cmplt_epi32(lhs.s, rhs.s);
2752 return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
2756 inline Int4 Int4::Max(
const Int4& lhs,
const Int4& rhs)
2758 #if defined(EFK_SIMD_SSE4_1)
2759 return _mm_max_epi32(lhs.s, rhs.s);
2761 __m128i mask = _mm_cmpgt_epi32(lhs.s, rhs.s);
2762 return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
2766 inline Int4 Int4::MulAdd(
const Int4& a,
const Int4& b,
const Int4& c)
2771 inline Int4 Int4::MulSub(
const Int4& a,
const Int4& b,
const Int4& c)
2776 template<
size_t LANE>
2777 Int4 Int4::MulLane(
const Int4& lhs,
const Int4& rhs)
2779 static_assert(LANE < 4,
"LANE is must be less than 4.");
2780 return lhs * Int4::Swizzle<LANE,LANE,LANE,LANE>(rhs);
2783 template<
size_t LANE>
2784 Int4 Int4::MulAddLane(
const Int4& a,
const Int4& b,
const Int4& c)
2786 static_assert(LANE < 4,
"LANE is must be less than 4.");
2787 return a + b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
2790 template<
size_t LANE>
2791 Int4 Int4::MulSubLane(
const Int4& a,
const Int4& b,
const Int4& c)
2793 static_assert(LANE < 4,
"LANE is must be less than 4.");
2794 return a - b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
2797 template <u
int32_t indexX, u
int32_t indexY, u
int32_t indexZ, u
int32_t indexW>
2798 Int4 Int4::Swizzle(
const Int4& v)
2800 static_assert(indexX < 4,
"indexX is must be less than 4.");
2801 static_assert(indexY < 4,
"indexY is must be less than 4.");
2802 static_assert(indexZ < 4,
"indexZ is must be less than 4.");
2803 static_assert(indexW < 4,
"indexW is must be less than 4.");
2804 return Int4{_mm_shuffle_epi32(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX))};
2807 template <
int COUNT>
2808 inline Int4 Int4::ShiftL(
const Int4& lhs)
2810 return _mm_slli_epi32(lhs.s, COUNT);
2813 template <
int COUNT>
2814 inline Int4 Int4::ShiftR(
const Int4& lhs)
2816 return _mm_srli_epi32(lhs.s, COUNT);
2819 template <
int COUNT>
2820 inline Int4 Int4::ShiftRA(
const Int4& lhs)
2822 return _mm_srai_epi32(lhs.s, COUNT);
2825 template <u
int32_t X, u
int32_t Y, u
int32_t Z, u
int32_t W>
2826 inline Int4 Int4::Mask()
2828 static_assert(X >= 2,
"indexX is must be set 0 or 1.");
2829 static_assert(Y >= 2,
"indexY is must be set 0 or 1.");
2830 static_assert(Z >= 2,
"indexZ is must be set 0 or 1.");
2831 static_assert(W >= 2,
"indexW is must be set 0 or 1.");
2832 return _mm_setr_epi32(
2833 (
int)(0xffffffff * X),
2834 (
int)(0xffffffff * Y),
2835 (
int)(0xffffffff * Z),
2836 (
int)(0xffffffff * W));
2839 inline uint32_t Int4::MoveMask(
const Int4& in)
2841 return (uint32_t)_mm_movemask_ps(_mm_castsi128_ps(in.s));
2844 inline Int4 Int4::Equal(
const Int4& lhs,
const Int4& rhs)
2846 return Int4{_mm_cmpeq_epi32(lhs.s, rhs.s)};
2849 inline Int4 Int4::NotEqual(
const Int4& lhs,
const Int4& rhs)
2851 return Int4{_mm_andnot_si128(_mm_cmpeq_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2854 inline Int4 Int4::LessThan(
const Int4& lhs,
const Int4& rhs)
2856 return Int4{_mm_cmplt_epi32(lhs.s, rhs.s)};
2859 inline Int4 Int4::LessEqual(
const Int4& lhs,
const Int4& rhs)
2861 return Int4{_mm_andnot_si128(_mm_cmpgt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2864 inline Int4 Int4::GreaterThan(
const Int4& lhs,
const Int4& rhs)
2866 return Int4{_mm_cmpgt_epi32(lhs.s, rhs.s)};
2869 inline Int4 Int4::GreaterEqual(
const Int4& lhs,
const Int4& rhs)
2871 return Int4{_mm_andnot_si128(_mm_cmplt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2880 #endif // __EFFEKSEER_SIMD_INT4_SSE_H__
2882 #ifndef __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2883 #define __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2886 #if defined(EFK_SIMD_GEN)
2894 inline Int4 Float4::Convert4i()
const {
return Int4((int32_t)vf[0], (int32_t)vf[1], (int32_t)vf[2], (int32_t)vf[3]); }
2896 inline Int4 Float4::Cast4i()
const {
return Int4(vu[0], vu[1], vu[2], vu[3]); }
2898 inline Float4 Int4::Convert4f()
const {
return Float4((
float)vi[0], (
float)vi[1], (
float)vi[2], (
float)vi[3]); }
2900 inline Float4 Int4::Cast4f()
const {
return Float4(vf[0], vf[1], vf[2], vf[3]); }
2908 #endif // __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2910 #ifndef __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2911 #define __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2914 #if defined(EFK_SIMD_NEON)
2922 inline Int4 Float4::Convert4i()
const {
return vcvtq_s32_f32(s); }
2924 inline Int4 Float4::Cast4i()
const {
return vreinterpretq_s32_f32(s); }
2926 inline Float4 Int4::Convert4f()
const {
return vcvtq_f32_s32(s); }
2928 inline Float4 Int4::Cast4f()
const {
return vreinterpretq_f32_s32(s); }
2935 #endif // __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2937 #ifndef __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2938 #define __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2941 #if defined(EFK_SIMD_SSE2)
2949 inline Int4 Float4::Convert4i()
const {
return _mm_cvttps_epi32(s); }
2951 inline Int4 Float4::Cast4i()
const {
return _mm_castps_si128(s); }
2953 inline Float4 Int4::Convert4f()
const {
return _mm_cvtepi32_ps(s); }
2955 inline Float4 Int4::Cast4f()
const {
return _mm_castsi128_ps(s); }
2963 #endif // __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2965 #ifndef __EFFEKSEER_SIMD_VEC2F_H__
2966 #define __EFFEKSEER_SIMD_VEC2F_H__
2982 explicit Vec2f() =
default;
2984 Vec2f(
float x,
float y): s(x, y, 0.0f, 1.0f) {}
2985 Vec2f(
const std::array<float, 2>& v): s(v[0], v[1], 0.0f, 1.0f) {}
2987 Vec2f(
const Vector2D& vec);
2988 Vec2f(
const vector2d& vec);
2990 float GetX()
const {
return s.GetX(); }
2991 float GetY()
const {
return s.GetY(); }
2993 void SetX(
float o) { s.SetX(o); }
2994 void SetY(
float o) { s.SetY(o); }
2996 Vec2f& operator+=(
const Vec2f& o) { s += o.s;
return *
this; }
2997 Vec2f& operator-=(
const Vec2f& o) { s -= o.s;
return *
this; }
2998 Vec2f& operator*=(
const Vec2f& o) { s *= o.s;
return *
this; }
2999 Vec2f& operator*=(
float o) { s *= o;
return *
this; }
3000 Vec2f& operator/=(
const Vec2f& o) { s /= o.s;
return *
this; }
3001 Vec2f& operator/=(
float o) { s /= o;
return *
this; }
3003 float LengthSq()
const;
3004 float Length()
const;
3005 bool IsZero(
float range = DefaultEpsilon)
const;
3006 Vec2f Normalize()
const;
3008 static Vec2f Load(
const void* mem);
3009 static void Store(
void* mem,
const Vec2f& i);
3016 static bool Equal(
const Vec2f& lhs,
const Vec2f& rhs,
float epsilon);
3021 return Vec2f{lhs.s + rhs.s};
3024 inline Vec2f operator-(
const Vec2f& lhs,
const Vec2f& rhs)
3026 return Vec2f{lhs.s - rhs.s};
3029 inline Vec2f operator*(
const Vec2f& lhs,
const Vec2f& rhs)
3031 return Vec2f{lhs.s * rhs.s};
3034 inline Vec2f operator*(
const Vec2f& lhs,
float rhs)
3036 return Vec2f{lhs.s * rhs};
3039 inline Vec2f operator/(
const Vec2f& lhs,
const Vec2f& rhs)
3041 return Vec2f{lhs.s / rhs.s};
3044 inline Vec2f operator/(
const Vec2f& lhs,
float rhs)
3046 return Vec2f{lhs.s / rhs};
3049 inline bool operator==(
const Vec2f& lhs,
const Vec2f& rhs)
3051 return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) == 0x3;
3054 inline bool operator!=(
const Vec2f& lhs,
const Vec2f& rhs)
3056 return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) != 0x3;
3059 inline Vec2f Vec2f::Load(
const void* mem)
3061 return Float4::Load2(mem);
3064 inline void Vec2f::Store(
void* mem,
const Vec2f& i)
3066 Float4::Store2(mem, i.s);
3069 inline Vec2f Vec2f::Sqrt(
const Vec2f& i)
3071 return Vec2f{Float4::Sqrt(i.s)};
3074 inline Vec2f Vec2f::Rsqrt(
const Vec2f& i)
3076 return Vec2f{Float4::Rsqrt(i.s)};
3079 inline Vec2f Vec2f::Abs(
const Vec2f& i)
3081 return Vec2f{Float4::Abs(i.s)};
3084 inline Vec2f Vec2f::Min(
const Vec2f& lhs,
const Vec2f& rhs)
3086 return Vec2f{Float4::Min(lhs.s, rhs.s)};
3089 inline Vec2f Vec2f::Max(
const Vec2f& lhs,
const Vec2f& rhs)
3091 return Vec2f{Float4::Max(lhs.s, rhs.s)};
3094 inline bool Vec2f::Equal(
const Vec2f& lhs,
const Vec2f& rhs,
float epsilon)
3096 return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x3) == 0x3;
3099 inline float Vec2f::LengthSq()
const
3102 return o.GetX() + o.GetY();
3105 inline float Vec2f::Length()
const
3107 return Effekseer::SIMD::Sqrt(LengthSq());
3110 inline bool Vec2f::IsZero(
float range)
const
3112 return LengthSq() < range * range;
3115 inline Vec2f Vec2f::Normalize()
const
3117 return *
this * Effekseer::SIMD::Rsqrt(LengthSq());
3124 #endif // __EFFEKSEER_VEC2F_H__
3126 #ifndef __EFFEKSEER_SIMD_VEC3F_H__
3127 #define __EFFEKSEER_SIMD_VEC3F_H__
3129 #include <functional>
3147 explicit Vec3f() =
default;
3149 Vec3f(
float x,
float y,
float z)
3157 Vec3f(
const Vector3D& vec);
3158 Vec3f(
const vector3d& vec);
3159 Vec3f(
const std::array<float, 3>& vec);
3202 Vec3f& operator*=(
float o)
3212 Vec3f& operator/=(
float o)
3218 float GetSquaredLength()
const;
3219 float GetLength()
const;
3220 bool IsZero(
float epsiron = DefaultEpsilon)
const;
3221 Vec3f Normalize()
const;
3222 Vec3f NormalizePrecisely()
const;
3223 Vec3f NormalizeFast()
const;
3225 static Vec3f Load(
const void* mem);
3226 static void Store(
void* mem,
const Vec3f& i);
3233 static float Dot(
const Vec3f& lhs,
const Vec3f& rhs);
3235 static bool Equal(
const Vec3f& lhs,
const Vec3f& rhs,
float epsilon = DefaultEpsilon);
3242 return Vec3f(-i.GetX(), -i.GetY(), -i.GetZ());
3245 inline Vec3f operator+(
const Vec3f& lhs,
const Vec3f& rhs)
3247 return Vec3f{lhs.s + rhs.s};
3250 inline Vec3f operator-(
const Vec3f& lhs,
const Vec3f& rhs)
3252 return Vec3f{lhs.s - rhs.s};
3255 inline Vec3f operator*(
const Vec3f& lhs,
const Vec3f& rhs)
3257 return Vec3f{lhs.s * rhs.s};
3260 inline Vec3f operator*(
const Vec3f& lhs,
float rhs)
3262 return Vec3f{lhs.s * rhs};
3265 inline Vec3f operator/(
const Vec3f& lhs,
const Vec3f& rhs)
3267 return Vec3f{lhs.s / rhs.s};
3270 inline Vec3f operator/(
const Vec3f& lhs,
float rhs)
3272 return Vec3f{lhs.s / rhs};
3275 inline bool operator==(
const Vec3f& lhs,
const Vec3f& rhs)
3277 return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) == 0x7;
3280 inline bool operator!=(
const Vec3f& lhs,
const Vec3f& rhs)
3282 return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) != 0x7;
3285 inline Vec3f Vec3f::Load(
const void* mem)
3287 return Float4::Load3(mem);
3290 inline void Vec3f::Store(
void* mem,
const Vec3f& i)
3292 Float4::Store3(mem, i.s);
3295 inline Vec3f Vec3f::Sqrt(
const Vec3f& i)
3297 return Vec3f{Float4::Sqrt(i.s)};
3300 inline Vec3f Vec3f::Rsqrt(
const Vec3f& i)
3302 return Vec3f{Float4::Rsqrt(i.s)};
3305 inline Vec3f Vec3f::Abs(
const Vec3f& i)
3307 return Vec3f{Float4::Abs(i.s)};
3310 inline Vec3f Vec3f::Min(
const Vec3f& lhs,
const Vec3f& rhs)
3312 return Vec3f{Float4::Min(lhs.s, rhs.s)};
3315 inline Vec3f Vec3f::Max(
const Vec3f& lhs,
const Vec3f& rhs)
3317 return Vec3f{Float4::Max(lhs.s, rhs.s)};
3320 inline float Vec3f::Dot(
const Vec3f& lhs,
const Vec3f& rhs)
3322 return Float4::Dot3(lhs.s, rhs.s).GetX();
3325 inline Vec3f Vec3f::Cross(
const Vec3f& lhs,
const Vec3f& rhs)
3327 return Float4::Cross3(lhs.s, rhs.s);
3330 inline bool Vec3f::Equal(
const Vec3f& lhs,
const Vec3f& rhs,
float epsilon)
3332 return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x7) == 0x7;
3335 inline float Vec3f::GetSquaredLength()
const
3338 return o.GetX() + o.GetY() + o.GetZ();
3341 inline float Vec3f::GetLength()
const
3343 return Effekseer::SIMD::Sqrt(GetSquaredLength());
3346 inline bool Vec3f::IsZero(
float epsiron)
const
3348 return (Float4::MoveMask(Float4::IsZero(s, epsiron)) & 0x7) == 0x7;
3351 inline Vec3f Vec3f::Normalize()
const
3353 return *
this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
3356 inline Vec3f Vec3f::NormalizePrecisely()
const
3358 return *
this / Effekseer::SIMD::Sqrt(GetSquaredLength());
3361 inline Vec3f Vec3f::NormalizeFast()
const
3363 return *
this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
3374 struct hash<Effekseer::SIMD::Vec3f>
3378 return std::hash<float>()(_Keyval.GetX()) + std::hash<float>()(_Keyval.GetY()) + std::hash<float>()(_Keyval.GetZ());
3384 #endif // __EFFEKSEER_SIMD_VEC3F_H__
3386 #ifndef __EFFEKSEER_SIMD_VEC4F_H__
3387 #define __EFFEKSEER_SIMD_VEC4F_H__
3404 float GetX()
const {
return s.GetX(); }
3405 float GetY()
const {
return s.GetY(); }
3406 float GetZ()
const {
return s.GetZ(); }
3407 float GetW()
const {
return s.GetW(); }
3409 void SetX(
float o) { s.SetX(o); }
3410 void SetY(
float o) { s.SetY(o); }
3411 void SetZ(
float o) { s.SetZ(o); }
3412 void SetW(
float o) { s.SetW(o); }
3416 this->s = this->s + o.s;
3422 this->s = this->s - o.s;
3428 this->s = this->s * o.s;
3434 this->s = this->s / o.s;
3443 static bool Equal(
const Vec4f& lhs,
const Vec4f& rhs,
float epsilon);
3450 inline Vec4f operator-(
const Vec4f& lhs,
const Vec4f& rhs) {
return Vec4f{lhs.s - rhs.s}; }
3452 inline Vec4f operator*(
const Vec4f& lhs,
const Vec4f& rhs) {
return Vec4f{lhs.s * rhs.s}; }
3454 inline Vec4f operator/(
const Vec4f& lhs,
const Vec4f& rhs) {
return Vec4f{lhs.s / rhs.s}; }
3456 inline bool operator==(
const Vec4f& lhs,
const Vec4f& rhs)
3458 return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) == 0xf;
3461 inline bool operator!=(
const Vec4f& lhs,
const Vec4f& rhs)
3463 return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) != 0xf;
3466 inline Vec4f Vec4f::Sqrt(
const Vec4f& i)
3468 return Vec4f{Float4::Sqrt(i.s)};
3471 inline Vec4f Vec4f::Rsqrt(
const Vec4f& i)
3473 return Vec4f{Float4::Rsqrt(i.s)};
3476 inline Vec4f Vec4f::Abs(
const Vec4f& i)
3478 return Vec4f{Float4::Abs(i.s)};
3481 inline Vec4f Vec4f::Min(
const Vec4f& lhs,
const Vec4f& rhs)
3483 return Vec4f{Float4::Min(lhs.s, rhs.s)};
3486 inline Vec4f Vec4f::Max(
const Vec4f& lhs,
const Vec4f& rhs)
3488 return Vec4f{Float4::Max(lhs.s, rhs.s)};
3491 inline bool Vec4f::Equal(
const Vec4f& lhs,
const Vec4f& rhs,
float epsilon)
3493 return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0xf) == 0xf;
3500 #endif // __EFFEKSEER_SIMD_VEC4F_H__
3502 #ifndef __EFFEKSEER_SIMD_MAT43F_H__
3503 #define __EFFEKSEER_SIMD_MAT43F_H__
3522 Mat43f(
float m11,
float m12,
float m13,
3523 float m21,
float m22,
float m23,
3524 float m31,
float m32,
float m33,
3525 float m41,
float m42,
float m43);
3526 Mat43f(
const Matrix43& mat);
3528 bool IsValid()
const;
3530 Mat43f Get3x3SubMatrix()
const;
3532 Vec3f GetScale()
const;
3534 Mat43f GetRotation()
const;
3536 Vec3f GetTranslation()
const;
3540 void SetTranslation(
const Vec3f& t);
3544 Mat43f& operator*=(
float rhs);
3546 static const Mat43f Identity;
3548 static bool Equal(
const Mat43f& lhs,
const Mat43f& rhs,
float epsilon = DefaultEpsilon);
3552 static Mat43f Scaling(
float x,
float y,
float z);
3556 static Mat43f RotationX(
float angle);
3558 static Mat43f RotationY(
float angle);
3560 static Mat43f RotationZ(
float angle);
3562 static Mat43f RotationXYZ(
float rx,
float ry,
float rz);
3564 static Mat43f RotationZXY(
float rz,
float rx,
float ry);
3566 static Mat43f RotationAxis(
const Vec3f& axis,
float angle);
3568 static Mat43f RotationAxis(
const Vec3f& axis,
float s,
float c);
3570 static Mat43f Translation(
float x,
float y,
float z);
3575 inline Mat43f::Mat43f(
3576 float m11,
float m12,
float m13,
3577 float m21,
float m22,
float m23,
3578 float m31,
float m32,
float m33,
3579 float m41,
float m42,
float m43)
3580 : X(m11, m21, m31, m41)
3581 , Y(m12, m22, m32, m42)
3582 , Z(m13, m23, m33, m43)
3586 inline bool operator==(
const Mat43f& lhs,
const Mat43f& rhs)
3588 return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z;
3591 inline bool operator!=(
const Mat43f& lhs,
const Mat43f& rhs)
3593 return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z;
3596 inline Mat43f operator*(
const Mat43f& lhs,
const Mat43f& rhs)
3598 const Float4 mask = Float4::SetUInt(0, 0, 0, 0xffffffff);
3601 res.X = mask & rhs.X;
3602 res.X = Float4::MulAddLane<0>(res.X, lhs.X, rhs.X);
3603 res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
3604 res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);
3606 res.Y = mask & rhs.Y;
3607 res.Y = Float4::MulAddLane<0>(res.Y, lhs.X, rhs.Y);
3608 res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
3609 res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);
3611 res.Z = mask & rhs.Z;
3612 res.Z = Float4::MulAddLane<0>(res.Z, lhs.X, rhs.Z);
3613 res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
3614 res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
3618 inline Vec3f Vec3f::Transform(
const Vec3f& lhs,
const Mat43f& rhs)
3623 Float4 s3 = Float4::SetZero();
3624 Float4::Transpose(s0, s1, s2, s3);
3626 Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
3627 res = Float4::MulAddLane<1>(res, s1, lhs.s);
3628 res = Float4::MulAddLane<2>(res, s2, lhs.s);
3632 inline Vec4f Vec4f::Transform(
const Vec4f& lhs,
const Mat43f& rhs)
3637 Float4 s3 = Float4(0.0f, 0.0f, 0.0f, 1.0f);
3638 Float4::Transpose(s0, s1, s2, s3);
3640 Float4 res = Float4::MulLane<0>(s0, lhs.s);
3641 res = Float4::MulAddLane<1>(res, s1, lhs.s);
3642 res = Float4::MulAddLane<2>(res, s2, lhs.s);
3643 res = Float4::MulAddLane<3>(res, s3, lhs.s);
3647 inline Mat43f& Mat43f::operator*=(
const Mat43f& rhs)
3649 *
this = *
this * rhs;
3653 inline Mat43f& Mat43f::operator*=(
float rhs)
3665 #endif // __EFFEKSEER_SIMD_MAT43F_H__
3667 #ifndef __EFFEKSEER_SIMD_MAT44F_H__
3668 #define __EFFEKSEER_SIMD_MAT44F_H__
3688 Mat44f(
float m11,
float m12,
float m13,
float m14,
3689 float m21,
float m22,
float m23,
float m24,
3690 float m31,
float m32,
float m33,
float m34,
3691 float m41,
float m42,
float m43,
float m44);
3693 Mat44f(
const Matrix44& mat);
3695 bool IsValid()
const;
3697 Vec3f GetScale()
const;
3699 Mat44f GetRotation()
const;
3701 Vec3f GetTranslation()
const;
3705 void SetTranslation(
const Vec3f& t);
3707 Mat44f Transpose()
const;
3711 Mat44f& operator*=(
float rhs);
3713 static const Mat44f Identity;
3715 static bool Equal(
const Mat44f& lhs,
const Mat44f& rhs,
float epsilon = DefaultEpsilon);
3719 static Mat44f Scaling(
float x,
float y,
float z);
3723 static Mat44f RotationX(
float angle);
3725 static Mat44f RotationY(
float angle);
3727 static Mat44f RotationZ(
float angle);
3729 static Mat44f RotationXYZ(
float rx,
float ry,
float rz);
3731 static Mat44f RotationZXY(
float rz,
float rx,
float ry);
3733 static Mat44f RotationAxis(
const Vec3f& axis,
float angle);
3735 static Mat44f RotationAxis(
const Vec3f& axis,
float s,
float c);
3737 static Mat44f Translation(
float x,
float y,
float z);
3742 inline Mat44f::Mat44f(
3743 float m11,
float m12,
float m13,
float m14,
3744 float m21,
float m22,
float m23,
float m24,
3745 float m31,
float m32,
float m33,
float m34,
3746 float m41,
float m42,
float m43,
float m44)
3747 : X(m11, m21, m31, m41)
3748 , Y(m12, m22, m32, m42)
3749 , Z(m13, m23, m33, m43)
3750 , W(m14, m24, m34, m44)
3754 inline Mat44f::Mat44f(
const Mat43f& mat)
3758 , W(0.0f, 0.0f, 0.0f, 1.0f)
3762 inline bool operator==(
const Mat44f& lhs,
const Mat44f& rhs)
3764 return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z && lhs.W == rhs.W;
3767 inline bool operator!=(
const Mat44f& lhs,
const Mat44f& rhs)
3769 return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z && lhs.W != rhs.W;
3772 inline Mat44f operator*(
const Mat44f& lhs,
const Mat44f& rhs)
3775 res.X = Float4::MulLane<0>(lhs.X, rhs.X);
3776 res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
3777 res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);
3778 res.X = Float4::MulAddLane<3>(res.X, lhs.W, rhs.X);
3780 res.Y = Float4::MulLane<0>(lhs.X, rhs.Y);
3781 res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
3782 res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);
3783 res.Y = Float4::MulAddLane<3>(res.Y, lhs.W, rhs.Y);
3785 res.Z = Float4::MulLane<0>(lhs.X, rhs.Z);
3786 res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
3787 res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
3788 res.Z = Float4::MulAddLane<3>(res.Z, lhs.W, rhs.Z);
3790 res.W = Float4::MulLane<0>(lhs.X, rhs.W);
3791 res.W = Float4::MulAddLane<1>(res.W, lhs.Y, rhs.W);
3792 res.W = Float4::MulAddLane<2>(res.W, lhs.Z, rhs.W);
3793 res.W = Float4::MulAddLane<3>(res.W, lhs.W, rhs.W);
3797 inline Vec3f Vec3f::Transform(
const Vec3f& lhs,
const Mat44f& rhs)
3803 Float4::Transpose(s0, s1, s2, s3);
3805 Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
3806 res = Float4::MulAddLane<1>(res, s1, lhs.s);
3807 res = Float4::MulAddLane<2>(res, s2, lhs.s);
3811 inline Vec4f Vec4f::Transform(
const Vec4f& lhs,
const Mat44f& rhs)
3817 Float4::Transpose(s0, s1, s2, s3);
3819 Float4 res = Float4::MulLane<0>(s0, lhs.s);
3820 res = Float4::MulAddLane<1>(res, s1, lhs.s);
3821 res = Float4::MulAddLane<2>(res, s2, lhs.s);
3822 res = Float4::MulAddLane<3>(res, s3, lhs.s);
3826 inline Mat44f& Mat44f::operator*=(
const Mat44f& rhs)
3828 *
this = *
this * rhs;
3832 inline Mat44f& Mat44f::operator*=(
float rhs)
3845 #endif // __EFFEKSEER_VEC4F_H__
3847 #ifndef __EFFEKSEER_SIMD_QUATERNIONF_H__
3848 #define __EFFEKSEER_SIMD_QUATERNIONF_H__
3908 return Quaternionf{-GetX(), -GetY(), -GetZ(), GetW()};
3913 const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();
3917 const auto qw = sqrtf(tr + 1.0f) / 2.0f;
3918 const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
3919 const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
3920 const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
3923 else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
3925 const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
3926 const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
3927 const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
3928 const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
3931 else if (mat.Y.GetY() > mat.Z.GetZ())
3933 const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
3934 const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
3935 const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
3936 const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
3941 const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
3942 const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
3943 const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
3944 const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
3951 const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();
3955 const auto qw = sqrtf(tr + 1.0f) / 2.0f;
3956 const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
3957 const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
3958 const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
3961 else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
3963 const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
3964 const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
3965 const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
3966 const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
3969 else if (mat.Y.GetY() > mat.Z.GetZ())
3971 const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
3972 const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
3973 const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
3974 const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
3979 const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
3980 const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
3981 const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
3982 const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
3989 const auto qx = GetX();
3990 const auto qy = GetY();
3991 const auto qz = GetZ();
3992 const auto qw = GetW();
3994 const auto qxx = qx * qx;
3995 const auto qyy = qy * qy;
3996 const auto qzz = qz * qz;
3997 const auto qww = qw * qw;
3999 const auto qxy = qx * qy;
4000 const auto qxz = qx * qz;
4001 const auto qyz = qy * qz;
4003 const auto qxw = qx * qw;
4004 const auto qyw = qy * qw;
4005 const auto qzw = qz * qw;
4009 ret.X =
SIMD::Float4{(qxx - qyy - qzz + qww), 2.0f * (qxy - qzw), 2.0f * (qxz + qyw), 0};
4010 ret.Y =
SIMD::Float4{2.0f * (qxy + qzw), (-qxx + qyy - qzz + qww), 2.0f * (qyz - qxw), 0};
4011 ret.Z =
SIMD::Float4{2.0f * (qxz - qyw), 2.0f * (qyz + qxw), (-qxx - qyy + qzz + qww), 0};
4018 const auto qq = q1.s * q2.s;
4019 auto cosa = qq.GetX() + qq.GetY() + qq.GetZ() + qq.GetW();
4023 return Slerp(q1,
Quaternionf{-q2.GetX(), -q2.GetY(), -q2.GetZ(), -q2.GetW()}, t);
4026 cosa = Min(1.0f, cosa);
4028 const auto alpha = acos(cosa);
4029 const auto smallValue = 0.00001f;
4030 if (alpha < smallValue)
4035 return Quaternionf{q1.s * sin((1.0f - t) * alpha) / sin(alpha) + q2.s * sin(t * alpha) / sin(alpha)};
4040 const auto qx = q.GetX();
4041 const auto qy = q.GetY();
4042 const auto qz = q.GetZ();
4043 const auto qw = q.GetW();
4045 const auto qxx = qx * qx;
4046 const auto qyy = qy * qy;
4047 const auto qzz = qz * qz;
4048 const auto qww = qw * qw;
4050 const auto qxy = qx * qy;
4051 const auto qxz = qx * qz;
4052 const auto qyz = qy * qz;
4054 const auto qxw = qx * qw;
4055 const auto qyw = qy * qw;
4056 const auto qzw = qz * qw;
4058 const auto x = (qxx - qyy - qzz + qww) * v.GetX() + 2.0f * (qxy - qzw) * v.GetY() + 2.0f * (qxz + qyw) * v.GetZ();
4059 const auto y = 2.0f * (qxy + qzw) * v.GetX() + (-qxx + qyy - qzz + qww) * v.GetY() + 2.0f * (qyz - qxw) * v.GetZ();
4060 const auto z = 2.0f * (qxz - qyw) * v.GetX() + 2.0f * (qyz + qxw) * v.GetY() + (-qxx - qyy + qzz + qww) * v.GetZ();
4062 return Vec3f{x, y, z};
4069 auto x = lhs.GetW() * rhs.GetX() - lhs.GetZ() * rhs.GetY() + lhs.GetY() * rhs.GetZ() + lhs.GetX() * rhs.GetW();
4070 auto y = lhs.GetZ() * rhs.GetX() + lhs.GetW() * rhs.GetY() - lhs.GetX() * rhs.GetZ() + lhs.GetY() * rhs.GetW();
4071 auto z = -lhs.GetY() * rhs.GetX() + lhs.GetX() * rhs.GetY() + lhs.GetW() * rhs.GetZ() + lhs.GetZ() * rhs.GetW();
4072 auto w = -lhs.GetX() * rhs.GetX() - lhs.GetY() * rhs.GetY() - lhs.GetZ() * rhs.GetZ() + lhs.GetW() * rhs.GetW();
4081 #ifndef __EFFEKSEER_SIMD_UTILS_H__
4082 #define __EFFEKSEER_SIMD_UTILS_H__
4092 template <
size_t align>
4095 static void*
operator new(
size_t size) {
4096 #if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
4097 return malloc(size);
4098 #elif defined(_WIN32)
4099 return _mm_malloc(size, align);
4101 void *ptr =
nullptr;
4102 posix_memalign(&ptr, align, size);
4106 static void operator delete(
void* ptr) {
4107 #if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
4109 #elif defined(_WIN32)
4117 inline Vector2D ToStruct(
const Vec2f& o)
4120 Vec2f::Store(&ret, o);
4124 inline Vector3D ToStruct(
const Vec3f& o)
4127 Vec3f::Store(&ret, o);
4131 inline Matrix43 ToStruct(
const Mat43f& o)
4136 Float4 tw = Float4::SetZero();
4137 Float4::Transpose(tx, ty, tz, tw);
4140 Float4::Store3(ret.Value[0], tx);
4141 Float4::Store3(ret.Value[1], ty);
4142 Float4::Store3(ret.Value[2], tz);
4143 Float4::Store3(ret.Value[3], tw);
4147 inline Matrix44 ToStruct(
const Mat44f& o)
4153 Float4::Transpose(tx, ty, tz, tw);
4156 Float4::Store4(ret.Values[0], tx);
4157 Float4::Store4(ret.Values[1], ty);
4158 Float4::Store4(ret.Values[2], tz);
4159 Float4::Store4(ret.Values[3], tw);
4167 #endif // __EFFEKSEER_SIMD_UTILS_H__