Effekseer
Effekseer.SIMD.h
1 #pragma once
2 #include "Effekseer.h"
3 
4 
5 #ifndef __EFFEKSEER_SIMD_BASE_H__
6 #define __EFFEKSEER_SIMD_BASE_H__
7 
8 #include <cstdint>
9 #include <cmath>
10 
11 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
12 // ARMv7/ARM64 NEON
13 
14 #define EFK_SIMD_NEON
15 
16 #if defined(_M_ARM64) || defined(__aarch64__)
17 #define EFK_SIMD_NEON_ARM64
18 #endif
19 
20 #include <arm_neon.h>
21 
22 #elif (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__SSE2__)
23 // x86/x86-64 SSE2/AVX2
24 
25 #define EFK_SIMD_SSE2
26 
27 #if defined(__AVX2__)
28 #define EFK_SIMD_AVX2
29 #endif
30 #if defined(__AVX__) || defined(EFK_SIMD_AVX2)
31 #define EFK_SIMD_AVX
32 #endif
33 #if defined(__SSE4_2__) || defined(EFK_SIMD_AVX)
34 #define EFK_SIMD_SSE4_2
35 #endif
36 #if defined(__SSE4_1__) || defined(EFK_SIMD_SSE4_2)
37 #define EFK_SIMD_SSE4_1
38 #endif
39 #if defined(__SSSE3__) || defined(EFK_SIMD_SSE4_1)
40 #define EFK_SIMD_SSSE3
41 #endif
42 #if defined(__SSE3__) || defined(EFK_SIMD_SSSE3)
43 #define EFK_SIMD_SSE3
44 #endif
45 
46 #if defined(EFK_SIMD_AVX) || defined(EFK_SIMD_AVX2)
47 #include <immintrin.h>
48 #elif defined(EFK_SIMD_SSE4_2)
49 #include <nmmintrin.h>
50 #elif defined(EFK_SIMD_SSE4_1)
51 #include <smmintrin.h>
52 #elif defined(EFK_SIMD_SSSE3)
53 #include <tmmintrin.h>
54 #elif defined(EFK_SIMD_SSE3)
55 #include <pmmintrin.h>
56 #elif defined(EFK_SIMD_SSE2)
57 #include <emmintrin.h>
58 #endif
59 
60 #else
61 // C++ Generic Implementation (Pseudo SIMD)
62 
63 #define EFK_SIMD_GEN
64 
65 #endif
66 
67 const float DefaultEpsilon = 1e-6f;
68 
69 #endif // __EFFEKSEER_SIMD_BASE_H__
70 
71 #ifndef __EFFEKSEER_SIMD_FLOAT4_GEN_H__
72 #define __EFFEKSEER_SIMD_FLOAT4_GEN_H__
73 
74 
75 #if defined(EFK_SIMD_GEN)
76 
77 #include <cstring>
78 #include <algorithm>
79 
80 namespace Effekseer
81 {
82 
83 namespace SIMD
84 {
85 
86 inline float Sqrt(float x)
87 {
88  return std::sqrt(x);
89 }
90 inline float Rsqrt(float x)
91 {
92  return 1.0f / std::sqrt(x);
93 }
94 
95 struct Int4;
96 
100 struct alignas(16) Float4
101 {
102  union {
103  float vf[4];
104  int32_t vi[4];
105  uint32_t vu[4];
106  };
107 
108  Float4() = default;
109  Float4(const Float4& rhs) = default;
110  Float4(float x, float y, float z, float w) { vf[0] = x; vf[1] = y; vf[2] = z; vf[3] = w; }
111  Float4(float i) { vf[0] = i; vf[1] = i; vf[2] = i; vf[3] = i; }
112 
113  float GetX() const { return vf[0]; }
114  float GetY() const { return vf[1]; }
115  float GetZ() const { return vf[2]; }
116  float GetW() const { return vf[3]; }
117 
118  void SetX(float o) { vf[0] = o; }
119  void SetY(float o) { vf[1] = o; }
120  void SetZ(float o) { vf[2] = o; }
121  void SetW(float o) { vf[3] = o; }
122 
123  template <size_t LANE>
124  Float4 Dup() { return Float4(vf[LANE], vf[LANE], vf[LANE], vf[LANE]); }
125 
126  Int4 Convert4i() const;
127  Int4 Cast4i() const;
128 
129  Float4& operator+=(const Float4& rhs)
130  {
131  for (size_t i = 0; i < 4; i++)
132  {
133  vf[i] += rhs.vf[i];
134  }
135  return *this;
136  }
137 
138  Float4& operator-=(const Float4& rhs)
139  {
140  for (size_t i = 0; i < 4; i++)
141  {
142  vf[i] -= rhs.vf[i];
143  }
144  return *this;
145  }
146 
147  Float4& operator*=(const Float4& rhs)
148  {
149  for (size_t i = 0; i < 4; i++)
150  {
151  vf[i] *= rhs.vf[i];
152  }
153  return *this;
154  }
155 
156  Float4& operator*=(float rhs)
157  {
158  for (size_t i = 0; i < 4; i++)
159  {
160  vf[i] *= rhs;
161  }
162  return *this;
163  }
164 
165  Float4& operator/=(const Float4& rhs)
166  {
167  for (size_t i = 0; i < 4; i++)
168  {
169  vf[i] /= rhs.vf[i];
170  }
171  return *this;
172  }
173 
174  Float4& operator/=(float rhs)
175  {
176  for (size_t i = 0; i < 4; i++)
177  {
178  vf[i] /= rhs;
179  }
180  return *this;
181  }
182 
183  static Float4 Load2(const void* mem);
184  static void Store2(void* mem, const Float4& i);
185  static Float4 Load3(const void* mem);
186  static void Store3(void* mem, const Float4& i);
187  static Float4 Load4(const void* mem);
188  static void Store4(void* mem, const Float4& i);
189 
190  static Float4 SetZero();
191  static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
192  static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
193  static Float4 Sqrt(const Float4& in);
194  static Float4 Rsqrt(const Float4& in);
195  static Float4 Abs(const Float4& in);
196  static Float4 Min(const Float4& lhs, const Float4& rhs);
197  static Float4 Max(const Float4& lhs, const Float4& rhs);
198  static Float4 Floor(const Float4& in);
199  static Float4 Ceil(const Float4& in);
200  static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
201  static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);
202 
203  template<size_t LANE>
204  static Float4 MulLane(const Float4& lhs, const Float4& rhs);
205  template<size_t LANE>
206  static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
207  template<size_t LANE>
208  static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
209  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
210  static Float4 Swizzle(const Float4& in);
211 
212  static Float4 Dot3(const Float4& lhs, const Float4& rhs);
213  static Float4 Cross3(const Float4& lhs, const Float4& rhs);
214 
215  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
216  static Float4 Mask();
217  static uint32_t MoveMask(const Float4& in);
218  static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
219  static Float4 Equal(const Float4& lhs, const Float4& rhs);
220  static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
221  static Float4 LessThan(const Float4& lhs, const Float4& rhs);
222  static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
223  static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
224  static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
225  static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
226  static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
227  static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
228 };
229 
230 inline Float4 operator+(const Float4& lhs, const Float4& rhs)
231 {
232  Float4 ret;
233  for (size_t i = 0; i < 4; i++)
234  {
235  ret.vf[i] = lhs.vf[i] + rhs.vf[i];
236  }
237  return ret;
238 }
239 
240 inline Float4 operator-(const Float4& lhs, const Float4& rhs)
241 {
242  Float4 ret;
243  for (size_t i = 0; i < 4; i++)
244  {
245  ret.vf[i] = lhs.vf[i] - rhs.vf[i];
246  }
247  return ret;
248 }
249 
250 inline Float4 operator*(const Float4& lhs, const Float4& rhs)
251 {
252  Float4 ret;
253  for (size_t i = 0; i < 4; i++)
254  {
255  ret.vf[i] = lhs.vf[i] * rhs.vf[i];
256  }
257  return ret;
258 }
259 
260 inline Float4 operator*(const Float4& lhs, float rhs)
261 {
262  Float4 ret;
263  for (size_t i = 0; i < 4; i++)
264  {
265  ret.vf[i] = lhs.vf[i] * rhs;
266  }
267  return ret;
268 }
269 
270 inline Float4 operator/(const Float4& lhs, const Float4& rhs)
271 {
272  Float4 ret;
273  for (size_t i = 0; i < 4; i++)
274  {
275  ret.vf[i] = lhs.vf[i] / rhs.vf[i];
276  }
277  return ret;
278 }
279 
280 inline Float4 operator/(const Float4& lhs, float rhs)
281 {
282  Float4 ret;
283  for (size_t i = 0; i < 4; i++)
284  {
285  ret.vf[i] = lhs.vf[i] / rhs;
286  }
287  return ret;
288 }
289 
290 inline Float4 operator&(const Float4& lhs, const Float4& rhs)
291 {
292  Float4 ret;
293  for (size_t i = 0; i < 4; i++)
294  {
295  ret.vu[i] = lhs.vu[i] & rhs.vu[i];
296  }
297  return ret;
298 }
299 
300 inline Float4 operator|(const Float4& lhs, const Float4& rhs)
301 {
302  Float4 ret;
303  for (size_t i = 0; i < 4; i++)
304  {
305  ret.vu[i] = lhs.vu[i] | rhs.vu[i];
306  }
307  return ret;
308 }
309 
310 inline Float4 operator^(const Float4& lhs, const Float4& rhs)
311 {
312  Float4 ret;
313  for (size_t i = 0; i < 4; i++)
314  {
315  ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
316  }
317  return ret;
318 }
319 
320 inline bool operator==(const Float4& lhs, const Float4& rhs)
321 {
322  bool ret = true;
323  for (size_t i = 0; i < 4; i++)
324  {
325  ret &= lhs.vf[i] == rhs.vf[i];
326  }
327  return ret;
328 }
329 
330 inline bool operator!=(const Float4& lhs, const Float4& rhs)
331 {
332  bool ret = true;
333  for (size_t i = 0; i < 4; i++)
334  {
335  ret &= lhs.vf[i] == rhs.vf[i];
336  }
337  return !ret;
338 }
339 
340 inline Float4 Float4::Load2(const void* mem)
341 {
342  Float4 ret;
343  memcpy(ret.vf, mem, sizeof(float) * 2);
344  // This code causes bugs in asmjs
345  // ret.vf[0] = *((float*)mem + 0);
346  // ret.vf[1] = *((float*)mem + 1);
347  return ret;
348 }
349 
350 inline void Float4::Store2(void* mem, const Float4& i)
351 {
352  memcpy(mem, i.vf, sizeof(float) * 2);
353  // This code causes bugs in asmjs
354  // *((float*)mem + 0) = i.vf[0];
355  // *((float*)mem + 1) = i.vf[1];
356 }
357 
358 inline Float4 Float4::Load3(const void* mem)
359 {
360  Float4 ret;
361  memcpy(ret.vf, mem, sizeof(float) * 3);
362  // This code causes bugs in asmjs
363  // ret.vf[0] = *((float*)mem + 0);
364  // ret.vf[1] = *((float*)mem + 1);
365  // ret.vf[2] = *((float*)mem + 2);
366  return ret;
367 }
368 
369 inline void Float4::Store3(void* mem, const Float4& i)
370 {
371  memcpy(mem, i.vf, sizeof(float) * 3);
372  // This code causes bugs in asmjs
373  // *((float*)mem + 0) = i.vf[0];
374  // *((float*)mem + 1) = i.vf[1];
375  // *((float*)mem + 2) = i.vf[2];
376 }
377 
378 inline Float4 Float4::Load4(const void* mem)
379 {
380  Float4 ret;
381  memcpy(ret.vf, mem, sizeof(float) * 4);
382  // This code causes bugs in emscripten
383  // ret.vf[0] = *((float*)mem + 0);
384  // ret.vf[1] = *((float*)mem + 1);
385  // ret.vf[2] = *((float*)mem + 2);
386  // ret.vf[3] = *((float*)mem + 3);
387  return ret;
388 }
389 
390 inline void Float4::Store4(void* mem, const Float4& i)
391 {
392  memcpy(mem, i.vf, sizeof(float) * 4);
393  // This code causes bugs in asmjs
394  // *((float*)mem + 0) = i.vf[0];
395  // *((float*)mem + 1) = i.vf[1];
396  // *((float*)mem + 2) = i.vf[2];
397  // *((float*)mem + 3) = i.vf[3];
398 }
399 
400 inline Float4 Float4::SetZero()
401 {
402  Float4 ret;
403  ret.vf[0] = 0.0f;
404  ret.vf[1] = 0.0f;
405  ret.vf[2] = 0.0f;
406  ret.vf[3] = 0.0f;
407  return ret;
408 }
409 
410 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
411 {
412  Float4 ret;
413  ret.vu[0] = (uint32_t)x;
414  ret.vu[1] = (uint32_t)y;
415  ret.vu[2] = (uint32_t)z;
416  ret.vu[3] = (uint32_t)w;
417  return ret;
418 }
419 
420 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
421 {
422  Float4 ret;
423  ret.vu[0] = (uint32_t)x;
424  ret.vu[1] = (uint32_t)y;
425  ret.vu[2] = (uint32_t)z;
426  ret.vu[3] = (uint32_t)w;
427  return ret;
428 }
429 
430 inline Float4 Float4::Sqrt(const Float4& in)
431 {
432  Float4 ret;
433  for (size_t i = 0; i < 4; i++)
434  {
435  ret.vf[i] = std::sqrt(in.vf[i]);
436  }
437  return ret;
438 }
439 
440 inline Float4 Float4::Rsqrt(const Float4& in)
441 {
442  Float4 ret;
443  for (size_t i = 0; i < 4; i++)
444  {
445  ret.vf[i] = 1.0f / std::sqrt(in.vf[i]);
446  }
447  return ret;
448 }
449 
450 inline Float4 Float4::Abs(const Float4& in)
451 {
452  Float4 ret;
453  for (size_t i = 0; i < 4; i++)
454  {
455  ret.vf[i] = std::abs(in.vf[i]);
456  }
457  return ret;
458 }
459 
460 inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
461 {
462  Float4 ret;
463  for (size_t i = 0; i < 4; i++)
464  {
465  ret.vf[i] = std::fmin(lhs.vf[i], rhs.vf[i]);
466  }
467  return ret;
468 }
469 
470 inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
471 {
472  Float4 ret;
473  for (size_t i = 0; i < 4; i++)
474  {
475  ret.vf[i] = std::fmax(lhs.vf[i], rhs.vf[i]);
476  }
477  return ret;
478 }
479 
480 inline Float4 Float4::Floor(const Float4& in)
481 {
482  Float4 ret;
483  for (size_t i = 0; i < 4; i++)
484  {
485  ret.vf[i] = std::floor(in.vf[i]);
486  }
487  return ret;
488 }
489 
490 inline Float4 Float4::Ceil(const Float4& in)
491 {
492  Float4 ret;
493  for (size_t i = 0; i < 4; i++)
494  {
495  ret.vf[i] = std::ceil(in.vf[i]);
496  }
497  return ret;
498 }
499 
500 inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
501 {
502  Float4 ret;
503  for (size_t i = 0; i < 4; i++)
504  {
505  ret.vf[i] = a.vf[i] + b.vf[i] * c.vf[i];
506 }
507  return ret;
508 }
509 
510 inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
511 {
512  Float4 ret;
513  for (size_t i = 0; i < 4; i++)
514  {
515  ret.vf[i] = a.vf[i] - b.vf[i] * c.vf[i];
516 }
517  return ret;
518 }
519 
520 inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
521 {
522  Float4 muled = lhs * rhs;
523  return Float4{muled.vf[0] + muled.vf[1] + muled.vf[2], 0.0f, 0.0f, 0.0f};
524 }
525 
526 inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
527 {
528  return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
529  Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
530 }
531 
532 template<size_t LANE>
533 Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
534 {
535  static_assert(LANE < 4, "LANE is must be less than 4.");
536  return lhs * rhs.vf[LANE];
537 }
538 
539 template<size_t LANE>
540 Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
541 {
542  static_assert(LANE < 4, "LANE is must be less than 4.");
543  return a + b * c.vf[LANE];
544 }
545 
546 template<size_t LANE>
547 Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
548 {
549  static_assert(LANE < 4, "LANE is must be less than 4.");
550  return a - b * c.vf[LANE];
551 }
552 
553 template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
554 Float4 Float4::Swizzle(const Float4& in)
555 {
556  static_assert(indexX < 4, "indexX is must be less than 4.");
557  static_assert(indexY < 4, "indexY is must be less than 4.");
558  static_assert(indexZ < 4, "indexZ is must be less than 4.");
559  static_assert(indexW < 4, "indexW is must be less than 4.");
560  return Float4{in.vf[indexX], in.vf[indexY], in.vf[indexZ], in.vf[indexW]};
561 }
562 
563 
564 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
565 Float4 Float4::Mask()
566 {
567  static_assert(X >= 2, "indexX is must be set 0 or 1.");
568  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
569  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
570  static_assert(W >= 2, "indexW is must be set 0 or 1.");
571  Float4 ret;
572  ret.vu[0] = 0xffffffff * X;
573  ret.vu[1] = 0xffffffff * Y;
574  ret.vu[2] = 0xffffffff * Z;
575  ret.vu[3] = 0xffffffff * W;
576  return ret;
577 }
578 
579 inline uint32_t Float4::MoveMask(const Float4& in)
580 {
581  return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
582 }
583 
584 inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
585 {
586  Float4 ret;
587  for (size_t i = 0; i < 4; i++)
588  {
589  ret.vu[i] = (mask.vu[i] & sel1.vu[i]) | (~mask.vu[i] & sel2.vu[i]);
590  }
591  return ret;
592 }
593 
594 inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
595 {
596  Float4 ret;
597  for (size_t i = 0; i < 4; i++)
598  {
599  ret.vu[i] = (lhs.vf[i] == rhs.vf[i]) ? 0xffffffff : 0;
600  }
601  return ret;
602 }
603 
604 inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
605 {
606  Float4 ret;
607  for (size_t i = 0; i < 4; i++)
608  {
609  ret.vu[i] = (lhs.vf[i] != rhs.vf[i]) ? 0xffffffff : 0;
610  }
611  return ret;
612 }
613 
614 inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
615 {
616  Float4 ret;
617  for (size_t i = 0; i < 4; i++)
618  {
619  ret.vu[i] = (lhs.vf[i] < rhs.vf[i]) ? 0xffffffff : 0;
620  }
621  return ret;
622 }
623 
624 inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
625 {
626  Float4 ret;
627  for (size_t i = 0; i < 4; i++)
628  {
629  ret.vu[i] = (lhs.vf[i] <= rhs.vf[i]) ? 0xffffffff : 0;
630  }
631  return ret;
632 }
633 
634 inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
635 {
636  Float4 ret;
637  for (size_t i = 0; i < 4; i++)
638  {
639  ret.vu[i] = (lhs.vf[i] > rhs.vf[i]) ? 0xffffffff : 0;
640  }
641  return ret;
642 }
643 
644 inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
645 {
646  Float4 ret;
647  for (size_t i = 0; i < 4; i++)
648  {
649  ret.vu[i] = (lhs.vf[i] >= rhs.vf[i]) ? 0xffffffff : 0;
650  }
651  return ret;
652 }
653 
654 inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
655 {
656  Float4 ret;
657  for (size_t i = 0; i < 4; i++)
658  {
659  ret.vu[i] = (std::abs(lhs.vf[i] - rhs.vf[i]) <= epsilon) ? 0xffffffff : 0;
660  }
661  return ret;
662 }
663 
664 inline Float4 Float4::IsZero(const Float4& in, float epsilon)
665 {
666  Float4 ret;
667  for (size_t i = 0; i < 4; i++)
668  {
669  ret.vu[i] = (std::abs(in.vf[i]) <= epsilon) ? 0xffffffff : 0;
670  }
671  return ret;
672 }
673 
674 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
675 {
676  std::swap(s0.vf[1], s1.vf[0]);
677  std::swap(s0.vf[2], s2.vf[0]);
678  std::swap(s0.vf[3], s3.vf[0]);
679  std::swap(s1.vf[2], s2.vf[1]);
680  std::swap(s2.vf[3], s3.vf[2]);
681  std::swap(s1.vf[3], s3.vf[1]);
682 }
683 
684 } // namespace SIMD
685 
686 } // namespace Effekseer
687 
688 #endif // defined(EFK_SIMD_GEN)
689 
690 #endif // __EFFEKSEER_SIMD_FLOAT4_GEN_H__
691 
692 #ifndef __EFFEKSEER_SIMD_FLOAT4_NEON_H__
693 #define __EFFEKSEER_SIMD_FLOAT4_NEON_H__
694 
695 
696 #if defined(EFK_SIMD_NEON)
697 
698 namespace Effekseer
699 {
700 
701 namespace SIMD
702 {
703 
704 inline float Sqrt(float x)
705 {
706  return sqrt(x);
707 }
708 
709 inline float Rsqrt(float x)
710 {
711  return 1.0f / sqrt(x);
712 }
713 
714 struct Int4;
715 
720 struct alignas(16) Float4
721 {
722  float32x4_t s;
723 
724  Float4() = default;
725  Float4(const Float4& rhs) = default;
726  Float4(float32x4_t rhs) { s = rhs; }
727  Float4(uint32x4_t rhs) { s = vreinterpretq_f32_u32(rhs); }
728  Float4(float x, float y, float z, float w) { const float f[4] = {x, y, z, w}; s = vld1q_f32(f); }
729  Float4(float i) { s = vdupq_n_f32(i); }
730 
731  float GetX() const { return vgetq_lane_f32(s, 0); }
732  float GetY() const { return vgetq_lane_f32(s, 1); }
733  float GetZ() const { return vgetq_lane_f32(s, 2); }
734  float GetW() const { return vgetq_lane_f32(s, 3); }
735 
736  void SetX(float i) { s = vsetq_lane_f32(i, s, 0); }
737  void SetY(float i) { s = vsetq_lane_f32(i, s, 1); }
738  void SetZ(float i) { s = vsetq_lane_f32(i, s, 2); }
739  void SetW(float i) { s = vsetq_lane_f32(i, s, 3); }
740 
741  template <size_t LANE>
742  Float4 Dup();
743 
744  Int4 Convert4i() const;
745  Int4 Cast4i() const;
746 
747  Float4& operator+=(const Float4& rhs);
748  Float4& operator-=(const Float4& rhs);
749  Float4& operator*=(const Float4& rhs);
750  Float4& operator*=(float rhs);
751  Float4& operator/=(const Float4& rhs);
752  Float4& operator/=(float rhs);
753 
754  static Float4 Load2(const void* mem);
755  static void Store2(void* mem, const Float4& i);
756  static Float4 Load3(const void* mem);
757  static void Store3(void* mem, const Float4& i);
758  static Float4 Load4(const void* mem);
759  static void Store4(void* mem, const Float4& i);
760 
761  static Float4 SetZero();
762  static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
763  static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
764  static Float4 Sqrt(const Float4& in);
765  static Float4 Rsqrt(const Float4& in);
766  static Float4 Abs(const Float4& in);
767  static Float4 Min(const Float4& lhs, const Float4& rhs);
768  static Float4 Max(const Float4& lhs, const Float4& rhs);
769  static Float4 Floor(const Float4& in);
770  static Float4 Ceil(const Float4& in);
771  static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
772  static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);
773 
774  template<size_t LANE>
775  static Float4 MulLane(const Float4& lhs, const Float4& rhs);
776  template<size_t LANE>
777  static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
778  template<size_t LANE>
779  static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
780  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
781  static Float4 Swizzle(const Float4& v);
782 
783  static Float4 Dot3(const Float4& lhs, const Float4& rhs);
784  static Float4 Cross3(const Float4& lhs, const Float4& rhs);
785 
786  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
787  static Float4 Mask();
788  static uint32_t MoveMask(const Float4& in);
789  static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
790  static Float4 Equal(const Float4& lhs, const Float4& rhs);
791  static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
792  static Float4 LessThan(const Float4& lhs, const Float4& rhs);
793  static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
794  static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
795  static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
796  static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
797  static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
798  static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
799 
800 private:
801  static Float4 SwizzleYZX(const Float4& in);
802  static Float4 SwizzleZXY(const Float4& in);
803 };
804 
805 template <size_t LANE>
806 Float4 Float4::Dup()
807 {
808  return (LANE < 2) ?
809  vdupq_lane_f32(vget_low_f32(s), LANE & 1) :
810  vdupq_lane_f32(vget_high_f32(s), LANE & 1);
811 }
812 
813 inline Float4 operator+(const Float4& lhs, const Float4& rhs)
814 {
815  return vaddq_f32(lhs.s, rhs.s);
816 }
817 
818 inline Float4 operator-(const Float4& lhs, const Float4& rhs)
819 {
820  return vsubq_f32(lhs.s, rhs.s);
821 }
822 
823 inline Float4 operator*(const Float4& lhs, const Float4& rhs)
824 {
825  return vmulq_f32(lhs.s, rhs.s);
826 }
827 
828 inline Float4 operator*(const Float4& lhs, float rhs)
829 {
830  return vmulq_n_f32(lhs.s, rhs);
831 }
832 
833 inline Float4 operator/(const Float4& lhs, const Float4& rhs)
834 {
835 #if defined(_M_ARM64) || __aarch64__
836  return vdivq_f32(lhs.s, rhs.s);
837 #else
838  float32x4_t recp = vrecpeq_f32(rhs.s);
839  float32x4_t s = vrecpsq_f32(recp, rhs.s);
840  recp = vmulq_f32(s, recp);
841  s = vrecpsq_f32(recp, rhs.s);
842  recp = vmulq_f32(s, recp);
843  return vmulq_f32(lhs.s, recp);
844 #endif
845 }
846 
847 inline Float4 operator/(const Float4& lhs, float rhs)
848 {
849  return lhs * (1.0f / rhs);
850 }
851 
852 inline Float4 operator&(const Float4& lhs, const Float4& rhs)
853 {
854  uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
855  uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
856  return vreinterpretq_f32_u32(vandq_u32(lhsi, rhsi));
857 }
858 
859 inline Float4 operator|(const Float4& lhs, const Float4& rhs)
860 {
861  uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
862  uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
863  return vreinterpretq_f32_u32(vorrq_u32(lhsi, rhsi));
864 }
865 
866 inline Float4 operator^(const Float4& lhs, const Float4& rhs)
867 {
868  uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
869  uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
870  return vreinterpretq_f32_u32(veorq_u32(lhsi, rhsi));
871 }
872 
873 inline bool operator==(const Float4& lhs, const Float4& rhs)
874 {
875  return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
876 }
877 
878 inline bool operator!=(const Float4& lhs, const Float4& rhs)
879 {
880  return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
881 }
882 
883 inline Float4& Float4::operator+=(const Float4& rhs) { return *this = *this + rhs; }
884 inline Float4& Float4::operator-=(const Float4& rhs) { return *this = *this - rhs; }
885 inline Float4& Float4::operator*=(const Float4& rhs) { return *this = *this * rhs; }
886 inline Float4& Float4::operator*=(float rhs) { return *this = *this * rhs; }
887 inline Float4& Float4::operator/=(const Float4& rhs) { return *this = *this / rhs; }
888 inline Float4& Float4::operator/=(float rhs) { return *this = *this / rhs; }
889 
890 inline Float4 Float4::Load2(const void* mem)
891 {
892  float32x2_t low = vld1_f32((const float*)mem);
893  float32x2_t high = vdup_n_f32(0.0f);
894  return vcombine_f32(low, high);
895 }
896 
897 inline void Float4::Store2(void* mem, const Float4& i)
898 {
899  vst1_f32((float*)mem, vget_low_f32(i.s));
900 }
901 
902 inline Float4 Float4::Load3(const void* mem)
903 {
904  float32x2_t low = vld1_f32((const float*)mem);
905  float32x2_t high = vld1_lane_f32((const float*)mem + 2, vdup_n_f32(0.0f), 0);
906  return vcombine_f32(low, high);
907 }
908 
909 inline void Float4::Store3(void* mem, const Float4& i)
910 {
911  vst1_f32((float*)mem, vget_low_f32(i.s));
912  vst1q_lane_f32((float*)mem + 2, i.s, 2);
913 }
914 
915 inline Float4 Float4::Load4(const void* mem)
916 {
917  return vld1q_f32((const float*)mem);
918 }
919 
920 inline void Float4::Store4(void* mem, const Float4& i)
921 {
922  vst1q_f32((float*)mem, i.s);
923 }
924 
925 inline Float4 Float4::SetZero()
926 {
927  return vdupq_n_f32(0.0f);
928 }
929 
930 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
931 {
932  const int32_t i[4] = {x, y, z, w};
933  return vreinterpretq_f32_s32(vld1q_s32(i));
934 }
935 
936 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
937 {
938  const uint32_t i[4] = {x, y, z, w};
939  return vreinterpretq_f32_u32(vld1q_u32(i));
940 }
941 
942 inline Float4 Float4::Sqrt(const Float4& in)
943 {
944 #if defined(_M_ARM64) || __aarch64__
945  return vsqrtq_f32(in.s);
946 #else
947  return Float4(1.0f) / Float4::Rsqrt(in);
948 #endif
949 }
950 
951 inline Float4 Float4::Rsqrt(const Float4& in)
952 {
953  float32x4_t s0 = vrsqrteq_f32(in.s);
954  float32x4_t p0 = vmulq_f32(in.s, s0);
955  float32x4_t r0 = vrsqrtsq_f32(p0, s0);
956  float32x4_t s1 = vmulq_f32(s0, r0);
957  return s1;
958 }
959 
960 inline Float4 Float4::Abs(const Float4& in)
961 {
962  return vabsq_f32(in.s);
963 }
964 
965 inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
966 {
967  return vminq_f32(lhs.s, rhs.s);
968 }
969 
970 inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
971 {
972  return vmaxq_f32(lhs.s, rhs.s);
973 }
974 
975 inline Float4 Float4::Floor(const Float4& in)
976 {
977 #if defined(_M_ARM64) || __aarch64__
978  return vrndmq_f32(in.s);
979 #else
980  int32x4_t in_i = vcvtq_s32_f32(in.s);
981  float32x4_t result = vcvtq_f32_s32(in_i);
982  float32x4_t larger = vcgtq_f32(result, in.s);
983  larger = vcvtq_f32_s32(larger);
984  return vaddq_f32(result, larger);
985 #endif
986 }
987 
988 inline Float4 Float4::Ceil(const Float4& in)
989 {
990 #if defined(_M_ARM64) || __aarch64__
991  return vrndpq_f32(in.s);
992 #else
993  int32x4_t in_i = vcvtq_s32_f32(in.s);
994  float32x4_t result = vcvtq_f32_s32(in_i);
995  float32x4_t smaller = vcltq_f32(result, in.s);
996  smaller = vcvtq_f32_s32(smaller);
997  return vsubq_f32(result, smaller);
998 #endif
999 }
1000 
1001 inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
1002 {
1003  return vmlaq_f32(a.s, b.s, c.s);
1004 }
1005 
1006 inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
1007 {
1008  return vmlsq_f32(a.s, b.s, c.s);
1009 }
1010 
1011 template<size_t LANE>
1012 inline Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
1013 {
1014  static_assert(LANE < 4, "LANE is must be less than 4.");
1015  float32x2_t rhs2 = (LANE < 2) ? vget_low_f32(rhs.s) : vget_high_f32(rhs.s);
1016  return vmulq_lane_f32(lhs.s, rhs2, LANE & 1);
1017 }
1018 
1019 template<size_t LANE>
1020 inline Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
1021 {
1022  static_assert(LANE < 4, "LANE is must be less than 4.");
1023  float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
1024  return vmlaq_lane_f32(a.s, b.s, c2, LANE & 1);
1025 }
1026 
1027 template<size_t LANE>
1028 inline Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
1029 {
1030  static_assert(LANE < 4, "LANE is must be less than 4.");
1031  float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
1032  return vmlsq_lane_f32(a.s, b.s, c2, LANE & 1);
1033 }
1034 
1035 //template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
1036 //inline Float4 Float4::Swizzle(const Float4& v)
1037 //{
1038 // static_assert(indexX < 4, "indexX is must be less than 4.");
1039 // static_assert(indexY < 4, "indexY is must be less than 4.");
1040 // static_assert(indexZ < 4, "indexZ is must be less than 4.");
1041 // static_assert(indexW < 4, "indexW is must be less than 4.");
1042 //}
1043 
1044 inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
1045 {
1046  float32x4_t mul = vmulq_f32(lhs.s, rhs.s);
1047  float32x2_t xy = vpadd_f32(vget_low_f32(mul), vget_low_f32(mul));
1048  float32x2_t dot = vadd_f32(xy, vget_high_f32(mul));
1049  return vcombine_f32(dot, vdup_n_f32(0.0f));
1050 }
1051 
1052 inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
1053 {
1054  return MulSub(SwizzleYZX(lhs.s) * SwizzleZXY(rhs.s), SwizzleZXY(lhs.s), SwizzleYZX(rhs.s));
1055 }
1056 
1057 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
1058 inline Float4 Float4::Mask()
1059 {
1060  static_assert(X >= 2, "indexX is must be set 0 or 1.");
1061  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
1062  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
1063  static_assert(W >= 2, "indexW is must be set 0 or 1.");
1064  const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
1065  return vld1q_f32((const float*)in);
1066 }
1067 
1068 inline uint32_t Float4::MoveMask(const Float4& in)
1069 {
1070  uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_f32(in.s));
1071  uint16_t u16[4];
1072  vst1_u16(u16, u16x4);
1073  return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
1074 }
1075 
1076 inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
1077 {
1078  uint32x4_t maski = vreinterpretq_u32_f32(mask.s);
1079  return vbslq_f32(maski, sel1.s, sel2.s);
1080 }
1081 
1082 inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
1083 {
1084  return vceqq_f32(lhs.s, rhs.s);
1085 }
1086 
1087 inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
1088 {
1089  return vmvnq_u32(vceqq_f32(lhs.s, rhs.s));
1090 }
1091 
1092 inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
1093 {
1094  return vcltq_f32(lhs.s, rhs.s);
1095 }
1096 
1097 inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
1098 {
1099  return vcleq_f32(lhs.s, rhs.s);
1100 }
1101 
1102 inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
1103 {
1104  return vcgtq_f32(lhs.s, rhs.s);
1105 }
1106 
1107 inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
1108 {
1109  return vcgeq_f32(lhs.s, rhs.s);
1110 }
1111 
1112 inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
1113 {
1114  return LessEqual(Abs(lhs - rhs), Float4(epsilon));
1115 }
1116 
1117 inline Float4 Float4::IsZero(const Float4& in, float epsilon)
1118 {
1119  return LessEqual(Abs(in), Float4(epsilon));
1120 }
1121 
1122 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
1123 {
1124  float32x4x2_t t0 = vzipq_f32(s0.s, s2.s);
1125  float32x4x2_t t1 = vzipq_f32(s1.s, s3.s);
1126  float32x4x2_t t2 = vzipq_f32(t0.val[0], t1.val[0]);
1127  float32x4x2_t t3 = vzipq_f32(t0.val[1], t1.val[1]);
1128 
1129  s0 = t2.val[0];
1130  s1 = t2.val[1];
1131  s2 = t3.val[0];
1132  s3 = t3.val[1];
1133 }
1134 
1135 inline Float4 Float4::SwizzleYZX(const Float4& in)
1136 {
1137  float32x4_t ex = vextq_f32(in.s, in.s, 1);
1138  return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 2);
1139 }
1140 
1141 inline Float4 Float4::SwizzleZXY(const Float4& in)
1142 {
1143  float32x4_t ex = vextq_f32(in.s, in.s, 3);
1144  return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 0);
1145 }
1146 
1147 } // namespace SIMD
1148 
1149 } // namespace Effekseer
1150 
1151 #endif
1152 #endif // __EFFEKSEER_SIMD_FLOAT4_NEON_H__
1153 
1154 #ifndef __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1155 #define __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1156 
1157 
1158 #if defined(EFK_SIMD_SSE2)
1159 
1160 namespace Effekseer
1161 {
1162 
1163 namespace SIMD
1164 {
1165 
1166 inline float Sqrt(float x)
1167 {
1168  _mm_store_ss(&x, _mm_sqrt_ss(_mm_load_ss(&x)));
1169  return x;
1170 }
1171 inline float Rsqrt(float x)
1172 {
1173  _mm_store_ss(&x, _mm_rsqrt_ss(_mm_load_ss(&x)));
1174  return x;
1175 }
1176 
1177 struct Int4;
1178 
1183 struct alignas(16) Float4
1184 {
1185  __m128 s;
1186 
1187  Float4() = default;
1188  Float4(const Float4& rhs) = default;
1189  Float4(__m128 rhs) { s = rhs; }
1190  Float4(__m128i rhs) { s = _mm_castsi128_ps(rhs); }
1191  Float4(float x, float y, float z, float w) { s = _mm_setr_ps(x, y, z, w); }
1192  Float4(float i) { s = _mm_set_ps1(i); }
1193 
1194  float GetX() const { return _mm_cvtss_f32(s); }
1195  float GetY() const { return _mm_cvtss_f32(Swizzle<1,1,1,1>(s).s); }
1196  float GetZ() const { return _mm_cvtss_f32(Swizzle<2,2,2,2>(s).s); }
1197  float GetW() const { return _mm_cvtss_f32(Swizzle<3,3,3,3>(s).s); }
1198 
1199  void SetX(float i) { s = _mm_move_ss(s, _mm_set_ss(i)); }
1200  void SetY(float i) { s = Swizzle<1,0,2,3>(_mm_move_ss(Swizzle<1,0,2,3>(s).s, _mm_set_ss(i))).s; }
1201  void SetZ(float i) { s = Swizzle<2,1,0,3>(_mm_move_ss(Swizzle<2,1,0,3>(s).s, _mm_set_ss(i))).s; }
1202  void SetW(float i) { s = Swizzle<3,1,2,0>(_mm_move_ss(Swizzle<3,1,2,0>(s).s, _mm_set_ss(i))).s; }
1203 
1204  template <size_t LANE>
1205  Float4 Dup() { return Swizzle<LANE,LANE,LANE,LANE>(s); }
1206 
1207  Int4 Convert4i() const;
1208  Int4 Cast4i() const;
1209 
1210  Float4& operator+=(const Float4& rhs);
1211  Float4& operator-=(const Float4& rhs);
1212  Float4& operator*=(const Float4& rhs);
1213  Float4& operator*=(float rhs);
1214  Float4& operator/=(const Float4& rhs);
1215  Float4& operator/=(float rhs);
1216 
1217  static Float4 Load2(const void* mem);
1218  static void Store2(void* mem, const Float4& i);
1219  static Float4 Load3(const void* mem);
1220  static void Store3(void* mem, const Float4& i);
1221  static Float4 Load4(const void* mem);
1222  static void Store4(void* mem, const Float4& i);
1223 
1224  static Float4 SetZero();
1225  static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
1226  static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
1227  static Float4 Sqrt(const Float4& in);
1228  static Float4 Rsqrt(const Float4& in);
1229  static Float4 Abs(const Float4& in);
1230  static Float4 Min(const Float4& lhs, const Float4& rhs);
1231  static Float4 Max(const Float4& lhs, const Float4& rhs);
1232  static Float4 Floor(const Float4& in);
1233  static Float4 Ceil(const Float4& in);
1234  static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
1235  static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);
1236 
1237  template<size_t LANE>
1238  static Float4 MulLane(const Float4& lhs, const Float4& rhs);
1239  template<size_t LANE>
1240  static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
1241  template<size_t LANE>
1242  static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
1243  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
1244  static Float4 Swizzle(const Float4& v);
1245 
1246  static Float4 Dot3(const Float4& lhs, const Float4& rhs);
1247  static Float4 Cross3(const Float4& lhs, const Float4& rhs);
1248 
1249  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
1250  static Float4 Mask();
1251  static uint32_t MoveMask(const Float4& in);
1252  static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
1253  static Float4 Equal(const Float4& lhs, const Float4& rhs);
1254  static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
1255  static Float4 LessThan(const Float4& lhs, const Float4& rhs);
1256  static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
1257  static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
1258  static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
1259  static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
1260  static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
1261  static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
1262 };
1263 
1264 inline Float4 operator+(const Float4& lhs, const Float4& rhs)
1265 {
1266  return Float4{_mm_add_ps(lhs.s, rhs.s)};
1267 }
1268 
1269 inline Float4 operator-(const Float4& lhs, const Float4& rhs)
1270 {
1271  return Float4{_mm_sub_ps(lhs.s, rhs.s)};
1272 }
1273 
1274 inline Float4 operator*(const Float4& lhs, const Float4& rhs)
1275 {
1276  return Float4{_mm_mul_ps(lhs.s, rhs.s)};
1277 }
1278 
1279 inline Float4 operator*(const Float4& lhs, float rhs)
1280 {
1281  return Float4{_mm_mul_ps(lhs.s, _mm_set1_ps(rhs))};
1282 }
1283 
1284 inline Float4 operator/(const Float4& lhs, const Float4& rhs)
1285 {
1286  return Float4{_mm_div_ps(lhs.s, rhs.s)};
1287 }
1288 
1289 inline Float4 operator/(const Float4& lhs, float rhs)
1290 {
1291  return Float4{_mm_div_ps(lhs.s, _mm_set1_ps(rhs))};
1292 }
1293 
1294 inline Float4 operator&(const Float4& lhs, const Float4& rhs)
1295 {
1296  return Float4{_mm_and_ps(lhs.s, rhs.s)};
1297 }
1298 
1299 inline Float4 operator|(const Float4& lhs, const Float4& rhs)
1300 {
1301  return Float4{_mm_or_ps(lhs.s, rhs.s)};
1302 }
1303 
1304 inline Float4 operator^(const Float4& lhs, const Float4& rhs)
1305 {
1306  return Float4{_mm_xor_ps(lhs.s, rhs.s)};
1307 }
1308 
1309 inline bool operator==(const Float4& lhs, const Float4& rhs)
1310 {
1311  return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
1312 }
1313 
1314 inline bool operator!=(const Float4& lhs, const Float4& rhs)
1315 {
1316  return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
1317 }
1318 
1319 inline Float4& Float4::operator+=(const Float4& rhs) { return *this = *this + rhs; }
1320 inline Float4& Float4::operator-=(const Float4& rhs) { return *this = *this - rhs; }
1321 inline Float4& Float4::operator*=(const Float4& rhs) { return *this = *this * rhs; }
1322 inline Float4& Float4::operator*=(float rhs) { return *this = *this * rhs; }
1323 inline Float4& Float4::operator/=(const Float4& rhs) { return *this = *this / rhs; }
1324 inline Float4& Float4::operator/=(float rhs) { return *this = *this / rhs; }
1325 
1326 inline Float4 Float4::Load2(const void* mem)
1327 {
1328  __m128 x = _mm_load_ss((const float*)mem + 0);
1329  __m128 y = _mm_load_ss((const float*)mem + 1);
1330  return _mm_unpacklo_ps(x, y);
1331 }
1332 
1333 inline void Float4::Store2(void* mem, const Float4& i)
1334 {
1335  Float4 t1 = Swizzle<1,1,1,1>(i.s);
1336  _mm_store_ss((float*)mem + 0, i.s);
1337  _mm_store_ss((float*)mem + 1, t1.s);
1338 }
1339 
1340 inline Float4 Float4::Load3(const void* mem)
1341 {
1342  __m128 x = _mm_load_ss((const float*)mem + 0);
1343  __m128 y = _mm_load_ss((const float*)mem + 1);
1344  __m128 z = _mm_load_ss((const float*)mem + 2);
1345  __m128 xy = _mm_unpacklo_ps(x, y);
1346  return _mm_movelh_ps(xy, z);
1347 }
1348 
1349 inline void Float4::Store3(void* mem, const Float4& i)
1350 {
1351  Float4 t1 = Swizzle<1,1,1,1>(i.s);
1352  Float4 t2 = Swizzle<2,2,2,2>(i.s);
1353  _mm_store_ss((float*)mem + 0, i.s);
1354  _mm_store_ss((float*)mem + 1, t1.s);
1355  _mm_store_ss((float*)mem + 2, t2.s);
1356 }
1357 
1358 inline Float4 Float4::Load4(const void* mem)
1359 {
1360  return _mm_loadu_ps((const float*)mem);
1361 }
1362 
1363 inline void Float4::Store4(void* mem, const Float4& i)
1364 {
1365  _mm_storeu_ps((float*)mem, i.s);
1366 }
1367 
1368 inline Float4 Float4::SetZero()
1369 {
1370  return _mm_setzero_ps();
1371 }
1372 
1373 inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
1374 {
1375  return Float4{_mm_setr_epi32((int)x, (int)y, (int)z, (int)w)};
1376 }
1377 
1378 inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
1379 {
1380  return Float4{_mm_setr_epi32((int)x, (int)y, (int)z, (int)w)};
1381 }
1382 
1383 inline Float4 Float4::Sqrt(const Float4& in)
1384 {
1385  return Float4{_mm_sqrt_ps(in.s)};
1386 }
1387 
1388 inline Float4 Float4::Rsqrt(const Float4& in)
1389 {
1390  return Float4{_mm_rsqrt_ps(in.s)};
1391 }
1392 
1393 inline Float4 Float4::Abs(const Float4& in)
1394 {
1395  return _mm_andnot_ps(_mm_set1_ps(-0.0f), in.s);
1396 }
1397 
1398 inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
1399 {
1400  return Float4{_mm_min_ps(lhs.s, rhs.s)};
1401 }
1402 
1403 inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
1404 {
1405  return Float4{_mm_max_ps(lhs.s, rhs.s)};
1406 }
1407 
1408 inline Float4 Float4::Floor(const Float4& in)
1409 {
1410 #if defined(EFK_SIMD_SSE4_2)
1411  return _mm_floor_ps(in.s);
1412 #else
1413  __m128i in_i = _mm_cvttps_epi32(in.s);
1414  __m128 result = _mm_cvtepi32_ps(in_i);
1415  __m128 larger = _mm_cmpgt_ps(result, in.s);
1416  larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
1417  return _mm_add_ps(result, larger);
1418 #endif
1419 }
1420 
1421 inline Float4 Float4::Ceil(const Float4& in)
1422 {
1423 #if defined(EFK_SIMD_SSE4_2)
1424  return _mm_ceil_ps(in.s);
1425 #else
1426  __m128i in_i = _mm_cvttps_epi32(in.s);
1427  __m128 result = _mm_cvtepi32_ps(in_i);
1428  __m128 smaller = _mm_cmplt_ps(result, in.s);
1429  smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
1430  return _mm_sub_ps(result, smaller);
1431 #endif
1432 }
1433 
1434 inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
1435 {
1436 #if defined(EFK_SIMD_AVX2)
1437  return Float4{_mm_fmadd_ps(b.s, c.s, a.s)};
1438 #else
1439  return Float4{_mm_add_ps(a.s, _mm_mul_ps(b.s, c.s))};
1440 #endif
1441 }
1442 
1443 inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
1444 {
1445 #if defined(EFK_SIMD_AVX2)
1446  return Float4{_mm_fnmadd_ps(b.s, c.s, a.s)};
1447 #else
1448  return Float4{_mm_sub_ps(a.s, _mm_mul_ps(b.s, c.s))};
1449 #endif
1450 }
1451 
1452 template<size_t LANE>
1453 Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
1454 {
1455  static_assert(LANE < 4, "LANE is must be less than 4.");
1456  return _mm_mul_ps(lhs.s, Swizzle<LANE,LANE,LANE,LANE>(rhs).s);
1457 }
1458 
1459 template<size_t LANE>
1460 Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
1461 {
1462  static_assert(LANE < 4, "LANE is must be less than 4.");
1463 #if defined(EFK_SIMD_AVX2)
1464  return _mm_fmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
1465 #else
1466  return _mm_add_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
1467 #endif
1468 }
1469 
1470 template<size_t LANE>
1471 Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
1472 {
1473  static_assert(LANE < 4, "LANE is must be less than 4.");
1474 #if defined(EFK_SIMD_AVX2)
1475  return _mm_fnmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
1476 #else
1477  return _mm_sub_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
1478 #endif
1479 }
1480 
1481 template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
1482 Float4 Float4::Swizzle(const Float4& v)
1483 {
1484  static_assert(indexX < 4, "indexX is must be less than 4.");
1485  static_assert(indexY < 4, "indexY is must be less than 4.");
1486  static_assert(indexZ < 4, "indexZ is must be less than 4.");
1487  static_assert(indexW < 4, "indexW is must be less than 4.");
1488 
1489 #if defined(EFK_SIMD_AVX)
1490  return _mm_permute_ps(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
1491 #else
1492  return _mm_shuffle_ps(v.s, v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
1493 #endif
1494 }
1495 
1496 inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
1497 {
1498  Float4 muled = lhs * rhs;
1499  return _mm_add_ss(_mm_add_ss(muled.s, Float4::Swizzle<1,1,1,1>(muled).s), Float4::Swizzle<2,2,2,2>(muled).s);
1500 }
1501 
1502 inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
1503 {
1504  return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
1505  Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
1506 }
1507 
1508 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
1509 inline Float4 Float4::Mask()
1510 {
1511  static_assert(X >= 2, "indexX is must be set 0 or 1.");
1512  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
1513  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
1514  static_assert(W >= 2, "indexW is must be set 0 or 1.");
1515  return _mm_setr_epi32(
1516  (int)(0xffffffff * X),
1517  (int)(0xffffffff * Y),
1518  (int)(0xffffffff * Z),
1519  (int)(0xffffffff * W));
1520 }
1521 
1522 inline uint32_t Float4::MoveMask(const Float4& in)
1523 {
1524  return (uint32_t)_mm_movemask_ps(in.s);
1525 }
1526 
1527 inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
1528 {
1529  return _mm_or_ps(_mm_and_ps(mask.s, sel1.s), _mm_andnot_ps(mask.s, sel2.s));
1530 }
1531 
1532 inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
1533 {
1534  return Float4{_mm_cmpeq_ps(lhs.s, rhs.s)};
1535 }
1536 
1537 inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
1538 {
1539  return Float4{_mm_cmpneq_ps(lhs.s, rhs.s)};
1540 }
1541 
1542 inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
1543 {
1544  return Float4{_mm_cmplt_ps(lhs.s, rhs.s)};
1545 }
1546 
1547 inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
1548 {
1549  return Float4{_mm_cmple_ps(lhs.s, rhs.s)};
1550 }
1551 
1552 inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
1553 {
1554  return Float4{_mm_cmpgt_ps(lhs.s, rhs.s)};
1555 }
1556 
1557 inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
1558 {
1559  return Float4{_mm_cmpge_ps(lhs.s, rhs.s)};
1560 }
1561 
1562 inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
1563 {
1564  return LessEqual(Abs(lhs - rhs), Float4(epsilon));
1565 }
1566 
1567 inline Float4 Float4::IsZero(const Float4& in, float epsilon)
1568 {
1569  return LessEqual(Abs(in), Float4(epsilon));
1570 }
1571 
1572 inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
1573 {
1574  _MM_TRANSPOSE4_PS(s0.s, s1.s, s2.s, s3.s);
1575 }
1576 
1577 } // namespace SIMD
1578 
1579 } // namespace Effekseer
1580 
1581 #endif
1582 
1583 #endif // __EFFEKSEER_SIMD_FLOAT4_SSE_H__
1584 
1585 #ifndef __EFFEKSEER_SIMD_INT4_GEN_H__
1586 #define __EFFEKSEER_SIMD_INT4_GEN_H__
1587 
1588 
1589 #if defined(EFK_SIMD_GEN)
1590 
1591 #include <cstring>
1592 #include <algorithm>
1593 
1594 namespace Effekseer
1595 {
1596 
1597 namespace SIMD
1598 {
1599 
1600 struct Float4;
1601 
1605 struct alignas(16) Int4
1606 {
1607  union {
1608  float vf[4];
1609  int32_t vi[4];
1610  uint32_t vu[4];
1611  };
1612 
1613  Int4() = default;
1614  Int4(const Int4& rhs) = default;
1615  Int4(int32_t x, int32_t y, int32_t z, int32_t w) { vi[0] = x; vi[1] = y; vi[2] = z; vi[3] = w; }
1616  Int4(int32_t i) { vi[0] = i; vi[1] = i; vi[2] = i; vi[3] = i; }
1617 
1618  int32_t GetX() const { return vi[0]; }
1619  int32_t GetY() const { return vi[1]; }
1620  int32_t GetZ() const { return vi[2]; }
1621  int32_t GetW() const { return vi[3]; }
1622 
1623  void SetX(int32_t o) { vi[0] = o; }
1624  void SetY(int32_t o) { vi[1] = o; }
1625  void SetZ(int32_t o) { vi[2] = o; }
1626  void SetW(int32_t o) { vi[3] = o; }
1627 
1628  Float4 Convert4f() const;
1629  Float4 Cast4f() const;
1630 
1631  Int4& operator+=(const Int4& rhs)
1632  {
1633  for (size_t i = 0; i < 4; i++)
1634  {
1635  vi[i] += rhs.vi[i];
1636  }
1637  return *this;
1638  }
1639 
1640  Int4& operator-=(const Int4& rhs)
1641  {
1642  for (size_t i = 0; i < 4; i++)
1643  {
1644  vi[i] -= rhs.vi[i];
1645  }
1646  return *this;
1647  }
1648 
1649  Int4& operator*=(const Int4& rhs)
1650  {
1651  for (size_t i = 0; i < 4; i++)
1652  {
1653  vi[i] *= rhs.vi[i];
1654  }
1655  return *this;
1656  }
1657 
1658  Int4& operator*=(int32_t rhs)
1659  {
1660  for (size_t i = 0; i < 4; i++)
1661  {
1662  vi[i] *= rhs;
1663  }
1664  return *this;
1665  }
1666 
1667  Int4& operator/=(const Int4& rhs)
1668  {
1669  for (size_t i = 0; i < 4; i++)
1670  {
1671  vi[i] /= rhs.vi[i];
1672  }
1673  return *this;
1674  }
1675 
1676  Int4& operator/=(int32_t rhs)
1677  {
1678  for (size_t i = 0; i < 4; i++)
1679  {
1680  vi[i] /= rhs;
1681  }
1682  return *this;
1683  }
1684 
1685  static Int4 Load2(const void* mem);
1686  static void Store2(void* mem, const Int4& i);
1687  static Int4 Load3(const void* mem);
1688  static void Store3(void* mem, const Int4& i);
1689  static Int4 Load4(const void* mem);
1690  static void Store4(void* mem, const Int4& i);
1691 
1692  static Int4 SetZero();
1693  static Int4 Abs(const Int4& in);
1694  static Int4 Min(const Int4& lhs, const Int4& rhs);
1695  static Int4 Max(const Int4& lhs, const Int4& rhs);
1696  static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
1697  static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);
1698 
1699  template<size_t LANE>
1700  static Int4 MulLane(const Int4& lhs, const Int4& rhs);
1701  template<size_t LANE>
1702  static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
1703  template<size_t LANE>
1704  static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
1705  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
1706  static Int4 Swizzle(const Int4& in);
1707 
1708  template <int COUNT>
1709  static Int4 ShiftL(const Int4& in);
1710  template <int COUNT>
1711  static Int4 ShiftR(const Int4& in);
1712  template <int COUNT>
1713  static Int4 ShiftRA(const Int4& in);
1714 
1715  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
1716  static Int4 Mask();
1717  static uint32_t MoveMask(const Int4& in);
1718  static Int4 Equal(const Int4& lhs, const Int4& rhs);
1719  static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
1720  static Int4 LessThan(const Int4& lhs, const Int4& rhs);
1721  static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
1722  static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
1723  static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
1724  static Int4 NearEqual(const Int4& lhs, const Int4& rhs, float epsilon = DefaultEpsilon);
1725  static Int4 IsZero(const Int4& in, float epsilon = DefaultEpsilon);
1726  static void Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3);
1727 };
1728 
1729 inline Int4 operator+(const Int4& lhs, const Int4& rhs)
1730 {
1731  Int4 ret;
1732  for (size_t i = 0; i < 4; i++)
1733  {
1734  ret.vi[i] = lhs.vi[i] + rhs.vi[i];
1735  }
1736  return ret;
1737 }
1738 
1739 inline Int4 operator-(const Int4& lhs, const Int4& rhs)
1740 {
1741  Int4 ret;
1742  for (size_t i = 0; i < 4; i++)
1743  {
1744  ret.vi[i] = lhs.vi[i] - rhs.vi[i];
1745  }
1746  return ret;
1747 }
1748 
1749 inline Int4 operator*(const Int4& lhs, const Int4& rhs)
1750 {
1751  Int4 ret;
1752  for (size_t i = 0; i < 4; i++)
1753  {
1754  ret.vi[i] = lhs.vi[i] * rhs.vi[i];
1755  }
1756  return ret;
1757 }
1758 
1759 inline Int4 operator*(const Int4& lhs, int32_t rhs)
1760 {
1761  Int4 ret;
1762  for (size_t i = 0; i < 4; i++)
1763  {
1764  ret.vi[i] = lhs.vi[i] * rhs;
1765  }
1766  return ret;
1767 }
1768 
1769 inline Int4 operator/(const Int4& lhs, const Int4& rhs)
1770 {
1771  Int4 ret;
1772  for (size_t i = 0; i < 4; i++)
1773  {
1774  ret.vi[i] = lhs.vi[i] / rhs.vi[i];
1775  }
1776  return ret;
1777 }
1778 
1779 inline Int4 operator/(const Int4& lhs, int32_t rhs)
1780 {
1781  Int4 ret;
1782  for (size_t i = 0; i < 4; i++)
1783  {
1784  ret.vi[i] = lhs.vi[i] / rhs;
1785  }
1786  return ret;
1787 }
1788 
1789 inline Int4 operator&(const Int4& lhs, const Int4& rhs)
1790 {
1791  Int4 ret;
1792  for (size_t i = 0; i < 4; i++)
1793  {
1794  ret.vu[i] = lhs.vu[i] & rhs.vu[i];
1795  }
1796  return ret;
1797 }
1798 
1799 inline Int4 operator|(const Int4& lhs, const Int4& rhs)
1800 {
1801  Int4 ret;
1802  for (size_t i = 0; i < 4; i++)
1803  {
1804  ret.vu[i] = lhs.vu[i] | rhs.vu[i];
1805  }
1806  return ret;
1807 }
1808 
1809 inline Int4 operator^(const Int4& lhs, const Int4& rhs)
1810 {
1811  Int4 ret;
1812  for (size_t i = 0; i < 4; i++)
1813  {
1814  ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
1815  }
1816  return ret;
1817 }
1818 
1819 inline bool operator==(const Int4& lhs, const Int4& rhs)
1820 {
1821  bool ret = true;
1822  for (size_t i = 0; i < 4; i++)
1823  {
1824  ret &= lhs.vi[i] == rhs.vi[i];
1825  }
1826  return ret;
1827 }
1828 
1829 inline bool operator!=(const Int4& lhs, const Int4& rhs)
1830 {
1831  bool ret = true;
1832  for (size_t i = 0; i < 4; i++)
1833  {
1834  ret &= lhs.vi[i] == rhs.vi[i];
1835  }
1836  return !ret;
1837 }
1838 
1839 inline Int4 Int4::Load2(const void* mem)
1840 {
1841  Int4 ret;
1842  memcpy(ret.vi, mem, sizeof(float) * 2);
1843  // This code causes bugs in asmjs
1844  // ret.vi[0] = *((float*)mem + 0);
1845  // ret.vi[1] = *((float*)mem + 1);
1846  return ret;
1847 }
1848 
1849 inline void Int4::Store2(void* mem, const Int4& i)
1850 {
1851  memcpy(mem, i.vi, sizeof(float) * 2);
1852  // This code causes bugs in asmjs
1853  // *((float*)mem + 0) = i.vi[0];
1854  // *((float*)mem + 1) = i.vi[1];
1855 }
1856 
1857 inline Int4 Int4::Load3(const void* mem)
1858 {
1859  Int4 ret;
1860  memcpy(ret.vi, mem, sizeof(float) * 3);
1861  // This code causes bugs in asmjs
1862  // ret.vi[0] = *((float*)mem + 0);
1863  // ret.vi[1] = *((float*)mem + 1);
1864  // ret.vi[2] = *((float*)mem + 2);
1865  return ret;
1866 }
1867 
1868 inline void Int4::Store3(void* mem, const Int4& i)
1869 {
1870  memcpy(mem, i.vi, sizeof(float) * 3);
1871  // This code causes bugs in asmjs
1872  // *((float*)mem + 0) = i.vi[0];
1873  // *((float*)mem + 1) = i.vi[1];
1874  // *((float*)mem + 2) = i.vi[2];
1875 }
1876 
1877 inline Int4 Int4::Load4(const void* mem)
1878 {
1879  Int4 ret;
1880  memcpy(ret.vi, mem, sizeof(float) * 4);
1881  // This code causes bugs in emscripten
1882  // ret.vi[0] = *((float*)mem + 0);
1883  // ret.vi[1] = *((float*)mem + 1);
1884  // ret.vi[2] = *((float*)mem + 2);
1885  // ret.vi[3] = *((float*)mem + 3);
1886  return ret;
1887 }
1888 
1889 inline void Int4::Store4(void* mem, const Int4& i)
1890 {
1891  memcpy(mem, i.vi, sizeof(float) * 4);
1892  // This code causes bugs in asmjs
1893  // *((float*)mem + 0) = i.vi[0];
1894  // *((float*)mem + 1) = i.vi[1];
1895  // *((float*)mem + 2) = i.vi[2];
1896  // *((float*)mem + 3) = i.vi[3];
1897 }
1898 
1899 inline Int4 Int4::SetZero()
1900 {
1901  Int4 ret;
1902  ret.vi[0] = 0;
1903  ret.vi[1] = 0;
1904  ret.vi[2] = 0;
1905  ret.vi[3] = 0;
1906  return ret;
1907 }
1908 
1909 inline Int4 Int4::Abs(const Int4& in)
1910 {
1911  Int4 ret;
1912  for (size_t i = 0; i < 4; i++)
1913  {
1914  ret.vi[i] = std::abs(in.vi[i]);
1915  }
1916  return ret;
1917 }
1918 
1919 inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
1920 {
1921  Int4 ret;
1922  for (size_t i = 0; i < 4; i++)
1923  {
1924  ret.vi[i] = (lhs.vi[i] < rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
1925  }
1926  return ret;
1927 }
1928 
1929 inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
1930 {
1931  Int4 ret;
1932  for (size_t i = 0; i < 4; i++)
1933  {
1934  ret.vi[i] = (lhs.vi[i] > rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
1935  }
1936  return ret;
1937 }
1938 
1939 inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
1940 {
1941  Int4 ret;
1942  for (size_t i = 0; i < 4; i++)
1943  {
1944  ret.vi[i] = a.vi[i] + b.vi[i] * c.vi[i];
1945 }
1946  return ret;
1947 }
1948 
1949 inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
1950 {
1951  Int4 ret;
1952  for (size_t i = 0; i < 4; i++)
1953  {
1954  ret.vi[i] = a.vi[i] - b.vi[i] * c.vi[i];
1955 }
1956  return ret;
1957 }
1958 
1959 template<size_t LANE>
1960 Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
1961 {
1962  static_assert(LANE < 4, "LANE is must be less than 4.");
1963  return lhs * rhs.vi[LANE];
1964 }
1965 
1966 template<size_t LANE>
1967 Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
1968 {
1969  static_assert(LANE < 4, "LANE is must be less than 4.");
1970  return a + b * c.vi[LANE];
1971 }
1972 
1973 template<size_t LANE>
1974 Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
1975 {
1976  static_assert(LANE < 4, "LANE is must be less than 4.");
1977  return a - b * c.vi[LANE];
1978 }
1979 
1980 template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
1981 Int4 Int4::Swizzle(const Int4& in)
1982 {
1983  static_assert(indexX < 4, "indexX is must be less than 4.");
1984  static_assert(indexY < 4, "indexY is must be less than 4.");
1985  static_assert(indexZ < 4, "indexZ is must be less than 4.");
1986  static_assert(indexW < 4, "indexW is must be less than 4.");
1987  return Int4{in.vi[indexX], in.vi[indexY], in.vi[indexZ], in.vi[indexW]};
1988 }
1989 
1990 template <int COUNT>
1991 inline Int4 Int4::ShiftL(const Int4& lhs)
1992 {
1993  Int4 ret;
1994  for (size_t i = 0; i < 4; i++)
1995  {
1996  ret.vu[i] = lhs.vu[i] << COUNT;
1997  }
1998  return ret;
1999 }
2000 
2001 template <int COUNT>
2002 inline Int4 Int4::ShiftR(const Int4& lhs)
2003 {
2004  Int4 ret;
2005  for (size_t i = 0; i < 4; i++)
2006  {
2007  ret.vu[i] = lhs.vu[i] >> COUNT;
2008  }
2009  return ret;
2010 }
2011 
2012 template <int COUNT>
2013 inline Int4 Int4::ShiftRA(const Int4& lhs)
2014 {
2015  Int4 ret;
2016  for (size_t i = 0; i < 4; i++)
2017  {
2018  ret.vi[i] = lhs.vi[i] >> COUNT;
2019  }
2020  return ret;
2021 }
2022 
2023 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
2024 Int4 Int4::Mask()
2025 {
2026  static_assert(X >= 2, "indexX is must be set 0 or 1.");
2027  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
2028  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
2029  static_assert(W >= 2, "indexW is must be set 0 or 1.");
2030  Int4 ret;
2031  ret.vu[0] = 0xffffffff * X;
2032  ret.vu[1] = 0xffffffff * Y;
2033  ret.vu[2] = 0xffffffff * Z;
2034  ret.vu[3] = 0xffffffff * W;
2035  return ret;
2036 }
2037 
2038 inline uint32_t Int4::MoveMask(const Int4& in)
2039 {
2040  return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
2041 }
2042 
2043 inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
2044 {
2045  Int4 ret;
2046  for (size_t i = 0; i < 4; i++)
2047  {
2048  ret.vu[i] = (lhs.vi[i] == rhs.vi[i]) ? 0xffffffff : 0;
2049  }
2050  return ret;
2051 }
2052 
2053 inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
2054 {
2055  Int4 ret;
2056  for (size_t i = 0; i < 4; i++)
2057  {
2058  ret.vu[i] = (lhs.vi[i] != rhs.vi[i]) ? 0xffffffff : 0;
2059  }
2060  return ret;
2061 }
2062 
2063 inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
2064 {
2065  Int4 ret;
2066  for (size_t i = 0; i < 4; i++)
2067  {
2068  ret.vu[i] = (lhs.vi[i] < rhs.vi[i]) ? 0xffffffff : 0;
2069  }
2070  return ret;
2071 }
2072 
2073 inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
2074 {
2075  Int4 ret;
2076  for (size_t i = 0; i < 4; i++)
2077  {
2078  ret.vu[i] = (lhs.vi[i] <= rhs.vi[i]) ? 0xffffffff : 0;
2079  }
2080  return ret;
2081 }
2082 
2083 inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
2084 {
2085  Int4 ret;
2086  for (size_t i = 0; i < 4; i++)
2087  {
2088  ret.vu[i] = (lhs.vi[i] > rhs.vi[i]) ? 0xffffffff : 0;
2089  }
2090  return ret;
2091 }
2092 
2093 inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
2094 {
2095  Int4 ret;
2096  for (size_t i = 0; i < 4; i++)
2097  {
2098  ret.vu[i] = (lhs.vi[i] >= rhs.vi[i]) ? 0xffffffff : 0;
2099  }
2100  return ret;
2101 }
2102 
2103 inline Int4 Int4::NearEqual(const Int4& lhs, const Int4& rhs, float epsilon)
2104 {
2105  Int4 ret;
2106  for (size_t i = 0; i < 4; i++)
2107  {
2108  ret.vu[i] = (std::abs(lhs.vi[i] - rhs.vi[i]) <= epsilon) ? 0xffffffff : 0;
2109  }
2110  return ret;
2111 }
2112 
2113 inline Int4 Int4::IsZero(const Int4& in, float epsilon)
2114 {
2115  Int4 ret;
2116  for (size_t i = 0; i < 4; i++)
2117  {
2118  ret.vu[i] = (std::abs(in.vi[i]) <= epsilon) ? 0xffffffff : 0;
2119  }
2120  return ret;
2121 }
2122 
2123 inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
2124 {
2125  std::swap(s0.vi[1], s1.vi[0]);
2126  std::swap(s0.vi[2], s2.vi[0]);
2127  std::swap(s0.vi[3], s3.vi[0]);
2128  std::swap(s1.vi[2], s2.vi[1]);
2129  std::swap(s2.vi[3], s3.vi[2]);
2130  std::swap(s1.vi[3], s3.vi[1]);
2131 }
2132 
2133 } // namespace SIMD
2134 
2135 } // namespace Effekseer
2136 
2137 #endif
2138 
2139 #endif // __EFFEKSEER_SIMD_INT4_GEN_H__
2140 
2141 #ifndef __EFFEKSEER_SIMD_INT4_NEON_H__
2142 #define __EFFEKSEER_SIMD_INT4_NEON_H__
2143 
2144 
2145 #if defined(EFK_SIMD_NEON)
2146 
2147 namespace Effekseer
2148 {
2149 
2150 namespace SIMD
2151 {
2152 
2153 struct Float4;
2154 
2159 struct alignas(16) Int4
2160 {
2161  int32x4_t s;
2162 
2163  Int4() = default;
2164  Int4(const Int4& rhs) = default;
2165  Int4(int32x4_t rhs) { s = rhs; }
2166  Int4(int32_t x, int32_t y, int32_t z, int32_t w) { const int32_t v[4] = {x, y, z, w}; s = vld1q_s32(v); }
2167  Int4(int32_t i) { s = vdupq_n_s32(i); }
2168 
2169  int32_t GetX() const { return vgetq_lane_s32(s, 0); }
2170  int32_t GetY() const { return vgetq_lane_s32(s, 1); }
2171  int32_t GetZ() const { return vgetq_lane_s32(s, 2); }
2172  int32_t GetW() const { return vgetq_lane_s32(s, 3); }
2173 
2174  void SetX(int32_t i) { s = vsetq_lane_s32(i, s, 0); }
2175  void SetY(int32_t i) { s = vsetq_lane_s32(i, s, 1); }
2176  void SetZ(int32_t i) { s = vsetq_lane_s32(i, s, 2); }
2177  void SetW(int32_t i) { s = vsetq_lane_s32(i, s, 3); }
2178 
2179  Float4 Convert4f() const;
2180  Float4 Cast4f() const;
2181 
2182  Int4& operator+=(const Int4& rhs);
2183  Int4& operator-=(const Int4& rhs);
2184  Int4& operator*=(const Int4& rhs);
2185  Int4& operator*=(int32_t rhs);
2186  Int4& operator/=(const Int4& rhs);
2187  Int4& operator/=(int32_t rhs);
2188 
2189  static Int4 Load2(const void* mem);
2190  static void Store2(void* mem, const Int4& i);
2191  static Int4 Load3(const void* mem);
2192  static void Store3(void* mem, const Int4& i);
2193  static Int4 Load4(const void* mem);
2194  static void Store4(void* mem, const Int4& i);
2195 
2196  static Int4 SetZero();
2197  static Int4 Abs(const Int4& in);
2198  static Int4 Min(const Int4& lhs, const Int4& rhs);
2199  static Int4 Max(const Int4& lhs, const Int4& rhs);
2200  static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
2201  static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);
2202 
2203  template<size_t LANE>
2204  static Int4 MulLane(const Int4& lhs, const Int4& rhs);
2205  template<size_t LANE>
2206  static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
2207  template<size_t LANE>
2208  static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
2209  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
2210  static Int4 Swizzle(const Int4& v);
2211 
2212  template <int COUNT>
2213  static Int4 ShiftL(const Int4& in);
2214  template <int COUNT>
2215  static Int4 ShiftR(const Int4& in);
2216  template <int COUNT>
2217  static Int4 ShiftRA(const Int4& in);
2218 
2219  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
2220  static Int4 Mask();
2221  static uint32_t MoveMask(const Int4& in);
2222  static Int4 Equal(const Int4& lhs, const Int4& rhs);
2223  static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
2224  static Int4 LessThan(const Int4& lhs, const Int4& rhs);
2225  static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
2226  static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
2227  static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
2228  static Int4 NearEqual(const Int4& lhs, const Int4& rhs, int32_t epsilon = DefaultEpsilon);
2229  static Int4 IsZero(const Int4& in, int32_t epsilon = DefaultEpsilon);
2230  static void Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3);
2231 
2232 private:
2233  static Int4 SwizzleYZX(const Int4& in);
2234  static Int4 SwizzleZXY(const Int4& in);
2235 };
2236 
2237 inline Int4 operator+(const Int4& lhs, const Int4& rhs)
2238 {
2239  return vaddq_s32(lhs.s, rhs.s);
2240 }
2241 
2242 inline Int4 operator-(const Int4& lhs, const Int4& rhs)
2243 {
2244  return vsubq_s32(lhs.s, rhs.s);
2245 }
2246 
2247 inline Int4 operator*(const Int4& lhs, const Int4& rhs)
2248 {
2249  return vmulq_s32(lhs.s, rhs.s);
2250 }
2251 
2252 inline Int4 operator*(const Int4& lhs, int32_t rhs)
2253 {
2254  return vmulq_n_s32(lhs.s, rhs);
2255 }
2256 
2257 inline Int4 operator/(const Int4& lhs, const Int4& rhs)
2258 {
2259 #if defined(EFK_NEON_ARM64)
2260  return vdivq_s32(lhs.s, rhs.s);
2261 #else
2262  return Int4(
2263  lhs.GetX() / rhs.GetX(),
2264  lhs.GetY() / rhs.GetY(),
2265  lhs.GetZ() / rhs.GetZ(),
2266  lhs.GetW() / rhs.GetW());
2267 #endif
2268 }
2269 
2270 inline Int4 operator/(const Int4& lhs, int32_t rhs)
2271 {
2272  return lhs * (1.0f / rhs);
2273 }
2274 
2275 inline Int4 operator&(const Int4& lhs, const Int4& rhs)
2276 {
2277  uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
2278  uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
2279  return vreinterpretq_s32_u32(vandq_u32(lhsi, rhsi));
2280 }
2281 
2282 inline Int4 operator|(const Int4& lhs, const Int4& rhs)
2283 {
2284  uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
2285  uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
2286  return vreinterpretq_s32_u32(vorrq_u32(lhsi, rhsi));
2287 }
2288 
2289 inline bool operator==(const Int4& lhs, const Int4& rhs)
2290 {
2291  return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
2292 }
2293 
2294 inline bool operator!=(const Int4& lhs, const Int4& rhs)
2295 {
2296  return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
2297 }
2298 
2299 inline Int4& Int4::operator+=(const Int4& rhs) { return *this = *this + rhs; }
2300 inline Int4& Int4::operator-=(const Int4& rhs) { return *this = *this - rhs; }
2301 inline Int4& Int4::operator*=(const Int4& rhs) { return *this = *this * rhs; }
2302 inline Int4& Int4::operator*=(int32_t rhs) { return *this = *this * rhs; }
2303 inline Int4& Int4::operator/=(const Int4& rhs) { return *this = *this / rhs; }
2304 inline Int4& Int4::operator/=(int32_t rhs) { return *this = *this / rhs; }
2305 
2306 inline Int4 Int4::Load2(const void* mem)
2307 {
2308  int32x2_t low = vld1_s32((const int32_t*)mem);
2309  int32x2_t high = vdup_n_s32(0.0f);
2310  return vcombine_s32(low, high);
2311 }
2312 
2313 inline void Int4::Store2(void* mem, const Int4& i)
2314 {
2315  vst1_s32((int32_t*)mem, vget_low_s32(i.s));
2316 }
2317 
2318 inline Int4 Int4::Load3(const void* mem)
2319 {
2320  int32x2_t low = vld1_s32((const int32_t*)mem);
2321  int32x2_t high = vld1_lane_s32((const int32_t*)mem + 2, vdup_n_s32(0.0f), 0);
2322  return vcombine_s32(low, high);
2323 }
2324 
2325 inline void Int4::Store3(void* mem, const Int4& i)
2326 {
2327  vst1_s32((int32_t*)mem, vget_low_s32(i.s));
2328  vst1q_lane_s32((int32_t*)mem + 2, i.s, 2);
2329 }
2330 
2331 inline Int4 Int4::Load4(const void* mem)
2332 {
2333  return vld1q_s32((const int32_t*)mem);
2334 }
2335 
2336 inline void Int4::Store4(void* mem, const Int4& i)
2337 {
2338  vst1q_s32((int32_t*)mem, i.s);
2339 }
2340 
2341 inline Int4 Int4::SetZero()
2342 {
2343  return vdupq_n_s32(0.0f);
2344 }
2345 
2346 inline Int4 Int4::Abs(const Int4& in)
2347 {
2348  return vabsq_s32(in.s);
2349 }
2350 
2351 inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
2352 {
2353  return vminq_s32(lhs.s, rhs.s);
2354 }
2355 
2356 inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
2357 {
2358  return vmaxq_s32(lhs.s, rhs.s);
2359 }
2360 
2361 inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
2362 {
2363  return vmlaq_s32(a.s, b.s, c.s);
2364 }
2365 
2366 inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
2367 {
2368  return vmlsq_s32(a.s, b.s, c.s);
2369 }
2370 
2371 template<size_t LANE>
2372 inline Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
2373 {
2374  static_assert(LANE < 4, "LANE is must be less than 4.");
2375  int32x2_t rhs2 = (LANE < 2) ? vget_low_s32(rhs.s) : vget_high_s32(rhs.s);
2376  return vmulq_lane_s32(lhs.s, rhs2, LANE & 1);
2377 }
2378 
2379 template<size_t LANE>
2380 inline Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
2381 {
2382  static_assert(LANE < 4, "LANE is must be less than 4.");
2383  int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
2384  return vmlaq_lane_s32(a.s, b.s, c2, LANE & 1);
2385 }
2386 
2387 template<size_t LANE>
2388 inline Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
2389 {
2390  static_assert(LANE < 4, "LANE is must be less than 4.");
2391  int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
2392  return vmlsq_lane_s32(a.s, b.s, c2, LANE & 1);
2393 }
2394 
2395 //template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
2396 //inline Int4 Int4::Swizzle(const Int4& v)
2397 //{
2398 // static_assert(indexX < 4, "indexX is must be less than 4.");
2399 // static_assert(indexY < 4, "indexY is must be less than 4.");
2400 // static_assert(indexZ < 4, "indexZ is must be less than 4.");
2401 // static_assert(indexW < 4, "indexW is must be less than 4.");
2402 //}
2403 
2404 template <int COUNT>
2405 inline Int4 Int4::ShiftL(const Int4& lhs)
2406 {
2407  return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
2408 }
2409 
2410 template <int COUNT>
2411 inline Int4 Int4::ShiftR(const Int4& lhs)
2412 {
2413  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
2414 }
2415 
2416 template <int COUNT>
2417 inline Int4 Int4::ShiftRA(const Int4& lhs)
2418 {
2419  return vshrq_n_s32(lhs.s, COUNT);
2420 }
2421 
2422 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
2423 inline Int4 Int4::Mask()
2424 {
2425  static_assert(X >= 2, "indexX is must be set 0 or 1.");
2426  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
2427  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
2428  static_assert(W >= 2, "indexW is must be set 0 or 1.");
2429  const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
2430  return vld1q_u32(in);
2431 }
2432 
2433 inline uint32_t Int4::MoveMask(const Int4& in)
2434 {
2435  uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_s32(in.s));
2436  uint16_t u16[4];
2437  vst1_u16(u16, u16x4);
2438  return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
2439 }
2440 
2441 inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
2442 {
2443  return vreinterpretq_s32_u32(vceqq_s32(lhs.s, rhs.s));
2444 }
2445 
2446 inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
2447 {
2448  return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(lhs.s, rhs.s)));
2449 }
2450 
2451 inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
2452 {
2453  return vreinterpretq_s32_u32(vcltq_s32(lhs.s, rhs.s));
2454 }
2455 
2456 inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
2457 {
2458  return vreinterpretq_s32_u32(vcleq_s32(lhs.s, rhs.s));
2459 }
2460 
2461 inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
2462 {
2463  return vreinterpretq_s32_u32(vcgtq_s32(lhs.s, rhs.s));
2464 }
2465 
2466 inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
2467 {
2468  return vreinterpretq_s32_u32(vcgeq_s32(lhs.s, rhs.s));
2469 }
2470 
2471 inline Int4 Int4::NearEqual(const Int4& lhs, const Int4& rhs, int32_t epsilon)
2472 {
2473  return LessEqual(Abs(lhs - rhs), Int4(epsilon));
2474 }
2475 
2476 inline Int4 Int4::IsZero(const Int4& in, int32_t epsilon)
2477 {
2478  return LessEqual(Abs(in), Int4(epsilon));
2479 }
2480 
2481 inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
2482 {
2483  int32x4x2_t t0 = vzipq_s32(s0.s, s2.s);
2484  int32x4x2_t t1 = vzipq_s32(s1.s, s3.s);
2485  int32x4x2_t t2 = vzipq_s32(t0.val[0], t1.val[0]);
2486  int32x4x2_t t3 = vzipq_s32(t0.val[1], t1.val[1]);
2487 
2488  s0 = t2.val[0];
2489  s1 = t2.val[1];
2490  s2 = t3.val[0];
2491  s3 = t3.val[1];
2492 }
2493 
2494 inline Int4 Int4::SwizzleYZX(const Int4& in)
2495 {
2496  int32x4_t ex = vextq_s32(in.s, in.s, 1);
2497  return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 2);
2498 }
2499 
2500 inline Int4 Int4::SwizzleZXY(const Int4& in)
2501 {
2502  int32x4_t ex = vextq_s32(in.s, in.s, 3);
2503  return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 0);
2504 }
2505 
2506 } // namespace SIMD
2507 
2508 } // namespace Effekseer
2509 
2510 #endif
2511 #endif // __EFFEKSEER_SIMD_INT4_NEON_H__
2512 
2513 #ifndef __EFFEKSEER_SIMD_INT4_SSE_H__
2514 #define __EFFEKSEER_SIMD_INT4_SSE_H__
2515 
2516 
2517 #if defined(EFK_SIMD_SSE2)
2518 
2519 namespace Effekseer
2520 {
2521 
2522 namespace SIMD
2523 {
2524 
2525 struct Float4;
2526 
2531 struct alignas(16) Int4
2532 {
2533  __m128i s;
2534 
2535  Int4() = default;
2536  Int4(const Int4& rhs) = default;
2537  Int4(__m128i rhs) { s = rhs; }
2538  Int4(__m128 rhs) { s = _mm_castps_si128(rhs); }
2539  Int4(int32_t x, int32_t y, int32_t z, int32_t w) { s = _mm_setr_epi32((int)x, (int)y, (int)z, (int)w); }
2540  Int4(int32_t i) { s = _mm_set1_epi32((int)i); }
2541 
2542  int32_t GetX() const { return _mm_cvtsi128_si32(s); }
2543  int32_t GetY() const { return _mm_cvtsi128_si32(Swizzle<1,1,1,1>(s).s); }
2544  int32_t GetZ() const { return _mm_cvtsi128_si32(Swizzle<2,2,2,2>(s).s); }
2545  int32_t GetW() const { return _mm_cvtsi128_si32(Swizzle<3,3,3,3>(s).s); }
2546 
2547  void SetX(int32_t i) { s = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s), _mm_castsi128_ps(_mm_cvtsi32_si128(i)))); }
2548  void SetY(int32_t i) { s = Swizzle<1,0,2,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<1,0,2,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2549  void SetZ(int32_t i) { s = Swizzle<2,1,0,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<2,1,0,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2550  void SetW(int32_t i) { s = Swizzle<3,1,2,0>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<3,1,2,0>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
2551 
2552  Float4 Convert4f() const;
2553  Float4 Cast4f() const;
2554 
2555  Int4& operator+=(const Int4& rhs);
2556  Int4& operator-=(const Int4& rhs);
2557  Int4& operator*=(const Int4& rhs);
2558  Int4& operator*=(int32_t rhs);
2559  Int4& operator/=(const Int4& rhs);
2560  Int4& operator/=(int32_t rhs);
2561 
2562  static Int4 Load2(const void* mem);
2563  static void Store2(void* mem, const Int4& i);
2564  static Int4 Load3(const void* mem);
2565  static void Store3(void* mem, const Int4& i);
2566  static Int4 Load4(const void* mem);
2567  static void Store4(void* mem, const Int4& i);
2568 
2569  static Int4 SetZero();
2570  static Int4 Abs(const Int4& in);
2571  static Int4 Min(const Int4& lhs, const Int4& rhs);
2572  static Int4 Max(const Int4& lhs, const Int4& rhs);
2573  static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
2574  static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);
2575 
2576  template<size_t LANE>
2577  static Int4 MulLane(const Int4& lhs, const Int4& rhs);
2578  template<size_t LANE>
2579  static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
2580  template<size_t LANE>
2581  static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
2582  template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
2583  static Int4 Swizzle(const Int4& v);
2584 
2585  template <int COUNT>
2586  static Int4 ShiftL(const Int4& in);
2587  template <int COUNT>
2588  static Int4 ShiftR(const Int4& in);
2589  template <int COUNT>
2590  static Int4 ShiftRA(const Int4& in);
2591 
2592  template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
2593  static Int4 Mask();
2594  static uint32_t MoveMask(const Int4& in);
2595  static Int4 Equal(const Int4& lhs, const Int4& rhs);
2596  static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
2597  static Int4 LessThan(const Int4& lhs, const Int4& rhs);
2598  static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
2599  static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
2600  static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
2601 };
2602 
2603 inline Int4 operator+(const Int4& lhs, const Int4& rhs)
2604 {
2605  return Int4{_mm_add_epi32(lhs.s, rhs.s)};
2606 }
2607 
2608 inline Int4 operator-(const Int4& lhs, const Int4& rhs)
2609 {
2610  return Int4{_mm_sub_epi32(lhs.s, rhs.s)};
2611 }
2612 
2613 inline Int4 operator*(const Int4& lhs, const Int4& rhs)
2614 {
2615 #if defined(EFK_SIMD_SSE4_1)
2616  return _mm_mullo_epi32(lhs.s, rhs.s);
2617 #else
2618  __m128i tmp1 = _mm_mul_epu32(lhs.s, rhs.s);
2619  __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_srli_si128(rhs.s, 4));
2620  return _mm_unpacklo_epi32(
2621  _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
2622  _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
2623 #endif
2624 }
2625 
2626 inline Int4 operator*(const Int4& lhs, int32_t rhs)
2627 {
2628 #if defined(EFK_SIMD_SSE4_1)
2629  return _mm_mullo_epi32(lhs.s, _mm_set1_epi32(rhs));
2630 #else
2631  __m128i tmp1 = _mm_mul_epu32(lhs.s, _mm_set1_epi32(rhs));
2632  __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_set1_epi32(rhs));
2633  return _mm_unpacklo_epi32(
2634  _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
2635  _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
2636 #endif
2637 }
2638 
2639 inline Int4 operator/(const Int4& lhs, const Int4& rhs)
2640 {
2641  return Int4(
2642  lhs.GetX() * rhs.GetX(),
2643  lhs.GetY() * rhs.GetY(),
2644  lhs.GetZ() * rhs.GetZ(),
2645  lhs.GetW() * rhs.GetW());
2646 }
2647 
2648 inline Int4 operator/(const Int4& lhs, int32_t rhs)
2649 {
2650  return Int4(
2651  lhs.GetX() * rhs,
2652  lhs.GetY() * rhs,
2653  lhs.GetZ() * rhs,
2654  lhs.GetW() * rhs);
2655 }
2656 
2657 inline Int4 operator&(const Int4& lhs, const Int4& rhs)
2658 {
2659  return Int4{_mm_and_si128(lhs.s, rhs.s)};
2660 }
2661 
2662 inline Int4 operator|(const Int4& lhs, const Int4& rhs)
2663 {
2664  return Int4{_mm_or_si128(lhs.s, rhs.s)};
2665 }
2666 
2667 inline Int4 operator^(const Int4& lhs, const Int4& rhs)
2668 {
2669  return Int4{_mm_xor_si128(lhs.s, rhs.s)};
2670 }
2671 
2672 inline bool operator==(const Int4& lhs, const Int4& rhs)
2673 {
2674  return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
2675 }
2676 
2677 inline bool operator!=(const Int4& lhs, const Int4& rhs)
2678 {
2679  return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
2680 }
2681 
2682 inline Int4& Int4::operator+=(const Int4& rhs) { return *this = *this + rhs; }
2683 inline Int4& Int4::operator-=(const Int4& rhs) { return *this = *this - rhs; }
2684 inline Int4& Int4::operator*=(const Int4& rhs) { return *this = *this * rhs; }
2685 inline Int4& Int4::operator*=(int32_t rhs) { return *this = *this * rhs; }
2686 inline Int4& Int4::operator/=(const Int4& rhs) { return *this = *this / rhs; }
2687 inline Int4& Int4::operator/=(int32_t rhs) { return *this = *this / rhs; }
2688 
2689 inline Int4 Int4::Load2(const void* mem)
2690 {
2691  __m128 x = _mm_load_ss((const float*)mem + 0);
2692  __m128 y = _mm_load_ss((const float*)mem + 1);
2693  return _mm_castps_si128(_mm_unpacklo_ps(x, y));
2694 }
2695 
2696 inline void Int4::Store2(void* mem, const Int4& i)
2697 {
2698  Int4 t1 = Swizzle<1,1,1,1>(i);
2699  _mm_store_ss((float*)mem + 0, _mm_castsi128_ps(i.s));
2700  _mm_store_ss((float*)mem + 1, _mm_castsi128_ps(t1.s));
2701 }
2702 
2703 inline Int4 Int4::Load3(const void* mem)
2704 {
2705  __m128 x = _mm_load_ss((const float*)mem + 0);
2706  __m128 y = _mm_load_ss((const float*)mem + 1);
2707  __m128 z = _mm_load_ss((const float*)mem + 2);
2708  __m128 xy = _mm_unpacklo_ps(x, y);
2709  return _mm_castps_si128(_mm_movelh_ps(xy, z));
2710 }
2711 
2712 inline void Int4::Store3(void* mem, const Int4& i)
2713 {
2714  Int4 t1 = Swizzle<1,1,1,1>(i);
2715  Int4 t2 = Swizzle<2,2,2,2>(i);
2716  _mm_store_ss((float*)mem + 0, _mm_castsi128_ps(i.s));
2717  _mm_store_ss((float*)mem + 1, _mm_castsi128_ps(t1.s));
2718  _mm_store_ss((float*)mem + 2, _mm_castsi128_ps(t2.s));
2719 }
2720 
2721 inline Int4 Int4::Load4(const void* mem)
2722 {
2723  return _mm_loadu_si128((const __m128i*)mem);
2724 }
2725 
2726 inline void Int4::Store4(void* mem, const Int4& i)
2727 {
2728  _mm_storeu_si128((__m128i*)mem, i.s);
2729 }
2730 
2731 inline Int4 Int4::SetZero()
2732 {
2733  return _mm_setzero_si128();
2734 }
2735 
2736 inline Int4 Int4::Abs(const Int4& in)
2737 {
2738 #if defined(EFK_SIMD_SSSE3)
2739  return _mm_abs_epi32(in.s);
2740 #else
2741  __m128i sign = _mm_srai_epi32(in.s, 31);
2742  return _mm_sub_epi32(_mm_xor_si128(in.s, sign), sign);
2743 #endif
2744 }
2745 
2746 inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
2747 {
2748 #if defined(EFK_SIMD_SSE4_1)
2749  return _mm_min_epi32(lhs.s, rhs.s);
2750 #else
2751  __m128i mask = _mm_cmplt_epi32(lhs.s, rhs.s);
2752  return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
2753 #endif
2754 }
2755 
2756 inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
2757 {
2758 #if defined(EFK_SIMD_SSE4_1)
2759  return _mm_max_epi32(lhs.s, rhs.s);
2760 #else
2761  __m128i mask = _mm_cmpgt_epi32(lhs.s, rhs.s);
2762  return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
2763 #endif
2764 }
2765 
2766 inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
2767 {
2768  return a + b * c;
2769 }
2770 
2771 inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
2772 {
2773  return a - b * c;
2774 }
2775 
2776 template<size_t LANE>
2777 Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
2778 {
2779  static_assert(LANE < 4, "LANE is must be less than 4.");
2780  return lhs * Int4::Swizzle<LANE,LANE,LANE,LANE>(rhs);
2781 }
2782 
2783 template<size_t LANE>
2784 Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
2785 {
2786  static_assert(LANE < 4, "LANE is must be less than 4.");
2787  return a + b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
2788 }
2789 
2790 template<size_t LANE>
2791 Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
2792 {
2793  static_assert(LANE < 4, "LANE is must be less than 4.");
2794  return a - b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
2795 }
2796 
2797 template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
2798 Int4 Int4::Swizzle(const Int4& v)
2799 {
2800  static_assert(indexX < 4, "indexX is must be less than 4.");
2801  static_assert(indexY < 4, "indexY is must be less than 4.");
2802  static_assert(indexZ < 4, "indexZ is must be less than 4.");
2803  static_assert(indexW < 4, "indexW is must be less than 4.");
2804  return Int4{_mm_shuffle_epi32(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX))};
2805 }
2806 
2807 template <int COUNT>
2808 inline Int4 Int4::ShiftL(const Int4& lhs)
2809 {
2810  return _mm_slli_epi32(lhs.s, COUNT);
2811 }
2812 
2813 template <int COUNT>
2814 inline Int4 Int4::ShiftR(const Int4& lhs)
2815 {
2816  return _mm_srli_epi32(lhs.s, COUNT);
2817 }
2818 
2819 template <int COUNT>
2820 inline Int4 Int4::ShiftRA(const Int4& lhs)
2821 {
2822  return _mm_srai_epi32(lhs.s, COUNT);
2823 }
2824 
2825 template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
2826 inline Int4 Int4::Mask()
2827 {
2828  static_assert(X >= 2, "indexX is must be set 0 or 1.");
2829  static_assert(Y >= 2, "indexY is must be set 0 or 1.");
2830  static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
2831  static_assert(W >= 2, "indexW is must be set 0 or 1.");
2832  return _mm_setr_epi32(
2833  (int)(0xffffffff * X),
2834  (int)(0xffffffff * Y),
2835  (int)(0xffffffff * Z),
2836  (int)(0xffffffff * W));
2837 }
2838 
2839 inline uint32_t Int4::MoveMask(const Int4& in)
2840 {
2841  return (uint32_t)_mm_movemask_ps(_mm_castsi128_ps(in.s));
2842 }
2843 
2844 inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
2845 {
2846  return Int4{_mm_cmpeq_epi32(lhs.s, rhs.s)};
2847 }
2848 
2849 inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
2850 {
2851  return Int4{_mm_andnot_si128(_mm_cmpeq_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2852 }
2853 
2854 inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
2855 {
2856  return Int4{_mm_cmplt_epi32(lhs.s, rhs.s)};
2857 }
2858 
2859 inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
2860 {
2861  return Int4{_mm_andnot_si128(_mm_cmpgt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2862 }
2863 
2864 inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
2865 {
2866  return Int4{_mm_cmpgt_epi32(lhs.s, rhs.s)};
2867 }
2868 
2869 inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
2870 {
2871  return Int4{_mm_andnot_si128(_mm_cmplt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
2872 }
2873 
2874 } // namespace SIMD
2875 
2876 } // namespace Effekseer
2877 
2878 #endif
2879 
2880 #endif // __EFFEKSEER_SIMD_INT4_SSE_H__
2881 
2882 #ifndef __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2883 #define __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2884 
2885 
2886 #if defined(EFK_SIMD_GEN)
2887 
2888 namespace Effekseer
2889 {
2890 
2891 namespace SIMD
2892 {
2893 
2894 inline Int4 Float4::Convert4i() const { return Int4((int32_t)vf[0], (int32_t)vf[1], (int32_t)vf[2], (int32_t)vf[3]); }
2895 
2896 inline Int4 Float4::Cast4i() const { return Int4(vu[0], vu[1], vu[2], vu[3]); }
2897 
2898 inline Float4 Int4::Convert4f() const { return Float4((float)vi[0], (float)vi[1], (float)vi[2], (float)vi[3]); }
2899 
2900 inline Float4 Int4::Cast4f() const { return Float4(vf[0], vf[1], vf[2], vf[3]); }
2901 
2902 } // namespace SIMD
2903 
2904 } // namespace Effekseer
2905 
2906 #endif
2907 
2908 #endif // __EFFEKSEER_SIMD_BRIDGE_GEN_H__
2909 
2910 #ifndef __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2911 #define __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2912 
2913 
2914 #if defined(EFK_SIMD_NEON)
2915 
2916 namespace Effekseer
2917 {
2918 
2919 namespace SIMD
2920 {
2921 
2922 inline Int4 Float4::Convert4i() const { return vcvtq_s32_f32(s); }
2923 
2924 inline Int4 Float4::Cast4i() const { return vreinterpretq_s32_f32(s); }
2925 
2926 inline Float4 Int4::Convert4f() const { return vcvtq_f32_s32(s); }
2927 
2928 inline Float4 Int4::Cast4f() const { return vreinterpretq_f32_s32(s); }
2929 
2930 } // namespace SIMD
2931 
2932 } // namespace Effekseer
2933 
2934 #endif
2935 #endif // __EFFEKSEER_SIMD_BRIDGE_NEON_H__
2936 
2937 #ifndef __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2938 #define __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2939 
2940 
2941 #if defined(EFK_SIMD_SSE2)
2942 
2943 namespace Effekseer
2944 {
2945 
2946 namespace SIMD
2947 {
2948 
2949 inline Int4 Float4::Convert4i() const { return _mm_cvttps_epi32(s); }
2950 
2951 inline Int4 Float4::Cast4i() const { return _mm_castps_si128(s); }
2952 
2953 inline Float4 Int4::Convert4f() const { return _mm_cvtepi32_ps(s); }
2954 
2955 inline Float4 Int4::Cast4f() const { return _mm_castsi128_ps(s); }
2956 
2957 } // namespace SIMD
2958 
2959 } // namespace Effekseer
2960 
2961 #endif
2962 
2963 #endif // __EFFEKSEER_SIMD_BRIDGE_SSE_H__
2964 
2965 #ifndef __EFFEKSEER_SIMD_VEC2F_H__
2966 #define __EFFEKSEER_SIMD_VEC2F_H__
2967 
2968 
2969 namespace Effekseer
2970 {
2971 
2972 struct Vector2D;
2973 struct vector2d;
2974 
2975 namespace SIMD
2976 {
2977 
2978 struct Vec2f
2979 {
2980  Float4 s;
2981 
2982  explicit Vec2f() = default;
2983  Vec2f(const Vec2f& vec) = default;
2984  Vec2f(float x, float y): s(x, y, 0.0f, 1.0f) {}
2985  Vec2f(const std::array<float, 2>& v): s(v[0], v[1], 0.0f, 1.0f) {}
2986  Vec2f(const Float4& vec): s(vec) {}
2987  Vec2f(const Vector2D& vec);
2988  Vec2f(const vector2d& vec);
2989 
2990  float GetX() const { return s.GetX(); }
2991  float GetY() const { return s.GetY(); }
2992 
2993  void SetX(float o) { s.SetX(o); }
2994  void SetY(float o) { s.SetY(o); }
2995 
2996  Vec2f& operator+=(const Vec2f& o) { s += o.s; return *this; }
2997  Vec2f& operator-=(const Vec2f& o) { s -= o.s; return *this; }
2998  Vec2f& operator*=(const Vec2f& o) { s *= o.s; return *this; }
2999  Vec2f& operator*=(float o) { s *= o; return *this; }
3000  Vec2f& operator/=(const Vec2f& o) { s /= o.s; return *this; }
3001  Vec2f& operator/=(float o) { s /= o; return *this; }
3002 
3003  float LengthSq() const;
3004  float Length() const;
3005  bool IsZero(float range = DefaultEpsilon) const;
3006  Vec2f Normalize() const;
3007 
3008  static Vec2f Load(const void* mem);
3009  static void Store(void* mem, const Vec2f& i);
3010 
3011  static Vec2f Sqrt(const Vec2f& i);
3012  static Vec2f Rsqrt(const Vec2f& i);
3013  static Vec2f Abs(const Vec2f& i);
3014  static Vec2f Min(const Vec2f& lhs, const Vec2f& rhs);
3015  static Vec2f Max(const Vec2f& lhs, const Vec2f& rhs);
3016  static bool Equal(const Vec2f& lhs, const Vec2f& rhs, float epsilon);
3017 };
3018 
3019 inline Vec2f operator+(const Vec2f& lhs, const Vec2f& rhs)
3020 {
3021  return Vec2f{lhs.s + rhs.s};
3022 }
3023 
3024 inline Vec2f operator-(const Vec2f& lhs, const Vec2f& rhs)
3025 {
3026  return Vec2f{lhs.s - rhs.s};
3027 }
3028 
3029 inline Vec2f operator*(const Vec2f& lhs, const Vec2f& rhs)
3030 {
3031  return Vec2f{lhs.s * rhs.s};
3032 }
3033 
3034 inline Vec2f operator*(const Vec2f& lhs, float rhs)
3035 {
3036  return Vec2f{lhs.s * rhs};
3037 }
3038 
3039 inline Vec2f operator/(const Vec2f& lhs, const Vec2f& rhs)
3040 {
3041  return Vec2f{lhs.s / rhs.s};
3042 }
3043 
3044 inline Vec2f operator/(const Vec2f& lhs, float rhs)
3045 {
3046  return Vec2f{lhs.s / rhs};
3047 }
3048 
3049 inline bool operator==(const Vec2f& lhs, const Vec2f& rhs)
3050 {
3051  return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) == 0x3;
3052 }
3053 
3054 inline bool operator!=(const Vec2f& lhs, const Vec2f& rhs)
3055 {
3056  return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) != 0x3;
3057 }
3058 
3059 inline Vec2f Vec2f::Load(const void* mem)
3060 {
3061  return Float4::Load2(mem);
3062 }
3063 
3064 inline void Vec2f::Store(void* mem, const Vec2f& i)
3065 {
3066  Float4::Store2(mem, i.s);
3067 }
3068 
3069 inline Vec2f Vec2f::Sqrt(const Vec2f& i)
3070 {
3071  return Vec2f{Float4::Sqrt(i.s)};
3072 }
3073 
3074 inline Vec2f Vec2f::Rsqrt(const Vec2f& i)
3075 {
3076  return Vec2f{Float4::Rsqrt(i.s)};
3077 }
3078 
3079 inline Vec2f Vec2f::Abs(const Vec2f& i)
3080 {
3081  return Vec2f{Float4::Abs(i.s)};
3082 }
3083 
3084 inline Vec2f Vec2f::Min(const Vec2f& lhs, const Vec2f& rhs)
3085 {
3086  return Vec2f{Float4::Min(lhs.s, rhs.s)};
3087 }
3088 
3089 inline Vec2f Vec2f::Max(const Vec2f& lhs, const Vec2f& rhs)
3090 {
3091  return Vec2f{Float4::Max(lhs.s, rhs.s)};
3092 }
3093 
3094 inline bool Vec2f::Equal(const Vec2f& lhs, const Vec2f& rhs, float epsilon)
3095 {
3096  return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x3) == 0x3;
3097 }
3098 
3099 inline float Vec2f::LengthSq() const
3100 {
3101  auto o = s * s;
3102  return o.GetX() + o.GetY();
3103 }
3104 
3105 inline float Vec2f::Length() const
3106 {
3107  return Effekseer::SIMD::Sqrt(LengthSq());
3108 }
3109 
3110 inline bool Vec2f::IsZero(float range) const
3111 {
3112  return LengthSq() < range * range;
3113 }
3114 
3115 inline Vec2f Vec2f::Normalize() const
3116 {
3117  return *this * Effekseer::SIMD::Rsqrt(LengthSq());
3118 }
3119 
3120 } // namespace SIMD
3121 
3122 } // namespace Effekseer
3123 
3124 #endif // __EFFEKSEER_VEC2F_H__
3125 
3126 #ifndef __EFFEKSEER_SIMD_VEC3F_H__
3127 #define __EFFEKSEER_SIMD_VEC3F_H__
3128 
3129 #include <functional>
3130 
3131 namespace Effekseer
3132 {
3133 
3134 struct Vector3D;
3135 struct vector3d;
3136 
3137 namespace SIMD
3138 {
3139 
3140 struct Mat43f;
3141 struct Mat44f;
3142 
3143 struct Vec3f
3144 {
3145  Float4 s;
3146 
3147  explicit Vec3f() = default;
3148  Vec3f(const Vec3f& vec) = default;
3149  Vec3f(float x, float y, float z)
3150  : s(x, y, z, 1.0f)
3151  {
3152  }
3153  Vec3f(const Float4& vec)
3154  : s(vec)
3155  {
3156  }
3157  Vec3f(const Vector3D& vec);
3158  Vec3f(const vector3d& vec);
3159  Vec3f(const std::array<float, 3>& vec);
3160 
3161  float GetX() const
3162  {
3163  return s.GetX();
3164  }
3165  float GetY() const
3166  {
3167  return s.GetY();
3168  }
3169  float GetZ() const
3170  {
3171  return s.GetZ();
3172  }
3173 
3174  void SetX(float o)
3175  {
3176  s.SetX(o);
3177  }
3178  void SetY(float o)
3179  {
3180  s.SetY(o);
3181  }
3182  void SetZ(float o)
3183  {
3184  s.SetZ(o);
3185  }
3186 
3187  Vec3f& operator+=(const Vec3f& o)
3188  {
3189  s += o.s;
3190  return *this;
3191  }
3192  Vec3f& operator-=(const Vec3f& o)
3193  {
3194  s -= o.s;
3195  return *this;
3196  }
3197  Vec3f& operator*=(const Vec3f& o)
3198  {
3199  s *= o.s;
3200  return *this;
3201  }
3202  Vec3f& operator*=(float o)
3203  {
3204  s *= o;
3205  return *this;
3206  }
3207  Vec3f& operator/=(const Vec3f& o)
3208  {
3209  s /= o.s;
3210  return *this;
3211  }
3212  Vec3f& operator/=(float o)
3213  {
3214  s /= o;
3215  return *this;
3216  }
3217 
3218  float GetSquaredLength() const;
3219  float GetLength() const;
3220  bool IsZero(float epsiron = DefaultEpsilon) const;
3221  Vec3f Normalize() const;
3222  Vec3f NormalizePrecisely() const;
3223  Vec3f NormalizeFast() const;
3224 
3225  static Vec3f Load(const void* mem);
3226  static void Store(void* mem, const Vec3f& i);
3227 
3228  static Vec3f Sqrt(const Vec3f& i);
3229  static Vec3f Rsqrt(const Vec3f& i);
3230  static Vec3f Abs(const Vec3f& i);
3231  static Vec3f Min(const Vec3f& lhs, const Vec3f& rhs);
3232  static Vec3f Max(const Vec3f& lhs, const Vec3f& rhs);
3233  static float Dot(const Vec3f& lhs, const Vec3f& rhs);
3234  static Vec3f Cross(const Vec3f& lhs, const Vec3f& rhs);
3235  static bool Equal(const Vec3f& lhs, const Vec3f& rhs, float epsilon = DefaultEpsilon);
3236  static Vec3f Transform(const Vec3f& lhs, const Mat43f& rhs);
3237  static Vec3f Transform(const Vec3f& lhs, const Mat44f& rhs);
3238 };
3239 
3240 inline Vec3f operator-(const Vec3f& i)
3241 {
3242  return Vec3f(-i.GetX(), -i.GetY(), -i.GetZ());
3243 }
3244 
3245 inline Vec3f operator+(const Vec3f& lhs, const Vec3f& rhs)
3246 {
3247  return Vec3f{lhs.s + rhs.s};
3248 }
3249 
3250 inline Vec3f operator-(const Vec3f& lhs, const Vec3f& rhs)
3251 {
3252  return Vec3f{lhs.s - rhs.s};
3253 }
3254 
3255 inline Vec3f operator*(const Vec3f& lhs, const Vec3f& rhs)
3256 {
3257  return Vec3f{lhs.s * rhs.s};
3258 }
3259 
3260 inline Vec3f operator*(const Vec3f& lhs, float rhs)
3261 {
3262  return Vec3f{lhs.s * rhs};
3263 }
3264 
3265 inline Vec3f operator/(const Vec3f& lhs, const Vec3f& rhs)
3266 {
3267  return Vec3f{lhs.s / rhs.s};
3268 }
3269 
3270 inline Vec3f operator/(const Vec3f& lhs, float rhs)
3271 {
3272  return Vec3f{lhs.s / rhs};
3273 }
3274 
3275 inline bool operator==(const Vec3f& lhs, const Vec3f& rhs)
3276 {
3277  return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) == 0x7;
3278 }
3279 
3280 inline bool operator!=(const Vec3f& lhs, const Vec3f& rhs)
3281 {
3282  return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) != 0x7;
3283 }
3284 
3285 inline Vec3f Vec3f::Load(const void* mem)
3286 {
3287  return Float4::Load3(mem);
3288 }
3289 
3290 inline void Vec3f::Store(void* mem, const Vec3f& i)
3291 {
3292  Float4::Store3(mem, i.s);
3293 }
3294 
3295 inline Vec3f Vec3f::Sqrt(const Vec3f& i)
3296 {
3297  return Vec3f{Float4::Sqrt(i.s)};
3298 }
3299 
3300 inline Vec3f Vec3f::Rsqrt(const Vec3f& i)
3301 {
3302  return Vec3f{Float4::Rsqrt(i.s)};
3303 }
3304 
3305 inline Vec3f Vec3f::Abs(const Vec3f& i)
3306 {
3307  return Vec3f{Float4::Abs(i.s)};
3308 }
3309 
3310 inline Vec3f Vec3f::Min(const Vec3f& lhs, const Vec3f& rhs)
3311 {
3312  return Vec3f{Float4::Min(lhs.s, rhs.s)};
3313 }
3314 
3315 inline Vec3f Vec3f::Max(const Vec3f& lhs, const Vec3f& rhs)
3316 {
3317  return Vec3f{Float4::Max(lhs.s, rhs.s)};
3318 }
3319 
3320 inline float Vec3f::Dot(const Vec3f& lhs, const Vec3f& rhs)
3321 {
3322  return Float4::Dot3(lhs.s, rhs.s).GetX();
3323 }
3324 
3325 inline Vec3f Vec3f::Cross(const Vec3f& lhs, const Vec3f& rhs)
3326 {
3327  return Float4::Cross3(lhs.s, rhs.s);
3328 }
3329 
3330 inline bool Vec3f::Equal(const Vec3f& lhs, const Vec3f& rhs, float epsilon)
3331 {
3332  return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x7) == 0x7;
3333 }
3334 
3335 inline float Vec3f::GetSquaredLength() const
3336 {
3337  auto o = s * s;
3338  return o.GetX() + o.GetY() + o.GetZ();
3339 }
3340 
3341 inline float Vec3f::GetLength() const
3342 {
3343  return Effekseer::SIMD::Sqrt(GetSquaredLength());
3344 }
3345 
3346 inline bool Vec3f::IsZero(float epsiron) const
3347 {
3348  return (Float4::MoveMask(Float4::IsZero(s, epsiron)) & 0x7) == 0x7;
3349 }
3350 
3351 inline Vec3f Vec3f::Normalize() const
3352 {
3353  return *this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
3354 }
3355 
3356 inline Vec3f Vec3f::NormalizePrecisely() const
3357 {
3358  return *this / Effekseer::SIMD::Sqrt(GetSquaredLength());
3359 }
3360 
3361 inline Vec3f Vec3f::NormalizeFast() const
3362 {
3363  return *this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
3364 }
3365 
3366 } // namespace SIMD
3367 
3368 } // namespace Effekseer
3369 
3370 namespace std
3371 {
3372 
3373 template <>
3374 struct hash<Effekseer::SIMD::Vec3f>
3375 {
3376  size_t operator()(const Effekseer::SIMD::Vec3f& _Keyval) const noexcept
3377  {
3378  return std::hash<float>()(_Keyval.GetX()) + std::hash<float>()(_Keyval.GetY()) + std::hash<float>()(_Keyval.GetZ());
3379  }
3380 };
3381 
3382 } // namespace std
3383 
3384 #endif // __EFFEKSEER_SIMD_VEC3F_H__
3385 
3386 #ifndef __EFFEKSEER_SIMD_VEC4F_H__
3387 #define __EFFEKSEER_SIMD_VEC4F_H__
3388 
3389 
3390 namespace Effekseer
3391 {
3392 
3393 namespace SIMD
3394 {
3395 
3396 struct Vec4f
3397 {
3398  Float4 s;
3399 
3400  Vec4f() = default;
3401  Vec4f(const Vec4f& vec) = default;
3402  Vec4f(const Float4& vec): s(vec) {}
3403 
3404  float GetX() const { return s.GetX(); }
3405  float GetY() const { return s.GetY(); }
3406  float GetZ() const { return s.GetZ(); }
3407  float GetW() const { return s.GetW(); }
3408 
3409  void SetX(float o) { s.SetX(o); }
3410  void SetY(float o) { s.SetY(o); }
3411  void SetZ(float o) { s.SetZ(o); }
3412  void SetW(float o) { s.SetW(o); }
3413 
3414  Vec4f& operator+=(const Vec4f& o)
3415  {
3416  this->s = this->s + o.s;
3417  return *this;
3418  }
3419 
3420  Vec4f& operator-=(const Vec4f& o)
3421  {
3422  this->s = this->s - o.s;
3423  return *this;
3424  }
3425 
3426  Vec4f& operator*=(const Vec4f& o)
3427  {
3428  this->s = this->s * o.s;
3429  return *this;
3430  }
3431 
3432  Vec4f& operator/=(const Vec4f& o)
3433  {
3434  this->s = this->s / o.s;
3435  return *this;
3436  }
3437 
3438  static Vec4f Sqrt(const Vec4f& i);
3439  static Vec4f Rsqrt(const Vec4f& i);
3440  static Vec4f Abs(const Vec4f& i);
3441  static Vec4f Min(const Vec4f& lhs, const Vec4f& rhs);
3442  static Vec4f Max(const Vec4f& lhs, const Vec4f& rhs);
3443  static bool Equal(const Vec4f& lhs, const Vec4f& rhs, float epsilon);
3444  static Vec4f Transform(const Vec4f& lhs, const Mat43f& rhs);
3445  static Vec4f Transform(const Vec4f& lhs, const Mat44f& rhs);
3446 };
3447 
3448 inline Vec4f operator+(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s + rhs.s}; }
3449 
3450 inline Vec4f operator-(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s - rhs.s}; }
3451 
3452 inline Vec4f operator*(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s * rhs.s}; }
3453 
3454 inline Vec4f operator/(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s / rhs.s}; }
3455 
3456 inline bool operator==(const Vec4f& lhs, const Vec4f& rhs)
3457 {
3458  return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) == 0xf;
3459 }
3460 
3461 inline bool operator!=(const Vec4f& lhs, const Vec4f& rhs)
3462 {
3463  return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) != 0xf;
3464 }
3465 
3466 inline Vec4f Vec4f::Sqrt(const Vec4f& i)
3467 {
3468  return Vec4f{Float4::Sqrt(i.s)};
3469 }
3470 
3471 inline Vec4f Vec4f::Rsqrt(const Vec4f& i)
3472 {
3473  return Vec4f{Float4::Rsqrt(i.s)};
3474 }
3475 
3476 inline Vec4f Vec4f::Abs(const Vec4f& i)
3477 {
3478  return Vec4f{Float4::Abs(i.s)};
3479 }
3480 
3481 inline Vec4f Vec4f::Min(const Vec4f& lhs, const Vec4f& rhs)
3482 {
3483  return Vec4f{Float4::Min(lhs.s, rhs.s)};
3484 }
3485 
3486 inline Vec4f Vec4f::Max(const Vec4f& lhs, const Vec4f& rhs)
3487 {
3488  return Vec4f{Float4::Max(lhs.s, rhs.s)};
3489 }
3490 
3491 inline bool Vec4f::Equal(const Vec4f& lhs, const Vec4f& rhs, float epsilon)
3492 {
3493  return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0xf) == 0xf;
3494 }
3495 
3496 } // namespace SIMD
3497 
3498 } // namespace Effekseer
3499 
3500 #endif // __EFFEKSEER_SIMD_VEC4F_H__
3501 
3502 #ifndef __EFFEKSEER_SIMD_MAT43F_H__
3503 #define __EFFEKSEER_SIMD_MAT43F_H__
3504 
3505 
3506 namespace Effekseer
3507 {
3508 
3509 struct Matrix43;
3510 
3511 namespace SIMD
3512 {
3513 
3514 struct Mat43f
3515 {
3516  Float4 X;
3517  Float4 Y;
3518  Float4 Z;
3519 
3520  Mat43f() = default;
3521  Mat43f(const Mat43f& rhs) = default;
3522  Mat43f(float m11, float m12, float m13,
3523  float m21, float m22, float m23,
3524  float m31, float m32, float m33,
3525  float m41, float m42, float m43);
3526  Mat43f(const Matrix43& mat);
3527 
3528  bool IsValid() const;
3529 
3530  Mat43f Get3x3SubMatrix() const;
3531 
3532  Vec3f GetScale() const;
3533 
3534  Mat43f GetRotation() const;
3535 
3536  Vec3f GetTranslation() const;
3537 
3538  void GetSRT(Vec3f& s, Mat43f& r, Vec3f& t) const;
3539 
3540  void SetTranslation(const Vec3f& t);
3541 
3542  Mat43f& operator*=(const Mat43f& rhs);
3543 
3544  Mat43f& operator*=(float rhs);
3545 
3546  static const Mat43f Identity;
3547 
3548  static bool Equal(const Mat43f& lhs, const Mat43f& rhs, float epsilon = DefaultEpsilon);
3549 
3550  static Mat43f SRT(const Vec3f& s, const Mat43f& r, const Vec3f& t);
3551 
3552  static Mat43f Scaling(float x, float y, float z);
3553 
3554  static Mat43f Scaling(const Vec3f& scale);
3555 
3556  static Mat43f RotationX(float angle);
3557 
3558  static Mat43f RotationY(float angle);
3559 
3560  static Mat43f RotationZ(float angle);
3561 
3562  static Mat43f RotationXYZ(float rx, float ry, float rz);
3563 
3564  static Mat43f RotationZXY(float rz, float rx, float ry);
3565 
3566  static Mat43f RotationAxis(const Vec3f& axis, float angle);
3567 
3568  static Mat43f RotationAxis(const Vec3f& axis, float s, float c);
3569 
3570  static Mat43f Translation(float x, float y, float z);
3571 
3572  static Mat43f Translation(const Vec3f& pos);
3573 };
3574 
3575 inline Mat43f::Mat43f(
3576  float m11, float m12, float m13,
3577  float m21, float m22, float m23,
3578  float m31, float m32, float m33,
3579  float m41, float m42, float m43)
3580  : X(m11, m21, m31, m41)
3581  , Y(m12, m22, m32, m42)
3582  , Z(m13, m23, m33, m43)
3583 {
3584 }
3585 
3586 inline bool operator==(const Mat43f& lhs, const Mat43f& rhs)
3587 {
3588  return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z;
3589 }
3590 
3591 inline bool operator!=(const Mat43f& lhs, const Mat43f& rhs)
3592 {
3593  return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z;
3594 }
3595 
3596 inline Mat43f operator*(const Mat43f& lhs, const Mat43f& rhs)
3597 {
3598  const Float4 mask = Float4::SetUInt(0, 0, 0, 0xffffffff);
3599 
3600  Mat43f res;
3601  res.X = mask & rhs.X;
3602  res.X = Float4::MulAddLane<0>(res.X, lhs.X, rhs.X);
3603  res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
3604  res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);
3605 
3606  res.Y = mask & rhs.Y;
3607  res.Y = Float4::MulAddLane<0>(res.Y, lhs.X, rhs.Y);
3608  res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
3609  res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);
3610 
3611  res.Z = mask & rhs.Z;
3612  res.Z = Float4::MulAddLane<0>(res.Z, lhs.X, rhs.Z);
3613  res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
3614  res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
3615  return res;
3616 }
3617 
3618 inline Vec3f Vec3f::Transform(const Vec3f& lhs, const Mat43f& rhs)
3619 {
3620  Float4 s0 = rhs.X;
3621  Float4 s1 = rhs.Y;
3622  Float4 s2 = rhs.Z;
3623  Float4 s3 = Float4::SetZero();
3624  Float4::Transpose(s0, s1, s2, s3);
3625 
3626  Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
3627  res = Float4::MulAddLane<1>(res, s1, lhs.s);
3628  res = Float4::MulAddLane<2>(res, s2, lhs.s);
3629  return Vec3f{res};
3630 }
3631 
3632 inline Vec4f Vec4f::Transform(const Vec4f& lhs, const Mat43f& rhs)
3633 {
3634  Float4 s0 = rhs.X;
3635  Float4 s1 = rhs.Y;
3636  Float4 s2 = rhs.Z;
3637  Float4 s3 = Float4(0.0f, 0.0f, 0.0f, 1.0f);
3638  Float4::Transpose(s0, s1, s2, s3);
3639 
3640  Float4 res = Float4::MulLane<0>(s0, lhs.s);
3641  res = Float4::MulAddLane<1>(res, s1, lhs.s);
3642  res = Float4::MulAddLane<2>(res, s2, lhs.s);
3643  res = Float4::MulAddLane<3>(res, s3, lhs.s);
3644  return res;
3645 }
3646 
3647 inline Mat43f& Mat43f::operator*=(const Mat43f& rhs)
3648 {
3649  *this = *this * rhs;
3650  return *this;
3651 }
3652 
3653 inline Mat43f& Mat43f::operator*=(float rhs)
3654 {
3655  X *= rhs;
3656  Y *= rhs;
3657  Z *= rhs;
3658  return *this;
3659 }
3660 
3661 } // namespace SIMD
3662 
3663 } // namespace Effekseer
3664 
3665 #endif // __EFFEKSEER_SIMD_MAT43F_H__
3666 
3667 #ifndef __EFFEKSEER_SIMD_MAT44F_H__
3668 #define __EFFEKSEER_SIMD_MAT44F_H__
3669 
3670 
3671 namespace Effekseer
3672 {
3673 
3674 struct Matrix44;
3675 
3676 namespace SIMD
3677 {
3678 
3679 struct Mat44f
3680 {
3681  Float4 X;
3682  Float4 Y;
3683  Float4 Z;
3684  Float4 W;
3685 
3686  Mat44f() = default;
3687  Mat44f(const Mat44f& rhs) = default;
3688  Mat44f(float m11, float m12, float m13, float m14,
3689  float m21, float m22, float m23, float m24,
3690  float m31, float m32, float m33, float m34,
3691  float m41, float m42, float m43, float m44);
3692  Mat44f(const Mat43f& mat);
3693  Mat44f(const Matrix44& mat);
3694 
3695  bool IsValid() const;
3696 
3697  Vec3f GetScale() const;
3698 
3699  Mat44f GetRotation() const;
3700 
3701  Vec3f GetTranslation() const;
3702 
3703  void GetSRT(Vec3f& s, Mat44f& r, Vec3f& t) const;
3704 
3705  void SetTranslation(const Vec3f& t);
3706 
3707  Mat44f Transpose() const;
3708 
3709  Mat44f& operator*=(const Mat44f& rhs);
3710 
3711  Mat44f& operator*=(float rhs);
3712 
3713  static const Mat44f Identity;
3714 
3715  static bool Equal(const Mat44f& lhs, const Mat44f& rhs, float epsilon = DefaultEpsilon);
3716 
3717  static Mat44f SRT(const Vec3f& s, const Mat44f& r, const Vec3f& t);
3718 
3719  static Mat44f Scaling(float x, float y, float z);
3720 
3721  static Mat44f Scaling(const Vec3f& scale);
3722 
3723  static Mat44f RotationX(float angle);
3724 
3725  static Mat44f RotationY(float angle);
3726 
3727  static Mat44f RotationZ(float angle);
3728 
3729  static Mat44f RotationXYZ(float rx, float ry, float rz);
3730 
3731  static Mat44f RotationZXY(float rz, float rx, float ry);
3732 
3733  static Mat44f RotationAxis(const Vec3f& axis, float angle);
3734 
3735  static Mat44f RotationAxis(const Vec3f& axis, float s, float c);
3736 
3737  static Mat44f Translation(float x, float y, float z);
3738 
3739  static Mat44f Translation(const Vec3f& pos);
3740 };
3741 
3742 inline Mat44f::Mat44f(
3743  float m11, float m12, float m13, float m14,
3744  float m21, float m22, float m23, float m24,
3745  float m31, float m32, float m33, float m34,
3746  float m41, float m42, float m43, float m44)
3747  : X(m11, m21, m31, m41)
3748  , Y(m12, m22, m32, m42)
3749  , Z(m13, m23, m33, m43)
3750  , W(m14, m24, m34, m44)
3751 {
3752 }
3753 
3754 inline Mat44f::Mat44f(const Mat43f& mat)
3755  : X(mat.X)
3756  , Y(mat.Y)
3757  , Z(mat.Z)
3758  , W(0.0f, 0.0f, 0.0f, 1.0f)
3759 {
3760 }
3761 
3762 inline bool operator==(const Mat44f& lhs, const Mat44f& rhs)
3763 {
3764  return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z && lhs.W == rhs.W;
3765 }
3766 
3767 inline bool operator!=(const Mat44f& lhs, const Mat44f& rhs)
3768 {
3769  return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z && lhs.W != rhs.W;
3770 }
3771 
3772 inline Mat44f operator*(const Mat44f& lhs, const Mat44f& rhs)
3773 {
3774  Mat44f res;
3775  res.X = Float4::MulLane<0>(lhs.X, rhs.X);
3776  res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
3777  res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);
3778  res.X = Float4::MulAddLane<3>(res.X, lhs.W, rhs.X);
3779 
3780  res.Y = Float4::MulLane<0>(lhs.X, rhs.Y);
3781  res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
3782  res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);
3783  res.Y = Float4::MulAddLane<3>(res.Y, lhs.W, rhs.Y);
3784 
3785  res.Z = Float4::MulLane<0>(lhs.X, rhs.Z);
3786  res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
3787  res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
3788  res.Z = Float4::MulAddLane<3>(res.Z, lhs.W, rhs.Z);
3789 
3790  res.W = Float4::MulLane<0>(lhs.X, rhs.W);
3791  res.W = Float4::MulAddLane<1>(res.W, lhs.Y, rhs.W);
3792  res.W = Float4::MulAddLane<2>(res.W, lhs.Z, rhs.W);
3793  res.W = Float4::MulAddLane<3>(res.W, lhs.W, rhs.W);
3794  return res;
3795 }
3796 
3797 inline Vec3f Vec3f::Transform(const Vec3f& lhs, const Mat44f& rhs)
3798 {
3799  Float4 s0 = rhs.X;
3800  Float4 s1 = rhs.Y;
3801  Float4 s2 = rhs.Z;
3802  Float4 s3 = rhs.W;
3803  Float4::Transpose(s0, s1, s2, s3);
3804 
3805  Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
3806  res = Float4::MulAddLane<1>(res, s1, lhs.s);
3807  res = Float4::MulAddLane<2>(res, s2, lhs.s);
3808  return Vec3f{res};
3809 }
3810 
3811 inline Vec4f Vec4f::Transform(const Vec4f& lhs, const Mat44f& rhs)
3812 {
3813  Float4 s0 = rhs.X;
3814  Float4 s1 = rhs.Y;
3815  Float4 s2 = rhs.Z;
3816  Float4 s3 = rhs.W;
3817  Float4::Transpose(s0, s1, s2, s3);
3818 
3819  Float4 res = Float4::MulLane<0>(s0, lhs.s);
3820  res = Float4::MulAddLane<1>(res, s1, lhs.s);
3821  res = Float4::MulAddLane<2>(res, s2, lhs.s);
3822  res = Float4::MulAddLane<3>(res, s3, lhs.s);
3823  return res;
3824 }
3825 
3826 inline Mat44f& Mat44f::operator*=(const Mat44f& rhs)
3827 {
3828  *this = *this * rhs;
3829  return *this;
3830 }
3831 
3832 inline Mat44f& Mat44f::operator*=(float rhs)
3833 {
3834  X *= rhs;
3835  Y *= rhs;
3836  Z *= rhs;
3837  W *= rhs;
3838  return *this;
3839 }
3840 
3841 } // namespace SIMD
3842 
3843 } // namespace Effekseer
3844 
3845 #endif // __EFFEKSEER_VEC4F_H__
3846 
3847 #ifndef __EFFEKSEER_SIMD_QUATERNIONF_H__
3848 #define __EFFEKSEER_SIMD_QUATERNIONF_H__
3849 
3850 
3851 namespace Effekseer
3852 {
3853 namespace SIMD
3854 {
3855 
3857 {
3858  Float4 s;
3859 
3860  Quaternionf() = default;
3861 
3862  Quaternionf(float x, float y, float z, float w)
3863  : s(x, y, z, w)
3864  {
3865  }
3866 
3867  Quaternionf(Float4 s)
3868  : s(s)
3869  {
3870  }
3871 
3872  float GetX() const
3873  {
3874  return s.GetX();
3875  }
3876  float GetY() const
3877  {
3878  return s.GetY();
3879  }
3880  float GetZ() const
3881  {
3882  return s.GetZ();
3883  }
3884  float GetW() const
3885  {
3886  return s.GetW();
3887  }
3888 
3889  void SetX(float o)
3890  {
3891  s.SetX(o);
3892  }
3893  void SetY(float o)
3894  {
3895  s.SetY(o);
3896  }
3897  void SetZ(float o)
3898  {
3899  s.SetZ(o);
3900  }
3901  void SetW(float o)
3902  {
3903  s.SetW(o);
3904  }
3905 
3906  Quaternionf Inverse() const
3907  {
3908  return Quaternionf{-GetX(), -GetY(), -GetZ(), GetW()};
3909  }
3910 
3911  static Quaternionf FromMatrix(const Mat44f& mat)
3912  {
3913  const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();
3914 
3915  if (tr > 0)
3916  {
3917  const auto qw = sqrtf(tr + 1.0f) / 2.0f;
3918  const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
3919  const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
3920  const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
3921  return Quaternionf{qx, qy, qz, qw};
3922  }
3923  else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
3924  {
3925  const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
3926  const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
3927  const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
3928  const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
3929  return Quaternionf{qx, qy, qz, qw};
3930  }
3931  else if (mat.Y.GetY() > mat.Z.GetZ())
3932  {
3933  const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
3934  const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
3935  const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
3936  const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
3937  return Quaternionf{qx, qy, qz, qw};
3938  }
3939  else
3940  {
3941  const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
3942  const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
3943  const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
3944  const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
3945  return Quaternionf{qx, qy, qz, qw};
3946  }
3947  }
3948 
3949  static Quaternionf FromMatrix(const Mat43f& mat)
3950  {
3951  const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();
3952 
3953  if (tr > 0)
3954  {
3955  const auto qw = sqrtf(tr + 1.0f) / 2.0f;
3956  const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
3957  const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
3958  const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
3959  return Quaternionf{qx, qy, qz, qw};
3960  }
3961  else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
3962  {
3963  const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
3964  const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
3965  const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
3966  const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
3967  return Quaternionf{qx, qy, qz, qw};
3968  }
3969  else if (mat.Y.GetY() > mat.Z.GetZ())
3970  {
3971  const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
3972  const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
3973  const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
3974  const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
3975  return Quaternionf{qx, qy, qz, qw};
3976  }
3977  else
3978  {
3979  const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
3980  const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
3981  const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
3982  const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
3983  return Quaternionf{qx, qy, qz, qw};
3984  }
3985  }
3986 
3987  Mat43f ToMatrix() const
3988  {
3989  const auto qx = GetX();
3990  const auto qy = GetY();
3991  const auto qz = GetZ();
3992  const auto qw = GetW();
3993 
3994  const auto qxx = qx * qx;
3995  const auto qyy = qy * qy;
3996  const auto qzz = qz * qz;
3997  const auto qww = qw * qw;
3998 
3999  const auto qxy = qx * qy;
4000  const auto qxz = qx * qz;
4001  const auto qyz = qy * qz;
4002 
4003  const auto qxw = qx * qw;
4004  const auto qyw = qy * qw;
4005  const auto qzw = qz * qw;
4006 
4007  Mat43f ret;
4008 
4009  ret.X = SIMD::Float4{(qxx - qyy - qzz + qww), 2.0f * (qxy - qzw), 2.0f * (qxz + qyw), 0};
4010  ret.Y = SIMD::Float4{2.0f * (qxy + qzw), (-qxx + qyy - qzz + qww), 2.0f * (qyz - qxw), 0};
4011  ret.Z = SIMD::Float4{2.0f * (qxz - qyw), 2.0f * (qyz + qxw), (-qxx - qyy + qzz + qww), 0};
4012 
4013  return ret;
4014  }
4015 
4016  static Quaternionf Slerp(const Quaternionf& q1, const Quaternionf& q2, float t)
4017  {
4018  const auto qq = q1.s * q2.s;
4019  auto cosa = qq.GetX() + qq.GetY() + qq.GetZ() + qq.GetW();
4020 
4021  if (cosa < 0.0f)
4022  {
4023  return Slerp(q1, Quaternionf{-q2.GetX(), -q2.GetY(), -q2.GetZ(), -q2.GetW()}, t);
4024  }
4025 
4026  cosa = Min(1.0f, cosa);
4027 
4028  const auto alpha = acos(cosa);
4029  const auto smallValue = 0.00001f;
4030  if (alpha < smallValue)
4031  {
4032  return q1;
4033  }
4034 
4035  return Quaternionf{q1.s * sin((1.0f - t) * alpha) / sin(alpha) + q2.s * sin(t * alpha) / sin(alpha)};
4036  }
4037 
4038  static Vec3f Transform(const Vec3f& v, const Quaternionf& q)
4039  {
4040  const auto qx = q.GetX();
4041  const auto qy = q.GetY();
4042  const auto qz = q.GetZ();
4043  const auto qw = q.GetW();
4044 
4045  const auto qxx = qx * qx;
4046  const auto qyy = qy * qy;
4047  const auto qzz = qz * qz;
4048  const auto qww = qw * qw;
4049 
4050  const auto qxy = qx * qy;
4051  const auto qxz = qx * qz;
4052  const auto qyz = qy * qz;
4053 
4054  const auto qxw = qx * qw;
4055  const auto qyw = qy * qw;
4056  const auto qzw = qz * qw;
4057 
4058  const auto x = (qxx - qyy - qzz + qww) * v.GetX() + 2.0f * (qxy - qzw) * v.GetY() + 2.0f * (qxz + qyw) * v.GetZ();
4059  const auto y = 2.0f * (qxy + qzw) * v.GetX() + (-qxx + qyy - qzz + qww) * v.GetY() + 2.0f * (qyz - qxw) * v.GetZ();
4060  const auto z = 2.0f * (qxz - qyw) * v.GetX() + 2.0f * (qyz + qxw) * v.GetY() + (-qxx - qyy + qzz + qww) * v.GetZ();
4061 
4062  return Vec3f{x, y, z};
4063  }
4064 };
4065 
4066 inline Quaternionf operator*(const Quaternionf& lhs, const Quaternionf& rhs)
4067 {
4068  // TODO optimize
4069  auto x = lhs.GetW() * rhs.GetX() - lhs.GetZ() * rhs.GetY() + lhs.GetY() * rhs.GetZ() + lhs.GetX() * rhs.GetW();
4070  auto y = lhs.GetZ() * rhs.GetX() + lhs.GetW() * rhs.GetY() - lhs.GetX() * rhs.GetZ() + lhs.GetY() * rhs.GetW();
4071  auto z = -lhs.GetY() * rhs.GetX() + lhs.GetX() * rhs.GetY() + lhs.GetW() * rhs.GetZ() + lhs.GetZ() * rhs.GetW();
4072  auto w = -lhs.GetX() * rhs.GetX() - lhs.GetY() * rhs.GetY() - lhs.GetZ() * rhs.GetZ() + lhs.GetW() * rhs.GetW();
4073  return Quaternionf{x, y, z, w};
4074 }
4075 
4076 } // namespace SIMD
4077 } // namespace Effekseer
4078 
4079 #endif
4080 
4081 #ifndef __EFFEKSEER_SIMD_UTILS_H__
4082 #define __EFFEKSEER_SIMD_UTILS_H__
4083 
4084 #include <stdlib.h>
4085 
4086 namespace Effekseer
4087 {
4088 
4089 namespace SIMD
4090 {
4091 
4092 template <size_t align>
4094 public:
4095  static void* operator new(size_t size) {
4096 #if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
4097  return malloc(size);
4098 #elif defined(_WIN32)
4099  return _mm_malloc(size, align);
4100 #else
4101  void *ptr = nullptr;
4102  posix_memalign(&ptr, align, size);
4103  return ptr;
4104 #endif
4105  }
4106  static void operator delete(void* ptr) {
4107 #if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
4108  free(ptr);
4109 #elif defined(_WIN32)
4110  _mm_free(ptr);
4111 #else
4112  return free(ptr);
4113 #endif
4114  }
4115 };
4116 
4117 inline Vector2D ToStruct(const Vec2f& o)
4118 {
4119  Vector2D ret;
4120  Vec2f::Store(&ret, o);
4121  return ret;
4122 }
4123 
4124 inline Vector3D ToStruct(const Vec3f& o)
4125 {
4126  Vector3D ret;
4127  Vec3f::Store(&ret, o);
4128  return ret;
4129 }
4130 
4131 inline Matrix43 ToStruct(const Mat43f& o)
4132 {
4133  Float4 tx = o.X;
4134  Float4 ty = o.Y;
4135  Float4 tz = o.Z;
4136  Float4 tw = Float4::SetZero();
4137  Float4::Transpose(tx, ty, tz, tw);
4138 
4139  Matrix43 ret;
4140  Float4::Store3(ret.Value[0], tx);
4141  Float4::Store3(ret.Value[1], ty);
4142  Float4::Store3(ret.Value[2], tz);
4143  Float4::Store3(ret.Value[3], tw);
4144  return ret;
4145 }
4146 
4147 inline Matrix44 ToStruct(const Mat44f& o)
4148 {
4149  Float4 tx = o.X;
4150  Float4 ty = o.Y;
4151  Float4 tz = o.Z;
4152  Float4 tw = o.W;
4153  Float4::Transpose(tx, ty, tz, tw);
4154 
4155  Matrix44 ret;
4156  Float4::Store4(ret.Values[0], tx);
4157  Float4::Store4(ret.Values[1], ty);
4158  Float4::Store4(ret.Values[2], tz);
4159  Float4::Store4(ret.Values[3], tw);
4160  return ret;
4161 }
4162 
4163 } // namespace SIMD
4164 
4165 } // namespace Effekseer
4166 
4167 #endif // __EFFEKSEER_SIMD_UTILS_H__
Effekseer::SIMD::AlignedAllocationPolicy
Definition: Effekseer.SIMD.h:4093
Effekseer::SIMD::Vec3f
Definition: Effekseer.SIMD.h:3143
Effekseer::SIMD::Int4
simd class for generic
Definition: Effekseer.SIMD.h:1605
Effekseer::SIMD::Mat44f
Definition: Effekseer.SIMD.h:3679
Effekseer::SIMD::Vec2f
Definition: Effekseer.SIMD.h:2978
Effekseer::SIMD::Mat43f
Definition: Effekseer.SIMD.h:3514
Effekseer::SIMD::Vec4f
Definition: Effekseer.SIMD.h:3396
Effekseer::SIMD::Quaternionf
Definition: Effekseer.SIMD.h:3856
Effekseer::SIMD::Float4
simd class for generic
Definition: Effekseer.SIMD.h:100