aboutsummaryrefslogtreecommitdiffstats
path: root/src/include/glm/simd/neon.h
blob: b2c2e8d7b81675d221acb2c7ba1fc89d6364e9d3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/// @ref simd_neon
/// @file glm/simd/neon.h

#pragma once

#if GLM_ARCH & GLM_ARCH_NEON_BIT
#include <arm_neon.h>

namespace glm {
	namespace neon {
		static float32x4_t dupq_lane(float32x4_t vsrc, int lane) {
			switch(lane) {
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
				case 0: return vdupq_laneq_f32(vsrc, 0);
				case 1: return vdupq_laneq_f32(vsrc, 1);
				case 2: return vdupq_laneq_f32(vsrc, 2);
				case 3: return vdupq_laneq_f32(vsrc, 3);
#else
				case 0: return vdupq_n_f32(vgetq_lane_f32(vsrc, 0));
				case 1: return vdupq_n_f32(vgetq_lane_f32(vsrc, 1));
				case 2: return vdupq_n_f32(vgetq_lane_f32(vsrc, 2));
				case 3: return vdupq_n_f32(vgetq_lane_f32(vsrc, 3));
#endif
			}
			assert(!"Unreachable code executed!");
			return vdupq_n_f32(0.0f);
		}

		static float32x2_t dup_lane(float32x4_t vsrc, int lane) {
			switch(lane) {
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
				case 0: return vdup_laneq_f32(vsrc, 0);
				case 1: return vdup_laneq_f32(vsrc, 1);
				case 2: return vdup_laneq_f32(vsrc, 2);
				case 3: return vdup_laneq_f32(vsrc, 3);
#else
				case 0: return vdup_n_f32(vgetq_lane_f32(vsrc, 0));
				case 1: return vdup_n_f32(vgetq_lane_f32(vsrc, 1));
				case 2: return vdup_n_f32(vgetq_lane_f32(vsrc, 2));
				case 3: return vdup_n_f32(vgetq_lane_f32(vsrc, 3));
#endif
			}
			assert(!"Unreachable code executed!");
			return vdup_n_f32(0.0f);
		}

		static float32x4_t copy_lane(float32x4_t vdst, int dlane, float32x4_t vsrc, int slane) {
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
			switch(dlane) {
				case 0:
					switch(slane) {
						case 0: return vcopyq_laneq_f32(vdst, 0, vsrc, 0);
						case 1: return vcopyq_laneq_f32(vdst, 0, vsrc, 1);
						case 2: return vcopyq_laneq_f32(vdst, 0, vsrc, 2);
						case 3: return vcopyq_laneq_f32(vdst, 0, vsrc, 3);
					}
					assert(!"Unreachable code executed!");
				case 1:
					switch(slane) {
						case 0: return vcopyq_laneq_f32(vdst, 1, vsrc, 0);
						case 1: return vcopyq_laneq_f32(vdst, 1, vsrc, 1);
						case 2: return vcopyq_laneq_f32(vdst, 1, vsrc, 2);
						case 3: return vcopyq_laneq_f32(vdst, 1, vsrc, 3);
					}
					assert(!"Unreachable code executed!");
				case 2:
					switch(slane) {
						case 0: return vcopyq_laneq_f32(vdst, 2, vsrc, 0);
						case 1: return vcopyq_laneq_f32(vdst, 2, vsrc, 1);
						case 2: return vcopyq_laneq_f32(vdst, 2, vsrc, 2);
						case 3: return vcopyq_laneq_f32(vdst, 2, vsrc, 3);
					}
					assert(!"Unreachable code executed!");
				case 3:
					switch(slane) {
						case 0: return vcopyq_laneq_f32(vdst, 3, vsrc, 0);
						case 1: return vcopyq_laneq_f32(vdst, 3, vsrc, 1);
						case 2: return vcopyq_laneq_f32(vdst, 3, vsrc, 2);
						case 3: return vcopyq_laneq_f32(vdst, 3, vsrc, 3);
					}
					assert(!"Unreachable code executed!");
			}
#else

			float l;
			switch(slane) {
				case 0: l = vgetq_lane_f32(vsrc, 0); break;
				case 1: l = vgetq_lane_f32(vsrc, 1); break;
				case 2: l = vgetq_lane_f32(vsrc, 2); break;
				case 3: l = vgetq_lane_f32(vsrc, 3); break;
				default: 
					assert(!"Unreachable code executed!");
			}
			switch(dlane) {
				case 0: return vsetq_lane_f32(l, vdst, 0);
				case 1: return vsetq_lane_f32(l, vdst, 1);
				case 2: return vsetq_lane_f32(l, vdst, 2);
				case 3: return vsetq_lane_f32(l, vdst, 3);
			}
#endif
			assert(!"Unreachable code executed!");
			return vdupq_n_f32(0.0f);
		}

		static float32x4_t mul_lane(float32x4_t v, float32x4_t vlane, int lane) {
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
			switch(lane) { 
				case 0: return vmulq_laneq_f32(v, vlane, 0); break;
				case 1: return vmulq_laneq_f32(v, vlane, 1); break;
				case 2: return vmulq_laneq_f32(v, vlane, 2); break;
				case 3: return vmulq_laneq_f32(v, vlane, 3); break;
				default: 
					assert(!"Unreachable code executed!");
			}
			assert(!"Unreachable code executed!");
			return vdupq_n_f32(0.0f);
#else
			return vmulq_f32(v, dupq_lane(vlane, lane));
#endif
		}

		static float32x4_t madd_lane(float32x4_t acc, float32x4_t v, float32x4_t vlane, int lane) {
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
#ifdef GLM_CONFIG_FORCE_FMA
#	define FMADD_LANE(acc, x, y, L) do { asm volatile ("fmla %0.4s, %1.4s, %2.4s" : "+w"(acc) : "w"(x), "w"(dup_lane(y, L))); } while(0)
#else
#	define FMADD_LANE(acc, x, y, L) do { acc = vmlaq_laneq_f32(acc, x, y, L); } while(0)
#endif

			switch(lane) { 
				case 0: 
					FMADD_LANE(acc, v, vlane, 0);
					return acc;
				case 1:
					FMADD_LANE(acc, v, vlane, 1);
					return acc;
				case 2:
					FMADD_LANE(acc, v, vlane, 2);
					return acc;
				case 3:
					FMADD_LANE(acc, v, vlane, 3);
					return acc;
				default: 
					assert(!"Unreachable code executed!");
			}
			assert(!"Unreachable code executed!");
			return vdupq_n_f32(0.0f);
#	undef FMADD_LANE
#else
			return vaddq_f32(acc, vmulq_f32(v, dupq_lane(vlane, lane)));
#endif
		}
	} //namespace neon
} // namespace glm
#endif // GLM_ARCH & GLM_ARCH_NEON_BIT