aboutsummaryrefslogtreecommitdiffstats
path: root/src/include/glm/simd/common.h
blob: 9352dc6b72de12de48543fdf5876c33a551f8dab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/// @ref simd
/// @file glm/simd/common.h

#pragma once

#include "platform.h"

#if GLM_ARCH & GLM_ARCH_SSE2_BIT

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_add(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_add_ps(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_add(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_add_ss(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_sub(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_sub_ps(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_sub(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_sub_ss(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_mul(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_mul_ps(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_mul(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_mul_ss(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_div(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_div_ps(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_div(glm_f32vec4 a, glm_f32vec4 b)
{
	return _mm_div_ss(a, b);
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_div_lowp(glm_f32vec4 a, glm_f32vec4 b)
{
	return glm_vec4_mul(a, _mm_rcp_ps(b));
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_swizzle_xyzw(glm_f32vec4 a)
{
#	if GLM_ARCH & GLM_ARCH_AVX2_BIT
		return _mm_permute_ps(a, _MM_SHUFFLE(3, 2, 1, 0));
#	else
		return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 1, 0));
#	endif
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c)
{
#	if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG)
		return _mm_fmadd_ss(a, b, c);
#	else
		return _mm_add_ss(_mm_mul_ss(a, b), c);
#	endif
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c)
{
#	if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG)
		return _mm_fmadd_ps(a, b, c);
#	else
		return glm_vec4_add(glm_vec4_mul(a, b), c);
#	endif
}

GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_abs(glm_f32vec4 x)
{
	return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
}

GLM_FUNC_QUALIFIER glm_ivec4 glm_ivec4_abs(glm_ivec4 x)
{
#	if GLM_ARCH & GLM_ARCH_SSSE3_BIT
		return _mm_sign_epi32(x, x);
#	else
		glm_ivec4 const sgn0 = _mm_srai_epi32(x, 31);
		glm_ivec4 const inv0 = _mm_xor_si128(x, sgn0);
		glm_ivec4 const sub0 = _mm_sub_epi32(inv0, sgn0);
		return sub0;
#	endif
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_sign(glm_vec4 x)
{
	glm_vec4 const zro0 = _mm_setzero_ps();
	glm_vec4 const cmp0 = _mm_cmplt_ps(x, zro0);
	glm_vec4 const cmp1 = _mm_cmpgt_ps(x, zro0);
	glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(-1.0f));
	glm_vec4 const and1 = _mm_and_ps(cmp1, _mm_set1_ps(1.0f));
	glm_vec4 const or0 = _mm_or_ps(and0, and1);
	return or0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_round(glm_vec4 x)
{
#	if GLM_ARCH & GLM_ARCH_SSE41_BIT
		return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
#	else
		glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)));
		glm_vec4 const and0 = _mm_and_ps(sgn0, x);
		glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f));
		glm_vec4 const add0 = glm_vec4_add(x, or0);
		glm_vec4 const sub0 = glm_vec4_sub(add0, or0);
		return sub0;
#	endif
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_floor(glm_vec4 x)
{
#	if GLM_ARCH & GLM_ARCH_SSE41_BIT
		return _mm_floor_ps(x);
#	else
		glm_vec4 const rnd0 = glm_vec4_round(x);
		glm_vec4 const cmp0 = _mm_cmplt_ps(x, rnd0);
		glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f));
		glm_vec4 const sub0 = glm_vec4_sub(rnd0, and0);
		return sub0;
#	endif
}

/* trunc TODO
GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_trunc(glm_vec4 x)
{
	return glm_vec4();
}
*/

//roundEven
GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_roundEven(glm_vec4 x)
{
	glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)));
	glm_vec4 const and0 = _mm_and_ps(sgn0, x);
	glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f));
	glm_vec4 const add0 = glm_vec4_add(x, or0);
	glm_vec4 const sub0 = glm_vec4_sub(add0, or0);
	return sub0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_ceil(glm_vec4 x)
{
#	if GLM_ARCH & GLM_ARCH_SSE41_BIT
		return _mm_ceil_ps(x);
#	else
		glm_vec4 const rnd0 = glm_vec4_round(x);
		glm_vec4 const cmp0 = _mm_cmpgt_ps(x, rnd0);
		glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f));
		glm_vec4 const add0 = glm_vec4_add(rnd0, and0);
		return add0;
#	endif
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_fract(glm_vec4 x)
{
	glm_vec4 const flr0 = glm_vec4_floor(x);
	glm_vec4 const sub0 = glm_vec4_sub(x, flr0);
	return sub0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mod(glm_vec4 x, glm_vec4 y)
{
	glm_vec4 const div0 = glm_vec4_div(x, y);
	glm_vec4 const flr0 = glm_vec4_floor(div0);
	glm_vec4 const mul0 = glm_vec4_mul(y, flr0);
	glm_vec4 const sub0 = glm_vec4_sub(x, mul0);
	return sub0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_clamp(glm_vec4 v, glm_vec4 minVal, glm_vec4 maxVal)
{
	glm_vec4 const min0 = _mm_min_ps(v, maxVal);
	glm_vec4 const max0 = _mm_max_ps(min0, minVal);
	return max0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mix(glm_vec4 v1, glm_vec4 v2, glm_vec4 a)
{
	glm_vec4 const sub0 = glm_vec4_sub(_mm_set1_ps(1.0f), a);
	glm_vec4 const mul0 = glm_vec4_mul(v1, sub0);
	glm_vec4 const mad0 = glm_vec4_fma(v2, a, mul0);
	return mad0;
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_step(glm_vec4 edge, glm_vec4 x)
{
	glm_vec4 const cmp = _mm_cmple_ps(x, edge);
	return _mm_movemask_ps(cmp) == 0 ? _mm_set1_ps(1.0f) : _mm_setzero_ps();
}

GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_smoothstep(glm_vec4 edge0, glm_vec4 edge1, glm_vec4 x)
{
	glm_vec4 const sub0 = glm_vec4_sub(x, edge0);
	glm_vec4 const sub1 = glm_vec4_sub(edge1, edge0);
	glm_vec4 const div0 = glm_vec4_sub(sub0, sub1);
	glm_vec4 const clp0 = glm_vec4_clamp(div0, _mm_setzero_ps(), _mm_set1_ps(1.0f));
	glm_vec4 const mul0 = glm_vec4_mul(_mm_set1_ps(2.0f), clp0);
	glm_vec4 const sub2 = glm_vec4_sub(_mm_set1_ps(3.0f), mul0);
	glm_vec4 const mul1 = glm_vec4_mul(clp0, clp0);
	glm_vec4 const mul2 = glm_vec4_mul(mul1, sub2);
	return mul2;
}

// Agner Fog method
GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_nan(glm_vec4 x)
{
	glm_ivec4 const t1 = _mm_castps_si128(x);						// reinterpret as 32-bit integer
	glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1));	// shift out sign bit
	glm_ivec4 const t3 = _mm_set1_epi32(int(0xFF000000));				// exponent mask
	glm_ivec4 const t4 = _mm_and_si128(t2, t3);						// exponent
	glm_ivec4 const t5 = _mm_andnot_si128(t3, t2);					// fraction
	glm_ivec4 const Equal = _mm_cmpeq_epi32(t3, t4);
	glm_ivec4 const Nequal = _mm_cmpeq_epi32(t5, _mm_setzero_si128());
	glm_ivec4 const And = _mm_and_si128(Equal, Nequal);
	return _mm_castsi128_ps(And);									// exponent = all 1s and fraction != 0
}

// Agner Fog method
GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_inf(glm_vec4 x)
{
	glm_ivec4 const t1 = _mm_castps_si128(x);										// reinterpret as 32-bit integer
	glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1));					// shift out sign bit
	return _mm_castsi128_ps(_mm_cmpeq_epi32(t2, _mm_set1_epi32(int(0xFF000000))));		// exponent is all 1s, fraction is 0
}

#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT