1 | /* |
2 | * Copyright (C) 2010, Google Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
16 | * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
20 | * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
23 | */ |
24 | |
25 | #include "config.h" |
26 | |
27 | #if ENABLE(WEB_AUDIO) |
28 | |
29 | #include "VectorMath.h" |
30 | |
31 | #if USE(ACCELERATE) |
32 | #include <Accelerate/Accelerate.h> |
33 | #endif |
34 | |
35 | #if CPU(X86_SSE2) |
36 | #include <emmintrin.h> |
37 | #endif |
38 | |
39 | #if HAVE(ARM_NEON_INTRINSICS) |
40 | #include <arm_neon.h> |
41 | #endif |
42 | |
43 | #include <algorithm> |
44 | #include <math.h> |
45 | |
46 | namespace WebCore { |
47 | |
48 | namespace VectorMath { |
49 | |
50 | #if USE(ACCELERATE) |
51 | // On the Mac we use the highly optimized versions in Accelerate.framework |
52 | |
53 | void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
54 | { |
55 | vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); |
56 | } |
57 | |
58 | void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
59 | { |
60 | vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
61 | } |
62 | |
63 | void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
64 | { |
65 | vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
66 | } |
67 | |
68 | void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
69 | { |
70 | DSPSplitComplex sc1; |
71 | DSPSplitComplex sc2; |
72 | DSPSplitComplex dest; |
73 | sc1.realp = const_cast<float*>(real1P); |
74 | sc1.imagp = const_cast<float*>(imag1P); |
75 | sc2.realp = const_cast<float*>(real2P); |
76 | sc2.imagp = const_cast<float*>(imag2P); |
77 | dest.realp = realDestP; |
78 | dest.imagp = imagDestP; |
79 | vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); |
80 | } |
81 | |
82 | void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
83 | { |
84 | vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); |
85 | } |
86 | |
87 | void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) |
88 | { |
89 | vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); |
90 | } |
91 | |
92 | void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) |
93 | { |
94 | vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); |
95 | } |
96 | |
97 | void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) |
98 | { |
99 | vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); |
100 | } |
101 | #else |
102 | |
103 | void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
104 | { |
105 | int n = framesToProcess; |
106 | |
107 | #if CPU(X86_SSE2) |
108 | if ((sourceStride == 1) && (destStride == 1)) { |
109 | float k = *scale; |
110 | |
111 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
112 | while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
113 | *destP += k * *sourceP; |
114 | sourceP++; |
115 | destP++; |
116 | n--; |
117 | } |
118 | |
119 | // Now the sourceP is aligned, use SSE. |
120 | int tailFrames = n % 4; |
121 | const float* endP = destP + n - tailFrames; |
122 | |
123 | __m128 pSource; |
124 | __m128 dest; |
125 | __m128 temp; |
126 | __m128 mScale = _mm_set_ps1(k); |
127 | |
128 | bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
129 | |
130 | #define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
131 | while (destP < endP) \ |
132 | { \ |
133 | pSource = _mm_load_ps(sourceP); \ |
134 | temp = _mm_mul_ps(pSource, mScale); \ |
135 | dest = _mm_##loadInstr##_ps(destP); \ |
136 | dest = _mm_add_ps(dest, temp); \ |
137 | _mm_##storeInstr##_ps(destP, dest); \ |
138 | sourceP += 4; \ |
139 | destP += 4; \ |
140 | } |
141 | |
142 | if (destAligned) |
143 | SSE2_MULT_ADD(load, store) |
144 | else |
145 | SSE2_MULT_ADD(loadu, storeu) |
146 | |
147 | n = tailFrames; |
148 | } |
149 | #elif HAVE(ARM_NEON_INTRINSICS) |
150 | if ((sourceStride == 1) && (destStride == 1)) { |
151 | int tailFrames = n % 4; |
152 | const float* endP = destP + n - tailFrames; |
153 | |
154 | float32x4_t k = vdupq_n_f32(*scale); |
155 | while (destP < endP) { |
156 | float32x4_t source = vld1q_f32(sourceP); |
157 | float32x4_t dest = vld1q_f32(destP); |
158 | |
159 | dest = vmlaq_f32(dest, source, k); |
160 | vst1q_f32(destP, dest); |
161 | |
162 | sourceP += 4; |
163 | destP += 4; |
164 | } |
165 | n = tailFrames; |
166 | } |
167 | #endif |
168 | while (n) { |
169 | *destP += *sourceP * *scale; |
170 | sourceP += sourceStride; |
171 | destP += destStride; |
172 | n--; |
173 | } |
174 | } |
175 | |
176 | void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
177 | { |
178 | int n = framesToProcess; |
179 | |
180 | #if CPU(X86_SSE2) |
181 | if ((sourceStride == 1) && (destStride == 1)) { |
182 | float k = *scale; |
183 | |
184 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
185 | while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { |
186 | *destP = k * *sourceP; |
187 | sourceP++; |
188 | destP++; |
189 | n--; |
190 | } |
191 | |
192 | // Now the sourceP address is aligned and start to apply SSE. |
193 | int group = n / 4; |
194 | __m128 mScale = _mm_set_ps1(k); |
195 | __m128* pSource; |
196 | __m128* pDest; |
197 | __m128 dest; |
198 | |
199 | |
200 | if (reinterpret_cast<size_t>(destP) & 0x0F) { |
201 | while (group--) { |
202 | pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); |
203 | dest = _mm_mul_ps(*pSource, mScale); |
204 | _mm_storeu_ps(destP, dest); |
205 | |
206 | sourceP += 4; |
207 | destP += 4; |
208 | } |
209 | } else { |
210 | while (group--) { |
211 | pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); |
212 | pDest = reinterpret_cast<__m128*>(destP); |
213 | *pDest = _mm_mul_ps(*pSource, mScale); |
214 | |
215 | sourceP += 4; |
216 | destP += 4; |
217 | } |
218 | } |
219 | |
220 | // Non-SSE handling for remaining frames which is less than 4. |
221 | n %= 4; |
222 | while (n) { |
223 | *destP = k * *sourceP; |
224 | sourceP++; |
225 | destP++; |
226 | n--; |
227 | } |
228 | } else { // If strides are not 1, rollback to normal algorithm. |
229 | #elif HAVE(ARM_NEON_INTRINSICS) |
230 | if ((sourceStride == 1) && (destStride == 1)) { |
231 | float k = *scale; |
232 | int tailFrames = n % 4; |
233 | const float* endP = destP + n - tailFrames; |
234 | |
235 | while (destP < endP) { |
236 | float32x4_t source = vld1q_f32(sourceP); |
237 | vst1q_f32(destP, vmulq_n_f32(source, k)); |
238 | |
239 | sourceP += 4; |
240 | destP += 4; |
241 | } |
242 | n = tailFrames; |
243 | } |
244 | #endif |
245 | float k = *scale; |
246 | while (n--) { |
247 | *destP = k * *sourceP; |
248 | sourceP += sourceStride; |
249 | destP += destStride; |
250 | } |
251 | #if CPU(X86_SSE2) |
252 | } |
253 | #endif |
254 | } |
255 | |
256 | void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
257 | { |
258 | int n = framesToProcess; |
259 | |
260 | #if CPU(X86_SSE2) |
261 | if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
262 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
263 | while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { |
264 | *destP = *source1P + *source2P; |
265 | source1P++; |
266 | source2P++; |
267 | destP++; |
268 | n--; |
269 | } |
270 | |
271 | // Now the source1P address is aligned and start to apply SSE. |
272 | int group = n / 4; |
273 | __m128* pSource1; |
274 | __m128* pSource2; |
275 | __m128* pDest; |
276 | __m128 source2; |
277 | __m128 dest; |
278 | |
279 | bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); |
280 | bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); |
281 | |
282 | if (source2Aligned && destAligned) { // all aligned |
283 | while (group--) { |
284 | pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); |
285 | pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); |
286 | pDest = reinterpret_cast<__m128*>(destP); |
287 | *pDest = _mm_add_ps(*pSource1, *pSource2); |
288 | |
289 | source1P += 4; |
290 | source2P += 4; |
291 | destP += 4; |
292 | } |
293 | |
294 | } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned |
295 | while (group--) { |
296 | pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); |
297 | pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); |
298 | dest = _mm_add_ps(*pSource1, *pSource2); |
299 | _mm_storeu_ps(destP, dest); |
300 | |
301 | source1P += 4; |
302 | source2P += 4; |
303 | destP += 4; |
304 | } |
305 | |
306 | } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned |
307 | while (group--) { |
308 | pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); |
309 | source2 = _mm_loadu_ps(source2P); |
310 | pDest = reinterpret_cast<__m128*>(destP); |
311 | *pDest = _mm_add_ps(*pSource1, source2); |
312 | |
313 | source1P += 4; |
314 | source2P += 4; |
315 | destP += 4; |
316 | } |
317 | } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned |
318 | while (group--) { |
319 | pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); |
320 | source2 = _mm_loadu_ps(source2P); |
321 | dest = _mm_add_ps(*pSource1, source2); |
322 | _mm_storeu_ps(destP, dest); |
323 | |
324 | source1P += 4; |
325 | source2P += 4; |
326 | destP += 4; |
327 | } |
328 | } |
329 | |
330 | // Non-SSE handling for remaining frames which is less than 4. |
331 | n %= 4; |
332 | while (n) { |
333 | *destP = *source1P + *source2P; |
334 | source1P++; |
335 | source2P++; |
336 | destP++; |
337 | n--; |
338 | } |
339 | } else { // if strides are not 1, rollback to normal algorithm |
340 | #elif HAVE(ARM_NEON_INTRINSICS) |
341 | if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
342 | int tailFrames = n % 4; |
343 | const float* endP = destP + n - tailFrames; |
344 | |
345 | while (destP < endP) { |
346 | float32x4_t source1 = vld1q_f32(source1P); |
347 | float32x4_t source2 = vld1q_f32(source2P); |
348 | vst1q_f32(destP, vaddq_f32(source1, source2)); |
349 | |
350 | source1P += 4; |
351 | source2P += 4; |
352 | destP += 4; |
353 | } |
354 | n = tailFrames; |
355 | } |
356 | #endif |
357 | while (n--) { |
358 | *destP = *source1P + *source2P; |
359 | source1P += sourceStride1; |
360 | source2P += sourceStride2; |
361 | destP += destStride; |
362 | } |
363 | #if CPU(X86_SSE2) |
364 | } |
365 | #endif |
366 | } |
367 | |
368 | void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
369 | { |
370 | |
371 | int n = framesToProcess; |
372 | |
373 | #if CPU(X86_SSE2) |
374 | if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
375 | // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
376 | while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { |
377 | *destP = *source1P * *source2P; |
378 | source1P++; |
379 | source2P++; |
380 | destP++; |
381 | n--; |
382 | } |
383 | |
384 | // Now the source1P address aligned and start to apply SSE. |
385 | int tailFrames = n % 4; |
386 | const float* endP = destP + n - tailFrames; |
387 | __m128 pSource1; |
388 | __m128 pSource2; |
389 | __m128 dest; |
390 | |
391 | bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); |
392 | bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
393 | |
394 | #define SSE2_MULT(loadInstr, storeInstr) \ |
395 | while (destP < endP) \ |
396 | { \ |
397 | pSource1 = _mm_load_ps(source1P); \ |
398 | pSource2 = _mm_##loadInstr##_ps(source2P); \ |
399 | dest = _mm_mul_ps(pSource1, pSource2); \ |
400 | _mm_##storeInstr##_ps(destP, dest); \ |
401 | source1P += 4; \ |
402 | source2P += 4; \ |
403 | destP += 4; \ |
404 | } |
405 | |
406 | if (source2Aligned && destAligned) // Both aligned. |
407 | SSE2_MULT(load, store) |
408 | else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. |
409 | SSE2_MULT(load, storeu) |
410 | else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. |
411 | SSE2_MULT(loadu, store) |
412 | else // Neither aligned. |
413 | SSE2_MULT(loadu, storeu) |
414 | |
415 | n = tailFrames; |
416 | } |
417 | #elif HAVE(ARM_NEON_INTRINSICS) |
418 | if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
419 | int tailFrames = n % 4; |
420 | const float* endP = destP + n - tailFrames; |
421 | |
422 | while (destP < endP) { |
423 | float32x4_t source1 = vld1q_f32(source1P); |
424 | float32x4_t source2 = vld1q_f32(source2P); |
425 | vst1q_f32(destP, vmulq_f32(source1, source2)); |
426 | |
427 | source1P += 4; |
428 | source2P += 4; |
429 | destP += 4; |
430 | } |
431 | n = tailFrames; |
432 | } |
433 | #endif |
434 | while (n) { |
435 | *destP = *source1P * *source2P; |
436 | source1P += sourceStride1; |
437 | source2P += sourceStride2; |
438 | destP += destStride; |
439 | n--; |
440 | } |
441 | } |
442 | |
443 | void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
444 | { |
445 | unsigned i = 0; |
446 | #if CPU(X86_SSE2) |
447 | // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. |
448 | // Otherwise, fall through to the scalar code below. |
449 | if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) |
450 | && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) |
451 | && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) |
452 | && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) |
453 | && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) |
454 | && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { |
455 | |
456 | unsigned endSize = framesToProcess - framesToProcess % 4; |
457 | while (i < endSize) { |
458 | __m128 real1 = _mm_load_ps(real1P + i); |
459 | __m128 real2 = _mm_load_ps(real2P + i); |
460 | __m128 imag1 = _mm_load_ps(imag1P + i); |
461 | __m128 imag2 = _mm_load_ps(imag2P + i); |
462 | __m128 real = _mm_mul_ps(real1, real2); |
463 | real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); |
464 | __m128 imag = _mm_mul_ps(real1, imag2); |
465 | imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); |
466 | _mm_store_ps(realDestP + i, real); |
467 | _mm_store_ps(imagDestP + i, imag); |
468 | i += 4; |
469 | } |
470 | } |
471 | #elif HAVE(ARM_NEON_INTRINSICS) |
472 | unsigned endSize = framesToProcess - framesToProcess % 4; |
473 | while (i < endSize) { |
474 | float32x4_t real1 = vld1q_f32(real1P + i); |
475 | float32x4_t real2 = vld1q_f32(real2P + i); |
476 | float32x4_t imag1 = vld1q_f32(imag1P + i); |
477 | float32x4_t imag2 = vld1q_f32(imag2P + i); |
478 | |
479 | float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); |
480 | float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); |
481 | |
482 | vst1q_f32(realDestP + i, realResult); |
483 | vst1q_f32(imagDestP + i, imagResult); |
484 | |
485 | i += 4; |
486 | } |
487 | #endif |
488 | for (; i < framesToProcess; ++i) { |
489 | // Read and compute result before storing them, in case the |
490 | // destination is the same as one of the sources. |
491 | float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; |
492 | float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; |
493 | |
494 | realDestP[i] = realResult; |
495 | imagDestP[i] = imagResult; |
496 | } |
497 | } |
498 | |
499 | void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) |
500 | { |
501 | int n = framesToProcess; |
502 | float sum = 0; |
503 | |
504 | #if CPU(X86_SSE2) |
505 | if (sourceStride == 1) { |
506 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
507 | while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
508 | float sample = *sourceP; |
509 | sum += sample * sample; |
510 | sourceP++; |
511 | n--; |
512 | } |
513 | |
514 | // Now the sourceP is aligned, use SSE. |
515 | int tailFrames = n % 4; |
516 | const float* endP = sourceP + n - tailFrames; |
517 | __m128 source; |
518 | __m128 mSum = _mm_setzero_ps(); |
519 | |
520 | while (sourceP < endP) { |
521 | source = _mm_load_ps(sourceP); |
522 | source = _mm_mul_ps(source, source); |
523 | mSum = _mm_add_ps(mSum, source); |
524 | sourceP += 4; |
525 | } |
526 | |
527 | // Summarize the SSE results. |
528 | const float* groupSumP = reinterpret_cast<float*>(&mSum); |
529 | sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
530 | |
531 | n = tailFrames; |
532 | } |
533 | #elif HAVE(ARM_NEON_INTRINSICS) |
534 | if (sourceStride == 1) { |
535 | int tailFrames = n % 4; |
536 | const float* endP = sourceP + n - tailFrames; |
537 | |
538 | float32x4_t fourSum = vdupq_n_f32(0); |
539 | while (sourceP < endP) { |
540 | float32x4_t source = vld1q_f32(sourceP); |
541 | fourSum = vmlaq_f32(fourSum, source, source); |
542 | sourceP += 4; |
543 | } |
544 | float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); |
545 | |
546 | float groupSum[2]; |
547 | vst1_f32(groupSum, twoSum); |
548 | sum += groupSum[0] + groupSum[1]; |
549 | |
550 | n = tailFrames; |
551 | } |
552 | #endif |
553 | |
554 | while (n--) { |
555 | float sample = *sourceP; |
556 | sum += sample * sample; |
557 | sourceP += sourceStride; |
558 | } |
559 | |
560 | ASSERT(sumP); |
561 | *sumP = sum; |
562 | } |
563 | |
564 | void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) |
565 | { |
566 | int n = framesToProcess; |
567 | float max = 0; |
568 | |
569 | #if CPU(X86_SSE2) |
570 | if (sourceStride == 1) { |
571 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
572 | while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
573 | max = std::max(max, fabsf(*sourceP)); |
574 | sourceP++; |
575 | n--; |
576 | } |
577 | |
578 | // Now the sourceP is aligned, use SSE. |
579 | int tailFrames = n % 4; |
580 | const float* endP = sourceP + n - tailFrames; |
581 | __m128 source; |
582 | __m128 mMax = _mm_setzero_ps(); |
583 | int mask = 0x7FFFFFFF; |
584 | __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); |
585 | |
586 | while (sourceP < endP) { |
587 | source = _mm_load_ps(sourceP); |
588 | // Calculate the absolute value by anding source with mask, the sign bit is set to 0. |
589 | source = _mm_and_ps(source, mMask); |
590 | mMax = _mm_max_ps(mMax, source); |
591 | sourceP += 4; |
592 | } |
593 | |
594 | // Get max from the SSE results. |
595 | const float* groupMaxP = reinterpret_cast<float*>(&mMax); |
596 | max = std::max(max, groupMaxP[0]); |
597 | max = std::max(max, groupMaxP[1]); |
598 | max = std::max(max, groupMaxP[2]); |
599 | max = std::max(max, groupMaxP[3]); |
600 | |
601 | n = tailFrames; |
602 | } |
603 | #elif HAVE(ARM_NEON_INTRINSICS) |
604 | if (sourceStride == 1) { |
605 | int tailFrames = n % 4; |
606 | const float* endP = sourceP + n - tailFrames; |
607 | |
608 | float32x4_t fourMax = vdupq_n_f32(0); |
609 | while (sourceP < endP) { |
610 | float32x4_t source = vld1q_f32(sourceP); |
611 | fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); |
612 | sourceP += 4; |
613 | } |
614 | float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); |
615 | |
616 | float groupMax[2]; |
617 | vst1_f32(groupMax, twoMax); |
618 | max = std::max(groupMax[0], groupMax[1]); |
619 | |
620 | n = tailFrames; |
621 | } |
622 | #endif |
623 | |
624 | while (n--) { |
625 | max = std::max(max, fabsf(*sourceP)); |
626 | sourceP += sourceStride; |
627 | } |
628 | |
629 | ASSERT(maxP); |
630 | *maxP = max; |
631 | } |
632 | |
633 | void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) |
634 | { |
635 | int n = framesToProcess; |
636 | float lowThreshold = *lowThresholdP; |
637 | float highThreshold = *highThresholdP; |
638 | |
639 | // FIXME: Optimize for SSE2. |
640 | #if HAVE(ARM_NEON_INTRINSICS) |
641 | if ((sourceStride == 1) && (destStride == 1)) { |
642 | int tailFrames = n % 4; |
643 | const float* endP = destP + n - tailFrames; |
644 | |
645 | float32x4_t low = vdupq_n_f32(lowThreshold); |
646 | float32x4_t high = vdupq_n_f32(highThreshold); |
647 | while (destP < endP) { |
648 | float32x4_t source = vld1q_f32(sourceP); |
649 | vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
650 | sourceP += 4; |
651 | destP += 4; |
652 | } |
653 | n = tailFrames; |
654 | } |
655 | #endif |
656 | while (n--) { |
657 | *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); |
658 | sourceP += sourceStride; |
659 | destP += destStride; |
660 | } |
661 | } |
662 | |
663 | #endif // USE(ACCELERATE) |
664 | |
665 | } // namespace VectorMath |
666 | |
667 | } // namespace WebCore |
668 | |
669 | #endif // ENABLE(WEB_AUDIO) |
670 | |