VectorMath.cpp source code [webkit/Source/WebCore/platform/audio/VectorMath.cpp]

1	/*
2	* Copyright (C) 2010, Google Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16	* DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20	* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23	*/
24
25	#include "config.h"
26
27	#if ENABLE(WEB_AUDIO)
28
29	#include "VectorMath.h"
30
31	#if USE(ACCELERATE)
32	#include <Accelerate/Accelerate.h>
33	#endif
34
35	#if CPU(X86_SSE2)
36	#include <emmintrin.h>
37	#endif
38
39	#if HAVE(ARM_NEON_INTRINSICS)
40	#include <arm_neon.h>
41	#endif
42
43	#include <algorithm>
44	#include <math.h>
45
46	namespace WebCore {
47
48	namespace VectorMath {
49
50	#if USE(ACCELERATE)
51	// On the Mac we use the highly optimized versions in Accelerate.framework
52
53	void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
54	{
55	vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
56	}
57
58	void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
59	{
60	vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
61	}
62
63	void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
64	{
65	vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
66	}
67
68	void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
69	{
70	DSPSplitComplex sc1;
71	DSPSplitComplex sc2;
72	DSPSplitComplex dest;
73	sc1.realp = const_cast<float*>(real1P);
74	sc1.imagp = const_cast<float*>(imag1P);
75	sc2.realp = const_cast<float*>(real2P);
76	sc2.imagp = const_cast<float*>(imag2P);
77	dest.realp = realDestP;
78	dest.imagp = imagDestP;
79	vDSP_zvmul(&sc1, `1`, &sc2, `1`, &dest, `1`, framesToProcess, `1`);
80	}
81
82	void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
83	{
84	vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
85	}
86
87	void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
88	{
89	vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
90	}
91
92	void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
93	{
94	vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
95	}
96
97	void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
98	{
99	vDSP_vclip(const_cast<float>(sourceP), sourceStride, const_cast<float>(lowThresholdP), const_cast*<float**>(highThresholdP), destP, destStride, framesToProcess);
100	}
101	#else
102
103	void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
104	{
105	int n = framesToProcess;
106
107	#if CPU(X86_SSE2)
108	if ((sourceStride == `1`) && (destStride == `1`)) {
109	float k = *scale;
110
111	// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
112	while ((reinterpret_cast<uintptr_t>(sourceP) & `0x0F`) && n) {
113	destP += k *sourceP;
114	sourceP++;
115	destP++;
116	n--;
117	}
118
119	// Now the sourceP is aligned, use SSE.
120	int tailFrames = n % `4`;
121	const float* endP = destP + n - tailFrames;
122
123	__m128 pSource;
124	__m128 dest;
125	__m128 temp;
126	__m128 mScale = _mm_set_ps1(k);
127
128	bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & `0x0F`);
129
130	#define SSE2_MULT_ADD(loadInstr, storeInstr) \
131	while (destP < endP) \
132	{ \
133	pSource = _mm_load_ps(sourceP); \
134	temp = _mm_mul_ps(pSource, mScale); \
135	dest = _mm_##loadInstr##_ps(destP); \
136	dest = _mm_add_ps(dest, temp); \
137	_mm_##storeInstr##_ps(destP, dest); \
138	sourceP += 4; \
139	destP += 4; \
140	}
141
142	if (destAligned)
143	SSE2_MULT_ADD(load, store)
144	else
145	SSE2_MULT_ADD(loadu, storeu)
146
147	n = tailFrames;
148	}
149	#elif HAVE(ARM_NEON_INTRINSICS)
150	if ((sourceStride == `1`) && (destStride == `1`)) {
151	int tailFrames = n % `4`;
152	const float* endP = destP + n - tailFrames;
153
154	float32x4_t k = vdupq_n_f32(*scale);
155	while (destP < endP) {
156	float32x4_t source = vld1q_f32(sourceP);
157	float32x4_t dest = vld1q_f32(destP);
158
159	dest = vmlaq_f32(dest, source, k);
160	vst1q_f32(destP, dest);
161
162	sourceP += `4`;
163	destP += `4`;
164	}
165	n = tailFrames;
166	}
167	#endif
168	while (n) {
169	destP += sourceP * *scale;
170	sourceP += sourceStride;
171	destP += destStride;
172	n--;
173	}
174	}
175
176	void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
177	{
178	int n = framesToProcess;
179
180	#if CPU(X86_SSE2)
181	if ((sourceStride == `1`) && (destStride == `1`)) {
182	float k = *scale;
183
184	// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
185	while ((reinterpret_cast<size_t>(sourceP) & `0x0F`) && n) {
186	destP = k *sourceP;
187	sourceP++;
188	destP++;
189	n--;
190	}
191
192	// Now the sourceP address is aligned and start to apply SSE.
193	int group = n / `4`;
194	__m128 mScale = _mm_set_ps1(k);
195	__m128* pSource;
196	__m128* pDest;
197	__m128 dest;
198
199
200	if (reinterpret_cast<size_t>(destP) & `0x0F`) {
201	while (group--) {
202	pSource = reinterpret_cast<__m128>(const_cast<float**>(sourceP));
203	dest = _mm_mul_ps(*pSource, mScale);
204	_mm_storeu_ps(destP, dest);
205
206	sourceP += `4`;
207	destP += `4`;
208	}
209	} else {
210	while (group--) {
211	pSource = reinterpret_cast<__m128>(const_cast<float**>(sourceP));
212	pDest = reinterpret_cast<__m128*>(destP);
213	pDest = _mm_mul_ps(pSource, mScale);
214
215	sourceP += `4`;
216	destP += `4`;
217	}
218	}
219
220	// Non-SSE handling for remaining frames which is less than 4.
221	n %= `4`;
222	while (n) {
223	destP = k *sourceP;
224	sourceP++;
225	destP++;
226	n--;
227	}
228	} else { // If strides are not 1, rollback to normal algorithm.
229	#elif HAVE(ARM_NEON_INTRINSICS)
230	if ((sourceStride == `1`) && (destStride == `1`)) {
231	float k = *scale;
232	int tailFrames = n % `4`;
233	const float* endP = destP + n - tailFrames;
234
235	while (destP < endP) {
236	float32x4_t source = vld1q_f32(sourceP);
237	vst1q_f32(destP, vmulq_n_f32(source, k));
238
239	sourceP += `4`;
240	destP += `4`;
241	}
242	n = tailFrames;
243	}
244	#endif
245	float k = *scale;
246	while (n--) {
247	destP = k *sourceP;
248	sourceP += sourceStride;
249	destP += destStride;
250	}
251	#if CPU(X86_SSE2)
252	}
253	#endif
254	}
255
256	void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
257	{
258	int n = framesToProcess;
259
260	#if CPU(X86_SSE2)
261	if ((sourceStride1 ==`1`) && (sourceStride2 == `1`) && (destStride == `1`)) {
262	// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
263	while ((reinterpret_cast<size_t>(source1P) & `0x0F`) && n) {
264	destP = source1P + *source2P;
265	source1P++;
266	source2P++;
267	destP++;
268	n--;
269	}
270
271	// Now the source1P address is aligned and start to apply SSE.
272	int group = n / `4`;
273	__m128* pSource1;
274	__m128* pSource2;
275	__m128* pDest;
276	__m128 source2;
277	__m128 dest;
278
279	bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & `0x0F`);
280	bool destAligned = !(reinterpret_cast<size_t>(destP) & `0x0F`);
281
282	if (source2Aligned && destAligned) { // all aligned
283	while (group--) {
284	pSource1 = reinterpret_cast<__m128>(const_cast<float**>(source1P));
285	pSource2 = reinterpret_cast<__m128>(const_cast<float**>(source2P));
286	pDest = reinterpret_cast<__m128*>(destP);
287	pDest = _mm_add_ps(pSource1, *pSource2);
288
289	source1P += `4`;
290	source2P += `4`;
291	destP += `4`;
292	}
293
294	} else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
295	while (group--) {
296	pSource1 = reinterpret_cast<__m128>(const_cast<float**>(source1P));
297	pSource2 = reinterpret_cast<__m128>(const_cast<float**>(source2P));
298	dest = _mm_add_ps(pSource1, pSource2);
299	_mm_storeu_ps(destP, dest);
300
301	source1P += `4`;
302	source2P += `4`;
303	destP += `4`;
304	}
305
306	} else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
307	while (group--) {
308	pSource1 = reinterpret_cast<__m128>(const_cast<float**>(source1P));
309	source2 = _mm_loadu_ps(source2P);
310	pDest = reinterpret_cast<__m128*>(destP);
311	pDest = _mm_add_ps(pSource1, source2);
312
313	source1P += `4`;
314	source2P += `4`;
315	destP += `4`;
316	}
317	} else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
318	while (group--) {
319	pSource1 = reinterpret_cast<__m128>(const_cast<float**>(source1P));
320	source2 = _mm_loadu_ps(source2P);
321	dest = _mm_add_ps(*pSource1, source2);
322	_mm_storeu_ps(destP, dest);
323
324	source1P += `4`;
325	source2P += `4`;
326	destP += `4`;
327	}
328	}
329
330	// Non-SSE handling for remaining frames which is less than 4.
331	n %= `4`;
332	while (n) {
333	destP = source1P + *source2P;
334	source1P++;
335	source2P++;
336	destP++;
337	n--;
338	}
339	} else { // if strides are not 1, rollback to normal algorithm
340	#elif HAVE(ARM_NEON_INTRINSICS)
341	if ((sourceStride1 ==`1`) && (sourceStride2 == `1`) && (destStride == `1`)) {
342	int tailFrames = n % `4`;
343	const float* endP = destP + n - tailFrames;
344
345	while (destP < endP) {
346	float32x4_t source1 = vld1q_f32(source1P);
347	float32x4_t source2 = vld1q_f32(source2P);
348	vst1q_f32(destP, vaddq_f32(source1, source2));
349
350	source1P += `4`;
351	source2P += `4`;
352	destP += `4`;
353	}
354	n = tailFrames;
355	}
356	#endif
357	while (n--) {
358	destP = source1P + *source2P;
359	source1P += sourceStride1;
360	source2P += sourceStride2;
361	destP += destStride;
362	}
363	#if CPU(X86_SSE2)
364	}
365	#endif
366	}
367
368	void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
369	{
370
371	int n = framesToProcess;
372
373	#if CPU(X86_SSE2)
374	if ((sourceStride1 == `1`) && (sourceStride2 == `1`) && (destStride == `1`)) {
375	// If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
376	while ((reinterpret_cast<uintptr_t>(source1P) & `0x0F`) && n) {
377	destP = source1P * *source2P;
378	source1P++;
379	source2P++;
380	destP++;
381	n--;
382	}
383
384	// Now the source1P address aligned and start to apply SSE.
385	int tailFrames = n % `4`;
386	const float* endP = destP + n - tailFrames;
387	__m128 pSource1;
388	__m128 pSource2;
389	__m128 dest;
390
391	bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & `0x0F`);
392	bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & `0x0F`);
393
394	#define SSE2_MULT(loadInstr, storeInstr) \
395	while (destP < endP) \
396	{ \
397	pSource1 = _mm_load_ps(source1P); \
398	pSource2 = _mm_##loadInstr##_ps(source2P); \
399	dest = _mm_mul_ps(pSource1, pSource2); \
400	_mm_##storeInstr##_ps(destP, dest); \
401	source1P += 4; \
402	source2P += 4; \
403	destP += 4; \
404	}
405
406	if (source2Aligned && destAligned) // Both aligned.
407	SSE2_MULT(load, store)
408	else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
409	SSE2_MULT(load, storeu)
410	else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
411	SSE2_MULT(loadu, store)
412	else // Neither aligned.
413	SSE2_MULT(loadu, storeu)
414
415	n = tailFrames;
416	}
417	#elif HAVE(ARM_NEON_INTRINSICS)
418	if ((sourceStride1 ==`1`) && (sourceStride2 == `1`) && (destStride == `1`)) {
419	int tailFrames = n % `4`;
420	const float* endP = destP + n - tailFrames;
421
422	while (destP < endP) {
423	float32x4_t source1 = vld1q_f32(source1P);
424	float32x4_t source2 = vld1q_f32(source2P);
425	vst1q_f32(destP, vmulq_f32(source1, source2));
426
427	source1P += `4`;
428	source2P += `4`;
429	destP += `4`;
430	}
431	n = tailFrames;
432	}
433	#endif
434	while (n) {
435	destP = source1P * *source2P;
436	source1P += sourceStride1;
437	source2P += sourceStride2;
438	destP += destStride;
439	n--;
440	}
441	}
442
443	void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
444	{
445	unsigned i = `0`;
446	#if CPU(X86_SSE2)
447	// Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
448	// Otherwise, fall through to the scalar code below.
449	if (!(reinterpret_cast<uintptr_t>(real1P) & `0x0F`)
450	&& !(reinterpret_cast<uintptr_t>(imag1P) & `0x0F`)
451	&& !(reinterpret_cast<uintptr_t>(real2P) & `0x0F`)
452	&& !(reinterpret_cast<uintptr_t>(imag2P) & `0x0F`)
453	&& !(reinterpret_cast<uintptr_t>(realDestP) & `0x0F`)
454	&& !(reinterpret_cast<uintptr_t>(imagDestP) & `0x0F`)) {
455
456	unsigned endSize = framesToProcess - framesToProcess % `4`;
457	while (i < endSize) {
458	__m128 real1 = _mm_load_ps(real1P + i);
459	__m128 real2 = _mm_load_ps(real2P + i);
460	__m128 imag1 = _mm_load_ps(imag1P + i);
461	__m128 imag2 = _mm_load_ps(imag2P + i);
462	__m128 real = _mm_mul_ps(real1, real2);
463	real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
464	__m128 imag = _mm_mul_ps(real1, imag2);
465	imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
466	_mm_store_ps(realDestP + i, real);
467	_mm_store_ps(imagDestP + i, imag);
468	i += `4`;
469	}
470	}
471	#elif HAVE(ARM_NEON_INTRINSICS)
472	unsigned endSize = framesToProcess - framesToProcess % `4`;
473	while (i < endSize) {
474	float32x4_t real1 = vld1q_f32(real1P + i);
475	float32x4_t real2 = vld1q_f32(real2P + i);
476	float32x4_t imag1 = vld1q_f32(imag1P + i);
477	float32x4_t imag2 = vld1q_f32(imag2P + i);
478
479	float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
480	float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
481
482	vst1q_f32(realDestP + i, realResult);
483	vst1q_f32(imagDestP + i, imagResult);
484
485	i += `4`;
486	}
487	#endif
488	for (; i < framesToProcess; ++i) {
489	// Read and compute result before storing them, in case the
490	// destination is the same as one of the sources.
491	float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
492	float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
493
494	realDestP[i] = realResult;
495	imagDestP[i] = imagResult;
496	}
497	}
498
499	void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
500	{
501	int n = framesToProcess;
502	float sum = `0`;
503
504	#if CPU(X86_SSE2)
505	if (sourceStride == `1`) {
506	// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
507	while ((reinterpret_cast<uintptr_t>(sourceP) & `0x0F`) && n) {
508	float sample = *sourceP;
509	sum += sample * sample;
510	sourceP++;
511	n--;
512	}
513
514	// Now the sourceP is aligned, use SSE.
515	int tailFrames = n % `4`;
516	const float* endP = sourceP + n - tailFrames;
517	__m128 source;
518	__m128 mSum = _mm_setzero_ps();
519
520	while (sourceP < endP) {
521	source = _mm_load_ps(sourceP);
522	source = _mm_mul_ps(source, source);
523	mSum = _mm_add_ps(mSum, source);
524	sourceP += `4`;
525	}
526
527	// Summarize the SSE results.
528	const float* groupSumP = reinterpret_cast<float*>(&mSum);
529	sum += groupSumP[`0`] + groupSumP[`1`] + groupSumP[`2`] + groupSumP[`3`];
530
531	n = tailFrames;
532	}
533	#elif HAVE(ARM_NEON_INTRINSICS)
534	if (sourceStride == `1`) {
535	int tailFrames = n % `4`;
536	const float* endP = sourceP + n - tailFrames;
537
538	float32x4_t fourSum = vdupq_n_f32(`0`);
539	while (sourceP < endP) {
540	float32x4_t source = vld1q_f32(sourceP);
541	fourSum = vmlaq_f32(fourSum, source, source);
542	sourceP += `4`;
543	}
544	float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
545
546	float groupSum[`2`];
547	vst1_f32(groupSum, twoSum);
548	sum += groupSum[`0`] + groupSum[`1`];
549
550	n = tailFrames;
551	}
552	#endif
553
554	while (n--) {
555	float sample = *sourceP;
556	sum += sample * sample;
557	sourceP += sourceStride;
558	}
559
560	ASSERT(sumP);
561	*sumP = sum;
562	}
563
564	void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
565	{
566	int n = framesToProcess;
567	float max = `0`;
568
569	#if CPU(X86_SSE2)
570	if (sourceStride == `1`) {
571	// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
572	while ((reinterpret_cast<uintptr_t>(sourceP) & `0x0F`) && n) {
573	max = std::max(max, fabsf(*sourceP));
574	sourceP++;
575	n--;
576	}
577
578	// Now the sourceP is aligned, use SSE.
579	int tailFrames = n % `4`;
580	const float* endP = sourceP + n - tailFrames;
581	__m128 source;
582	__m128 mMax = _mm_setzero_ps();
583	int mask = `0x7FFFFFFF`;
584	__m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
585
586	while (sourceP < endP) {
587	source = _mm_load_ps(sourceP);
588	// Calculate the absolute value by anding source with mask, the sign bit is set to 0.
589	source = _mm_and_ps(source, mMask);
590	mMax = _mm_max_ps(mMax, source);
591	sourceP += `4`;
592	}
593
594	// Get max from the SSE results.
595	const float* groupMaxP = reinterpret_cast<float*>(&mMax);
596	max = std::max(max, groupMaxP[`0`]);
597	max = std::max(max, groupMaxP[`1`]);
598	max = std::max(max, groupMaxP[`2`]);
599	max = std::max(max, groupMaxP[`3`]);
600
601	n = tailFrames;
602	}
603	#elif HAVE(ARM_NEON_INTRINSICS)
604	if (sourceStride == `1`) {
605	int tailFrames = n % `4`;
606	const float* endP = sourceP + n - tailFrames;
607
608	float32x4_t fourMax = vdupq_n_f32(`0`);
609	while (sourceP < endP) {
610	float32x4_t source = vld1q_f32(sourceP);
611	fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
612	sourceP += `4`;
613	}
614	float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
615
616	float groupMax[`2`];
617	vst1_f32(groupMax, twoMax);
618	max = std::max(groupMax[`0`], groupMax[`1`]);
619
620	n = tailFrames;
621	}
622	#endif
623
624	while (n--) {
625	max = std::max(max, fabsf(*sourceP));
626	sourceP += sourceStride;
627	}
628
629	ASSERT(maxP);
630	*maxP = max;
631	}
632
633	void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
634	{
635	int n = framesToProcess;
636	float lowThreshold = *lowThresholdP;
637	float highThreshold = *highThresholdP;
638
639	// FIXME: Optimize for SSE2.
640	#if HAVE(ARM_NEON_INTRINSICS)
641	if ((sourceStride == `1`) && (destStride == `1`)) {
642	int tailFrames = n % `4`;
643	const float* endP = destP + n - tailFrames;
644
645	float32x4_t low = vdupq_n_f32(lowThreshold);
646	float32x4_t high = vdupq_n_f32(highThreshold);
647	while (destP < endP) {
648	float32x4_t source = vld1q_f32(sourceP);
649	vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
650	sourceP += `4`;
651	destP += `4`;
652	}
653	n = tailFrames;
654	}
655	#endif
656	while (n--) {
657	destP = std::max(std::min(sourceP, highThreshold), lowThreshold);
658	sourceP += sourceStride;
659	destP += destStride;
660	}
661	}
662
663	#endif // USE(ACCELERATE)
664
665	} // namespace VectorMath
666
667	} // namespace WebCore
668
669	#endif // ENABLE(WEB_AUDIO)
670

Browse the source code of webkit/Source/WebCore/platform/audio/VectorMath.cpp