1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Apple Inc. ("Apple") nor the names of
14 * its contributors may be used to endorse or promote products derived
15 * from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "config.h"
30
31#if ENABLE(WEB_AUDIO)
32
33#include "SincResampler.h"
34
35#include "AudioBus.h"
36#include <wtf/MathExtras.h>
37
38#if CPU(X86_SSE2)
39#include <emmintrin.h>
40#endif
41
42// Input buffer layout, dividing the total buffer into regions (r0 - r5):
43//
44// |----------------|----------------------------------------------------------------|----------------|
45//
46// blockSize + kernelSize / 2
47// <-------------------------------------------------------------------------------->
48// r0
49//
50// kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
51// <---------------> <---------------> <---------------> <--------------->
52// r1 r2 r3 r4
53//
54// blockSize
55// <-------------------------------------------------------------->
56// r5
57
58// The Algorithm:
59//
60// 1) Consume input frames into r0 (r1 is zero-initialized).
61// 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
62// or we've finished generating all the output frames.
63// 3) Copy r3 to r1 and r4 to r2.
64// 4) Consume input frames into r5 (zero-pad if we run out of input).
65// 5) Goto (2) until all of input is consumed.
66//
67// note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
68
69namespace WebCore {
70
71SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
72 : m_scaleFactor(scaleFactor)
73 , m_kernelSize(kernelSize)
74 , m_numberOfKernelOffsets(numberOfKernelOffsets)
75 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
76 , m_virtualSourceIndex(0)
77 , m_blockSize(512)
78 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
79 , m_source(0)
80 , m_sourceFramesAvailable(0)
81 , m_sourceProvider(0)
82 , m_isBufferPrimed(false)
83{
84 initializeKernel();
85}
86
87void SincResampler::initializeKernel()
88{
89 // Blackman window parameters.
90 double alpha = 0.16;
91 double a0 = 0.5 * (1.0 - alpha);
92 double a1 = 0.5;
93 double a2 = 0.5 * alpha;
94
95 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
96 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
97
98 // The sinc function is an idealized brick-wall filter, but since we're windowing it the
99 // transition from pass to stop does not happen right away. So we should adjust the
100 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
101 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
102 sincScaleFactor *= 0.9;
103
104 int n = m_kernelSize;
105 int halfSize = n / 2;
106
107 // Generates a set of windowed sinc() kernels.
108 // We generate a range of sub-sample offsets from 0.0 to 1.0.
109 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
110 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
111
112 for (int i = 0; i < n; ++i) {
113 // Compute the sinc() with offset.
114 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
115 double sinc = !s ? 1.0 : sin(s) / s;
116 sinc *= sincScaleFactor;
117
118 // Compute Blackman window, matching the offset of the sinc().
119 double x = (i - subsampleOffset) / n;
120 double window = a0 - a1 * cos(2.0 * piDouble * x) + a2 * cos(4.0 * piDouble * x);
121
122 // Window the sinc() function and store at the correct offset.
123 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
124 }
125 }
126}
127
128void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
129{
130 ASSERT(m_sourceProvider);
131 if (!m_sourceProvider)
132 return;
133
134 // Wrap the provided buffer by an AudioBus for use by the source provider.
135 auto bus = AudioBus::create(1, numberOfSourceFrames, false);
136
137 // FIXME: Find a way to make the following const-correct:
138 bus->setChannelMemory(0, buffer, numberOfSourceFrames);
139
140 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
141}
142
143namespace {
144
145// BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
146
147class BufferSourceProvider : public AudioSourceProvider {
148public:
149 BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
150 : m_source(source)
151 , m_sourceFramesAvailable(numberOfSourceFrames)
152 {
153 }
154
155 // Consumes samples from the in-memory buffer.
156 void provideInput(AudioBus* bus, size_t framesToProcess) override
157 {
158 ASSERT(m_source && bus);
159 if (!m_source || !bus)
160 return;
161
162 float* buffer = bus->channel(0)->mutableData();
163
164 // Clamp to number of frames available and zero-pad.
165 size_t framesToCopy = std::min(m_sourceFramesAvailable, framesToProcess);
166 memcpy(buffer, m_source, sizeof(float) * framesToCopy);
167
168 // Zero-pad if necessary.
169 if (framesToCopy < framesToProcess)
170 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
171
172 m_sourceFramesAvailable -= framesToCopy;
173 m_source += framesToCopy;
174 }
175
176private:
177 const float* m_source;
178 size_t m_sourceFramesAvailable;
179};
180
181} // namespace
182
183void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
184{
185 // Resample an in-memory buffer using an AudioSourceProvider.
186 BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
187
188 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
189 unsigned remaining = numberOfDestinationFrames;
190
191 while (remaining) {
192 unsigned framesThisTime = std::min(remaining, m_blockSize);
193 process(&sourceProvider, destination, framesThisTime);
194
195 destination += framesThisTime;
196 remaining -= framesThisTime;
197 }
198}
199
200void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
201{
202 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
203 ASSERT(isGood);
204 if (!isGood)
205 return;
206
207 m_sourceProvider = sourceProvider;
208
209 unsigned numberOfDestinationFrames = framesToProcess;
210
211 // Setup various region pointers in the buffer (see diagram above).
212 float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
213 float* r1 = m_inputBuffer.data();
214 float* r2 = r0;
215 float* r3 = r0 + m_blockSize - m_kernelSize / 2;
216 float* r4 = r0 + m_blockSize;
217 float* r5 = r0 + m_kernelSize / 2;
218
219 // Step (1)
220 // Prime the input buffer at the start of the input stream.
221 if (!m_isBufferPrimed) {
222 consumeSource(r0, m_blockSize + m_kernelSize / 2);
223 m_isBufferPrimed = true;
224 }
225
226 // Step (2)
227
228 while (numberOfDestinationFrames) {
229 while (m_virtualSourceIndex < m_blockSize) {
230 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
231 int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
232 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
233
234 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
235 int offsetIndex = static_cast<int>(virtualOffsetIndex);
236
237 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
238 float* k2 = k1 + m_kernelSize;
239
240 // Initialize input pointer based on quantized m_virtualSourceIndex.
241 float* inputP = r1 + sourceIndexI;
242
243 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
244 float sum1 = 0;
245 float sum2 = 0;
246
247 // Figure out how much to weight each kernel's "convolution".
248 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
249
250 // Generate a single output sample.
251 int n = m_kernelSize;
252
253#define CONVOLVE_ONE_SAMPLE \
254 input = *inputP++; \
255 sum1 += input * *k1; \
256 sum2 += input * *k2; \
257 ++k1; \
258 ++k2;
259
260 {
261 float input;
262
263#if CPU(X86_SSE2)
264 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
265 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
266 CONVOLVE_ONE_SAMPLE
267 n--;
268 }
269
270 // Now the inputP is aligned and start to apply SSE.
271 float* endP = inputP + n - n % 4;
272 __m128 mInput;
273 __m128 mK1;
274 __m128 mK2;
275 __m128 mul1;
276 __m128 mul2;
277
278 __m128 sums1 = _mm_setzero_ps();
279 __m128 sums2 = _mm_setzero_ps();
280 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
281 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
282
283#define LOAD_DATA(l1, l2) \
284 mInput = _mm_load_ps(inputP); \
285 mK1 = _mm_##l1##_ps(k1); \
286 mK2 = _mm_##l2##_ps(k2);
287
288#define CONVOLVE_4_SAMPLES \
289 mul1 = _mm_mul_ps(mInput, mK1); \
290 mul2 = _mm_mul_ps(mInput, mK2); \
291 sums1 = _mm_add_ps(sums1, mul1); \
292 sums2 = _mm_add_ps(sums2, mul2); \
293 inputP += 4; \
294 k1 += 4; \
295 k2 += 4;
296
297 if (k1Aligned && k2Aligned) { // both aligned
298 while (inputP < endP) {
299 LOAD_DATA(load, load)
300 CONVOLVE_4_SAMPLES
301 }
302 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
303 while (inputP < endP) {
304 LOAD_DATA(loadu, load)
305 CONVOLVE_4_SAMPLES
306 }
307 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
308 while (inputP < endP) {
309 LOAD_DATA(load, loadu)
310 CONVOLVE_4_SAMPLES
311 }
312 } else { // both non-aligned
313 while (inputP < endP) {
314 LOAD_DATA(loadu, loadu)
315 CONVOLVE_4_SAMPLES
316 }
317 }
318
319 // Summarize the SSE results to sum1 and sum2.
320 float* groupSumP = reinterpret_cast<float*>(&sums1);
321 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
322 groupSumP = reinterpret_cast<float*>(&sums2);
323 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
324
325 n %= 4;
326 while (n) {
327 CONVOLVE_ONE_SAMPLE
328 n--;
329 }
330#else
331 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
332
333 // Optimize size 32 and size 64 kernels by unrolling the while loop.
334 // A 20 - 30% speed improvement was measured in some cases by using this approach.
335
336 if (n == 32) {
337 CONVOLVE_ONE_SAMPLE // 1
338 CONVOLVE_ONE_SAMPLE // 2
339 CONVOLVE_ONE_SAMPLE // 3
340 CONVOLVE_ONE_SAMPLE // 4
341 CONVOLVE_ONE_SAMPLE // 5
342 CONVOLVE_ONE_SAMPLE // 6
343 CONVOLVE_ONE_SAMPLE // 7
344 CONVOLVE_ONE_SAMPLE // 8
345 CONVOLVE_ONE_SAMPLE // 9
346 CONVOLVE_ONE_SAMPLE // 10
347 CONVOLVE_ONE_SAMPLE // 11
348 CONVOLVE_ONE_SAMPLE // 12
349 CONVOLVE_ONE_SAMPLE // 13
350 CONVOLVE_ONE_SAMPLE // 14
351 CONVOLVE_ONE_SAMPLE // 15
352 CONVOLVE_ONE_SAMPLE // 16
353 CONVOLVE_ONE_SAMPLE // 17
354 CONVOLVE_ONE_SAMPLE // 18
355 CONVOLVE_ONE_SAMPLE // 19
356 CONVOLVE_ONE_SAMPLE // 20
357 CONVOLVE_ONE_SAMPLE // 21
358 CONVOLVE_ONE_SAMPLE // 22
359 CONVOLVE_ONE_SAMPLE // 23
360 CONVOLVE_ONE_SAMPLE // 24
361 CONVOLVE_ONE_SAMPLE // 25
362 CONVOLVE_ONE_SAMPLE // 26
363 CONVOLVE_ONE_SAMPLE // 27
364 CONVOLVE_ONE_SAMPLE // 28
365 CONVOLVE_ONE_SAMPLE // 29
366 CONVOLVE_ONE_SAMPLE // 30
367 CONVOLVE_ONE_SAMPLE // 31
368 CONVOLVE_ONE_SAMPLE // 32
369 } else if (n == 64) {
370 CONVOLVE_ONE_SAMPLE // 1
371 CONVOLVE_ONE_SAMPLE // 2
372 CONVOLVE_ONE_SAMPLE // 3
373 CONVOLVE_ONE_SAMPLE // 4
374 CONVOLVE_ONE_SAMPLE // 5
375 CONVOLVE_ONE_SAMPLE // 6
376 CONVOLVE_ONE_SAMPLE // 7
377 CONVOLVE_ONE_SAMPLE // 8
378 CONVOLVE_ONE_SAMPLE // 9
379 CONVOLVE_ONE_SAMPLE // 10
380 CONVOLVE_ONE_SAMPLE // 11
381 CONVOLVE_ONE_SAMPLE // 12
382 CONVOLVE_ONE_SAMPLE // 13
383 CONVOLVE_ONE_SAMPLE // 14
384 CONVOLVE_ONE_SAMPLE // 15
385 CONVOLVE_ONE_SAMPLE // 16
386 CONVOLVE_ONE_SAMPLE // 17
387 CONVOLVE_ONE_SAMPLE // 18
388 CONVOLVE_ONE_SAMPLE // 19
389 CONVOLVE_ONE_SAMPLE // 20
390 CONVOLVE_ONE_SAMPLE // 21
391 CONVOLVE_ONE_SAMPLE // 22
392 CONVOLVE_ONE_SAMPLE // 23
393 CONVOLVE_ONE_SAMPLE // 24
394 CONVOLVE_ONE_SAMPLE // 25
395 CONVOLVE_ONE_SAMPLE // 26
396 CONVOLVE_ONE_SAMPLE // 27
397 CONVOLVE_ONE_SAMPLE // 28
398 CONVOLVE_ONE_SAMPLE // 29
399 CONVOLVE_ONE_SAMPLE // 30
400 CONVOLVE_ONE_SAMPLE // 31
401 CONVOLVE_ONE_SAMPLE // 32
402 CONVOLVE_ONE_SAMPLE // 33
403 CONVOLVE_ONE_SAMPLE // 34
404 CONVOLVE_ONE_SAMPLE // 35
405 CONVOLVE_ONE_SAMPLE // 36
406 CONVOLVE_ONE_SAMPLE // 37
407 CONVOLVE_ONE_SAMPLE // 38
408 CONVOLVE_ONE_SAMPLE // 39
409 CONVOLVE_ONE_SAMPLE // 40
410 CONVOLVE_ONE_SAMPLE // 41
411 CONVOLVE_ONE_SAMPLE // 42
412 CONVOLVE_ONE_SAMPLE // 43
413 CONVOLVE_ONE_SAMPLE // 44
414 CONVOLVE_ONE_SAMPLE // 45
415 CONVOLVE_ONE_SAMPLE // 46
416 CONVOLVE_ONE_SAMPLE // 47
417 CONVOLVE_ONE_SAMPLE // 48
418 CONVOLVE_ONE_SAMPLE // 49
419 CONVOLVE_ONE_SAMPLE // 50
420 CONVOLVE_ONE_SAMPLE // 51
421 CONVOLVE_ONE_SAMPLE // 52
422 CONVOLVE_ONE_SAMPLE // 53
423 CONVOLVE_ONE_SAMPLE // 54
424 CONVOLVE_ONE_SAMPLE // 55
425 CONVOLVE_ONE_SAMPLE // 56
426 CONVOLVE_ONE_SAMPLE // 57
427 CONVOLVE_ONE_SAMPLE // 58
428 CONVOLVE_ONE_SAMPLE // 59
429 CONVOLVE_ONE_SAMPLE // 60
430 CONVOLVE_ONE_SAMPLE // 61
431 CONVOLVE_ONE_SAMPLE // 62
432 CONVOLVE_ONE_SAMPLE // 63
433 CONVOLVE_ONE_SAMPLE // 64
434 } else {
435 while (n--) {
436 // Non-optimized using actual while loop.
437 CONVOLVE_ONE_SAMPLE
438 }
439 }
440#endif
441 }
442
443 // Linearly interpolate the two "convolutions".
444 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
445
446 *destination++ = result;
447
448 // Advance the virtual index.
449 m_virtualSourceIndex += m_scaleFactor;
450
451 --numberOfDestinationFrames;
452 if (!numberOfDestinationFrames)
453 return;
454 }
455
456 // Wrap back around to the start.
457 m_virtualSourceIndex -= m_blockSize;
458
459 // Step (3) Copy r3 to r1 and r4 to r2.
460 // This wraps the last input frames back to the start of the buffer.
461 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
462 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
463
464 // Step (4)
465 // Refresh the buffer with more input.
466 consumeSource(r5, m_blockSize);
467 }
468}
469
470} // namespace WebCore
471
472#endif // ENABLE(WEB_AUDIO)
473