1 | /* |
2 | * Copyright (C) 2011 Google Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * 3. Neither the name of Apple Inc. ("Apple") nor the names of |
14 | * its contributors may be used to endorse or promote products derived |
15 | * from this software without specific prior written permission. |
16 | * |
17 | * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
18 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
20 | * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
21 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
22 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
23 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
24 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "config.h" |
30 | |
31 | #if ENABLE(WEB_AUDIO) |
32 | |
33 | #include "SincResampler.h" |
34 | |
35 | #include "AudioBus.h" |
36 | #include <wtf/MathExtras.h> |
37 | |
38 | #if CPU(X86_SSE2) |
39 | #include <emmintrin.h> |
40 | #endif |
41 | |
42 | // Input buffer layout, dividing the total buffer into regions (r0 - r5): |
43 | // |
44 | // |----------------|----------------------------------------------------------------|----------------| |
45 | // |
46 | // blockSize + kernelSize / 2 |
47 | // <--------------------------------------------------------------------------------> |
48 | // r0 |
49 | // |
50 | // kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2 |
51 | // <---------------> <---------------> <---------------> <---------------> |
52 | // r1 r2 r3 r4 |
53 | // |
54 | // blockSize |
55 | // <--------------------------------------------------------------> |
56 | // r5 |
57 | |
58 | // The Algorithm: |
59 | // |
60 | // 1) Consume input frames into r0 (r1 is zero-initialized). |
61 | // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4. |
62 | // or we've finished generating all the output frames. |
63 | // 3) Copy r3 to r1 and r4 to r2. |
64 | // 4) Consume input frames into r5 (zero-pad if we run out of input). |
65 | // 5) Goto (2) until all of input is consumed. |
66 | // |
67 | // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc. |
68 | |
69 | namespace WebCore { |
70 | |
71 | SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets) |
72 | : m_scaleFactor(scaleFactor) |
73 | , m_kernelSize(kernelSize) |
74 | , m_numberOfKernelOffsets(numberOfKernelOffsets) |
75 | , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1)) |
76 | , m_virtualSourceIndex(0) |
77 | , m_blockSize(512) |
78 | , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above. |
79 | , m_source(0) |
80 | , m_sourceFramesAvailable(0) |
81 | , m_sourceProvider(0) |
82 | , m_isBufferPrimed(false) |
83 | { |
84 | initializeKernel(); |
85 | } |
86 | |
87 | void SincResampler::initializeKernel() |
88 | { |
89 | // Blackman window parameters. |
90 | double alpha = 0.16; |
91 | double a0 = 0.5 * (1.0 - alpha); |
92 | double a1 = 0.5; |
93 | double a2 = 0.5 * alpha; |
94 | |
95 | // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter. |
96 | double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0; |
97 | |
98 | // The sinc function is an idealized brick-wall filter, but since we're windowing it the |
99 | // transition from pass to stop does not happen right away. So we should adjust the |
100 | // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end. |
101 | // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize. |
102 | sincScaleFactor *= 0.9; |
103 | |
104 | int n = m_kernelSize; |
105 | int halfSize = n / 2; |
106 | |
107 | // Generates a set of windowed sinc() kernels. |
108 | // We generate a range of sub-sample offsets from 0.0 to 1.0. |
109 | for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) { |
110 | double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets; |
111 | |
112 | for (int i = 0; i < n; ++i) { |
113 | // Compute the sinc() with offset. |
114 | double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset); |
115 | double sinc = !s ? 1.0 : sin(s) / s; |
116 | sinc *= sincScaleFactor; |
117 | |
118 | // Compute Blackman window, matching the offset of the sinc(). |
119 | double x = (i - subsampleOffset) / n; |
120 | double window = a0 - a1 * cos(2.0 * piDouble * x) + a2 * cos(4.0 * piDouble * x); |
121 | |
122 | // Window the sinc() function and store at the correct offset. |
123 | m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window; |
124 | } |
125 | } |
126 | } |
127 | |
128 | void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames) |
129 | { |
130 | ASSERT(m_sourceProvider); |
131 | if (!m_sourceProvider) |
132 | return; |
133 | |
134 | // Wrap the provided buffer by an AudioBus for use by the source provider. |
135 | auto bus = AudioBus::create(1, numberOfSourceFrames, false); |
136 | |
137 | // FIXME: Find a way to make the following const-correct: |
138 | bus->setChannelMemory(0, buffer, numberOfSourceFrames); |
139 | |
140 | m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames); |
141 | } |
142 | |
143 | namespace { |
144 | |
145 | // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer. |
146 | |
147 | class BufferSourceProvider : public AudioSourceProvider { |
148 | public: |
149 | BufferSourceProvider(const float* source, size_t numberOfSourceFrames) |
150 | : m_source(source) |
151 | , m_sourceFramesAvailable(numberOfSourceFrames) |
152 | { |
153 | } |
154 | |
155 | // Consumes samples from the in-memory buffer. |
156 | void provideInput(AudioBus* bus, size_t framesToProcess) override |
157 | { |
158 | ASSERT(m_source && bus); |
159 | if (!m_source || !bus) |
160 | return; |
161 | |
162 | float* buffer = bus->channel(0)->mutableData(); |
163 | |
164 | // Clamp to number of frames available and zero-pad. |
165 | size_t framesToCopy = std::min(m_sourceFramesAvailable, framesToProcess); |
166 | memcpy(buffer, m_source, sizeof(float) * framesToCopy); |
167 | |
168 | // Zero-pad if necessary. |
169 | if (framesToCopy < framesToProcess) |
170 | memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy)); |
171 | |
172 | m_sourceFramesAvailable -= framesToCopy; |
173 | m_source += framesToCopy; |
174 | } |
175 | |
176 | private: |
177 | const float* m_source; |
178 | size_t m_sourceFramesAvailable; |
179 | }; |
180 | |
181 | } // namespace |
182 | |
183 | void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames) |
184 | { |
185 | // Resample an in-memory buffer using an AudioSourceProvider. |
186 | BufferSourceProvider sourceProvider(source, numberOfSourceFrames); |
187 | |
188 | unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor); |
189 | unsigned remaining = numberOfDestinationFrames; |
190 | |
191 | while (remaining) { |
192 | unsigned framesThisTime = std::min(remaining, m_blockSize); |
193 | process(&sourceProvider, destination, framesThisTime); |
194 | |
195 | destination += framesThisTime; |
196 | remaining -= framesThisTime; |
197 | } |
198 | } |
199 | |
200 | void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess) |
201 | { |
202 | bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2); |
203 | ASSERT(isGood); |
204 | if (!isGood) |
205 | return; |
206 | |
207 | m_sourceProvider = sourceProvider; |
208 | |
209 | unsigned numberOfDestinationFrames = framesToProcess; |
210 | |
211 | // Setup various region pointers in the buffer (see diagram above). |
212 | float* r0 = m_inputBuffer.data() + m_kernelSize / 2; |
213 | float* r1 = m_inputBuffer.data(); |
214 | float* r2 = r0; |
215 | float* r3 = r0 + m_blockSize - m_kernelSize / 2; |
216 | float* r4 = r0 + m_blockSize; |
217 | float* r5 = r0 + m_kernelSize / 2; |
218 | |
219 | // Step (1) |
220 | // Prime the input buffer at the start of the input stream. |
221 | if (!m_isBufferPrimed) { |
222 | consumeSource(r0, m_blockSize + m_kernelSize / 2); |
223 | m_isBufferPrimed = true; |
224 | } |
225 | |
226 | // Step (2) |
227 | |
228 | while (numberOfDestinationFrames) { |
229 | while (m_virtualSourceIndex < m_blockSize) { |
230 | // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are. |
231 | int sourceIndexI = static_cast<int>(m_virtualSourceIndex); |
232 | double subsampleRemainder = m_virtualSourceIndex - sourceIndexI; |
233 | |
234 | double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets; |
235 | int offsetIndex = static_cast<int>(virtualOffsetIndex); |
236 | |
237 | float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize; |
238 | float* k2 = k1 + m_kernelSize; |
239 | |
240 | // Initialize input pointer based on quantized m_virtualSourceIndex. |
241 | float* inputP = r1 + sourceIndexI; |
242 | |
243 | // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex |
244 | float sum1 = 0; |
245 | float sum2 = 0; |
246 | |
247 | // Figure out how much to weight each kernel's "convolution". |
248 | double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex; |
249 | |
250 | // Generate a single output sample. |
251 | int n = m_kernelSize; |
252 | |
253 | #define CONVOLVE_ONE_SAMPLE \ |
254 | input = *inputP++; \ |
255 | sum1 += input * *k1; \ |
256 | sum2 += input * *k2; \ |
257 | ++k1; \ |
258 | ++k2; |
259 | |
260 | { |
261 | float input; |
262 | |
263 | #if CPU(X86_SSE2) |
264 | // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. |
265 | while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) { |
266 | CONVOLVE_ONE_SAMPLE |
267 | n--; |
268 | } |
269 | |
270 | // Now the inputP is aligned and start to apply SSE. |
271 | float* endP = inputP + n - n % 4; |
272 | __m128 mInput; |
273 | __m128 mK1; |
274 | __m128 mK2; |
275 | __m128 mul1; |
276 | __m128 mul2; |
277 | |
278 | __m128 sums1 = _mm_setzero_ps(); |
279 | __m128 sums2 = _mm_setzero_ps(); |
280 | bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F); |
281 | bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F); |
282 | |
283 | #define LOAD_DATA(l1, l2) \ |
284 | mInput = _mm_load_ps(inputP); \ |
285 | mK1 = _mm_##l1##_ps(k1); \ |
286 | mK2 = _mm_##l2##_ps(k2); |
287 | |
288 | #define CONVOLVE_4_SAMPLES \ |
289 | mul1 = _mm_mul_ps(mInput, mK1); \ |
290 | mul2 = _mm_mul_ps(mInput, mK2); \ |
291 | sums1 = _mm_add_ps(sums1, mul1); \ |
292 | sums2 = _mm_add_ps(sums2, mul2); \ |
293 | inputP += 4; \ |
294 | k1 += 4; \ |
295 | k2 += 4; |
296 | |
297 | if (k1Aligned && k2Aligned) { // both aligned |
298 | while (inputP < endP) { |
299 | LOAD_DATA(load, load) |
300 | CONVOLVE_4_SAMPLES |
301 | } |
302 | } else if (!k1Aligned && k2Aligned) { // only k2 aligned |
303 | while (inputP < endP) { |
304 | LOAD_DATA(loadu, load) |
305 | CONVOLVE_4_SAMPLES |
306 | } |
307 | } else if (k1Aligned && !k2Aligned) { // only k1 aligned |
308 | while (inputP < endP) { |
309 | LOAD_DATA(load, loadu) |
310 | CONVOLVE_4_SAMPLES |
311 | } |
312 | } else { // both non-aligned |
313 | while (inputP < endP) { |
314 | LOAD_DATA(loadu, loadu) |
315 | CONVOLVE_4_SAMPLES |
316 | } |
317 | } |
318 | |
319 | // Summarize the SSE results to sum1 and sum2. |
320 | float* groupSumP = reinterpret_cast<float*>(&sums1); |
321 | sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
322 | groupSumP = reinterpret_cast<float*>(&sums2); |
323 | sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
324 | |
325 | n %= 4; |
326 | while (n) { |
327 | CONVOLVE_ONE_SAMPLE |
328 | n--; |
329 | } |
330 | #else |
331 | // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better. |
332 | |
333 | // Optimize size 32 and size 64 kernels by unrolling the while loop. |
334 | // A 20 - 30% speed improvement was measured in some cases by using this approach. |
335 | |
336 | if (n == 32) { |
337 | CONVOLVE_ONE_SAMPLE // 1 |
338 | CONVOLVE_ONE_SAMPLE // 2 |
339 | CONVOLVE_ONE_SAMPLE // 3 |
340 | CONVOLVE_ONE_SAMPLE // 4 |
341 | CONVOLVE_ONE_SAMPLE // 5 |
342 | CONVOLVE_ONE_SAMPLE // 6 |
343 | CONVOLVE_ONE_SAMPLE // 7 |
344 | CONVOLVE_ONE_SAMPLE // 8 |
345 | CONVOLVE_ONE_SAMPLE // 9 |
346 | CONVOLVE_ONE_SAMPLE // 10 |
347 | CONVOLVE_ONE_SAMPLE // 11 |
348 | CONVOLVE_ONE_SAMPLE // 12 |
349 | CONVOLVE_ONE_SAMPLE // 13 |
350 | CONVOLVE_ONE_SAMPLE // 14 |
351 | CONVOLVE_ONE_SAMPLE // 15 |
352 | CONVOLVE_ONE_SAMPLE // 16 |
353 | CONVOLVE_ONE_SAMPLE // 17 |
354 | CONVOLVE_ONE_SAMPLE // 18 |
355 | CONVOLVE_ONE_SAMPLE // 19 |
356 | CONVOLVE_ONE_SAMPLE // 20 |
357 | CONVOLVE_ONE_SAMPLE // 21 |
358 | CONVOLVE_ONE_SAMPLE // 22 |
359 | CONVOLVE_ONE_SAMPLE // 23 |
360 | CONVOLVE_ONE_SAMPLE // 24 |
361 | CONVOLVE_ONE_SAMPLE // 25 |
362 | CONVOLVE_ONE_SAMPLE // 26 |
363 | CONVOLVE_ONE_SAMPLE // 27 |
364 | CONVOLVE_ONE_SAMPLE // 28 |
365 | CONVOLVE_ONE_SAMPLE // 29 |
366 | CONVOLVE_ONE_SAMPLE // 30 |
367 | CONVOLVE_ONE_SAMPLE // 31 |
368 | CONVOLVE_ONE_SAMPLE // 32 |
369 | } else if (n == 64) { |
370 | CONVOLVE_ONE_SAMPLE // 1 |
371 | CONVOLVE_ONE_SAMPLE // 2 |
372 | CONVOLVE_ONE_SAMPLE // 3 |
373 | CONVOLVE_ONE_SAMPLE // 4 |
374 | CONVOLVE_ONE_SAMPLE // 5 |
375 | CONVOLVE_ONE_SAMPLE // 6 |
376 | CONVOLVE_ONE_SAMPLE // 7 |
377 | CONVOLVE_ONE_SAMPLE // 8 |
378 | CONVOLVE_ONE_SAMPLE // 9 |
379 | CONVOLVE_ONE_SAMPLE // 10 |
380 | CONVOLVE_ONE_SAMPLE // 11 |
381 | CONVOLVE_ONE_SAMPLE // 12 |
382 | CONVOLVE_ONE_SAMPLE // 13 |
383 | CONVOLVE_ONE_SAMPLE // 14 |
384 | CONVOLVE_ONE_SAMPLE // 15 |
385 | CONVOLVE_ONE_SAMPLE // 16 |
386 | CONVOLVE_ONE_SAMPLE // 17 |
387 | CONVOLVE_ONE_SAMPLE // 18 |
388 | CONVOLVE_ONE_SAMPLE // 19 |
389 | CONVOLVE_ONE_SAMPLE // 20 |
390 | CONVOLVE_ONE_SAMPLE // 21 |
391 | CONVOLVE_ONE_SAMPLE // 22 |
392 | CONVOLVE_ONE_SAMPLE // 23 |
393 | CONVOLVE_ONE_SAMPLE // 24 |
394 | CONVOLVE_ONE_SAMPLE // 25 |
395 | CONVOLVE_ONE_SAMPLE // 26 |
396 | CONVOLVE_ONE_SAMPLE // 27 |
397 | CONVOLVE_ONE_SAMPLE // 28 |
398 | CONVOLVE_ONE_SAMPLE // 29 |
399 | CONVOLVE_ONE_SAMPLE // 30 |
400 | CONVOLVE_ONE_SAMPLE // 31 |
401 | CONVOLVE_ONE_SAMPLE // 32 |
402 | CONVOLVE_ONE_SAMPLE // 33 |
403 | CONVOLVE_ONE_SAMPLE // 34 |
404 | CONVOLVE_ONE_SAMPLE // 35 |
405 | CONVOLVE_ONE_SAMPLE // 36 |
406 | CONVOLVE_ONE_SAMPLE // 37 |
407 | CONVOLVE_ONE_SAMPLE // 38 |
408 | CONVOLVE_ONE_SAMPLE // 39 |
409 | CONVOLVE_ONE_SAMPLE // 40 |
410 | CONVOLVE_ONE_SAMPLE // 41 |
411 | CONVOLVE_ONE_SAMPLE // 42 |
412 | CONVOLVE_ONE_SAMPLE // 43 |
413 | CONVOLVE_ONE_SAMPLE // 44 |
414 | CONVOLVE_ONE_SAMPLE // 45 |
415 | CONVOLVE_ONE_SAMPLE // 46 |
416 | CONVOLVE_ONE_SAMPLE // 47 |
417 | CONVOLVE_ONE_SAMPLE // 48 |
418 | CONVOLVE_ONE_SAMPLE // 49 |
419 | CONVOLVE_ONE_SAMPLE // 50 |
420 | CONVOLVE_ONE_SAMPLE // 51 |
421 | CONVOLVE_ONE_SAMPLE // 52 |
422 | CONVOLVE_ONE_SAMPLE // 53 |
423 | CONVOLVE_ONE_SAMPLE // 54 |
424 | CONVOLVE_ONE_SAMPLE // 55 |
425 | CONVOLVE_ONE_SAMPLE // 56 |
426 | CONVOLVE_ONE_SAMPLE // 57 |
427 | CONVOLVE_ONE_SAMPLE // 58 |
428 | CONVOLVE_ONE_SAMPLE // 59 |
429 | CONVOLVE_ONE_SAMPLE // 60 |
430 | CONVOLVE_ONE_SAMPLE // 61 |
431 | CONVOLVE_ONE_SAMPLE // 62 |
432 | CONVOLVE_ONE_SAMPLE // 63 |
433 | CONVOLVE_ONE_SAMPLE // 64 |
434 | } else { |
435 | while (n--) { |
436 | // Non-optimized using actual while loop. |
437 | CONVOLVE_ONE_SAMPLE |
438 | } |
439 | } |
440 | #endif |
441 | } |
442 | |
443 | // Linearly interpolate the two "convolutions". |
444 | double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2; |
445 | |
446 | *destination++ = result; |
447 | |
448 | // Advance the virtual index. |
449 | m_virtualSourceIndex += m_scaleFactor; |
450 | |
451 | --numberOfDestinationFrames; |
452 | if (!numberOfDestinationFrames) |
453 | return; |
454 | } |
455 | |
456 | // Wrap back around to the start. |
457 | m_virtualSourceIndex -= m_blockSize; |
458 | |
459 | // Step (3) Copy r3 to r1 and r4 to r2. |
460 | // This wraps the last input frames back to the start of the buffer. |
461 | memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2)); |
462 | memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2)); |
463 | |
464 | // Step (4) |
465 | // Refresh the buffer with more input. |
466 | consumeSource(r5, m_blockSize); |
467 | } |
468 | } |
469 | |
470 | } // namespace WebCore |
471 | |
472 | #endif // ENABLE(WEB_AUDIO) |
473 | |