| 1 | /* |
| 2 | * Copyright (C) 2013-2019 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions |
| 6 | * are met: |
| 7 | * 1. Redistributions of source code must retain the above copyright |
| 8 | * notice, this list of conditions and the following disclaimer. |
| 9 | * 2. Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * |
| 13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 24 | */ |
| 25 | |
| 26 | #include "config.h" |
| 27 | #include "BinarySwitch.h" |
| 28 | |
| 29 | #if ENABLE(JIT) |
| 30 | |
| 31 | #include "JSCInlines.h" |
| 32 | #include <wtf/ListDump.h> |
| 33 | |
| 34 | namespace JSC { |
| 35 | |
| 36 | namespace BinarySwitchInternal { |
| 37 | static const bool verbose = false; |
| 38 | } |
| 39 | |
| 40 | static unsigned globalCounter; // We use a different seed every time we are invoked. |
| 41 | |
| 42 | BinarySwitch::BinarySwitch(GPRReg value, const Vector<int64_t>& cases, Type type) |
| 43 | : m_type(type) |
| 44 | , m_value(value) |
| 45 | , m_weakRandom(globalCounter++) |
| 46 | , m_index(0) |
| 47 | , m_caseIndex(UINT_MAX) |
| 48 | { |
| 49 | if (cases.isEmpty()) |
| 50 | return; |
| 51 | |
| 52 | if (BinarySwitchInternal::verbose) |
| 53 | dataLog("Original cases: " , listDump(cases), "\n" ); |
| 54 | |
| 55 | for (unsigned i = 0; i < cases.size(); ++i) |
| 56 | m_cases.append(Case(cases[i], i)); |
| 57 | |
| 58 | std::sort(m_cases.begin(), m_cases.end()); |
| 59 | |
| 60 | if (BinarySwitchInternal::verbose) |
| 61 | dataLog("Sorted cases: " , listDump(m_cases), "\n" ); |
| 62 | |
| 63 | #if !ASSERT_DISABLED |
| 64 | for (unsigned i = 1; i < m_cases.size(); ++i) |
| 65 | ASSERT(m_cases[i - 1] < m_cases[i], i, m_cases.size(), m_cases[i].value, m_cases[i].index); |
| 66 | #endif |
| 67 | |
| 68 | build(0, false, m_cases.size()); |
| 69 | } |
| 70 | |
| 71 | BinarySwitch::~BinarySwitch() |
| 72 | { |
| 73 | } |
| 74 | |
| 75 | bool BinarySwitch::advance(MacroAssembler& jit) |
| 76 | { |
| 77 | if (m_cases.isEmpty()) { |
| 78 | m_fallThrough.append(jit.jump()); |
| 79 | return false; |
| 80 | } |
| 81 | |
| 82 | if (m_index == m_branches.size()) { |
| 83 | RELEASE_ASSERT(m_jumpStack.isEmpty()); |
| 84 | return false; |
| 85 | } |
| 86 | |
| 87 | for (;;) { |
| 88 | const BranchCode& code = m_branches[m_index++]; |
| 89 | switch (code.kind) { |
| 90 | case NotEqualToFallThrough: |
| 91 | switch (m_type) { |
| 92 | case Int32: |
| 93 | m_fallThrough.append(jit.branch32( |
| 94 | MacroAssembler::NotEqual, m_value, |
| 95 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
| 96 | break; |
| 97 | case IntPtr: |
| 98 | m_fallThrough.append(jit.branchPtr( |
| 99 | MacroAssembler::NotEqual, m_value, |
| 100 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
| 101 | break; |
| 102 | } |
| 103 | break; |
| 104 | case NotEqualToPush: |
| 105 | switch (m_type) { |
| 106 | case Int32: |
| 107 | m_jumpStack.append(jit.branch32( |
| 108 | MacroAssembler::NotEqual, m_value, |
| 109 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
| 110 | break; |
| 111 | case IntPtr: |
| 112 | m_jumpStack.append(jit.branchPtr( |
| 113 | MacroAssembler::NotEqual, m_value, |
| 114 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
| 115 | break; |
| 116 | } |
| 117 | break; |
| 118 | case LessThanToPush: |
| 119 | switch (m_type) { |
| 120 | case Int32: |
| 121 | m_jumpStack.append(jit.branch32( |
| 122 | MacroAssembler::LessThan, m_value, |
| 123 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
| 124 | break; |
| 125 | case IntPtr: |
| 126 | m_jumpStack.append(jit.branchPtr( |
| 127 | MacroAssembler::LessThan, m_value, |
| 128 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
| 129 | break; |
| 130 | } |
| 131 | break; |
| 132 | case Pop: |
| 133 | m_jumpStack.takeLast().link(&jit); |
| 134 | break; |
| 135 | case ExecuteCase: |
| 136 | m_caseIndex = code.index; |
| 137 | return true; |
| 138 | } |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | class RandomNumberGenerator { |
| 143 | public: |
| 144 | using result_type = uint32_t; |
| 145 | |
| 146 | RandomNumberGenerator(WeakRandom& weakRandom) |
| 147 | : m_weakRandom(weakRandom) |
| 148 | { |
| 149 | } |
| 150 | |
| 151 | uint32_t operator()() |
| 152 | { |
| 153 | return m_weakRandom.getUint32(); |
| 154 | } |
| 155 | |
| 156 | static constexpr uint32_t min() { return std::numeric_limits<uint32_t>::min(); } |
| 157 | static constexpr uint32_t max() { return std::numeric_limits<uint32_t>::max(); } |
| 158 | |
| 159 | private: |
| 160 | WeakRandom& m_weakRandom; |
| 161 | }; |
| 162 | |
| 163 | void BinarySwitch::build(unsigned start, bool hardStart, unsigned end) |
| 164 | { |
| 165 | if (BinarySwitchInternal::verbose) |
| 166 | dataLog("Building with start = " , start, ", hardStart = " , hardStart, ", end = " , end, "\n" ); |
| 167 | |
| 168 | auto append = [&] (const BranchCode& code) { |
| 169 | if (BinarySwitchInternal::verbose) |
| 170 | dataLog("==> " , code, "\n" ); |
| 171 | m_branches.append(code); |
| 172 | }; |
| 173 | |
| 174 | unsigned size = end - start; |
| 175 | |
| 176 | RELEASE_ASSERT(size); |
| 177 | |
| 178 | // This code uses some random numbers to keep things balanced. It's important to keep in mind |
| 179 | // that this does not improve average-case throughput under the assumption that all cases fire |
| 180 | // with equal probability. It just ensures that there will not be some switch structure that |
| 181 | // when combined with some input will always produce pathologically good or pathologically bad |
| 182 | // performance. |
| 183 | |
| 184 | const unsigned leafThreshold = 3; |
| 185 | |
| 186 | if (size <= leafThreshold) { |
| 187 | if (BinarySwitchInternal::verbose) |
| 188 | dataLog("It's a leaf.\n" ); |
| 189 | |
| 190 | // It turns out that for exactly three cases or less, it's better to just compare each |
| 191 | // case individually. This saves 1/6 of a branch on average, and up to 1/3 of a branch in |
| 192 | // extreme cases where the divide-and-conquer bottoms out in a lot of 3-case subswitches. |
| 193 | // |
| 194 | // This assumes that we care about the cost of hitting some case more than we care about |
| 195 | // bottoming out in a default case. I believe that in most places where we use switch |
| 196 | // statements, we are more likely to hit one of the cases than we are to fall through to |
| 197 | // default. Intuitively, if we wanted to improve the performance of default, we would |
| 198 | // reduce the value of leafThreshold to 2 or even to 1. See below for a deeper discussion. |
| 199 | |
| 200 | bool allConsecutive = false; |
| 201 | |
| 202 | if ((hardStart || (start && m_cases[start - 1].value == m_cases[start].value - 1)) |
| 203 | && start + size < m_cases.size() |
| 204 | && m_cases[start + size - 1].value == m_cases[start + size].value - 1) { |
| 205 | allConsecutive = true; |
| 206 | for (unsigned i = 0; i < size - 1; ++i) { |
| 207 | if (m_cases[start + i].value + 1 != m_cases[start + i + 1].value) { |
| 208 | allConsecutive = false; |
| 209 | break; |
| 210 | } |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | if (BinarySwitchInternal::verbose) |
| 215 | dataLog("allConsecutive = " , allConsecutive, "\n" ); |
| 216 | |
| 217 | Vector<unsigned, 3> localCaseIndices; |
| 218 | for (unsigned i = 0; i < size; ++i) |
| 219 | localCaseIndices.append(start + i); |
| 220 | |
| 221 | std::shuffle( |
| 222 | localCaseIndices.begin(), localCaseIndices.end(), |
| 223 | RandomNumberGenerator(m_weakRandom)); |
| 224 | |
| 225 | for (unsigned i = 0; i < size - 1; ++i) { |
| 226 | append(BranchCode(NotEqualToPush, localCaseIndices[i])); |
| 227 | append(BranchCode(ExecuteCase, localCaseIndices[i])); |
| 228 | append(BranchCode(Pop)); |
| 229 | } |
| 230 | |
| 231 | if (!allConsecutive) |
| 232 | append(BranchCode(NotEqualToFallThrough, localCaseIndices.last())); |
| 233 | |
| 234 | append(BranchCode(ExecuteCase, localCaseIndices.last())); |
| 235 | return; |
| 236 | } |
| 237 | |
| 238 | if (BinarySwitchInternal::verbose) |
| 239 | dataLog("It's not a leaf.\n" ); |
| 240 | |
| 241 | // There are two different strategies we could consider here: |
| 242 | // |
| 243 | // Isolate median and split: pick a median and check if the comparison value is equal to it; |
| 244 | // if so, execute the median case. Otherwise check if the value is less than the median, and |
| 245 | // recurse left or right based on this. This has two subvariants: we could either first test |
| 246 | // equality for the median and then do the less-than, or we could first do the less-than and |
| 247 | // then check equality on the not-less-than path. |
| 248 | // |
| 249 | // Ignore median and split: do a less-than comparison on a value that splits the cases in two |
| 250 | // equal-sized halves. Recurse left or right based on the comparison. Do not test for equality |
| 251 | // against the median (or anything else); let the recursion handle those equality comparisons |
| 252 | // once we bottom out in a list that case 3 cases or less (see above). |
| 253 | // |
| 254 | // I'll refer to these strategies as Isolate and Ignore. I initially believed that Isolate |
| 255 | // would be faster since it leads to less branching for some lucky cases. It turns out that |
| 256 | // Isolate is almost a total fail in the average, assuming all cases are equally likely. How |
| 257 | // bad Isolate is depends on whether you believe that doing two consecutive branches based on |
| 258 | // the same comparison is cheaper than doing the compare/branches separately. This is |
| 259 | // difficult to evaluate. For small immediates that aren't blinded, we just care about |
| 260 | // avoiding a second compare instruction. For large immediates or when blinding is in play, we |
| 261 | // also care about the instructions used to materialize the immediate a second time. Isolate |
| 262 | // can help with both costs since it involves first doing a < compare+branch on some value, |
| 263 | // followed by a == compare+branch on the same exact value (or vice-versa). Ignore will do a < |
| 264 | // compare+branch on some value, and then the == compare+branch on that same value will happen |
| 265 | // much later. |
| 266 | // |
| 267 | // To evaluate these costs, I wrote the recurrence relation for Isolate and Ignore, assuming |
| 268 | // that ComparisonCost is the cost of a compare+branch and ChainedComparisonCost is the cost |
| 269 | // of a compare+branch on some value that you've just done another compare+branch for. These |
| 270 | // recurrence relations compute the total cost incurred if you executed the switch statement |
| 271 | // on each matching value. So the average cost of hitting some case can be computed as |
| 272 | // Isolate[n]/n or Ignore[n]/n, respectively for the two relations. |
| 273 | // |
| 274 | // Isolate[1] = ComparisonCost |
| 275 | // Isolate[2] = (2 + 1) * ComparisonCost |
| 276 | // Isolate[3] = (3 + 2 + 1) * ComparisonCost |
| 277 | // Isolate[n_] := With[ |
| 278 | // {medianIndex = Floor[n/2] + If[EvenQ[n], RandomInteger[], 1]}, |
| 279 | // ComparisonCost + ChainedComparisonCost + |
| 280 | // (ComparisonCost * (medianIndex - 1) + Isolate[medianIndex - 1]) + |
| 281 | // (2 * ComparisonCost * (n - medianIndex) + Isolate[n - medianIndex])] |
| 282 | // |
| 283 | // Ignore[1] = ComparisonCost |
| 284 | // Ignore[2] = (2 + 1) * ComparisonCost |
| 285 | // Ignore[3] = (3 + 2 + 1) * ComparisonCost |
| 286 | // Ignore[n_] := With[ |
| 287 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]}, |
| 288 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) + |
| 289 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])] |
| 290 | // |
| 291 | // This does not account for the average cost of hitting the default case. See further below |
| 292 | // for a discussion of that. |
| 293 | // |
| 294 | // It turns out that for ComparisonCost = 1 and ChainedComparisonCost = 1, Ignore is always |
| 295 | // better than Isolate. If we assume that ChainedComparisonCost = 0, then Isolate wins for |
| 296 | // switch statements that have 20 cases or fewer, though the margin of victory is never large |
| 297 | // - it might sometimes save an average of 0.3 ComparisonCost. For larger switch statements, |
| 298 | // we see divergence between the two with Ignore winning. This is of course rather |
| 299 | // unrealistic since the chained comparison is never free. For ChainedComparisonCost = 0.5, we |
| 300 | // see Isolate winning for 10 cases or fewer, by maybe 0.2 ComparisonCost. Again we see |
| 301 | // divergence for large switches with Ignore winning, for example if a switch statement has |
| 302 | // 100 cases then Ignore saves one branch on average. |
| 303 | // |
| 304 | // Our current JIT backends don't provide for optimization for chained comparisons, except for |
| 305 | // reducing the code for materializing the immediate if the immediates are large or blinding |
| 306 | // comes into play. Probably our JIT backends live somewhere north of |
| 307 | // ChainedComparisonCost = 0.5. |
| 308 | // |
| 309 | // This implies that using the Ignore strategy is likely better. If we wanted to incorporate |
| 310 | // the Isolate strategy, we'd want to determine the switch size threshold at which the two |
| 311 | // cross over and then use Isolate for switches that are smaller than that size. |
| 312 | // |
| 313 | // The average cost of hitting the default case is similar, but involves a different cost for |
| 314 | // the base cases: you have to assume that you will always fail each branch. For the Ignore |
| 315 | // strategy we would get this recurrence relation; the same kind of thing happens to the |
| 316 | // Isolate strategy: |
| 317 | // |
| 318 | // Ignore[1] = ComparisonCost |
| 319 | // Ignore[2] = (2 + 2) * ComparisonCost |
| 320 | // Ignore[3] = (3 + 3 + 3) * ComparisonCost |
| 321 | // Ignore[n_] := With[ |
| 322 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]}, |
| 323 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) + |
| 324 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])] |
| 325 | // |
| 326 | // This means that if we cared about the default case more, we would likely reduce |
| 327 | // leafThreshold. Reducing it to 2 would reduce the average cost of the default case by 1/3 |
| 328 | // in the most extreme cases (num switch cases = 3, 6, 12, 24, ...). But it would also |
| 329 | // increase the average cost of taking one of the non-default cases by 1/3. Typically the |
| 330 | // difference is 1/6 in either direction. This makes it a very simple trade-off: if we believe |
| 331 | // that the default case is more important then we would want leafThreshold to be 2, and the |
| 332 | // default case would become 1/6 faster on average. But we believe that most switch statements |
| 333 | // are more likely to take one of the cases than the default, so we use leafThreshold = 3 |
| 334 | // and get a 1/6 speed-up on average for taking an explicit case. |
| 335 | |
| 336 | unsigned medianIndex = (start + end) / 2; |
| 337 | |
| 338 | if (BinarySwitchInternal::verbose) |
| 339 | dataLog("medianIndex = " , medianIndex, "\n" ); |
| 340 | |
| 341 | // We want medianIndex to point to the thing we will do a less-than compare against. We want |
| 342 | // this less-than compare to split the current sublist into equal-sized sublists, or |
| 343 | // nearly-equal-sized with some randomness if we're in the odd case. With the above |
| 344 | // calculation, in the odd case we will have medianIndex pointing at either the element we |
| 345 | // want or the element to the left of the one we want. Consider the case of five elements: |
| 346 | // |
| 347 | // 0 1 2 3 4 |
| 348 | // |
| 349 | // start will be 0, end will be 5. The average is 2.5, which rounds down to 2. If we do |
| 350 | // value < 2, then we will split the list into 2 elements on the left and three on the right. |
| 351 | // That's pretty good, but in this odd case we'd like to at random choose 3 instead to ensure |
| 352 | // that we don't become unbalanced on the right. This does not improve throughput since one |
| 353 | // side will always get shafted, and that side might still be odd, in which case it will also |
| 354 | // have two sides and one of them will get shafted - and so on. We just want to avoid |
| 355 | // deterministic pathologies. |
| 356 | // |
| 357 | // In the even case, we will always end up pointing at the element we want: |
| 358 | // |
| 359 | // 0 1 2 3 |
| 360 | // |
| 361 | // start will be 0, end will be 4. So, the average is 2, which is what we'd like. |
| 362 | if (size & 1) { |
| 363 | RELEASE_ASSERT(medianIndex - start + 1 == end - medianIndex); |
| 364 | medianIndex += m_weakRandom.getUint32() & 1; |
| 365 | } else |
| 366 | RELEASE_ASSERT(medianIndex - start == end - medianIndex); |
| 367 | |
| 368 | RELEASE_ASSERT(medianIndex > start); |
| 369 | RELEASE_ASSERT(medianIndex + 1 < end); |
| 370 | |
| 371 | if (BinarySwitchInternal::verbose) |
| 372 | dataLog("fixed medianIndex = " , medianIndex, "\n" ); |
| 373 | |
| 374 | append(BranchCode(LessThanToPush, medianIndex)); |
| 375 | build(medianIndex, true, end); |
| 376 | append(BranchCode(Pop)); |
| 377 | build(start, hardStart, medianIndex); |
| 378 | } |
| 379 | |
| 380 | void BinarySwitch::Case::dump(PrintStream& out) const |
| 381 | { |
| 382 | out.print("<value: " , value, ", index: " , index, ">" ); |
| 383 | } |
| 384 | |
| 385 | void BinarySwitch::BranchCode::dump(PrintStream& out) const |
| 386 | { |
| 387 | switch (kind) { |
| 388 | case NotEqualToFallThrough: |
| 389 | out.print("NotEqualToFallThrough" ); |
| 390 | break; |
| 391 | case NotEqualToPush: |
| 392 | out.print("NotEqualToPush" ); |
| 393 | break; |
| 394 | case LessThanToPush: |
| 395 | out.print("LessThanToPush" ); |
| 396 | break; |
| 397 | case Pop: |
| 398 | out.print("Pop" ); |
| 399 | break; |
| 400 | case ExecuteCase: |
| 401 | out.print("ExecuteCase" ); |
| 402 | break; |
| 403 | } |
| 404 | |
| 405 | if (index != UINT_MAX) |
| 406 | out.print("(" , index, ")" ); |
| 407 | } |
| 408 | |
| 409 | } // namespace JSC |
| 410 | |
| 411 | #endif // ENABLE(JIT) |
| 412 | |
| 413 | |