Finding peaks with SIMD

code

julia

Author

Allen Hill

Published

March 2, 2026

slUtils = {
  {
    document.getElementById('widget-tip')?.classList.remove('nojs');
    document.getElementById('benchmark-fallback')?.classList.add('nojs');

    // Always update CSS content (not just on first create) so hot-reload picks up changes
    let style = document.getElementById('sl-shared-styles');
    if (!style) {
      style = document.createElement('style');
      style.id = 'sl-shared-styles';
      document.head.appendChild(style);
    }
    style.textContent = `
      .sl-row { display: flex; align-items: center; height: 2.3rem; }
      .sl-label { width: var(--sl-label-width, 5rem); flex-shrink: 0; text-align: right; padding-right: 0.75rem; opacity: 0.5; font-size: 0.92em; white-space: nowrap; }
      .sl-op { width: 1.3rem; flex-shrink: 0; text-align: right; padding-right: 0.3rem; font-weight: 700; box-sizing: content-box; }
      .sl-bits { display: flex; }
      .sl-bit { width: 1.5rem; height: 1.8rem; display: inline-flex; align-items: center; justify-content: center; border-radius: 3px; border: 1px solid rgba(37, 99, 235, 0.2); box-sizing: border-box; position: relative; overflow: hidden; transition: background 0.4s ease; }
      .sl-val { font-variant-numeric: tabular-nums; }
      .sl-val.b1 { font-weight: 700; }
      .sl-val.b0 { opacity: 0.35; }
      .sl-bit.hl { background: rgba(37, 99, 235, 0.15); }
      .sl-val.sl-entering { position: absolute; inset: 0; display: flex; align-items: center; justify-content: center; }
      .sl-val.sl-exiting { position: absolute; inset: 0; display: flex; align-items: center; justify-content: center; }
      .sl-carry { position: absolute; font-size: 0.55em; font-weight: 700; opacity: 0.7; font-variant-numeric: tabular-nums; transition: all 0.3s ease; }
      .sl-ibit { width: 1.5rem; height: 1.8rem; display: inline-flex; align-items: center; justify-content: center; border-radius: 3px; cursor: pointer; user-select: none; font-variant-numeric: tabular-nums; transition: opacity 0.15s; border: 1px solid rgba(37, 99, 235, 0.2); box-sizing: border-box; }
      .sl-ibit.b1 { font-weight: 700; }
      .sl-ibit.b0 { opacity: 0.35; }
      .sl-ibit:hover { opacity: 0.7; }
      .sl-ibit.invalid { background: rgba(220, 38, 38, 0.15); border-color: rgba(220, 38, 38, 0.4); opacity: 1; }
      .sl-hr { margin-left: calc(var(--sl-label-width, 5rem) + 1.6rem); margin-top: 0.15rem; margin-bottom: 0.15rem; width: calc(8 * 1.5rem); border: none; border-top: 2px solid; opacity: 0.5; }
      .sl-calc { margin-left: calc(var(--sl-label-width, 5rem) + 1.6rem); margin-top: 0.4rem; width: calc(8 * 1.5rem); font-family: inherit; font-size: 0.85em; padding: 0.25rem 0; cursor: pointer; border: 1.5px solid currentColor; border-radius: 3px; background: transparent; color: inherit; text-align: center; transition: opacity 0.2s; }
      .sl-calc:disabled, .sl-calc.sl-disabled { opacity: 0.35; cursor: not-allowed; }
      .sl-toast { margin-left: 0; margin-top: 0.3rem; width: 19.5rem; font-size: 0.8em; color: rgba(220, 38, 38, 0.8); overflow: hidden; opacity: 0; max-height: 0; transition: opacity 0.3s, max-height 0.3s; }
      .sl-toast.visible { opacity: 1; max-height: 4rem; }
    `;
  }

  const toBits = n => Array.from({length: 8}, (_, i) => (n >> (7 - i)) & 1);
  const wait = ms => new Promise(r => setTimeout(r, ms));
  const raf = () => new Promise(r => requestAnimationFrame(r));

  function mkBit(v) {
    const s = document.createElement('span');
    s.className = 'sl-bit';
    const t = document.createElement('span');
    t.className = `sl-val ${v ? 'b1' : 'b0'}`;
    t.textContent = v;
    s.appendChild(t);
    return s;
  }

  function mkRow(label, bits, op) {
    const r = document.createElement('div');
    r.className = 'sl-row';
    const l = document.createElement('span');
    l.className = 'sl-label';
    l.textContent = label;
    r.appendChild(l);
    const o = document.createElement('span');
    o.className = 'sl-op';
    o.textContent = op || '';
    r.appendChild(o);
    const g = document.createElement('div');
    g.className = 'sl-bits';
    const cells = bits.map(mkBit);
    cells.forEach(c => g.appendChild(c));
    r.appendChild(g);
    return { el: r, lbl: l, opEl: o, bitsDiv: g, cells };
  }

  function animateBit(cell, newVal) {
    while (cell.children.length > 1) cell.removeChild(cell.lastChild);
    const old = cell.firstChild;
    const neu = document.createElement('span');
    neu.className = `sl-val ${newVal ? 'b1' : 'b0'} sl-entering`;
    neu.textContent = newVal;
    neu.style.opacity = '0';
    neu.style.transform = 'translateY(-70%)';
    cell.appendChild(neu);
    old.className = old.className + ' sl-exiting';
    requestAnimationFrame(() => { requestAnimationFrame(() => {
      old.style.transition = 'opacity 0.4s ease, transform 0.4s ease';
      neu.style.transition = 'opacity 0.4s ease, transform 0.4s ease';
      old.style.opacity = '0';
      old.style.transform = 'translateY(70%)';
      neu.style.opacity = '';
      neu.style.transform = 'translateY(0)';
    }); });
    setTimeout(() => {
      if (!cell.isConnected) return;
      if (cell.contains(old)) cell.removeChild(old);
      neu.className = `sl-val ${newVal ? 'b1' : 'b0'}`;
      neu.style.cssText = '';
    }, 450);
  }

  // Cascading addition animation with carry visualization.
  // cells: 8 .sl-bit elements already displaying initial values (mask).
  // bitsArr: 8-element array of bits to add.
  // direction: 'ltr' (carry left, LSB→MSB) or 'rtl' (carry right, MSB→LSB).
  async function animateAdd(cells, bitsArr, direction) {
    const step = direction === 'ltr' ? -1 : 1;
    const start = direction === 'ltr' ? 7 : 0;
    const end = direction === 'ltr' ? -1 : 8;
    // Corner where carry indicators sit
    const crnrLeft = direction === 'ltr' ? '2px' : 'calc(100% - 0.6rem)';
    // Edge where carry enters target cell from
    const entryLeft = direction === 'ltr' ? 'calc(100%)' : '-0.5rem';
    // Direction old value exits toward (for 1+1 carry animation)
    const exitTx = direction === 'ltr' ? 'translateX(-0.7rem) scale(0.5)' : 'translateX(0.7rem) scale(0.5)';

    // Track effective cell values (updated as carries resolve)
    const vals = cells.map(c => {
      const v = c.querySelector('.sl-val');
      return v ? parseInt(v.textContent) : 0;
    });

    let carry = 0;
    let markerIdx = -1; // which cell has a carry marker

    for (let i = start; i !== end; i += step) {
      const cell = cells[i];
      const a = vals[i];
      const b = bitsArr[i];
      const sum = a + b + carry;
      const result = sum & 1;
      const carryOut = sum >> 1;
      const nextI = i + step;
      const hasNext = nextI >= 0 && nextI < 8;
      const hasMarker = markerIdx === i;

      let acted = false;

      // --- Handle existing carry marker at this cell ---
      if (hasMarker) {
        const cm = cell.querySelector('.sl-carry');
        if (cm) {
          // Carry always propagates when marker present (a=1,carry=1 → sum≥2)
          // Slide marker toward exit edge and fade
          cm.style.left = entryLeft;
          cm.style.opacity = '0';
          cm.style.transform = 'scale(0.5)';
          setTimeout(() => { if (cell.contains(cm)) cell.removeChild(cm); }, 350);
        }
        markerIdx = -1;
        acted = true;
      }

      // --- Animate cell value change ---
      if (a !== result) {
        if (carryOut && !hasMarker) {
          // 1+1=0+carry: custom animation — 0 slides down, old 1 shrinks toward exit
          while (cell.children.length > 1) cell.removeChild(cell.lastChild);
          const old = cell.querySelector('.sl-val');
          const neu = document.createElement('span');
          neu.className = 'sl-val b0 sl-entering';
          neu.textContent = 0;
          neu.style.opacity = '0';
          neu.style.transform = 'translateY(-70%)';
          cell.appendChild(neu);
          old.className += ' sl-exiting';
          requestAnimationFrame(() => { requestAnimationFrame(() => {
            old.style.transition = 'opacity 0.35s ease, transform 0.35s ease';
            neu.style.transition = 'opacity 0.4s ease, transform 0.4s ease';
            old.style.opacity = '0';
            old.style.transform = exitTx;
            neu.style.opacity = '';
            neu.style.transform = 'translateY(0)';
          }); });
          setTimeout(() => {
            if (!cell.isConnected) return;
            if (cell.contains(old)) cell.removeChild(old);
            neu.className = 'sl-val b0';
            neu.style.cssText = '';
          }, 450);
        } else {
          // Simple value change (0→1 slide-down, or 1→0 with marker already handled)
          animateBit(cell, result);
        }
        acted = true;
      }

      vals[i] = result;

      // --- Send carry to next cell ---
      if (carryOut && hasNext) {
        const nextCell = cells[nextI];
        const nextVal = vals[nextI];

        // Create carry element entering from near edge
        const ce = document.createElement('span');
        ce.className = 'sl-carry';
        ce.textContent = '1';
        ce.style.top = '1px';
        ce.style.left = entryLeft;
        ce.style.opacity = '0';
        nextCell.appendChild(ce);
        await raf(); await raf();

        if (nextVal === 1) {
          // Next cell has 1: carry slides to corner, stays as marker
          ce.style.left = crnrLeft;
          ce.style.opacity = '0.7';
          markerIdx = nextI;
          carry = 1;
          await wait(400); // carry slide transition is 0.3s
        } else {
          // Next cell has 0: carry slides to center, grows, replaces 0
          const oldV = nextCell.querySelector('.sl-val');
          if (oldV) {
            oldV.style.transition = 'opacity 0.3s ease';
            oldV.style.opacity = '0';
          }
          ce.style.top = '50%';
          ce.style.left = '50%';
          ce.style.transform = 'translate(-50%, -50%)';
          ce.style.fontSize = '1em';
          ce.style.opacity = '1';
          await wait(500); // carry grow + fade is 0.3s; extra time so resolved cell is visible before next iteration processes it
          // Replace carry with proper sl-val
          if (nextCell.contains(ce)) nextCell.removeChild(ce);
          if (oldV && nextCell.contains(oldV)) nextCell.removeChild(oldV);
          const nv = document.createElement('span');
          nv.className = 'sl-val b1';
          nv.textContent = '1';
          if (nextCell.firstChild) nextCell.insertBefore(nv, nextCell.firstChild);
          else nextCell.appendChild(nv);
          vals[nextI] = 1;
          carry = 0;
        }
      } else if (acted) {
        carry = 0;
        await wait(450); // animateBit transition is 0.4s
      } else {
        carry = 0;
        await wait(150); // nothing happened, brief scanning pause
      }
    }
  }

  function mkInputRow(label, val, opts) {
    const row = document.createElement('div');
    row.style.cssText = 'display: flex; align-items: center; height: 2.3rem;';
    const labelDiv = document.createElement('div');
    labelDiv.style.cssText = 'width: var(--sl-label-width, 5rem); flex-shrink: 0; text-align: right; padding-right: 0.75rem; opacity: 0.5; font-size: 0.92em; white-space: nowrap;';
    labelDiv.textContent = label;
    row.appendChild(labelDiv);
    const opSpacer = document.createElement('div');
    opSpacer.style.cssText = 'width: 1.3rem; flex-shrink: 0; text-align: right; padding-right: 0.3rem; font-weight: 700; box-sizing: content-box;';
    if (opts.op) opSpacer.textContent = opts.op;
    row.appendChild(opSpacer);
    const bitsDiv = document.createElement('div');
    bitsDiv.style.display = 'flex';
    bitsDiv.style.flexShrink = '0';
    const bitRuns = [];
    for (let i = 0; i < 8; i++) {
      const b = (val >> (7 - i)) & 1;
      const span = document.createElement('span');
      span.className = `sl-ibit ${b ? 'b1' : 'b0'}`;
      span.textContent = b;
      span.addEventListener('click', () => { opts.toggle(i); });
      bitRuns.push(span);
      bitsDiv.appendChild(span);
    }
    row.appendChild(bitsDiv);
    const hexWrap = document.createElement('span');
    hexWrap.style.cssText = 'margin-left: 0.5rem; opacity: 0.5; font-size: 0.92em; white-space: nowrap; display: flex; align-items: center; flex-shrink: 0;';
    if (opts.editable) {
      hexWrap.append('(0\u{1d5d1}');
      const hexInput = document.createElement('input');
      hexInput.type = 'text';
      hexInput.maxLength = 2;
      hexInput.value = val.toString(16).padStart(2, '0');
      hexInput.style.cssText = 'width: 2.5ch; font-family: inherit; font-size: inherit; border: 1.5px solid transparent; border-bottom-color: currentColor; border-radius: 2px; background: transparent; text-align: center; padding: 0; color: inherit; transition: border-color 0.15s; margin-right: -4.5px; margin-left: -2px;';
      hexInput.addEventListener('mouseenter', () => { hexInput.style.borderColor = 'currentColor'; });
      hexInput.addEventListener('mouseleave', () => { if (document.activeElement !== hexInput) { hexInput.style.borderColor = 'transparent'; hexInput.style.borderBottomColor = 'currentColor'; } });
      hexInput.addEventListener('focus', () => { hexInput.style.borderColor = 'currentColor'; });
      hexInput.addEventListener('blur', () => { hexInput.style.borderColor = 'transparent'; hexInput.style.borderBottomColor = 'currentColor'; });
      hexWrap.appendChild(hexInput);
      hexWrap.append(')');
      row.appendChild(hexWrap);
      // Hide hex display when viewport is too narrow
      let hexWidth = 0;
      const check = () => {
        if (!hexWidth && hexWrap.offsetWidth) hexWidth = hexWrap.offsetWidth;
        const fixed = labelDiv.offsetWidth + opSpacer.offsetWidth + bitsDiv.offsetWidth;
        const cell = row.closest('.cell-output-display') || row.parentElement;
        hexWrap.style.display = (cell.clientWidth >= fixed + hexWidth + 24) ? 'flex' : 'none';
      };
      const ro = new ResizeObserver(check);
      requestAnimationFrame(() => {
        const cell = row.closest('.cell-output-display') || row.parentElement;
        ro.observe(cell);
        check();
      });
      return { row, bitRuns, hexInput, hexWrap, bitsDiv };
    } else {
      const hexSpan = document.createElement('span');
      hexSpan.textContent = `(0\u{1d5d1}${val.toString(16).padStart(2, '0')})`;
      hexWrap.appendChild(hexSpan);
      row.appendChild(hexWrap);
      // Hide hex display when viewport is too narrow
      let hexWidth = 0;
      const check = () => {
        if (!hexWidth && hexWrap.offsetWidth) hexWidth = hexWrap.offsetWidth;
        const fixed = labelDiv.offsetWidth + opSpacer.offsetWidth + bitsDiv.offsetWidth;
        const cell = row.closest('.cell-output-display') || row.parentElement;
        hexWrap.style.display = (cell.clientWidth >= fixed + hexWidth + 24) ? 'flex' : 'none';
      };
      const ro = new ResizeObserver(check);
      requestAnimationFrame(() => {
        const cell = row.closest('.cell-output-display') || row.parentElement;
        ro.observe(cell);
        check();
      });
      return { row, bitRuns, hexSpan, hexWrap, bitsDiv };
    }
  }

  // Center the widget's bitstring area relative to the page content column.
  // Uses a nearby <p> as proxy for body column width/position, since widgets
  // use column:screen and the body column shifts when asides go to margin.
  function centerWidget(container) {
    const style = getComputedStyle(container);
    const lw = parseFloat(style.getPropertyValue('--sl-label-width')) || 5;
    const remPx = parseFloat(getComputedStyle(document.documentElement).fontSize);
    // Distance from widget left edge to bitstring center
    const bsCenterPx = (lw + 1.6 + 6) * remPx; // label + op + half of 12rem bits
    let lastPad = -1;
    let rafId = 0;
    const update = () => {
      const cell = container.closest('.cell-output-display') || container.parentElement;
      const section = cell.closest('section') || cell.parentElement;
      const refEl = section.querySelector('p') || cell;
      const cellRect = cell.getBoundingClientRect();
      const refRect = refEl.getBoundingClientRect();
      const colCenter = refRect.left + refRect.width / 2;
      const pad = Math.max(remPx/2, colCenter - bsCenterPx - cellRect.left) | remPx/2;
      if (pad !== lastPad) {
        lastPad = pad;
        container.style.paddingLeft = pad + 'px';
      }
    };
    const scheduleUpdate = () => {
      if (!rafId) rafId = requestAnimationFrame(() => { rafId = 0; update(); });
    };
    const ro = new ResizeObserver(scheduleUpdate);
    requestAnimationFrame(() => {
      const cell = container.closest('.cell-output-display') || container.parentElement;
      const section = cell.closest('section') || cell.parentElement;
      const refEl = section.querySelector('p') || cell;
      ro.observe(cell);
      if (refEl !== cell) ro.observe(refEl);
      update();
    });
  }

  return { toBits, wait, raf, mkBit, mkRow, animateBit, animateAdd, mkInputRow, centerWidget };
}

A couple of years ago, I added a benchmarks page to the documentation for my Peaks.jl package (a Julia package for finding peaks in vector data). When I compared the performance of my Peaks.jl functions against other obvious peak finding implementations, I was shocked to discover that some of the other functions were faster!¹ 🙀 This completely unacceptable situation was just the ~~encouragement~~ nerd-snipe I needed to develop the new fastest™ peak finding function.² Considering the relative simplicity of the existing peak finding algorithm, any meaningfully faster solution would have to involve explicit SIMD vectorization.³

What is SIMD?

SIMD stands for single-instruction, multiple-data. CPUs run software by executing one instruction at a time.⁴ Normal CPU instructions operate on scalar values (e.g. add one number to another), but many CPUs have SIMD extensions which add specialized instructions that can perform common operations (addition, multiplication, comparison, etc) on vector values (e.g. add 4 pair of numbers). This means the CPU does more work at once, which can be very handy to speed up software! Unfortunately, using SIMD forces a different, less-flexible style of programming: it’s harder/not possible to use branching (think if statements), and SIMD instructions operate on a fixed-width vector (e.g. 4 pair of numbers are always added, and if less than 4 pair are available, special handling or a non-SIMD fallback is needed).

Conveniently, I’ve wanted to learn SIMD for a while, and I just needed a personally relevant, real problem to experiment on. Some features can’t be supported in a SIMD algorithm (e.g. no Missing support)⁵, but it was non-negotiable that both SIMD and non-SIMD functions maintained the same basic definition of a peak:

A peak MUST be preceded by lesser element(s), MAY be followed by equal element(s), and MUST be ultimately followed by lesser element(s).
A peak where the “largest” element(s) occurs more than once is called a “plateau”. The location of a plateau is defined as the first largest element; this definition allows appending data without altering the location of an existing peak.

I’m using Julia in this post (since I’m describing the algorithm/code from my Peaks.jl package), but the concepts should transfer to any language that allows explicit use of SIMD. We’ll use this signal to work out examples:

An example signal containing a normal peak, a “false” plateau, and a true plateau

Visually, we can recognize a normal peak at index 2. Next, there is a “false” plateau at index 4, where the signal rises, plateaus, but ultimately rises again (i.e. doesn’t meet the definition for a plateau peak). Finally, there is a true plateau at index 6.

Based on the definition of a peak, we need 3 comparisons for each element. The first part of the function/SIMD loop will need to set up, load the data, and perform the SIMD comparisons; like this:

1offset₋₁ = VecRange{8}(-1)
offset   = VecRange{8}( 0)
offset₊₁ = VecRange{8}(+1)

for i in firstindex(x)+1:8:lastindex(x)-1
2  xᵢ₋₁ = x[i + offset₋₁]
  xᵢ   = x[i + offset]
  xᵢ₊₁ = x[i + offset₊₁]

3  rise = bitmask(xᵢ₋₁  < xᵢ)
  fall = bitmask(xᵢ₊₁  < xᵢ)
  flat = bitmask(xᵢ₊₁ == xᵢ)

  # peak identification happens here
end

1: Define SIMD ranges to use as offsets when loading signal elements into SIMD registers.⁶ These are SIMD specific equivalents of normal ranges (e.g. VecRange{8}(-1) is functionally equivalent to -1:6).
2: Load (8) signal elements from a range of indices into special SIMD vector registers (e.g. during the first loop, xᵢ₋₁ will load elements 1:8 from x, xᵢ₋₁ == x[2 .+ -1:6]).⁷
3: The comparisons produce SIMD vectors holding 8 Bools, each of which holds the result (true or false) of a single comparison. The bitmask function converts each Bool in the vector into a single bit 1 or 0 in a byte (or a larger/wider bitset—a set of bits). Since many CPUs have native hardware support for 64- or 32-bit values, we can later process many more elements/peaks at a time using bitwise and other basic math operators.

What are bitwise operators?

Bitwise operators are functions (compiled to single CPU instructions) that are focused on manipulating the bits in a value/bitset; most bitwise operators apply logical operations to every bit in a bitset at once. There are bitwise operators for all common logic operations: AND (operator symbol is &), OR (|), NOT (~), etc. Some other operators include left (<<) and right (>>) shifts, which add a number of 0 bit(s) before the least-significant bit (LSB) or after the most-significant bit (MSB), and drop the extra bit(s), e.g. 0b00010101 >> 1 shifts 0b00010101 right by one bit, resulting in 0b00001010.

Bitwise operations normally affect every bit the same way, but sometimes this is undesirable. A common technique is the use of “bitmasks” to control which bit(s) in a set should or shouldn’t be changed by the code. The only difference between a bitmask and a bitset is purpose/intent.

Using the above code on our example signal produces the following bitsets for the three comparisons:

# results shown as a bitstrings
julia> rise = bitmask(xᵢ₋₁ < xᵢ)
0b00010101

julia> fall = bitmask(xᵢ₊₁ < xᵢ)
0b10000001

julia> flat = bitmask(xᵢ₊₁ == xᵢ)
0b01110100

Bit ordering in bitsets follows index ordering, such that the least-significant bit (LSB) in a bitset corresponds to the lowest index in that slice of data. Bitstrings are printed most-significant bit (MSB) to LSB, from left-to-right, so the LSB is the rightmost digit.

We know from looking at the signal plot that there are peaks at indices 2 and 6, which means that by the end of our SIMD loop, we should have a bitset with set (1/true) bits at 1 and 5, 0b00010001.⁸ Study the three bitsets, and you might notice that different approaches are needed to identifying peaks and plateaus.

For normal peaks, every given bit position in rise and fall corresponds to a matched scalar comparison, xᵢ₋₁ < xᵢ and xᵢ₊₁ < xᵢ, for a given i. Normal peaks must have the same set bits in rise AND fall to indicate a peak (i.e. the previous and next elements are less than the current element). Using a bitwise & will produce set bits only if both bits in the pair are set (1/true). So, the simple equation rise & fall will keep only those matched bits, and all other bits will be cleared; for our signal, bit 1 is the only set bit in both rise and fall, resulting in 0b00000001. One peak identified, one peak to go.

Unfortunately, identifying plateaus is trickier. After the normal peaks are identified with rise & fall, fall can be right shifted to align the signal dips with the ends of plateaus. But the set bits in rise and fall for the beginning and end of plateaus will still not be aligned (because the flat part of a plateau might run for multiple elements), so a simple combination like rise & flat & fall won’t work: the same bits might be set in rise and flat, but the set bit in fall could be later.

Every plateau will have a bit run (one or more consecutively set bits, if the plateau has multiple equal values) where the run’s LSB is set in rise and the run’s MSB is set in fall. Multiple plateaus might be present in a flat bitset, and the LSB of each bit run is that peak’s location (bit 5 in our example flat).

We can filter “false” plateaus from flat by first clearing any bit runs that don’t have a set bit in rise matching the start (LSB) bit of the run. Next, clear any bit runs that don’t have a set bit in fall matching the end (MSB) bit of the run. Finally, clear all the remaining bits in flat except for each run’s LSB. We’ll need a few bitwise focused functions to filter the flat bitset as described:

Functions to clear bit runs that don’t begin or end with an LSB/MSB mask
- lsb_mask(0b01110100,0b00010101) ==0b01110100
- msb_mask(0b01110100,0b01000000) ==0b01110000
A function to clear all bits except for the (potentially multiple) local LSB of bit runs (e.g. run_lsb(0b01110) ==0b00010)

Masking bit runs with the run LSB

This function can clear any bit runs in flat that don’t begin with a set bit in rise: bits & ~(bits + mask). The premise is that adding the LSB of any bit run will (by carrying) invert all previously set bits in the run. So, by adding the LSB of only the bit runs we want and then inverting the bitset, we get a bitmask that can be used with an AND op to clear all unwanted bit runs.

{
  const { toBits, wait, raf, mkBit, mkRow, animateBit, animateAdd, mkInputRow, centerWidget } = slUtils;
  let bitsVal = 116;
  let maskVal = 21;

  function getInvalidBits(b, m) {
    const runLsb = b & ~((b << 1) & 0xFF);
    const runMsb = b & ~((b >> 1) & 0xFF);
    const validMaskBits = (b - runLsb) | ((runMsb << 1) & 0xFF) | ((runLsb >> 1) & 0xFF);
    return m & validMaskBits;
  }

  const prev = document._sl2State || {res: [0,0,0,0,0,0,0,0]};
  const prevR3Bits = prev.r3Bits || toBits(maskVal);
  const prevR3Labels = prev.r3Labels || false;

  const container = html`<div style="
    font-family: var(--bs-font-monospace, monospace);
    font-size: 1.15rem;
    --sl-label-width: 5rem;
  "></div>`;
  centerWidget(container);

  // MSB / LSB descriptor row — aligned with bitstring columns
  const descRow = document.createElement('div');
  descRow.style.cssText = 'display: flex; align-items: center; height: 1.2rem;';
  const descSpacer = document.createElement('div');
  descSpacer.style.cssText = 'width: calc(var(--sl-label-width, 5rem) + 1.6rem); flex-shrink: 0;';
  descRow.appendChild(descSpacer);
  const descLabel = document.createElement('div');
  descLabel.style.cssText = 'width: calc(8 * 1.5rem); display: flex; align-items: center; opacity: 0.3; font-size: 0.55em; letter-spacing: 0.05em; padding: 0 0.15rem;';
  const descMsb = document.createElement('span');
  descMsb.textContent = 'MSB';
  const descLine = document.createElement('span');
  descLine.style.cssText = 'flex: 1; border-bottom: 1px solid currentColor; margin: 0 0.4em; opacity: 0.5;';
  const descLsb = document.createElement('span');
  descLsb.textContent = 'LSB';
  descLabel.append(descMsb, descLine, descLsb);
  descRow.appendChild(descLabel);

  // --- Input rows ---
  const bitsRow = mkInputRow('bits\u00a0=', bitsVal, {
    editable: true,
    toggle(i) { bitsVal ^= (1 << (7 - i)); sync(); }
  });
  const maskRow = mkInputRow('mask\u00a0=', maskVal, {
    editable: false,
    op: '+',
    toggle(i) { maskVal ^= (1 << (7 - i)); sync(); }
  });

  bitsRow.hexInput.addEventListener('input', () => {
    const text = bitsRow.hexInput.value.replace(/[^0-9a-fA-F]/g, '');
    if (text !== bitsRow.hexInput.value) bitsRow.hexInput.value = text;
    if (text.length === 2) {
      const parsed = parseInt(text, 16);
      if (!isNaN(parsed)) { bitsVal = parsed; sync(); }
    }
  });
  bitsRow.hexInput.addEventListener('blur', () => {
    bitsRow.hexInput.value = bitsVal.toString(16).padStart(2, '0');
  });

  // Initial validation highlight
  const initInv = getInvalidBits(bitsVal, maskVal);
  for (let i = 0; i < 8; i++) {
    if ((initInv >> (7 - i)) & 1) maskRow.bitRuns[i].classList.add('invalid');
  }

  // --- Animation rows ---
  const hr1 = document.createElement('hr');
  hr1.className = 'sl-hr';

  // Animated row: sum → invert
  const r3 = (() => {
    const r = document.createElement('div');
    r.className = 'sl-row';
    const l = document.createElement('span');
    l.className = 'sl-label';
    r.appendChild(l);
    const o = document.createElement('span');
    o.className = 'sl-op';
    const tilde = document.createElement('span');
    tilde.textContent = '~';
    tilde.style.cssText = prevR3Labels ? 'opacity: 1; transition: opacity 0.3s;' : 'opacity: 0; transition: opacity 0.3s;';
    o.appendChild(tilde);
    r.appendChild(o);
    const g = document.createElement('div');
    g.className = 'sl-bits';
    const cells = prevR3Bits.map(mkBit);
    cells.forEach(c => g.appendChild(c));
    r.appendChild(g);
    return { el: r, cells, bitsDiv: g, tilde };
  })();

  const r4 = mkRow('', toBits(bitsVal), '&');
  const hr2 = document.createElement('hr');
  hr2.className = 'sl-hr';
  const r5 = mkRow('result:', prev.res);
  r5.cells.forEach((c, i) => { if (prev.res[i]) c.classList.add('hl'); });

  // Calculate button
  let isInvalid = getInvalidBits(bitsVal, maskVal) !== 0;
  let animating = false;
  const calcBtn = document.createElement('button');
  calcBtn.className = 'sl-calc';
  calcBtn.textContent = 'Calculate';
  if (isInvalid) calcBtn.classList.add('sl-disabled');

  const toast = document.createElement('div');
  toast.className = 'sl-toast';
  toast.innerHTML = 'Set bits must be LSB of bit runs in <code>bits</code>  or bits that will not affect carrying into LSBs.';

  calcBtn.addEventListener('click', () => {
    if (isInvalid) { toast.classList.add('visible'); return; }
    if (animating) return;
    runAnimation();
  });

  container.append(descRow, bitsRow.row, maskRow.row, hr1, r3.el, r4.el, hr2, r5.el, calcBtn, toast);

  // --- Sync input changes to animation rows ---
  function sync() {
    bitsRow.hexInput.value = bitsVal.toString(16).padStart(2, '0');
    for (let i = 0; i < 8; i++) {
      const b = (bitsVal >> (7 - i)) & 1;
      bitsRow.bitRuns[i].textContent = b;
      bitsRow.bitRuns[i].className = `sl-ibit ${b ? 'b1' : 'b0'}`;
    }
    maskRow.hexSpan.textContent = `(0\u{1d5d1}${maskVal.toString(16).padStart(2, '0')})`;
    const inv = getInvalidBits(bitsVal, maskVal);
    for (let i = 0; i < 8; i++) {
      const b = (maskVal >> (7 - i)) & 1;
      const bad = (inv >> (7 - i)) & 1;
      maskRow.bitRuns[i].textContent = b;
      maskRow.bitRuns[i].className = `sl-ibit ${b ? 'b1' : 'b0'}${bad ? ' invalid' : ''}`;
    }
    // Update r4 (bits & row)
    const bitsB = toBits(bitsVal);
    r4.cells.forEach((c, i) => {
      const t = c.querySelector('.sl-val');
      if (t) { t.textContent = bitsB[i]; t.className = `sl-val ${bitsB[i] ? 'b1' : 'b0'}`; }
    });
    // Update r3 to show current mask if not yet animated
    if (!document._sl2State.animated) {
      const maskB = toBits(maskVal);
      document._sl2State.r3Bits = maskB;
      r3.cells.forEach((c, i) => {
        const t = c.querySelector('.sl-val');
        if (t) { t.textContent = maskB[i]; t.className = `sl-val ${maskB[i] ? 'b1' : 'b0'}`; }
      });
    }
    // Validation
    isInvalid = inv !== 0;
    if (isInvalid) { calcBtn.classList.add('sl-disabled'); }
    else { calcBtn.classList.remove('sl-disabled'); toast.classList.remove('visible'); }
  }

  // --- Store state ---
  document._sl2State = {
    res: prev.res,
    r3Bits: prevR3Bits.slice(),
    r3Labels: prevR3Labels,
    animated: prev.animated || false
  };

  // --- Animation ---
  async function runAnimation() {
    animating = true;
    const bits = bitsVal & 0xFF;
    const mask = maskVal & 0xFF;
    const bitsB = toBits(bits);
    const maskB = toBits(mask);
    const sumB = toBits((bits + mask) & 0xFF);
    const invSumB = toBits((~(bits + mask)) & 0xFF);
    const resB = toBits((bits & ~(bits + mask)) & 0xFF);
    const prevRes = document._sl2State.res;

    document._sl2State.animated = true;
    await raf(); await raf();

    // Reset phase
    if (document._sl2State.r3Labels) {
      r3.tilde.style.opacity = '0';
      r3.cells.forEach((c, i) => {
        const cur = parseInt(c.querySelector('.sl-val').textContent);
        if (cur !== maskB[i]) animateBit(c, maskB[i]);
      });
      await wait(600);
      document._sl2State.r3Bits = maskB;
      document._sl2State.r3Labels = false;
    }

    await wait(400);

    // Cascading addition with carry visualization
    await animateAdd(r3.cells, bitsB, 'ltr');
    document._sl2State.r3Bits = sumB;

    await wait(400);

    // Invert: reveal ~ and flip bits
    r3.tilde.style.opacity = '1';
    await wait(300);
    r3.cells.forEach((c, i) => { animateBit(c, invSumB[i]); });
    await wait(600);
    document._sl2State.r3Bits = invSumB;
    document._sl2State.r3Labels = true;

    await wait(400);

    // Result: animate changed bits
    for (let i = 0; i < 8; i++) {
      if (prevRes[i] !== resB[i]) {
        const c = r5.cells[i];
        animateBit(c, resB[i]);
        setTimeout(() => {
          if (resB[i]) c.classList.add('hl'); else c.classList.remove('hl');
        }, 200);
      }
    }
    document._sl2State.res = resB;
    animating = false;
  }

  return container;
}

Masking bit runs with the run MSB

I originally didn’t have a complete understanding of the problem space, and missed the importance of carrying (particularly carry direction) to generalize the LSB mask function for MSB masks. I made do with bitreverse operations,⁹ but the downside was 3 relatively¹⁰ expensive bitreverse calls. Off-and-on for several years, I thought about how to better solve this, until I recently finally connected the dots that carrying and carry direction were the core of the problem. With the proper language to describe what I needed, some research led me to adder algorithms. With a little prompting, Claude produced a reverse-carry adder based on the Kogge-Stone algorithm, but using right shifts.¹¹

That accomplished, we can substitute the reverse-carry adder into the same equation used for the LSB mask: bits & ~(bits ⊞ mask), where I’m using ⊞ as the symbol for the reverse-carry adder. The new reverse-carry function is ~2x faster than the original bitreverse based function!¹² This new function is a direct corollary of the last function: it can clear any bit runs in flat that don’t end with a set bit in fall.

{
  const { toBits, wait, raf, mkBit, mkRow, animateBit, animateAdd, mkInputRow, centerWidget } = slUtils;
  let bitsVal = 116;
  let maskVal = 64;

    function getInvalidBits(b, m) {
    const runLsb = b & ~((b << 1) & 0xFF);
    const runMsb = b & ~((b >> 1) & 0xFF);
    const validMaskBits = (b - runMsb) | ((runMsb << 1) & 0xFF) | ((runLsb >> 1) & 0xFF);
    return m & validMaskBits;
  }

  // Reverse-carry addition (Kogge-Stone, right-shifting)
  function rcAdd(x, y) {
    let P = (x ^ y) & 0xFF;
    let G = (x & y) & 0xFF;
    G = (G | (P & (G >> 1))) & 0xFF;
    P = (P & (P >> 1)) & 0xFF;
    G = (G | (P & (G >> 2))) & 0xFF;
    P = (P & (P >> 2)) & 0xFF;
    G = (G | (P & (G >> 4))) & 0xFF;
    return (x ^ y ^ (G >> 1)) & 0xFF;
  }

  const prev = document._sl3State || {res: [0,0,0,0,0,0,0,0]};
  const prevR3Bits = prev.r3Bits || toBits(maskVal);
  const prevR3Labels = prev.r3Labels || false;

  const container = html`<div style="
    font-family: var(--bs-font-monospace, monospace);
    font-size: 1.15rem;
    --sl-label-width: 5rem;
  "></div>`;
  centerWidget(container);

  // --- Input rows ---
  const bitsRow = mkInputRow('bits\u00a0=', bitsVal, {
    editable: true,
    toggle(i) { bitsVal ^= (1 << (7 - i)); sync(); }
  });
  const maskRow = mkInputRow('mask\u00a0=', maskVal, {
    editable: false,
    op: '\u229e',
    toggle(i) { maskVal ^= (1 << (7 - i)); sync(); }
  });

  bitsRow.hexInput.addEventListener('input', () => {
    const text = bitsRow.hexInput.value.replace(/[^0-9a-fA-F]/g, '');
    if (text !== bitsRow.hexInput.value) bitsRow.hexInput.value = text;
    if (text.length === 2) {
      const parsed = parseInt(text, 16);
      if (!isNaN(parsed)) { bitsVal = parsed; sync(); }
    }
  });
  bitsRow.hexInput.addEventListener('blur', () => {
    bitsRow.hexInput.value = bitsVal.toString(16).padStart(2, '0');
  });

  // Initial validation highlight
  const initInv = getInvalidBits(bitsVal, maskVal);
  for (let i = 0; i < 8; i++) {
    if ((initInv >> (7 - i)) & 1) maskRow.bitRuns[i].classList.add('invalid');
  }

  // --- Animation rows ---
  const hr1 = document.createElement('hr');
  hr1.className = 'sl-hr';

  // Animated row: sum → invert
  const r3 = (() => {
    const r = document.createElement('div');
    r.className = 'sl-row';
    const l = document.createElement('span');
    l.className = 'sl-label';
    r.appendChild(l);
    const o = document.createElement('span');
    o.className = 'sl-op';
    const tilde = document.createElement('span');
    tilde.textContent = '~';
    tilde.style.cssText = prevR3Labels ? 'opacity: 1; transition: opacity 0.3s;' : 'opacity: 0; transition: opacity 0.3s;';
    o.appendChild(tilde);
    r.appendChild(o);
    const g = document.createElement('div');
    g.className = 'sl-bits';
    const cells = prevR3Bits.map(mkBit);
    cells.forEach(c => g.appendChild(c));
    r.appendChild(g);
    return { el: r, cells, bitsDiv: g, tilde };
  })();

  const r4 = mkRow('', toBits(bitsVal), '&');
  const hr2 = document.createElement('hr');
  hr2.className = 'sl-hr';
  const r5 = mkRow('result:', prev.res);
  r5.cells.forEach((c, i) => { if (prev.res[i]) c.classList.add('hl'); });

  // Calculate button
  let isInvalid = getInvalidBits(bitsVal, maskVal) !== 0;
  let animating = false;
  const calcBtn = document.createElement('button');
  calcBtn.className = 'sl-calc';
  calcBtn.textContent = 'Calculate';
  if (isInvalid) calcBtn.classList.add('sl-disabled');

  const toast = document.createElement('div');
  toast.className = 'sl-toast';
  toast.innerHTML = 'Set bits must be LSB of bit runs in <code>bits</code>  or bits that will not affect carrying into LSBs.';

  calcBtn.addEventListener('click', () => {
    if (isInvalid) { toast.classList.add('visible'); return; }
    if (animating) return;
    runAnimation();
  });

  container.append(bitsRow.row, maskRow.row, hr1, r3.el, r4.el, hr2, r5.el, calcBtn, toast);

  // --- Sync input changes to animation rows ---
  function sync() {
    bitsRow.hexInput.value = bitsVal.toString(16).padStart(2, '0');
    for (let i = 0; i < 8; i++) {
      const b = (bitsVal >> (7 - i)) & 1;
      bitsRow.bitRuns[i].textContent = b;
      bitsRow.bitRuns[i].className = `sl-ibit ${b ? 'b1' : 'b0'}`;
    }
    maskRow.hexSpan.textContent = `(0\u{1d5d1}${maskVal.toString(16).padStart(2, '0')})`;
    const inv = getInvalidBits(bitsVal, maskVal);
    for (let i = 0; i < 8; i++) {
      const b = (maskVal >> (7 - i)) & 1;
      const bad = (inv >> (7 - i)) & 1;
      maskRow.bitRuns[i].textContent = b;
      maskRow.bitRuns[i].className = `sl-ibit ${b ? 'b1' : 'b0'}${bad ? ' invalid' : ''}`;
    }
    // Update r4 (bits & row)
    const bitsB = toBits(bitsVal);
    r4.cells.forEach((c, i) => {
      const t = c.querySelector('.sl-val');
      if (t) { t.textContent = bitsB[i]; t.className = `sl-val ${bitsB[i] ? 'b1' : 'b0'}`; }
    });
    // Update r3 to show current mask if not yet animated
    if (!document._sl3State.animated) {
      const maskB = toBits(maskVal);
      document._sl3State.r3Bits = maskB;
      r3.cells.forEach((c, i) => {
        const t = c.querySelector('.sl-val');
        if (t) { t.textContent = maskB[i]; t.className = `sl-val ${maskB[i] ? 'b1' : 'b0'}`; }
      });
    }
    // Validation
    isInvalid = inv !== 0;
    if (isInvalid) { calcBtn.classList.add('sl-disabled'); }
    else { calcBtn.classList.remove('sl-disabled'); toast.classList.remove('visible'); }
  }

  // --- Store state ---
  document._sl3State = {
    res: prev.res,
    r3Bits: prevR3Bits.slice(),
    r3Labels: prevR3Labels,
    animated: prev.animated || false
  };

  // --- Animation ---
  async function runAnimation() {
    animating = true;
    const bits = bitsVal & 0xFF;
    const mask = maskVal & 0xFF;
    const bitsB = toBits(bits);
    const maskB = toBits(mask);
    const addResult = rcAdd(bits, mask);
    const addB = toBits(addResult);
    const invAddB = toBits((~addResult) & 0xFF);
    const resB = toBits((bits & ~addResult) & 0xFF);
    const prevRes = document._sl3State.res;

    document._sl3State.animated = true;
    await raf(); await raf();

    // Reset phase
    if (document._sl3State.r3Labels) {
      r3.tilde.style.opacity = '0';
      r3.cells.forEach((c, i) => {
        const cur = parseInt(c.querySelector('.sl-val').textContent);
        if (cur !== maskB[i]) animateBit(c, maskB[i]);
      });
      await wait(600);
      document._sl3State.r3Bits = maskB;
      document._sl3State.r3Labels = false;
    }

    await wait(400);

    // Cascading reverse-carry addition (MSB→LSB, right-to-left)
    await animateAdd(r3.cells, bitsB, 'rtl');
    document._sl3State.r3Bits = addB;

    await wait(400);

    // Invert: reveal ~ and flip bits
    r3.tilde.style.opacity = '1';
    await wait(300);
    r3.cells.forEach((c, i) => { animateBit(c, invAddB[i]); });
    await wait(600);
    document._sl3State.r3Bits = invAddB;
    document._sl3State.r3Labels = true;

    await wait(400);

    // Result: animate changed bits
    for (let i = 0; i < 8; i++) {
      if (prevRes[i] !== resB[i]) {
        const c = r5.cells[i];
        animateBit(c, resB[i]);
        setTimeout(() => {
          if (resB[i]) c.classList.add('hl'); else c.classList.remove('hl');
        }, 200);
      }
    }
    document._sl3State.res = resB;
    animating = false;
  }

  return container;
}

Calculating the LSB of bit runs

This function can be achieved with x & ~(x << 1). It is based on the fact that the LSB of every bit run is preceded by an unset bit (or will be after a left shift). Zeros become ones when inverted, so after shifting and inverting, the only set bits in both the original and manipulated bitset are the LSB of each bit run. Calling this function on a flat bitmask (that we’ve already filtered to only contain real plateaus) gives us the plateau locations.

{
  const { toBits, wait, raf, mkBit, mkRow, animateBit, mkInputRow, centerWidget } = slUtils;
  let val = 116;
  const prev = document._sl1State || {res: [0,0,0,0,0,0,0,0]};
  const prevR1Bits = prev.r1Bits || toBits(val);

  const container = html`<div style="
    font-family: var(--bs-font-monospace, monospace);
    font-size: 1.15rem;
  "></div>`;
  centerWidget(container);

  // --- Input row ---
  const inputRow = mkInputRow('x\u00a0=', val, {
    editable: true,
    toggle(i) { val ^= (1 << (7 - i)); sync(); }
  });
  inputRow.hexInput.addEventListener('input', () => {
    const text = inputRow.hexInput.value.replace(/[^0-9a-fA-F]/g, '');
    if (text !== inputRow.hexInput.value) inputRow.hexInput.value = text;
    if (text.length === 2) {
      const parsed = parseInt(text, 16);
      if (!isNaN(parsed)) { val = parsed; sync(); }
    }
  });
  inputRow.hexInput.addEventListener('blur', () => {
    inputRow.hexInput.value = val.toString(16).padStart(2, '0');
  });

  // --- Animation rows ---
  // Top row: animated shift/invert
  const r1 = mkRow('', prevR1Bits, '');

  const r2 = mkRow('', toBits(val), '&');
  const hr = document.createElement('hr');
  hr.className = 'sl-hr';
  const r3 = mkRow('result:', prev.res);
  r3.cells.forEach((c, i) => { if (prev.res[i]) c.classList.add('hl'); });

  // Calculate button
  let animating = false;
  const calcBtn = document.createElement('button');
  calcBtn.className = 'sl-calc';
  calcBtn.textContent = 'Calculate';
  calcBtn.addEventListener('click', () => {
    if (animating) return;
    runAnimation();
  });

  container.append(inputRow.row, r1.el, r2.el, hr, r3.el, calcBtn);

  // --- Sync input changes to animation rows ---
  function sync() {
    inputRow.hexInput.value = val.toString(16).padStart(2, '0');
    const xB = toBits(val);
    for (let i = 0; i < 8; i++) {
      const b = xB[i];
      inputRow.bitRuns[i].textContent = b;
      inputRow.bitRuns[i].className = `sl-ibit ${b ? 'b1' : 'b0'}`;
      // Update r2 (& row)
      const t2 = r2.cells[i].firstChild;
      t2.textContent = b;
      t2.className = `sl-val ${b ? 'b1' : 'b0'}`;
    }
    // Update r1 if no animation has run yet
    if (!document._sl1State.animated) {
      r1.cells.forEach((c, i) => {
        const t = c.firstChild;
        t.textContent = xB[i];
        t.className = `sl-val ${xB[i] ? 'b1' : 'b0'}`;
      });
      document._sl1State.r1Bits = xB;
    }
  }

  // --- Store state ---
  document._sl1State = {
    res: prev.res,
    r1Bits: prevR1Bits.slice(),
    animated: prev.animated || false
  };

  // --- Animation ---
  async function runAnimation() {
    animating = true;
    const x = val & 0xFF;
    const xB = toBits(x);
    const shB = toBits((x << 1) & 0xFF);
    const invB = toBits((~(x << 1)) & 0xFF);
    const resB = toBits((x & ~(x << 1)) & 0xFF);
    const prevRes = document._sl1State.res;

    document._sl1State.animated = true;
    await raf(); await raf();

    // Reset phase: revert r1 to current x bits if needed
    const curR1 = document._sl1State.r1Bits;
    const needsReset = curR1.some((v, i) => v !== xB[i]);
    if (needsReset) {
      r1.cells.forEach((c, i) => {
        if (parseInt(c.firstChild.textContent) !== xB[i]) {
          animateBit(c, xB[i]);
        }
      });
      await wait(600);
      document._sl1State.r1Bits = xB;
    }

    await wait(400);

    // Shift phase: show <<, slide values left within static cells, fade out
    r1.opEl.style.transition = 'opacity 0.3s';
    r1.opEl.textContent = '<<';
    r1.opEl.style.opacity = '0';
    await raf(); await raf();
    r1.opEl.style.opacity = '1';
    await wait(400);
    // For each cell: slide old value left out, slide new (shifted) value in from right
    r1.cells.forEach((c, i) => {
      const old = c.firstChild;
      old.style.position = 'absolute';
      old.style.inset = '0';
      old.style.display = 'flex';
      old.style.alignItems = 'center';
      old.style.justifyContent = 'center';
      const neu = document.createElement('span');
      neu.className = `sl-val ${shB[i] ? 'b1' : 'b0'}`;
      neu.textContent = shB[i];
      neu.style.cssText = 'position: absolute; inset: 0; display: flex; align-items: center; justify-content: center; transform: translateX(1.5rem);';
      c.appendChild(neu);
    });
    await raf(); await raf();
    r1.cells.forEach(c => {
      const old = c.children[0];
      const neu = c.children[1];
      old.style.transition = 'transform 0.5s ease';
      old.style.transform = 'translateX(-1.5rem)';
      neu.style.transition = 'transform 0.5s ease';
      neu.style.transform = 'translateX(0)';
    });
    await wait(550);
    // Clean up: remove old spans, reset new spans to normal flow
    r1.cells.forEach((c, i) => {
      while (c.children.length > 1) c.removeChild(c.firstChild);
      const v = c.firstChild;
      v.style.cssText = '';
      v.className = `sl-val ${shB[i] ? 'b1' : 'b0'}`;
    });
    document._sl1State.r1Bits = shB;
    await wait(400);
    r1.opEl.style.opacity = '0';
    await wait(400);

    // Invert phase: show ~, pause, flip bits, pause, fade out
    r1.opEl.textContent = '~';
    await raf(); await raf();
    r1.opEl.style.opacity = '1';
    await wait(400);
    r1.cells.forEach((c, i) => { animateBit(c, invB[i]); });
    await wait(900);
    r1.opEl.style.opacity = '0';
    await wait(400);
    document._sl1State.r1Bits = invB;

    // Animate changed result bits
    for (let i = 0; i < 8; i++) {
      if (prevRes[i] !== resB[i]) {
        const c = r3.cells[i];
        animateBit(c, resB[i]);
        setTimeout(() => {
          if (resB[i]) c.classList.add('hl'); else c.classList.remove('hl');
        }, 200);
      }
    }
    document._sl1State.res = resB;
    animating = false;
  }

  return container;
}

Full algorithm

With the new bit-twiddling functions, the complete (ish) SIMD algorithm looks something like:

I’m glossing over the fussy auxiliary code to handle signal lengths that don’t work with the 8 element chunking, and plateaus that span multiple bytes/words.

offset₋₁ = VecRange{8}(-1)
offset   = VecRange{8}( 0)
offset₊₁ = VecRange{8}(+1)

for i in firstindex(x)+1:8:lastindex(x)-1
  xᵢ₋₁ = x[i + offset₋₁]
  xᵢ   = x[i + offset]
  xᵢ₊₁ = x[i + offset₊₁]

  rise = bitmask(xᵢ₋₁  < xᵢ)
  fall = bitmask(xᵢ₊₁  < xᵢ)
  flat = bitmask(xᵢ₊₁ == xᵢ)

1  peaks = rise & fall

2  fall >>= 1
3  flat = runs_lsb_mask(flat, rise)
  flat = runs_msb_mask(flat, fall)

4  peaks |= run_lsb(flat)
end

1: Capture normal peaks
2: Right shift fall bitmask to align signal dips with the end of plateaus
3: Clear bit runs that don’t begin or end with set bits in rise and fall
4: Add plateau locations to peaks.

Ironically, the code using explicit SIMD was the most straightforward part of the code! Instead, the bitwise manipulation after the SIMD was the trickiest part. In the full SIMD algorithm, I use a pre-allocated BitVector to store the peaks bitsets. Using a BitVector has the double benefit of taking the peaks bitsets directly¹³ (without needing to convert its representation) and avoiding allocations in the core loop.

Benchmarks

How much faster is the SIMD simplemaxima compared to the scalar argmaxima? Using random (normal) data, here’s the average time per peak for each function for different signal lengths. Check out the benchmarks page for more benchmark results.

Benchmark plot showing simplemaxima is ~7x faster than argmaxima across vector lengths

🤖 AI disclosure

The algorithms and writing in this post were produced without AI. An LLM (Claude) helped write the code for the reverse-carry adder, the interactive bitwise math widgets, and the benchmark plot (which is adapted from the benchmark plot at benchmarks page).

Footnotes

These comparisons aren’t completely apples-to-apples, and technically disfavor Peaks.jl functions, which have a larger feature set. In particular, none of the compared functions detect plateau peaks.↩︎
Even though I have never needed faster peak finding, or used peaks in a real-time/performance sensitive context 🙄↩︎
Many compilers, including Julia, can auto-vectorize simple/recognizable code patterns. More complex code must be intentionally and manually refactored to explicitly use SIMD.↩︎
I realize that’s not strictly true with superscalar processors; don’t @ me.↩︎
I decided the strict keyword argument and larger window sizes were too difficult and/or not possible to support.↩︎
Registers are where CPUs store data for instructions to immediately use.↩︎
Not all hardware will support a so-called vector “width” of e.g. 8 double-precision floats (which would be 512 bits wide). SIMD.jl and LLVM are very helpfully papering over differences between the code and what is actually supported by the hardware.↩︎
Given the definition of a peak, the first possible peak location is at index 2, so the first bit corresponds to that index.↩︎
bitreverse reverses the order of a bitset, e.g. 0b01110100 becomes 0b00101110. By reversing the bits and mask, we can use the runs_lsb_mask function to achieve the desired behavior:
```
bitreverse(
  runs_lsb_mask(
    bitreverse(bits),
    bitreverse(mask)
  )
)
```
↩︎
The bitreverseing took all of 7% of the SIMD function runtime!!↩︎
I also realized that the lack of this operation (reverse-carry adder) was the only major function/algorithmic gap blocking support for larger window widths.↩︎
Otherwise there’s not much point. The perfect™ abstraction doesn’t matter if its slow or inconvenient.↩︎
Unfortunately this does require accessing/using BitVector internals.↩︎