From 22697419bcb1bbb6ba7b1b84f2d80580d5b3278e Mon Sep 17 00:00:00 2001
From: Al Johri <al.johri@gmail.com>
Date: Thu, 18 Jun 2026 11:05:43 -0400
Subject: [PATCH] Add BitPacker16x: AVX-512 (16-lane / 512-int block) flavor

Adds a fourth bitpacking flavor that leverages AVX-512, mirroring the
existing BitPacker8x (AVX2) flavor:

- New `bitpacker16x` feature (enabled by default) exposing a `BitPacker16x`
  public type.
- AVX-512 implementation plus a scalar fallback; the instruction set is
  detected at runtime and falls back to scalar when avx512f is unavailable,
  so the produced format is identical regardless of CPU.
- The cross-lane helpers (compute_delta / integrate_delta / or_collapse) use
  whole-register `valignd` (`_mm512_alignr_epi32`).
- The avx512 helpers carry `#[target_feature(enable = "avx512f")]` so they
  inline into the feature-gated pack/unpack/num_bits. Without it `num_bits`
  spills the 512-bit accumulator to the stack and calls out-of-line, which
  measured ~6x slower and (because compress recomputes num_bits per block)
  looked like a 2.5x compress regression.
- Block size is 512 integers (32 registers x 16 lanes).

Also wired into lib.rs docs, the criterion benchmark, the README, and the
CHANGELOG; crate version bumped to 0.10.0.

Correctness is validated against the scalar reference via the crate's existing
`test_compatible` cross-check (byte-for-byte), run under Intel SDE since AVX-512
hardware was not available locally. Throughput verified on AVX-512 hardware
(AWS r8a / AMD Zen5 and r7i / Intel Sapphire Rapids): per-int parity-or-better
than BitPacker8x on num_bits/compress/decompress, at 2x lanes per instruction.
---
 CHANGELOG.md            |   7 +
 Cargo.toml              |   5 +-
 README.md               |  35 ++-
 src/bitpacker16x.rs     | 529 ++++++++++++++++++++++++++++++++++++++++
 src/bitpacking_bench.rs |   3 +-
 src/lib.rs              |  12 +-
 6 files changed, 580 insertions(+), 11 deletions(-)
 create mode 100644 src/bitpacker16x.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b25d3f..5e76132 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# bitpacking 0.10.0
+
+- Add `BitPacker16x`, a 512-bit / 16-lane bitpacking flavor that leverages
+  `AVX-512` instructions, with a scalar fallback. Like the other flavors it
+  detects the instruction set at runtime and falls back to scalar when AVX-512
+  is unavailable. Enabled by default via the new `bitpacker16x` feature.
+
 # bitpacking 0.9.3
 
 - Performance improvements in BitPacker4x for ARM
diff --git a/Cargo.toml b/Cargo.toml
index f49403d..3c05c7f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "bitpacking"
-version = "0.9.3"
+version = "0.10.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 readme = "README.md"
@@ -22,7 +22,8 @@ proptest = "1.4"
 bitpacker1x = []
 bitpacker4x = []
 bitpacker8x = []
-default = ["bitpacker1x", "bitpacker4x", "bitpacker8x"]
+bitpacker16x = []
+default = ["bitpacker1x", "bitpacker4x", "bitpacker8x", "bitpacker16x"]
 
 [[bench]]
 name = "bitpacking_bench"
diff --git a/README.md b/README.md
index ff2521a..69b9a7c 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ It makes it possible to compress/decompress :
 Just add to your `Cargo.toml` :
 
 ```toml
-bitpacking = "0.9"
+bitpacking = "0.10"
 ```
 
 For some bitpacking flavor and for some platform, the bitpacking crate
@@ -63,13 +63,13 @@ For instance, assuming a block of `4`, when encoding `4, 9, 3, 2`. Assuming that
 
 As a result, each integer of this block will only require 4 bits.
 
-## Choosing between BitPacker1x, BitPacker4x and BitPacker8x.
+## Choosing between BitPacker1x, BitPacker4x, BitPacker8x and BitPacker16x.
 
-:warning: `BitPacker1x`, `BitPacker4x`, and `BitPacker8x` produce different formats,
+:warning: `BitPacker1x`, `BitPacker4x`, `BitPacker8x`, and `BitPacker16x` produce different formats,
 and are incompatible one with another.
 
-`BitPacker4x` and `BitPacker8x` are designed specifically to leverage `SSE3` and `AVX2`
-instructions respectively.
+`BitPacker4x`, `BitPacker8x`, and `BitPacker16x` are designed specifically to leverage `SSE3`, `AVX2`
+and `AVX-512` instructions respectively.
 
 It will safely fall back at runtime to a scalar implementation of these format if these instruction sets are not available on the running CPU.
 
@@ -93,6 +93,12 @@ One block must contain `128 integers`.
 to leverage `AVX2` instructions to encode and decode the stream.
 One block must contain `256 integers`.
 
+#### BitPacker16x
+
+`BitPacker16x` bits ordering works in layers of 16 integers. This gives an opportunity
+to leverage `AVX-512` instructions to encode and decode the stream.
+One block must contain `512 integers`.
+
 
 
 ## Compressing small integers
@@ -174,6 +180,25 @@ cargo bench
 | decompress       | 6.5 billions int/s |
 | decompress_delta | 5.6 billions int/s |
 
+## BitPacker16x (assuming AVX-512 instructions are available)
+
+The laptop above has no AVX-512, so these were measured separately on an AWS
+`r8a.xlarge` (AMD EPYC 9R45, Zen 5), one thread, 24-bit values. `BitPacker8x` on
+the **same machine** is listed alongside for a fair comparison (the tables above
+are a different CPU and are not directly comparable to these).
+
+| operation        | BitPacker8x (AVX2)   | BitPacker16x (AVX-512) |
+|:-----------------|:---------------------|:-----------------------|
+| compress         | 18.0 billions int/s  | 17.2 billions int/s    |
+| compress_delta   | 15.4 billions int/s  | 15.7 billions int/s    |
+| decompress       | 36.9 billions int/s  | 41.3 billions int/s    |
+| decompress_delta | 180 millions int/s   | 11.7 billions int/s    |
+
+`BitPacker16x` matches `BitPacker8x` on the non-delta paths and is ~12% faster on
+`decompress`, while packing 512 integers per block. The large `decompress_delta`
+gap reflects a slow `BitPacker8x` (AVX2) delta-integration path rather than a 16x
+speedup per se.
+
 
 ## Reference
 
diff --git a/src/bitpacker16x.rs b/src/bitpacker16x.rs
new file mode 100644
index 0000000..2302e4c
--- /dev/null
+++ b/src/bitpacker16x.rs
@@ -0,0 +1,529 @@
+use super::{BitPacker, UnsafeBitPacker};
+
+#[cfg(target_arch = "x86_64")]
+use crate::Available;
+
+const BLOCK_LEN: usize = 32 * 16;
+
+#[cfg(target_arch = "x86_64")]
+mod avx512 {
+
+    use super::BLOCK_LEN;
+    use crate::Available;
+
+    use std::arch::x86_64::__m512i as DataType;
+    use std::arch::x86_64::_mm512_and_si512 as op_and;
+    use std::arch::x86_64::_mm512_or_si512 as op_or;
+    use std::arch::x86_64::_mm512_set1_epi32 as set1;
+
+    use std::arch::x86_64::{
+        _mm512_add_epi32, _mm512_alignr_epi32, _mm512_castsi512_si128, _mm512_loadu_si512,
+        _mm512_permutexvar_epi32, _mm512_setzero_si512, _mm512_sll_epi32, _mm512_srl_epi32,
+        _mm512_storeu_si512, _mm512_sub_epi32, _mm_cvtsi128_si32, _mm_cvtsi32_si128,
+    };
+
+    // Variable-count shift: the immediate `_mm512_slli_epi32` wants `const u32`
+    // but the macro passes `const i32`; with `N` constant this folds to an immediate.
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn left_shift_32<const N: i32>(el: DataType) -> DataType {
+        _mm512_sll_epi32(el, _mm_cvtsi32_si128(N))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn right_shift_32<const N: i32>(el: DataType) -> DataType {
+        _mm512_srl_epi32(el, _mm_cvtsi32_si128(N))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn load_unaligned(addr: *const DataType) -> DataType {
+        _mm512_loadu_si512(addr.cast())
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn store_unaligned(addr: *mut DataType, data: DataType) {
+        _mm512_storeu_si512(addr.cast(), data);
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn or_collapse_to_u32(accumulator: DataType) -> u32 {
+        // OR-rotate the whole register four times so lane 0 holds all 16 lanes.
+        let mut v = accumulator;
+        v = op_or(v, _mm512_alignr_epi32::<8>(v, v));
+        v = op_or(v, _mm512_alignr_epi32::<4>(v, v));
+        v = op_or(v, _mm512_alignr_epi32::<2>(v, v));
+        v = op_or(v, _mm512_alignr_epi32::<1>(v, v));
+        _mm_cvtsi128_si32(_mm512_castsi512_si128(v)) as u32
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn compute_delta(curr: DataType, prev: DataType) -> DataType {
+        // builds {prev[15], curr[0], .., curr[14]}
+        let prev_shifted = _mm512_alignr_epi32::<15>(curr, prev);
+        _mm512_sub_epi32(curr, prev_shifted)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn integrate_delta(prev: DataType, delta: DataType) -> DataType {
+        // Prefix-sum the lanes, then add the running offset prev[15].
+        // `alignr(x, zero, 16 - s)` shifts lanes right by `s` (so 15/14/12/8 = 1/2/4/8).
+        let zero = _mm512_setzero_si512();
+        let offset = _mm512_permutexvar_epi32(set1(15), prev);
+        let mut x = delta;
+        x = _mm512_add_epi32(x, _mm512_alignr_epi32::<15>(x, zero));
+        x = _mm512_add_epi32(x, _mm512_alignr_epi32::<14>(x, zero));
+        x = _mm512_add_epi32(x, _mm512_alignr_epi32::<12>(x, zero));
+        x = _mm512_add_epi32(x, _mm512_alignr_epi32::<8>(x, zero));
+        _mm512_add_epi32(x, offset)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn add(left: DataType, right: DataType) -> DataType {
+        _mm512_add_epi32(left, right)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn sub(left: DataType, right: DataType) -> DataType {
+        _mm512_sub_epi32(left, right)
+    }
+
+    declare_bitpacker!(target_feature(enable = "avx512f"));
+
+    impl Available for UnsafeBitPackerImpl {
+        fn available() -> bool {
+            is_x86_feature_detected!("avx512f")
+        }
+    }
+}
+
+mod scalar {
+
+    use super::BLOCK_LEN;
+    use crate::Available;
+    use std::ptr;
+
+    type DataType = [u32; 16];
+
+    fn set1(el: i32) -> DataType {
+        [el as u32; 16]
+    }
+
+    fn right_shift_32<const N: i32>(el: DataType) -> DataType {
+        [
+            el[0] >> N,
+            el[1] >> N,
+            el[2] >> N,
+            el[3] >> N,
+            el[4] >> N,
+            el[5] >> N,
+            el[6] >> N,
+            el[7] >> N,
+            el[8] >> N,
+            el[9] >> N,
+            el[10] >> N,
+            el[11] >> N,
+            el[12] >> N,
+            el[13] >> N,
+            el[14] >> N,
+            el[15] >> N,
+        ]
+    }
+
+    fn left_shift_32<const N: i32>(el: DataType) -> DataType {
+        [
+            el[0] << N,
+            el[1] << N,
+            el[2] << N,
+            el[3] << N,
+            el[4] << N,
+            el[5] << N,
+            el[6] << N,
+            el[7] << N,
+            el[8] << N,
+            el[9] << N,
+            el[10] << N,
+            el[11] << N,
+            el[12] << N,
+            el[13] << N,
+            el[14] << N,
+            el[15] << N,
+        ]
+    }
+
+    fn op_or(left: DataType, right: DataType) -> DataType {
+        [
+            left[0] | right[0],
+            left[1] | right[1],
+            left[2] | right[2],
+            left[3] | right[3],
+            left[4] | right[4],
+            left[5] | right[5],
+            left[6] | right[6],
+            left[7] | right[7],
+            left[8] | right[8],
+            left[9] | right[9],
+            left[10] | right[10],
+            left[11] | right[11],
+            left[12] | right[12],
+            left[13] | right[13],
+            left[14] | right[14],
+            left[15] | right[15],
+        ]
+    }
+
+    fn op_and(left: DataType, right: DataType) -> DataType {
+        [
+            left[0] & right[0],
+            left[1] & right[1],
+            left[2] & right[2],
+            left[3] & right[3],
+            left[4] & right[4],
+            left[5] & right[5],
+            left[6] & right[6],
+            left[7] & right[7],
+            left[8] & right[8],
+            left[9] & right[9],
+            left[10] & right[10],
+            left[11] & right[11],
+            left[12] & right[12],
+            left[13] & right[13],
+            left[14] & right[14],
+            left[15] & right[15],
+        ]
+    }
+
+    unsafe fn load_unaligned(addr: *const DataType) -> DataType {
+        ptr::read_unaligned(addr)
+    }
+
+    unsafe fn store_unaligned(dst: *mut DataType, data: DataType) {
+        ptr::write_unaligned(dst, data);
+    }
+
+    fn or_collapse_to_u32(accumulator: DataType) -> u32 {
+        (((accumulator[0] | accumulator[1]) | (accumulator[2] | accumulator[3]))
+            | ((accumulator[4] | accumulator[5]) | (accumulator[6] | accumulator[7])))
+            | (((accumulator[8] | accumulator[9]) | (accumulator[10] | accumulator[11]))
+                | ((accumulator[12] | accumulator[13]) | (accumulator[14] | accumulator[15])))
+    }
+
+    fn compute_delta(curr: DataType, prev: DataType) -> DataType {
+        [
+            curr[0].wrapping_sub(prev[15]),
+            curr[1].wrapping_sub(curr[0]),
+            curr[2].wrapping_sub(curr[1]),
+            curr[3].wrapping_sub(curr[2]),
+            curr[4].wrapping_sub(curr[3]),
+            curr[5].wrapping_sub(curr[4]),
+            curr[6].wrapping_sub(curr[5]),
+            curr[7].wrapping_sub(curr[6]),
+            curr[8].wrapping_sub(curr[7]),
+            curr[9].wrapping_sub(curr[8]),
+            curr[10].wrapping_sub(curr[9]),
+            curr[11].wrapping_sub(curr[10]),
+            curr[12].wrapping_sub(curr[11]),
+            curr[13].wrapping_sub(curr[12]),
+            curr[14].wrapping_sub(curr[13]),
+            curr[15].wrapping_sub(curr[14]),
+        ]
+    }
+
+    fn integrate_delta(offset: DataType, delta: DataType) -> DataType {
+        let el0 = offset[15].wrapping_add(delta[0]);
+        let el1 = el0.wrapping_add(delta[1]);
+        let el2 = el1.wrapping_add(delta[2]);
+        let el3 = el2.wrapping_add(delta[3]);
+        let el4 = el3.wrapping_add(delta[4]);
+        let el5 = el4.wrapping_add(delta[5]);
+        let el6 = el5.wrapping_add(delta[6]);
+        let el7 = el6.wrapping_add(delta[7]);
+        let el8 = el7.wrapping_add(delta[8]);
+        let el9 = el8.wrapping_add(delta[9]);
+        let el10 = el9.wrapping_add(delta[10]);
+        let el11 = el10.wrapping_add(delta[11]);
+        let el12 = el11.wrapping_add(delta[12]);
+        let el13 = el12.wrapping_add(delta[13]);
+        let el14 = el13.wrapping_add(delta[14]);
+        let el15 = el14.wrapping_add(delta[15]);
+        [
+            el0, el1, el2, el3, el4, el5, el6, el7, el8, el9, el10, el11, el12, el13, el14, el15,
+        ]
+    }
+
+    fn add(left: DataType, right: DataType) -> DataType {
+        [
+            left[0].wrapping_add(right[0]),
+            left[1].wrapping_add(right[1]),
+            left[2].wrapping_add(right[2]),
+            left[3].wrapping_add(right[3]),
+            left[4].wrapping_add(right[4]),
+            left[5].wrapping_add(right[5]),
+            left[6].wrapping_add(right[6]),
+            left[7].wrapping_add(right[7]),
+            left[8].wrapping_add(right[8]),
+            left[9].wrapping_add(right[9]),
+            left[10].wrapping_add(right[10]),
+            left[11].wrapping_add(right[11]),
+            left[12].wrapping_add(right[12]),
+            left[13].wrapping_add(right[13]),
+            left[14].wrapping_add(right[14]),
+            left[15].wrapping_add(right[15]),
+        ]
+    }
+
+    fn sub(left: DataType, right: DataType) -> DataType {
+        [
+            left[0].wrapping_sub(right[0]),
+            left[1].wrapping_sub(right[1]),
+            left[2].wrapping_sub(right[2]),
+            left[3].wrapping_sub(right[3]),
+            left[4].wrapping_sub(right[4]),
+            left[5].wrapping_sub(right[5]),
+            left[6].wrapping_sub(right[6]),
+            left[7].wrapping_sub(right[7]),
+            left[8].wrapping_sub(right[8]),
+            left[9].wrapping_sub(right[9]),
+            left[10].wrapping_sub(right[10]),
+            left[11].wrapping_sub(right[11]),
+            left[12].wrapping_sub(right[12]),
+            left[13].wrapping_sub(right[13]),
+            left[14].wrapping_sub(right[14]),
+            left[15].wrapping_sub(right[15]),
+        ]
+    }
+
+    // The `cfg(any(debug, not(debug)))` is here to put an attribute that has no effect.
+    //
+    // For other bitpacker, we enable specific CPU instruction set, but for the
+    // scalar bitpacker none is required.
+    declare_bitpacker!(cfg(any(debug, not(debug))));
+
+    impl Available for UnsafeBitPackerImpl {
+        fn available() -> bool {
+            true
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+enum InstructionSet {
+    #[cfg(target_arch = "x86_64")]
+    AVX512,
+    Scalar,
+}
+
+/// `BitPacker16x` packs integers in groups of 16. This gives an opportunity
+/// to leverage `AVX-512` instructions to encode and decode the stream.
+/// One block must contain `512 integers`.
+#[derive(Clone, Copy)]
+pub struct BitPacker16x(InstructionSet);
+
+impl BitPacker for BitPacker16x {
+    const BLOCK_LEN: usize = BLOCK_LEN;
+
+    fn new() -> Self {
+        #[cfg(target_arch = "x86_64")]
+        {
+            if avx512::UnsafeBitPackerImpl::available() {
+                return BitPacker16x(InstructionSet::AVX512);
+            }
+        }
+        BitPacker16x(InstructionSet::Scalar)
+    }
+
+    fn compress(&self, decompressed: &[u32], compressed: &mut [u8], num_bits: u8) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => {
+                    avx512::UnsafeBitPackerImpl::compress(decompressed, compressed, num_bits)
+                }
+                InstructionSet::Scalar => {
+                    scalar::UnsafeBitPackerImpl::compress(decompressed, compressed, num_bits)
+                }
+            }
+        }
+    }
+
+    fn compress_sorted(
+        &self,
+        initial: u32,
+        decompressed: &[u32],
+        compressed: &mut [u8],
+        num_bits: u8,
+    ) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => avx512::UnsafeBitPackerImpl::compress_sorted(
+                    initial,
+                    decompressed,
+                    compressed,
+                    num_bits,
+                ),
+                InstructionSet::Scalar => scalar::UnsafeBitPackerImpl::compress_sorted(
+                    initial,
+                    decompressed,
+                    compressed,
+                    num_bits,
+                ),
+            }
+        }
+    }
+
+    fn compress_strictly_sorted(
+        &self,
+        initial: Option<u32>,
+        decompressed: &[u32],
+        compressed: &mut [u8],
+        num_bits: u8,
+    ) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => avx512::UnsafeBitPackerImpl::compress_strictly_sorted(
+                    initial,
+                    decompressed,
+                    compressed,
+                    num_bits,
+                ),
+                InstructionSet::Scalar => scalar::UnsafeBitPackerImpl::compress_strictly_sorted(
+                    initial,
+                    decompressed,
+                    compressed,
+                    num_bits,
+                ),
+            }
+        }
+    }
+
+    fn decompress(&self, compressed: &[u8], decompressed: &mut [u32], num_bits: u8) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => {
+                    avx512::UnsafeBitPackerImpl::decompress(compressed, decompressed, num_bits)
+                }
+                InstructionSet::Scalar => {
+                    scalar::UnsafeBitPackerImpl::decompress(compressed, decompressed, num_bits)
+                }
+            }
+        }
+    }
+
+    fn decompress_sorted(
+        &self,
+        initial: u32,
+        compressed: &[u8],
+        decompressed: &mut [u32],
+        num_bits: u8,
+    ) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => avx512::UnsafeBitPackerImpl::decompress_sorted(
+                    initial,
+                    compressed,
+                    decompressed,
+                    num_bits,
+                ),
+                InstructionSet::Scalar => scalar::UnsafeBitPackerImpl::decompress_sorted(
+                    initial,
+                    compressed,
+                    decompressed,
+                    num_bits,
+                ),
+            }
+        }
+    }
+
+    fn decompress_strictly_sorted(
+        &self,
+        initial: Option<u32>,
+        compressed: &[u8],
+        decompressed: &mut [u32],
+        num_bits: u8,
+    ) -> usize {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => avx512::UnsafeBitPackerImpl::decompress_strictly_sorted(
+                    initial,
+                    compressed,
+                    decompressed,
+                    num_bits,
+                ),
+                InstructionSet::Scalar => scalar::UnsafeBitPackerImpl::decompress_strictly_sorted(
+                    initial,
+                    compressed,
+                    decompressed,
+                    num_bits,
+                ),
+            }
+        }
+    }
+
+    fn num_bits(&self, decompressed: &[u32]) -> u8 {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => avx512::UnsafeBitPackerImpl::num_bits(decompressed),
+                InstructionSet::Scalar => scalar::UnsafeBitPackerImpl::num_bits(decompressed),
+            }
+        }
+    }
+
+    fn num_bits_sorted(&self, initial: u32, decompressed: &[u32]) -> u8 {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => {
+                    avx512::UnsafeBitPackerImpl::num_bits_sorted(initial, decompressed)
+                }
+                InstructionSet::Scalar => {
+                    scalar::UnsafeBitPackerImpl::num_bits_sorted(initial, decompressed)
+                }
+            }
+        }
+    }
+
+    fn num_bits_strictly_sorted(&self, initial: Option<u32>, decompressed: &[u32]) -> u8 {
+        unsafe {
+            match self.0 {
+                #[cfg(target_arch = "x86_64")]
+                InstructionSet::AVX512 => {
+                    avx512::UnsafeBitPackerImpl::num_bits_strictly_sorted(initial, decompressed)
+                }
+                InstructionSet::Scalar => {
+                    scalar::UnsafeBitPackerImpl::num_bits_strictly_sorted(initial, decompressed)
+                }
+            }
+        }
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[cfg(test)]
+mod tests {
+    use super::BLOCK_LEN;
+    use super::{avx512, scalar};
+    use crate::tests::test_util_compatible;
+    use crate::Available;
+
+    #[test]
+    fn test_compatible() {
+        if avx512::UnsafeBitPackerImpl::available() {
+            test_util_compatible::<scalar::UnsafeBitPackerImpl, avx512::UnsafeBitPackerImpl>(
+                BLOCK_LEN,
+            );
+        }
+    }
+}
diff --git a/src/bitpacking_bench.rs b/src/bitpacking_bench.rs
index eb09c61..bf0eeb0 100644
--- a/src/bitpacking_bench.rs
+++ b/src/bitpacking_bench.rs
@@ -1,7 +1,7 @@
 use criterion::{criterion_group, criterion_main, Bencher, Criterion};
 use std::time::Duration;
 
-use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
+use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x, BitPacker16x};
 use criterion::Benchmark;
 use criterion::Throughput;
 
@@ -322,6 +322,7 @@ fn criterion_benchmark(criterion: &mut Criterion) {
     criterion_benchmark_bitpacker("BitPacker1x", BitPacker1x::new(), criterion);
     criterion_benchmark_bitpacker("BitPacker4x", BitPacker4x::new(), criterion);
     criterion_benchmark_bitpacker("BitPacker8x", BitPacker8x::new(), criterion);
+    criterion_benchmark_bitpacker("BitPacker16x", BitPacker16x::new(), criterion);
 }
 
 criterion_group! {
diff --git a/src/lib.rs b/src/lib.rs
index 4f6425d..5906120 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,13 +7,13 @@
 /*!  # Fast Bitpacking algorithms
 
 This crate is a **Rust port of [Daniel Lemire's simdcomp C library](https://github.com/lemire/simdcomp)**.
-It contains different flavor of integers compression via bitpacking :  `BitPacker1x`, `BitPacker4x`, and `BitPacker8x`.
+It contains different flavor of integers compression via bitpacking :  `BitPacker1x`, `BitPacker4x`, `BitPacker8x`, and `BitPacker16x`.
 
 Each produces different formats, and are incompatible one with another,
 and requires integers to be encoded in block of different size..
 
-`BitPacker4x` and `BitPacker8x` are designed specifically to leverage `SSE3`
-and `AVX2` instructions respectively.
+`BitPacker4x`, `BitPacker8x`, and `BitPacker16x` are designed specifically to leverage `SSE3`,
+`AVX2`, and `AVX-512` instructions respectively.
 
 The library will fall back to a scalar implementation if these instruction
 sets are not available. For instance :
@@ -362,6 +362,9 @@ mod bitpacker4x_simple;
 #[cfg(feature = "bitpacker8x")]
 mod bitpacker8x;
 
+#[cfg(feature = "bitpacker16x")]
+mod bitpacker16x;
+
 #[cfg(all(feature = "bitpacker1x", not(debug_assertions)))]
 pub use bitpacker1x::BitPacker1x;
 #[cfg(all(feature = "bitpacker1x", debug_assertions))]
@@ -375,6 +378,9 @@ pub use bitpacker4x_simple::BitPacker4x;
 #[cfg(feature = "bitpacker8x")]
 pub use bitpacker8x::BitPacker8x;
 
+#[cfg(feature = "bitpacker16x")]
+pub use bitpacker16x::BitPacker16x;
+
 #[cfg(test)]
 mod tests_unit {
     use super::*;