From 159aa0762cc24e395df69d93d5ce954994f4ba9a Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Sun, 16 Feb 2025 11:34:47 -0800
Subject: [PATCH] aes-gcm: Enable AVX-512 implementation.

---
 build.rs                                      |  6 ++
 .../aes/asm/aes-gcm-avx10-x86_64.pl           | 83 +----------------
 src/aead/aes_gcm.rs                           | 23 +++++
 src/aead/aes_gcm/vaesclmulavx512.rs           | 93 +++++++++++++++++++
 src/aead/gcm.rs                               | 10 ++
 src/aead/gcm/vclmulavx2.rs                    |  1 +
 src/aead/gcm/vclmulavx512.rs                  | 49 ++++++++++
 src/cpu.rs                                    | 14 +++
 src/cpu/intel.rs                              | 39 ++++++++
 9 files changed, 236 insertions(+), 82 deletions(-)
 create mode 100644 src/aead/aes_gcm/vaesclmulavx512.rs
 create mode 100644 src/aead/gcm/vclmulavx512.rs

diff --git a/build.rs b/build.rs
index 9843ad8aa5..2296c0c3e4 100644
--- a/build.rs
+++ b/build.rs
@@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
 
     (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
     (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
@@ -888,8 +889,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "LIMB_shr",
         "OPENSSL_cpuid_setup",
         "aes_gcm_dec_kernel",
+        "aes_gcm_dec_update_vaes_avx10_512",
         "aes_gcm_dec_update_vaes_avx2",
         "aes_gcm_enc_kernel",
+        "aes_gcm_enc_update_vaes_avx10_512",
         "aes_gcm_enc_update_vaes_avx2",
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_set_encrypt_key",
@@ -949,12 +952,15 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_ghash_avx",
         "gcm_ghash_clmul",
         "gcm_ghash_neon",
+        "gcm_ghash_vpclmulqdq_avx10_512_1",
         "gcm_ghash_vpclmulqdq_avx2_1",
         "gcm_gmult_clmul",
         "gcm_gmult_neon",
+        "gcm_gmult_vpclmulqdq_avx2",
         "gcm_init_avx",
         "gcm_init_clmul",
         "gcm_init_neon",
+        "gcm_init_vpclmulqdq_avx10_512",
         "gcm_init_vpclmulqdq_avx2",
         "k25519Precomp",
         "limbs_mul_add_limb",
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index 8a099d3c30..8b0ff89207 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -737,59 +737,6 @@ sub _ghash_update {
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
 
-    # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
-    cmp             \$$VL, $AADLEN
-    jb              .Laad_blockbyblock$local_label_suffix
-
-    # AADLEN >= VL, so we'll operate on full vectors.  Broadcast bswap_mask and
-    # gfpoly to all 128-bit lanes.
-    vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
-    vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
-
-    # Load the lowest set of key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
-
-    cmp             \$4*$VL-1, $AADLEN
-    jbe             .Laad_loop_1x$local_label_suffix
-
-    # AADLEN >= 4*VL.  Load the higher key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
-    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
-    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
-
-    # Update GHASH with 4*VL bytes of AAD at a time.
-.Laad_loop_4x$local_label_suffix:
-    vmovdqu8        0*$VL($AAD), $GHASHDATA0
-    vmovdqu8        1*$VL($AAD), $GHASHDATA1
-    vmovdqu8        2*$VL($AAD), $GHASHDATA2
-    vmovdqu8        3*$VL($AAD), $GHASHDATA3
-    @{[ _ghash_4x ]}
-    sub             \$-4*$VL, $AAD  # shorter than 'add 4*VL' when VL=32
-    add             \$-4*$VL, $AADLEN
-    cmp             \$4*$VL-1, $AADLEN
-    ja              .Laad_loop_4x$local_label_suffix
-
-    # Update GHASH with VL bytes of AAD at a time.
-    cmp             \$$VL, $AADLEN
-    jb              .Laad_large_done$local_label_suffix
-.Laad_loop_1x$local_label_suffix:
-    vmovdqu8        ($AAD), $GHASHDATA0
-    vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
-    vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
-    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
-                    $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
-    @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
-                        $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$$VL, $AAD
-    sub             \$$VL, $AADLEN
-    cmp             \$$VL, $AADLEN
-    jae             .Laad_loop_1x$local_label_suffix
-
-.Laad_large_done$local_label_suffix:
-    # Issue the vzeroupper that is needed after using ymm or zmm registers.
-    # Do it here instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
-
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock$local_label_suffix:
     test            $AADLEN, $AADLEN
@@ -801,9 +748,6 @@ sub _ghash_update {
     vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
                     $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$16, $AAD
-    sub             \$16, $AADLEN
-    jnz             .Laad_loop_blockbyblock$local_label_suffix
 
 .Laad_done$local_label_suffix:
     # Store the updated GHASH accumulator back to memory.
@@ -1303,31 +1247,6 @@ sub _aes_gcm_update {
     return $code;
 }
 
-# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
-$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
-{
-    my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
-    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
-      map( "%xmm$_", ( 0 .. 6 ) );
-
-    $code .= <<___;
-    @{[ _save_xmmregs (6) ]}
-    .seh_endprologue
-
-    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
-    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
-    vmovdqu         $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
-    vmovdqu         .Lgfpoly(%rip), $GFPOLY
-    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
-
-    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
-
-    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
-    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
-___
-}
-$code .= _end_func;
-
 # Disabled until significant deployment of AVX10/256 is seen.  The separate
 # *_vaes_avx2 implementation provides the only 256-bit support for now.
 #
@@ -1353,7 +1272,7 @@ sub _aes_gcm_update {
 $code .= _aes_gcm_init;
 $code .= _end_func;
 
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512_1", 1;
 $code .= _ghash_update;
 $code .= _end_func;
 
diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs
index d9e08a3116..a842f32341 100644
--- a/src/aead/aes_gcm.rs
+++ b/src/aead/aes_gcm.rs
@@ -36,6 +36,7 @@ use cpu::GetFeature as _;
 mod aarch64;
 mod aeshwclmulmovbe;
 mod vaesclmulavx2;
+mod vaesclmulavx512;
 
 #[derive(Clone)]
 pub(super) struct Key(DynKey);
@@ -51,6 +52,9 @@ impl Key {
 
 #[derive(Clone)]
 enum DynKey {
+    #[cfg(target_arch = "x86_64")]
+    VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
+
     #[cfg(target_arch = "x86_64")]
     VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
 
@@ -86,6 +90,9 @@ impl DynKey {
             let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
             let gcm_key_value = derive_gcm_key_value(&aes_key);
             let combo = if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
+                Self::VAesClMulAvx512(Combo { aes_key, gcm_key })
+            } else if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
                 Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
             } else if let Some(cpu) = cpu.get_feature() {
@@ -190,6 +197,11 @@ pub(super) fn seal(
             seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx512(c) => {
+            seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
+        }
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => seal_whole_partial(
             c,
@@ -317,6 +329,17 @@ pub(super) fn open(
             open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx512(c) => open_whole_partial(
+            c,
+            aad,
+            in_out_slice,
+            src,
+            ctr,
+            tag_iv,
+            vaesclmulavx512::open_whole,
+        ),
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => open_whole_partial(
             c,
diff --git a/src/aead/aes_gcm/vaesclmulavx512.rs b/src/aead/aes_gcm/vaesclmulavx512.rs
new file mode 100644
index 0000000000..4033223301
--- /dev/null
+++ b/src/aead/aes_gcm/vaesclmulavx512.rs
@@ -0,0 +1,93 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{aes, gcm, Counter, BLOCK_LEN};
+use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
+use core::num::{NonZeroU32, NonZeroUsize};
+
+pub(super) fn seal_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    prefixed_extern! {
+        fn aes_gcm_enc_update_vaes_avx10_512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::AES_KEY,
+            ivec: &Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(len) = NonZeroUsize::new(in_out.len()) {
+        let aes_key = aes_key.inner_less_safe();
+        let (htable, xi) = auth.inner();
+        let input = in_out.as_ptr();
+        let output = in_out.as_mut_ptr();
+        unsafe { aes_gcm_enc_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi) };
+        let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+            unreachable!() // Due to previous checks.
+        });
+        ctr.increment_by_less_safe(blocks);
+    }
+}
+
+pub(super) fn open_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        fn aes_gcm_dec_update_vaes_avx10_512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::AES_KEY,
+            ivec: &mut Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    in_out.with_input_output_len(|input, output, len| {
+        if let Some(len) = NonZeroUsize::new(len) {
+            let aes_key = aes_key.inner_less_safe();
+            let (htable, xi) = auth.inner();
+            unsafe {
+                aes_gcm_dec_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi)
+            };
+            let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+                unreachable!() // Due to previous checks.
+            });
+            ctr.increment_by_less_safe(blocks);
+        }
+    })
+}
diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs
index 443c19e16b..b95ff8c70e 100644
--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@@ -39,6 +39,7 @@ pub(super) mod clmulavxmovbe;
 pub(super) mod fallback;
 pub(super) mod neon;
 pub(super) mod vclmulavx2;
+pub(super) mod vclmulavx512;
 
 pub(super) struct Context<'key, K> {
     Xi: Xi,
@@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> {
     }
 }
 
+#[cfg(target_arch = "x86_64")]
+impl Context<'_, vclmulavx512::Key> {
+    /// Access to `inner` for the integrated AES-GCM implementations only.
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
+        (self.key.inner(), &mut self.Xi)
+    }
+}
+
 impl<K: UpdateBlocks> Context<'_, K> {
     #[inline(always)]
     pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
diff --git a/src/aead/gcm/vclmulavx2.rs b/src/aead/gcm/vclmulavx2.rs
index ebf4e76ad4..edb7aa9220 100644
--- a/src/aead/gcm/vclmulavx2.rs
+++ b/src/aead/gcm/vclmulavx2.rs
@@ -27,6 +27,7 @@ pub struct Key {
 }
 
 impl Key {
+    #[inline(never)]
     pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
         Self {
             h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },
diff --git a/src/aead/gcm/vclmulavx512.rs b/src/aead/gcm/vclmulavx512.rs
new file mode 100644
index 0000000000..e0eb14907f
--- /dev/null
+++ b/src/aead/gcm/vclmulavx512.rs
@@ -0,0 +1,49 @@
+// Copyright 2018-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
+use crate::{
+    aead::gcm::ffi::BLOCK_LEN,
+    cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul},
+    polyfill::slice::AsChunks,
+};
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    pub(in super::super) fn new(
+        value: KeyValue,
+        _cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul),
+    ) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx10_512, value) },
+        }
+    }
+
+    pub(super) fn inner(&self) -> &HTable {
+        &self.h_table
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
+        unsafe { ghash!(gcm_ghash_vpclmulqdq_avx10_512_1, xi, &self.h_table, input) }
+    }
+}
diff --git a/src/cpu.rs b/src/cpu.rs
index 293e1b5355..cc73e32fb6 100644
--- a/src/cpu.rs
+++ b/src/cpu.rs
@@ -113,6 +113,20 @@ where
     }
 }
 
+impl<A, B, C, D> GetFeature<(A, B, C, D)> for features::Values
+where
+    features::Values: GetFeature<(A, B)>,
+    features::Values: GetFeature<(C, D)>,
+{
+    #[inline(always)]
+    fn get_feature(&self) -> Option<(A, B, C, D)> {
+        match (self.get_feature(), self.get_feature()) {
+            (Some((a, b)), Some((c, d))) => Some((a, b, c, d)),
+            _ => None,
+        }
+    }
+}
+
 impl<F> GetFeature<F> for Features
 where
     features::Values: GetFeature<F>,
diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs
index f45052fe7f..d479ecd365 100644
--- a/src/cpu/intel.rs
+++ b/src/cpu/intel.rs
@@ -139,6 +139,11 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
     #[cfg(target_arch = "x86_64")]
     let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]);
 
+    // `OPENSSL_cpuid_setup` synthesizes this bit when it detects an Intel
+    // CPU family that is known to downclock when ZMM registers are used.
+    #[cfg(target_arch = "x86_64")]
+    let avoid_zmm = check(cpuid[2], 14);
+
     let mut caps = 0;
 
     // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE
@@ -250,6 +255,35 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
         // calling into the C code.
         let flag = unsafe { &avx2_available };
         flag.store(1, core::sync::atomic::Ordering::Relaxed);
+
+        // AVX-512.
+        // Initial releases of macOS 12 had a serious bug w.r.t. AVX-512
+        // support; see https://go-review.googlesource.com/c/sys/+/620256.
+        // Given that, plus Apple's transition to ARM, AVX-512 isn't worth
+        // supporting for their targets.
+        #[cfg(not(target_vendor = "apple"))]
+        {
+            // Intel: "15.3 DETECTION OF 512-BIT INSTRUCTION GROUPS OF THE INTEL
+            // AVX-512 FAMILY".
+            // `OPENSSL_cpuid_setup` clears these bits when XCR0[7:5] isn't 0b111.
+            // doesn't AVX-512  state.
+            let f = check(extended_features_ebx, 16);
+            let bw = check(extended_features_ebx, 30);
+
+            // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS
+            // OPERATING AT 256 AND 128-BIT VECTOR LENGTHS"
+            let vl = check(extended_features_ebx, 31);
+
+            // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS
+            // OPERATING AT 256 AND 128-BIT VECTOR LENGTHS."
+            if !avoid_zmm && f {
+                // Intel: "Table 15-2. Feature Flag Collection Required of
+                // 256/128 Bit Vector Lengths for Each Instruction Group."
+                if bw && vl {
+                    set(&mut caps, Shift::Avx512_BW_VL_ZMM)
+                }
+            }
+        }
     }
 
     // Intel: "12.13.4 Checking for Intel AES-NI Support"
@@ -340,6 +374,11 @@ impl_get_feature! {
         { ("x86", "x86_64") => Aes },
         { ("x86", "x86_64") => Avx },
         { ("x86_64") => Bmi1 },
+
+        // AVX512BW + AVX512VL + AND using ZMM registers isn't expected to caus
+        // downclocking.
+        { ("x86_64") => Avx512_BW_VL_ZMM },
+
         { ("x86_64") => Avx2 },
         { ("x86_64") => Bmi2 },
         { ("x86_64") => Adx },