From 159aa0762cc24e395df69d93d5ce954994f4ba9a Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Sun, 16 Feb 2025 11:34:47 -0800 Subject: [PATCH] aes-gcm: Enable AVX-512 implementation. --- build.rs | 6 ++ .../aes/asm/aes-gcm-avx10-x86_64.pl | 83 +---------------- src/aead/aes_gcm.rs | 23 +++++ src/aead/aes_gcm/vaesclmulavx512.rs | 93 +++++++++++++++++++ src/aead/gcm.rs | 10 ++ src/aead/gcm/vclmulavx2.rs | 1 + src/aead/gcm/vclmulavx512.rs | 49 ++++++++++ src/cpu.rs | 14 +++ src/cpu/intel.rs | 39 ++++++++ 9 files changed, 236 insertions(+), 82 deletions(-) create mode 100644 src/aead/aes_gcm/vaesclmulavx512.rs create mode 100644 src/aead/gcm/vclmulavx512.rs diff --git a/build.rs b/build.rs index 9843ad8aa5..2296c0c3e4 100644 --- a/build.rs +++ b/build.rs @@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"), (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), @@ -888,8 +889,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "LIMB_shr", "OPENSSL_cpuid_setup", "aes_gcm_dec_kernel", + "aes_gcm_dec_update_vaes_avx10_512", "aes_gcm_dec_update_vaes_avx2", "aes_gcm_enc_kernel", + "aes_gcm_enc_update_vaes_avx10_512", "aes_gcm_enc_update_vaes_avx2", "aes_hw_ctr32_encrypt_blocks", "aes_hw_set_encrypt_key", @@ -949,12 +952,15 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "gcm_ghash_avx", "gcm_ghash_clmul", "gcm_ghash_neon", + "gcm_ghash_vpclmulqdq_avx10_512_1", "gcm_ghash_vpclmulqdq_avx2_1", "gcm_gmult_clmul", "gcm_gmult_neon", + "gcm_gmult_vpclmulqdq_avx2", "gcm_init_avx", "gcm_init_clmul", "gcm_init_neon", + "gcm_init_vpclmulqdq_avx10_512", "gcm_init_vpclmulqdq_avx2", "k25519Precomp", "limbs_mul_add_limb", diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl index 8a099d3c30..8b0ff89207 100644 --- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl @@ -737,59 +737,6 @@ sub _ghash_update { vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM - # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL. - cmp \$$VL, $AADLEN - jb .Laad_blockbyblock$local_label_suffix - - # AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and - # gfpoly to all 128-bit lanes. - vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK - vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY - - # Load the lowest set of key powers. - vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1 - - cmp \$4*$VL-1, $AADLEN - jbe .Laad_loop_1x$local_label_suffix - - # AADLEN >= 4*VL. Load the higher key powers. - vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4 - vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3 - vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2 - - # Update GHASH with 4*VL bytes of AAD at a time. -.Laad_loop_4x$local_label_suffix: - vmovdqu8 0*$VL($AAD), $GHASHDATA0 - vmovdqu8 1*$VL($AAD), $GHASHDATA1 - vmovdqu8 2*$VL($AAD), $GHASHDATA2 - vmovdqu8 3*$VL($AAD), $GHASHDATA3 - @{[ _ghash_4x ]} - sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32 - add \$-4*$VL, $AADLEN - cmp \$4*$VL-1, $AADLEN - ja .Laad_loop_4x$local_label_suffix - - # Update GHASH with VL bytes of AAD at a time. - cmp \$$VL, $AADLEN - jb .Laad_large_done$local_label_suffix -.Laad_loop_1x$local_label_suffix: - vmovdqu8 ($AAD), $GHASHDATA0 - vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0 - vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC - @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, - $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]} - @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM, - $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} - add \$$VL, $AAD - sub \$$VL, $AADLEN - cmp \$$VL, $AADLEN - jae .Laad_loop_1x$local_label_suffix - -.Laad_large_done$local_label_suffix: - # Issue the vzeroupper that is needed after using ymm or zmm registers. - # Do it here instead of at the end, to minimize overhead for small AADLEN. - vzeroupper - # GHASH the remaining data 16 bytes at a time, using xmm registers only. .Laad_blockbyblock$local_label_suffix: test $AADLEN, $AADLEN @@ -801,9 +748,6 @@ sub _ghash_update { vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM @{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} - add \$16, $AAD - sub \$16, $AADLEN - jnz .Laad_loop_blockbyblock$local_label_suffix .Laad_done$local_label_suffix: # Store the updated GHASH accumulator back to memory. @@ -1303,31 +1247,6 @@ sub _aes_gcm_update { return $code; } -# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]); -$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1; -{ - my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ]; - my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = - map( "%xmm$_", ( 0 .. 6 ) ); - - $code .= <<___; - @{[ _save_xmmregs (6) ]} - .seh_endprologue - - vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC - vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK - vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1 - vmovdqu .Lgfpoly(%rip), $GFPOLY - vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC - - @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]} - - vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC - vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR) -___ -} -$code .= _end_func; - # Disabled until significant deployment of AVX10/256 is seen. The separate # *_vaes_avx2 implementation provides the only 256-bit support for now. # @@ -1353,7 +1272,7 @@ sub _aes_gcm_update { $code .= _aes_gcm_init; $code .= _end_func; -$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1; +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512_1", 1; $code .= _ghash_update; $code .= _end_func; diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs index d9e08a3116..a842f32341 100644 --- a/src/aead/aes_gcm.rs +++ b/src/aead/aes_gcm.rs @@ -36,6 +36,7 @@ use cpu::GetFeature as _; mod aarch64; mod aeshwclmulmovbe; mod vaesclmulavx2; +mod vaesclmulavx512; #[derive(Clone)] pub(super) struct Key(DynKey); @@ -51,6 +52,9 @@ impl Key { #[derive(Clone)] enum DynKey { + #[cfg(target_arch = "x86_64")] + VAesClMulAvx512(Combo), + #[cfg(target_arch = "x86_64")] VAesClMulAvx2(Combo), @@ -86,6 +90,9 @@ impl DynKey { let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; let gcm_key_value = derive_gcm_key_value(&aes_key); let combo = if let Some(cpu) = cpu.get_feature() { + let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu); + Self::VAesClMulAvx512(Combo { aes_key, gcm_key }) + } else if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu); Self::VAesClMulAvx2(Combo { aes_key, gcm_key }) } else if let Some(cpu) = cpu.get_feature() { @@ -190,6 +197,11 @@ pub(super) fn seal( seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole) } + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx512(c) => { + seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole) + } + #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => seal_whole_partial( c, @@ -317,6 +329,17 @@ pub(super) fn open( open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole) } + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx512(c) => open_whole_partial( + c, + aad, + in_out_slice, + src, + ctr, + tag_iv, + vaesclmulavx512::open_whole, + ), + #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => open_whole_partial( c, diff --git a/src/aead/aes_gcm/vaesclmulavx512.rs b/src/aead/aes_gcm/vaesclmulavx512.rs new file mode 100644 index 0000000000..4033223301 --- /dev/null +++ b/src/aead/aes_gcm/vaesclmulavx512.rs @@ -0,0 +1,93 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{aes, gcm, Counter, BLOCK_LEN}; +use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut}; +use core::num::{NonZeroU32, NonZeroUsize}; + +pub(super) fn seal_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + ctr: &mut Counter, + mut in_out: AsChunksMut, +) { + prefixed_extern! { + fn aes_gcm_enc_update_vaes_avx10_512( + input: *const u8, + output: *mut u8, + len: c::NonZero_size_t, // TODO? zero OK? + key: &aes::AES_KEY, + ivec: &Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + let in_out = in_out.as_flattened_mut(); + + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(len) = NonZeroUsize::new(in_out.len()) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + let input = in_out.as_ptr(); + let output = in_out.as_mut_ptr(); + unsafe { aes_gcm_enc_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi) }; + let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| { + unreachable!() // Due to previous checks. + }); + ctr.increment_by_less_safe(blocks); + } +} + +pub(super) fn open_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + in_out: Overlapping, + ctr: &mut Counter, +) { + prefixed_extern! { + fn aes_gcm_dec_update_vaes_avx10_512( + input: *const u8, + output: *mut u8, + len: c::NonZero_size_t, // TODO? zero OK? + key: &aes::AES_KEY, + ivec: &mut Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + // Precondition. TODO: Create an overlapping::AsChunks for this. + assert_eq!(in_out.len() % BLOCK_LEN, 0); + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + in_out.with_input_output_len(|input, output, len| { + if let Some(len) = NonZeroUsize::new(len) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + unsafe { + aes_gcm_dec_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi) + }; + let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| { + unreachable!() // Due to previous checks. + }); + ctr.increment_by_less_safe(blocks); + } + }) +} diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs index 443c19e16b..b95ff8c70e 100644 --- a/src/aead/gcm.rs +++ b/src/aead/gcm.rs @@ -39,6 +39,7 @@ pub(super) mod clmulavxmovbe; pub(super) mod fallback; pub(super) mod neon; pub(super) mod vclmulavx2; +pub(super) mod vclmulavx512; pub(super) struct Context<'key, K> { Xi: Xi, @@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> { } } +#[cfg(target_arch = "x86_64")] +impl Context<'_, vclmulavx512::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (self.key.inner(), &mut self.Xi) + } +} + impl Context<'_, K> { #[inline(always)] pub fn update_blocks(&mut self, input: AsChunks) { diff --git a/src/aead/gcm/vclmulavx2.rs b/src/aead/gcm/vclmulavx2.rs index ebf4e76ad4..edb7aa9220 100644 --- a/src/aead/gcm/vclmulavx2.rs +++ b/src/aead/gcm/vclmulavx2.rs @@ -27,6 +27,7 @@ pub struct Key { } impl Key { + #[inline(never)] pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) }, diff --git a/src/aead/gcm/vclmulavx512.rs b/src/aead/gcm/vclmulavx512.rs new file mode 100644 index 0000000000..e0eb14907f --- /dev/null +++ b/src/aead/gcm/vclmulavx512.rs @@ -0,0 +1,49 @@ +// Copyright 2018-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; +use crate::{ + aead::gcm::ffi::BLOCK_LEN, + cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul}, + polyfill::slice::AsChunks, +}; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new( + value: KeyValue, + _cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul), + ) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx10_512, value) }, + } + } + + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + let input: AsChunks = (&a).into(); + unsafe { ghash!(gcm_ghash_vpclmulqdq_avx10_512_1, xi, &self.h_table, input) } + } +} diff --git a/src/cpu.rs b/src/cpu.rs index 293e1b5355..cc73e32fb6 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -113,6 +113,20 @@ where } } +impl GetFeature<(A, B, C, D)> for features::Values +where + features::Values: GetFeature<(A, B)>, + features::Values: GetFeature<(C, D)>, +{ + #[inline(always)] + fn get_feature(&self) -> Option<(A, B, C, D)> { + match (self.get_feature(), self.get_feature()) { + (Some((a, b)), Some((c, d))) => Some((a, b, c, d)), + _ => None, + } + } +} + impl GetFeature for Features where features::Values: GetFeature, diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index f45052fe7f..d479ecd365 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -139,6 +139,11 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { #[cfg(target_arch = "x86_64")] let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]); + // `OPENSSL_cpuid_setup` synthesizes this bit when it detects an Intel + // CPU family that is known to downclock when ZMM registers are used. + #[cfg(target_arch = "x86_64")] + let avoid_zmm = check(cpuid[2], 14); + let mut caps = 0; // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE @@ -250,6 +255,35 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { // calling into the C code. let flag = unsafe { &avx2_available }; flag.store(1, core::sync::atomic::Ordering::Relaxed); + + // AVX-512. + // Initial releases of macOS 12 had a serious bug w.r.t. AVX-512 + // support; see https://go-review.googlesource.com/c/sys/+/620256. + // Given that, plus Apple's transition to ARM, AVX-512 isn't worth + // supporting for their targets. + #[cfg(not(target_vendor = "apple"))] + { + // Intel: "15.3 DETECTION OF 512-BIT INSTRUCTION GROUPS OF THE INTEL + // AVX-512 FAMILY". + // `OPENSSL_cpuid_setup` clears these bits when XCR0[7:5] isn't 0b111. + // doesn't AVX-512 state. + let f = check(extended_features_ebx, 16); + let bw = check(extended_features_ebx, 30); + + // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS + // OPERATING AT 256 AND 128-BIT VECTOR LENGTHS" + let vl = check(extended_features_ebx, 31); + + // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS + // OPERATING AT 256 AND 128-BIT VECTOR LENGTHS." + if !avoid_zmm && f { + // Intel: "Table 15-2. Feature Flag Collection Required of + // 256/128 Bit Vector Lengths for Each Instruction Group." + if bw && vl { + set(&mut caps, Shift::Avx512_BW_VL_ZMM) + } + } + } } // Intel: "12.13.4 Checking for Intel AES-NI Support" @@ -340,6 +374,11 @@ impl_get_feature! { { ("x86", "x86_64") => Aes }, { ("x86", "x86_64") => Avx }, { ("x86_64") => Bmi1 }, + + // AVX512BW + AVX512VL + AND using ZMM registers isn't expected to caus + // downclocking. + { ("x86_64") => Avx512_BW_VL_ZMM }, + { ("x86_64") => Avx2 }, { ("x86_64") => Bmi2 }, { ("x86_64") => Adx },