Skip to content

Commit

Permalink
aes-gcm: Enable AVX-512 implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
briansmith committed Mar 8, 2025
1 parent ea2e22e commit 159aa07
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 82 deletions.
6 changes: 6 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[

(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
Expand Down Expand Up @@ -888,8 +889,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"LIMB_shr",
"OPENSSL_cpuid_setup",
"aes_gcm_dec_kernel",
"aes_gcm_dec_update_vaes_avx10_512",
"aes_gcm_dec_update_vaes_avx2",
"aes_gcm_enc_kernel",
"aes_gcm_enc_update_vaes_avx10_512",
"aes_gcm_enc_update_vaes_avx2",
"aes_hw_ctr32_encrypt_blocks",
"aes_hw_set_encrypt_key",
Expand Down Expand Up @@ -949,12 +952,15 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"gcm_ghash_avx",
"gcm_ghash_clmul",
"gcm_ghash_neon",
"gcm_ghash_vpclmulqdq_avx10_512_1",
"gcm_ghash_vpclmulqdq_avx2_1",
"gcm_gmult_clmul",
"gcm_gmult_neon",
"gcm_gmult_vpclmulqdq_avx2",
"gcm_init_avx",
"gcm_init_clmul",
"gcm_init_neon",
"gcm_init_vpclmulqdq_avx10_512",
"gcm_init_vpclmulqdq_avx2",
"k25519Precomp",
"limbs_mul_add_limb",
Expand Down
83 changes: 1 addition & 82 deletions crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -737,59 +737,6 @@ sub _ghash_update {
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
# Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
cmp \$$VL, $AADLEN
jb .Laad_blockbyblock$local_label_suffix

# AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and
# gfpoly to all 128-bit lanes.
vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY

# Load the lowest set of key powers.
vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1

cmp \$4*$VL-1, $AADLEN
jbe .Laad_loop_1x$local_label_suffix

# AADLEN >= 4*VL. Load the higher key powers.
vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2

# Update GHASH with 4*VL bytes of AAD at a time.
.Laad_loop_4x$local_label_suffix:
vmovdqu8 0*$VL($AAD), $GHASHDATA0
vmovdqu8 1*$VL($AAD), $GHASHDATA1
vmovdqu8 2*$VL($AAD), $GHASHDATA2
vmovdqu8 3*$VL($AAD), $GHASHDATA3
@{[ _ghash_4x ]}
sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32
add \$-4*$VL, $AADLEN
cmp \$4*$VL-1, $AADLEN
ja .Laad_loop_4x$local_label_suffix

# Update GHASH with VL bytes of AAD at a time.
cmp \$$VL, $AADLEN
jb .Laad_large_done$local_label_suffix
.Laad_loop_1x$local_label_suffix:
vmovdqu8 ($AAD), $GHASHDATA0
vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
$GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
@{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
add \$$VL, $AAD
sub \$$VL, $AADLEN
cmp \$$VL, $AADLEN
jae .Laad_loop_1x$local_label_suffix

.Laad_large_done$local_label_suffix:
# Issue the vzeroupper that is needed after using ymm or zmm registers.
# Do it here instead of at the end, to minimize overhead for small AADLEN.
vzeroupper

# GHASH the remaining data 16 bytes at a time, using xmm registers only.
.Laad_blockbyblock$local_label_suffix:
test $AADLEN, $AADLEN
Expand All @@ -801,9 +748,6 @@ sub _ghash_update {
vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
@{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
add \$16, $AAD
sub \$16, $AADLEN
jnz .Laad_loop_blockbyblock$local_label_suffix
.Laad_done$local_label_suffix:
# Store the updated GHASH accumulator back to memory.
Expand Down Expand Up @@ -1303,31 +1247,6 @@ sub _aes_gcm_update {
return $code;
}

# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
{
my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
map( "%xmm$_", ( 0 .. 6 ) );

$code .= <<___;
@{[ _save_xmmregs (6) ]}
.seh_endprologue

vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
vmovdqu .Lgfpoly(%rip), $GFPOLY
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC

@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}

vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR)
___
}
$code .= _end_func;

# Disabled until significant deployment of AVX10/256 is seen. The separate
# *_vaes_avx2 implementation provides the only 256-bit support for now.
#
Expand All @@ -1353,7 +1272,7 @@ sub _aes_gcm_update {
$code .= _aes_gcm_init;
$code .= _end_func;

$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512_1", 1;
$code .= _ghash_update;
$code .= _end_func;

Expand Down
23 changes: 23 additions & 0 deletions src/aead/aes_gcm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ use cpu::GetFeature as _;
mod aarch64;
mod aeshwclmulmovbe;
mod vaesclmulavx2;
mod vaesclmulavx512;

#[derive(Clone)]
pub(super) struct Key(DynKey);
Expand All @@ -51,6 +52,9 @@ impl Key {

#[derive(Clone)]
enum DynKey {
#[cfg(target_arch = "x86_64")]
VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),

#[cfg(target_arch = "x86_64")]
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),

Expand Down Expand Up @@ -86,6 +90,9 @@ impl DynKey {
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
let gcm_key_value = derive_gcm_key_value(&aes_key);
let combo = if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
Self::VAesClMulAvx512(Combo { aes_key, gcm_key })
} else if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
} else if let Some(cpu) = cpu.get_feature() {
Expand Down Expand Up @@ -190,6 +197,11 @@ pub(super) fn seal(
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx512(c) => {
seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
c,
Expand Down Expand Up @@ -317,6 +329,17 @@ pub(super) fn open(
open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx512(c) => open_whole_partial(
c,
aad,
in_out_slice,
src,
ctr,
tag_iv,
vaesclmulavx512::open_whole,
),

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => open_whole_partial(
c,
Expand Down
93 changes: 93 additions & 0 deletions src/aead/aes_gcm/vaesclmulavx512.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright 2015-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#![cfg(target_arch = "x86_64")]

use super::{aes, gcm, Counter, BLOCK_LEN};
use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
use core::num::{NonZeroU32, NonZeroUsize};

pub(super) fn seal_whole(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
ctr: &mut Counter,
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
) {
prefixed_extern! {
fn aes_gcm_enc_update_vaes_avx10_512(
input: *const u8,
output: *mut u8,
len: c::NonZero_size_t, // TODO? zero OK?
key: &aes::AES_KEY,
ivec: &Counter,
Htable: &gcm::HTable,
Xi: &mut gcm::Xi);
}

let in_out = in_out.as_flattened_mut();

// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

if let Some(len) = NonZeroUsize::new(in_out.len()) {
let aes_key = aes_key.inner_less_safe();
let (htable, xi) = auth.inner();
let input = in_out.as_ptr();
let output = in_out.as_mut_ptr();
unsafe { aes_gcm_enc_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi) };
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
unreachable!() // Due to previous checks.
});
ctr.increment_by_less_safe(blocks);
}
}

pub(super) fn open_whole(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
in_out: Overlapping,
ctr: &mut Counter,
) {
prefixed_extern! {
fn aes_gcm_dec_update_vaes_avx10_512(
input: *const u8,
output: *mut u8,
len: c::NonZero_size_t, // TODO? zero OK?
key: &aes::AES_KEY,
ivec: &mut Counter,
Htable: &gcm::HTable,
Xi: &mut gcm::Xi);
}

// Precondition. TODO: Create an overlapping::AsChunks for this.
assert_eq!(in_out.len() % BLOCK_LEN, 0);
// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

in_out.with_input_output_len(|input, output, len| {
if let Some(len) = NonZeroUsize::new(len) {
let aes_key = aes_key.inner_less_safe();
let (htable, xi) = auth.inner();
unsafe {
aes_gcm_dec_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi)
};
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
unreachable!() // Due to previous checks.
});
ctr.increment_by_less_safe(blocks);
}
})
}
10 changes: 10 additions & 0 deletions src/aead/gcm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub(super) mod clmulavxmovbe;
pub(super) mod fallback;
pub(super) mod neon;
pub(super) mod vclmulavx2;
pub(super) mod vclmulavx512;

pub(super) struct Context<'key, K> {
Xi: Xi,
Expand Down Expand Up @@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> {
}
}

#[cfg(target_arch = "x86_64")]
impl Context<'_, vclmulavx512::Key> {
/// Access to `inner` for the integrated AES-GCM implementations only.
#[inline]
pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
(self.key.inner(), &mut self.Xi)
}
}

impl<K: UpdateBlocks> Context<'_, K> {
#[inline(always)]
pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
Expand Down
1 change: 1 addition & 0 deletions src/aead/gcm/vclmulavx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub struct Key {
}

impl Key {
#[inline(never)]
pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
Self {
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },
Expand Down
49 changes: 49 additions & 0 deletions src/aead/gcm/vclmulavx512.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright 2018-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#![cfg(target_arch = "x86_64")]

use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
use crate::{
aead::gcm::ffi::BLOCK_LEN,
cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul},
polyfill::slice::AsChunks,
};

#[derive(Clone)]
pub struct Key {
h_table: HTable,
}

impl Key {
pub(in super::super) fn new(
value: KeyValue,
_cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul),
) -> Self {
Self {
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx10_512, value) },
}
}

pub(super) fn inner(&self) -> &HTable {
&self.h_table
}
}

impl UpdateBlock for Key {
fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx10_512_1, xi, &self.h_table, input) }
}
}
Loading

0 comments on commit 159aa07

Please sign in to comment.