Browse Source

Implement SIMD for NEON

On my phone, the speed with this NEON code is 2.3x the speed without
NEON for BLAKE2b and 1.6x for BLAKE2s.
pull/2/head 0.2.6
Cesar Eduardo Barros 9 years ago
parent
commit
fa3ae10063
  1. 2
      Cargo.toml
  2. 13
      src/simd/mod.rs
  3. 174
      src/simd/neon.rs

2
Cargo.toml

@ -1,6 +1,6 @@
[package]
name = "blake2-rfc"
version = "0.2.5"
version = "0.2.6"
authors = ["Cesar Eduardo Barros <cesarb@cesarb.eti.br>"]
description = "A pure Rust implementation of BLAKE2 based on the draft RFC."
repository = "https://github.com/cesarb/blake2-rfc"

13
src/simd/mod.rs

@ -31,9 +31,18 @@ mod sse2;
#[cfg(all(feature = "simd", any(target_arch = "x86", target_arch = "x86_64")))]
pub use self::sse2::*;
#[cfg(all(feature = "simd", target_arch = "arm", target_endian = "little"))]
mod neon;
#[cfg(all(feature = "simd", target_arch = "arm", target_endian = "little"))]
pub use self::neon::*;
#[cfg(not(all(feature = "simd", any(target_arch = "x86",
target_arch = "x86_64"))))]
target_arch = "x86_64",
all(target_arch = "arm",
target_endian = "little")))))]
mod fallback;
#[cfg(not(all(feature = "simd", any(target_arch = "x86",
target_arch = "x86_64"))))]
target_arch = "x86_64",
all(target_arch = "arm",
target_endian = "little")))))]
pub use self::fallback::*;

174
src/simd/neon.rs

@ -0,0 +1,174 @@
// Copyright (c) 2015 Cesar Eduardo Barros
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the
// Software without restriction, including without
// limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of
// the Software, and to permit persons to whom the Software
// is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial portions
// of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
use std::ops::BitXor;
#[derive(Clone, Copy, Debug)]
#[repr(C)]
#[simd]
pub struct vec4_u32(u32, u32, u32, u32);
#[derive(Clone, Copy, Debug)]
#[repr(C)]
#[simd]
struct u64x2(u64, u64);
#[derive(Clone, Copy, Debug)]
pub struct vec4_u64(u64x2, u64x2);
#[derive(Clone, Copy)] #[repr(C)] #[simd] struct i32x4(i32, i32, i32, i32);
#[derive(Clone, Copy)] #[repr(C)] #[simd] struct i64x2(i64, i64);
extern {
#[link_name = "llvm.arm.neon.vshiftu.v4i32"]
fn vshiftu_v4i32(a: vec4_u32, b: i32x4) -> vec4_u32;
#[link_name = "llvm.arm.neon.vshiftu.v2i64"]
fn vshiftu_v2i64(a: u64x2, b: i64x2) -> u64x2;
}
#[inline(always)]
fn vshlq_n_u32(a: vec4_u32, n: u32) -> vec4_u32 {
let n_ = n as i32;
unsafe { vshiftu_v4i32(a, i32x4(n_, n_, n_, n_)) }
}
#[inline(always)]
fn vshrq_n_u32(a: vec4_u32, n: u32) -> vec4_u32 {
let n_ = -(n as i32);
unsafe { vshiftu_v4i32(a, i32x4(n_, n_, n_, n_)) }
}
#[inline(always)]
fn vshlq_n_u64(a: u64x2, n: u32) -> u64x2 {
let n_ = n as i64;
unsafe { vshiftu_v2i64(a, i64x2(n_, n_)) }
}
#[inline(always)]
fn vshrq_n_u64(a: u64x2, n: u32) -> u64x2 {
let n_ = -(n as i64);
unsafe { vshiftu_v2i64(a, i64x2(n_, n_)) }
}
impl vec4_u32 {
#[inline(always)]
pub fn new(a: u32, b: u32, c: u32, d: u32) -> Self {
vec4_u32(a, b, c, d)
}
#[inline(always)] pub fn from_le(self) -> Self { self }
#[inline(always)] pub fn to_le(self) -> Self { self }
#[inline(always)]
pub fn wrapping_add(self, rhs: Self) -> Self {
self + rhs
}
#[inline(always)]
pub fn rotate_right(self, n: u32) -> Self {
vshrq_n_u32(self, n) ^ vshlq_n_u32(self, 32 - n)
}
#[inline(always)]
pub fn shuffle_left_1(self) -> Self {
vec4_u32(self.1, self.2, self.3, self.0)
}
#[inline(always)]
pub fn shuffle_left_2(self) -> Self {
vec4_u32(self.2, self.3, self.0, self.1)
}
#[inline(always)]
pub fn shuffle_left_3(self) -> Self {
vec4_u32(self.3, self.0, self.1, self.2)
}
#[inline(always)]
pub fn shuffle_right_1(self) -> Self { self.shuffle_left_3() }
#[inline(always)]
pub fn shuffle_right_2(self) -> Self { self.shuffle_left_2() }
#[inline(always)]
pub fn shuffle_right_3(self) -> Self { self.shuffle_left_1() }
}
impl BitXor for vec4_u64 {
type Output = Self;
#[inline(always)]
fn bitxor(self, rhs: Self) -> Self::Output {
vec4_u64(self.0 ^ rhs.0,
self.1 ^ rhs.1)
}
}
impl vec4_u64 {
#[inline(always)]
pub fn new(a: u64, b: u64, c: u64, d: u64) -> Self {
vec4_u64(u64x2(a, b), u64x2(c, d))
}
#[inline(always)] pub fn from_le(self) -> Self { self }
#[inline(always)] pub fn to_le(self) -> Self { self }
#[inline(always)]
pub fn wrapping_add(self, rhs: Self) -> Self {
vec4_u64(self.0 + rhs.0, self.1 + rhs.1)
}
#[inline(always)]
pub fn rotate_right(self, n: u32) -> Self {
vec4_u64(vshrq_n_u64(self.0, n) ^ vshlq_n_u64(self.0, 64 - n),
vshrq_n_u64(self.1, n) ^ vshlq_n_u64(self.1, 64 - n))
}
#[inline(always)]
pub fn shuffle_left_1(self) -> Self {
vec4_u64(u64x2((self.0).1, (self.1).0), u64x2((self.1).1, (self.0).0))
}
#[inline(always)]
pub fn shuffle_left_2(self) -> Self {
vec4_u64(self.1, self.0)
}
#[inline(always)]
pub fn shuffle_left_3(self) -> Self {
vec4_u64(u64x2((self.1).1, (self.0).0), u64x2((self.0).1, (self.1).0))
}
#[inline(always)]
pub fn shuffle_right_1(self) -> Self { self.shuffle_left_3() }
#[inline(always)]
pub fn shuffle_right_2(self) -> Self { self.shuffle_left_2() }
#[inline(always)]
pub fn shuffle_right_3(self) -> Self { self.shuffle_left_1() }
}
Loading…
Cancel
Save