Make shuffles for rotates optional and document SIMD features

9 years ago · c59b93819c
3 changed files with 33 additions and 9 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "blake2-rfc"
-version = "0.2.7"
+version = "0.2.8"
 authors = ["Cesar Eduardo Barros <cesarb@cesarb.eti.br>"]
 description = "A pure Rust implementation of BLAKE2 based on the draft RFC."
 repository = "https://github.com/cesarb/blake2-rfc"
@ -11,7 +11,8 @@ license = "MIT"
 [features]
 bench = []
 simd = ["simdty"]
-simd_asm = ["simd"]
+simd_opt = ["simd"]
+simd_asm = ["simd_opt"]

 [dependencies]
 constant_time_eq = "0.1.0"
--- a/README.md
+++ b/README.md
@ -51,3 +51,26 @@ in tree hashing mode. You are responsible for creating a valid parameter
 block, for hashing the padded key block if using keyed hashing, and for
 calling the correct finalization function. The parameter block is not
 validated by these functions.
+
+## SIMD optimization
+
+This crate has experimental support for explicit SIMD optimizations. It
+requires nightly Rust due to the use of unstable features.
+
+The following cargo features enable the explicit SIMD optimization:
+
+* `simd` enables the explicit use of SIMD vectors instead of a plain
+  struct
+* `simd_opt` additionally enables the use of SIMD shuffles to implement
+  some of the rotates
+* `simd_asm` additionally enables the use of inline asm to implement
+  some of the SIMD shuffles
+
+While one might expect that each of these is faster than the previous
+one, and that they are all faster than not enabling explicit SIMD
+vectors, that's not always the case. It can vary depending on target
+architecture and compiler options. If you need the extra speed from
+these optimizations, benchmark each one (the `bench` feature enables
+`cargo bench` in this crate, so you can use for instance `cargo bench
+--features="bench simd_asm"`). They have currently been tuned for SSE2
+(x86 and x86-64) and NEON (arm).
--- a/src/simd.rs
+++ b/src/simd.rs
@ -24,7 +24,7 @@
 // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 // DEALINGS IN THE SOFTWARE.

-#[cfg(feature = "simd")]
+#[cfg(feature = "simd_opt")]
 use std::mem::transmute;

 #[cfg(feature = "simd")]
@ -158,7 +158,7 @@ macro_rules! impl_vector_common {
    }
 }

-#[cfg(feature = "simd")]
+#[cfg(feature = "simd_opt")]
 #[cfg(any(target_arch = "arm", target_arch = "aarch64",
          target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
@ -176,7 +176,7 @@ fn u32x4_rotate_right_16(vec: u32x4) -> u32x4 {
 impl Vector for u32x4 {
    impl_vector_common!(u32x4, u32, 32);

-    #[cfg(feature = "simd")]
+    #[cfg(feature = "simd_opt")]
    #[cfg(any(target_arch = "arm", target_arch = "aarch64",
              target_arch = "x86", target_arch = "x86_64"))]
    #[inline(always)]
@ -189,7 +189,7 @@ impl Vector for u32x4 {
    }
 }

-#[cfg(feature = "simd")]
+#[cfg(feature = "simd_opt")]
 #[cfg(any(target_arch = "arm", target_arch = "aarch64",
          target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
@ -204,7 +204,7 @@ fn u64x4_rotate_right_32(vec: u64x4) -> u64x4 {
    }
 }

-#[cfg(feature = "simd")]
+#[cfg(feature = "simd_opt")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
 fn u64x4_rotate_right_16(vec: u64x4) -> u64x4 {
@ -247,7 +247,7 @@ fn u64x4_rotate_right_u8(vec: u64x4, n: u8) -> u64x4 {
 impl Vector for u64x4 {
    impl_vector_common!(u64x4, u64, 64);

-    #[cfg(feature = "simd")]
+    #[cfg(feature = "simd_opt")]
    #[cfg(any(all(target_arch = "arm", not(feature = "simd_asm")),
              target_arch = "aarch64"))]
    #[inline(always)]
@ -272,7 +272,7 @@ impl Vector for u64x4 {
        }
    }

-    #[cfg(feature = "simd")]
+    #[cfg(feature = "simd_opt")]
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    #[inline(always)]
    fn rotate_right(self, n: u32) -> Self