1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Copyright 2019 TiKV Project Authors. Licensed under MIT or Apache-2.0.

//! PCLMULQDQ-based CRC-64-ECMA computer.
//!
//! The implementation is based on Intel's "Fast CRC Computation for Generic
//! Polynomials Using PCLMULQDQ Instruction" [white paper].
//!
//! [white paper]: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use super::table;

pub fn get_update() -> super::UpdateFn {
    if is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128
        && is_x86_feature_detected!("sse2") // (all other _mm_*)
        && is_x86_feature_detected!("sse4.1")
    // _mm_extract_epi64
    {
        update
    } else {
        table::update
    }
}

fn update(mut state: u64, bytes: &[u8]) -> u64 {
    let (left, middle, right) = unsafe { bytes.align_to::<[__m128i; 8]>() };
    if let Some((first, rest)) = middle.split_first() {
        state = table::update(state, left);
        state = unsafe { update_simd(state, first, rest) };
        table::update(state, right)
    } else {
        table::update(state, bytes)
    }
}

#[target_feature(enable = "pclmulqdq", enable = "sse2")]
unsafe fn fold(coeff: __m128i, x: __m128i, y: __m128i) -> __m128i {
    let h = _mm_clmulepi64_si128(x, coeff, 0x10);
    let l = _mm_clmulepi64_si128(x, coeff, 0x01);
    _mm_xor_si128(_mm_xor_si128(h, l), y)
}

#[target_feature(enable = "sse2")]
unsafe fn build_const(high: u64, low: u64) -> __m128i {
    _mm_set_epi64x(high as i64, low as i64)
}

#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn update_simd(state: u64, first: &[__m128i; 8], rest: &[[__m128i; 8]]) -> u64 {
    let state = build_const(0, state);

    // receive the initial 128 bytes of data
    let mut x0 = _mm_load_si128(first.as_ptr());
    let mut x1 = _mm_load_si128(first.as_ptr().add(1));
    let mut x2 = _mm_load_si128(first.as_ptr().add(2));
    let mut x3 = _mm_load_si128(first.as_ptr().add(3));
    let mut x4 = _mm_load_si128(first.as_ptr().add(4));
    let mut x5 = _mm_load_si128(first.as_ptr().add(5));
    let mut x6 = _mm_load_si128(first.as_ptr().add(6));
    let mut x7 = _mm_load_si128(first.as_ptr().add(7));

    // xor the initial CRC value
    x0 = _mm_xor_si128(x0, state);

    // all K_nnn constants are computed by bit_reverse(x^nnn mod POLY).
    const K_1023: u64 = 0xd7d8_6b2a_f73d_e740;
    const K_1087: u64 = 0x8757_d71d_4fcc_1000;
    let coeff_128 = build_const(K_1087, K_1023);

    // perform 128-byte folding.
    for chunk in rest {
        x0 = fold(coeff_128, x0, _mm_load_si128(chunk.as_ptr()));
        x1 = fold(coeff_128, x1, _mm_load_si128(chunk.as_ptr().add(1)));
        x2 = fold(coeff_128, x2, _mm_load_si128(chunk.as_ptr().add(2)));
        x3 = fold(coeff_128, x3, _mm_load_si128(chunk.as_ptr().add(3)));
        x4 = fold(coeff_128, x4, _mm_load_si128(chunk.as_ptr().add(4)));
        x5 = fold(coeff_128, x5, _mm_load_si128(chunk.as_ptr().add(5)));
        x6 = fold(coeff_128, x6, _mm_load_si128(chunk.as_ptr().add(6)));
        x7 = fold(coeff_128, x7, _mm_load_si128(chunk.as_ptr().add(7)));
    }

    // fold by distance of 112 bytes
    const K_895: u64 = 0x9478_74de_5950_52cb;
    const K_959: u64 = 0x9e73_5cb5_9b47_24da;
    x7 = fold(build_const(K_959, K_895), x0, x7);

    // fold by distance of 96 bytes
    const K_767: u64 = 0xe4ce_2cd5_5fea_0037;
    const K_831: u64 = 0x2fe3_fd29_20ce_82ec;
    x7 = fold(build_const(K_831, K_767), x1, x7);

    // fold by distance of 80 bytes
    const K_639: u64 = 0x0e31_d519_421a_63a5;
    const K_703: u64 = 0x2e30_2032_12ca_c325;
    x7 = fold(build_const(K_703, K_639), x2, x7);

    // fold by distance of 64 bytes
    const K_511: u64 = 0x081f_6054_a784_2df4;
    const K_575: u64 = 0x6ae3_efbb_9dd4_41f3;
    x7 = fold(build_const(K_575, K_511), x3, x7);

    // fold by distance of 48 bytes
    const K_383: u64 = 0x69a3_5d91_c373_0254;
    const K_447: u64 = 0xb5ea_1af9_c013_aca4;
    x7 = fold(build_const(K_447, K_383), x4, x7);

    // fold by distance of 32 bytes
    const K_255: u64 = 0x3be6_53a3_0fe1_af51;
    const K_319: u64 = 0x6009_5b00_8a9e_fa44;
    x7 = fold(build_const(K_319, K_255), x5, x7);

    // fold by distance of 16 bytes
    const K_127: u64 = 0xdabe_95af_c787_5f40; // same as table::TABLE_7[1]
    const K_191: u64 = 0xe05d_d497_ca39_3ae4; // same as table::TABLE_15[1]
    x7 = fold(build_const(K_191, K_127), x6, x7);

    // finally fold 16 bytes into 8 bytes.
    let r = _mm_clmulepi64_si128(x7, build_const(0, K_127), 0x00);
    let r = _mm_xor_si128(r, _mm_srli_si128(x7, 8));

    // barrett reduction.
    const MU: u64 = 0x9c3e_466c_1729_63d5;
    const POLY: u64 = 0x92d8_af2b_af0e_1e85;
    let polymu = build_const(POLY, MU);
    let t1 = _mm_clmulepi64_si128(r, polymu, 0x00);
    let t2 = _mm_clmulepi64_si128(t1, polymu, 0x10);
    let res = _mm_xor_si128(_mm_xor_si128(t2, _mm_slli_si128(t1, 8)), r);

    _mm_extract_epi64(res, 1) as u64
}