1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
use fallback;

// We only use AVX when we can detect at runtime whether it's available, which
// requires std.
#[cfg(feature = "use_std")]
mod avx;
mod sse2;

// This macro employs a gcc-like "ifunc" trick where by upon first calling
// `memchr` (for example), CPU feature detection will be performed at runtime
// to determine the best implementation to use. After CPU feature detection
// is done, we replace `memchr`'s function pointer with the selection. Upon
// subsequent invocations, the CPU-specific routine is invoked directly, which
// skips the CPU feature detection and subsequent branch that's required.
//
// While this typically doesn't matter for rare occurrences or when used on
// larger haystacks, `memchr` can be called in tight loops where the overhead
// of this branch can actually add up *and is measurable*. This trick was
// necessary to bring this implementation up to glibc's speeds for the 'tiny'
// benchmarks, for example.
//
// At some point, I expect the Rust ecosystem will get a nice macro for doing
// exactly this, at which point, we can replace our hand-jammed version of it.
//
// N.B. The ifunc strategy does prevent function inlining of course, but on
// modern CPUs, you'll probably end up with the AVX2 implementation, which
// probably can't be inlined anyway---unless you've compiled your entire
// program with AVX2 enabled. However, even then, the various memchr
// implementations aren't exactly small, so inlining might not help anyway!
#[cfg(feature = "use_std")]
macro_rules! ifunc {
    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
        use std::mem;
        use std::sync::atomic::{AtomicPtr, Ordering};

        type FnRaw = *mut ();

        static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);

        fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> {
            let fun =
                if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") {
                    avx::$name as FnRaw
                } else if cfg!(memchr_runtime_sse2) {
                    sse2::$name as FnRaw
                } else {
                    fallback::$name as FnRaw
                };
            FN.store(fun as FnRaw, Ordering::Relaxed);
            unsafe {
                mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack)
            }
        }

        unsafe {
            let fun = FN.load(Ordering::Relaxed);
            mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack)
        }
    }}
}

// When std isn't available to provide runtime CPU feature detection, or if
// runtime CPU feature detection has been explicitly disabled, then just call
// our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets,
// so no CPU feature detection is necessary.
#[cfg(not(feature = "use_std"))]
macro_rules! ifunc {
    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
        if cfg!(memchr_runtime_sse2) {
            unsafe { sse2::$name($($needle),+, $haystack) }
        } else {
            fallback::$name($($needle),+, $haystack)
        }
    }}
}

#[inline(always)]
pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1)
}

#[inline(always)]
pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memchr2, haystack, n1, n2)
}

#[inline(always)]
pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memchr3, haystack, n1, n2, n3)
}

#[inline(always)]
pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1)
}

#[inline(always)]
pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memrchr2, haystack, n1, n2)
}

#[inline(always)]
pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
    ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memrchr3, haystack, n1, n2, n3)
}