1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0.

mod charset;
pub mod collator;

use std::cmp::Ordering;
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
use std::ops::Deref;

use codec::prelude::*;
use num::Unsigned;

use crate::codec::Result;

#[macro_export]
macro_rules! match_template_collator {
     ($t:tt, $($tail:tt)*) => {{
         #[allow(unused_imports)]
         use $crate::codec::collation::collator::*;

         match_template::match_template! {
             $t = [
                Binary => CollatorBinary,
                Utf8Mb4Bin => CollatorUtf8Mb4Bin,
                Utf8Mb4BinNoPadding => CollatorUtf8Mb4BinNoPadding,
                Utf8Mb4GeneralCi => CollatorUtf8Mb4GeneralCi,
                Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi,
                Latin1Bin => CollatorLatin1Bin,
            ],
            $($tail)*
         }
     }}
}

pub trait Charset {
    type Char: Copy + Into<u32>;

    fn validate(bstr: &[u8]) -> Result<()>;

    fn decode_one(data: &[u8]) -> Option<(Self::Char, usize)>;
}

pub trait Collator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug {
    type Charset: Charset;
    type Weight: Unsigned;

    const IS_CASE_INSENSITIVE: bool;

    /// Returns the weight of a given char. The chars that have equal
    /// weight are considered as the same char with this collation.
    /// See more on <http://www.unicode.org/reports/tr10/#Weight_Level_Defn>.
    fn char_weight(char: <Self::Charset as Charset>::Char) -> Self::Weight;

    /// Writes the SortKey of `bstr` into `writer`.
    fn write_sort_key<W: BufferWriter>(writer: &mut W, bstr: &[u8]) -> Result<usize>;

    /// Returns the SortKey of `bstr` as an owned byte vector.
    fn sort_key(bstr: &[u8]) -> Result<Vec<u8>> {
        let mut v = Vec::default();
        Self::write_sort_key(&mut v, bstr)?;
        Ok(v)
    }

    /// Compares `a` and `b` based on their SortKey.
    fn sort_compare(a: &[u8], b: &[u8]) -> Result<Ordering>;

    /// Hashes `bstr` based on its SortKey directly.
    ///
    /// WARN: `sort_hash(str) != hash(sort_key(str))`.
    fn sort_hash<H: Hasher>(state: &mut H, bstr: &[u8]) -> Result<()>;
}

#[derive(Debug)]
#[repr(transparent)]
pub struct SortKey<T, C: Collator>
where
    T: AsRef<[u8]>,
{
    inner: T,
    _phantom: PhantomData<C>,
}

impl<T, C: Collator> SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    #[inline]
    pub fn new(inner: T) -> Result<Self> {
        C::Charset::validate(inner.as_ref())?;
        Ok(Self {
            inner,
            _phantom: PhantomData,
        })
    }

    /// Create SortKey from unchecked bytes.
    ///
    /// # Panic
    ///
    /// The `Ord`, `Hash`, `PartialEq` and more implementations assume that the bytes are
    /// valid for the certain collator. The violation will cause panic.
    #[inline]
    pub fn new_unchecked(inner: T) -> Self {
        Self {
            inner,
            _phantom: PhantomData,
        }
    }

    #[inline]
    #[allow(clippy::transmute_ptr_to_ptr)]
    pub fn new_ref(inner: &T) -> Result<&Self> {
        C::Charset::validate(inner.as_ref())?;
        Ok(unsafe { std::mem::transmute(inner) })
    }

    #[inline]
    #[allow(clippy::transmute_ptr_to_ptr)]
    pub fn map_option(inner: &Option<T>) -> Result<&Option<Self>> {
        if let Some(inner) = inner {
            C::Charset::validate(inner.as_ref())?;
        }
        Ok(unsafe { std::mem::transmute(inner) })
    }

    #[inline]
    #[allow(clippy::transmute_ptr_to_ptr)]
    pub fn map_option_owned(inner: Option<T>) -> Result<Option<Self>> {
        if let Some(inner) = inner {
            C::Charset::validate(inner.as_ref())?;
            return Self::new(inner).map(Some);
        }
        Ok(None)
    }

    #[inline]
    pub fn into_inner(self) -> T {
        self.inner
    }
}

impl<T, C: Collator> Hash for SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    #[inline]
    fn hash<H: Hasher>(&self, state: &mut H) {
        C::sort_hash(state, self.inner.as_ref()).unwrap()
    }
}

impl<T, C: Collator> PartialEq for SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    #[inline]
    fn eq(&self, other: &Self) -> bool {
        C::sort_compare(&self.inner.as_ref(), &other.inner.as_ref()).unwrap()
            == std::cmp::Ordering::Equal
    }
}

impl<T, C: Collator> Eq for SortKey<T, C> where T: AsRef<[u8]> {}

impl<T, C: Collator> PartialOrd for SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    #[inline]
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        C::sort_compare(&self.inner.as_ref(), &other.inner.as_ref()).ok()
    }
}

impl<T, C: Collator> Ord for SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    #[inline]
    fn cmp(&self, other: &Self) -> Ordering {
        C::sort_compare(&self.inner.as_ref(), &other.inner.as_ref()).unwrap()
    }
}

impl<T, C: Collator> Clone for SortKey<T, C>
where
    T: AsRef<[u8]> + Clone,
{
    #[inline]
    fn clone(&self) -> Self {
        Self {
            inner: self.inner.clone(),
            _phantom: PhantomData,
        }
    }
}

impl<T, C: Collator> Deref for SortKey<T, C>
where
    T: AsRef<[u8]>,
{
    type Target = T;

    #[inline]
    fn deref(&self) -> &Self::Target {
        &self.inner
    }
}