1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
//! Types supporting the UTF-8 parser

/// Action to take when receiving a byte
#[derive(Debug, Copy, Clone)]
pub enum Action {
    /// Unexpected byte; sequence is invalid
    InvalidSequence = 0,
    /// Received valid 7-bit ASCII byte which can be directly emitted.
    EmitByte = 1,
    /// Set the bottom continuation byte
    SetByte1 = 2,
    /// Set the 2nd-from-last continuation byte
    SetByte2 = 3,
    /// Set the 2nd-from-last byte which is part of a two byte sequence
    SetByte2Top = 4,
    /// Set the 3rd-from-last continuation byte
    SetByte3 = 5,
    /// Set the 3rd-from-last byte which is part of a three byte sequence
    SetByte3Top = 6,
    /// Set the top byte of a four byte sequence.
    SetByte4 = 7,
}

/// States the parser can be in.
///
/// There is a state for each initial input of the 3 and 4 byte sequences since
/// the following bytes are subject to different conditions than a tail byte.
#[allow(non_camel_case_types)]
#[derive(Debug, Copy, Clone)]
pub enum State {
    /// Ground state; expect anything
    Ground = 0,
    /// 3 tail bytes
    Tail3 = 1,
    /// 2 tail bytes
    Tail2 = 2,
    /// 1 tail byte
    Tail1 = 3,
    /// UTF8-3 starting with E0
    U3_2_e0 = 4,
    /// UTF8-3 starting with ED
    U3_2_ed = 5,
    /// UTF8-4 starting with F0
    Utf8_4_3_f0 = 6,
    /// UTF8-4 starting with F4
    Utf8_4_3_f4 = 7,
}

impl Default for State {
    fn default() -> State {
        State::Ground
    }
}

impl State {
    /// Advance the parser state.
    ///
    /// This takes the current state and input byte into consideration, to determine the next state
    /// and any action that should be taken.
    #[inline]
    pub fn advance(&self, byte: u8) -> (State, Action) {
        match self {
            State::Ground => match byte {
                0x00..=0x7f => (State::Ground, Action::EmitByte),
                0xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
                0xe0 => (State::U3_2_e0, Action::SetByte3Top),
                0xe1..=0xec => (State::Tail2, Action::SetByte3Top),
                0xed => (State::U3_2_ed, Action::SetByte3Top),
                0xee..=0xef => (State::Tail2, Action::SetByte3Top),
                0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
                0xf1..=0xf3 => (State::Tail3, Action::SetByte4),
                0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::U3_2_e0 => match byte {
                0xa0..=0xbf => (State::Tail1, Action::SetByte2),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::U3_2_ed => match byte {
                0x80..=0x9f => (State::Tail1, Action::SetByte2),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::Utf8_4_3_f0 => match byte {
                0x90..=0xbf => (State::Tail2, Action::SetByte3),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::Utf8_4_3_f4 => match byte {
                0x80..=0x8f => (State::Tail2, Action::SetByte3),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::Tail3 => match byte {
                0x80..=0xbf => (State::Tail2, Action::SetByte3),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::Tail2 => match byte {
                0x80..=0xbf => (State::Tail1, Action::SetByte2),
                _ => (State::Ground, Action::InvalidSequence),
            },
            State::Tail1 => match byte {
                0x80..=0xbf => (State::Ground, Action::SetByte1),
                _ => (State::Ground, Action::InvalidSequence),
            },
        }
    }
}