Skip to content

Faster and more accurate parsing #2

@tisonkun

Description

@tisonkun

So far, the parse_size function is derived from the parse-size crate.

bsize/bsize/src/parse.rs

Lines 68 to 198 in ab464e6

fn parse_size(mut src: &[u8]) -> Result<u64, ParseError> {
// trim starting and trailing spaces
while let [b' ', init @ ..] = src {
src = init;
}
while let [init @ .., b' '] = src {
src = init;
}
// trim trailing 'b' or 'B'
if let [init @ .., b'b' | b'B'] = src {
src = init;
};
let mut multiply = 1u64;
if let [init @ .., b'i' | b'I'] = src {
src = init;
if let [init @ .., prefix] = src {
match prefix {
b'k' | b'K' => multiply = 1 << 10,
b'm' | b'M' => multiply = 1 << 20,
b'g' | b'G' => multiply = 1 << 30,
b't' | b'T' => multiply = 1 << 40,
b'p' | b'P' => multiply = 1 << 50,
b'e' | b'E' => multiply = 1 << 60,
_ => return Err(ParseError::Malformed),
}
src = init;
} else {
// [iI][bB] is malformed suffix.
return Err(ParseError::Malformed);
}
} else {
if let [init @ .., prefix] = src {
'skip: {
match prefix {
b'k' | b'K' => multiply = 1_000,
b'm' | b'M' => multiply = 1_000_000,
b'g' | b'G' => multiply = 1_000_000_000,
b't' | b'T' => multiply = 1_000_000_000_000,
b'p' | b'P' => multiply = 1_000_000_000_000_000,
b'e' | b'E' => multiply = 1_000_000_000_000_000_000,
_ => break 'skip,
}
src = init;
}
}
}
// trim spaces between numeric part and unit part
while let [init @ .., b' '] = src {
src = init;
}
macro_rules! append_digit {
($before:expr, $method:ident, $digit_char:expr) => {
$before
.checked_mul(10)
.and_then(|v| v.$method(($digit_char - b'0').into()))
};
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ParseState {
Empty,
Integer,
IntegerOverflow,
Fraction,
FractionOverflow,
}
let mut mantissa = 0u64;
let mut exponent = 0i32;
let mut state = ParseState::Empty;
for b in src {
match (state, *b) {
(ParseState::Integer | ParseState::Empty, b'0'..=b'9') => {
if let Some(m) = append_digit!(mantissa, checked_add, *b) {
mantissa = m;
state = ParseState::Integer;
} else {
if *b >= b'5' {
mantissa += 1;
}
state = ParseState::IntegerOverflow;
exponent += 1;
}
}
(ParseState::IntegerOverflow, b'0'..=b'9') => {
exponent += 1;
}
(ParseState::Fraction, b'0'..=b'9') => {
if let Some(m) = append_digit!(mantissa, checked_add, *b) {
mantissa = m;
exponent -= 1;
} else {
if *b >= b'5' {
mantissa += 1;
}
state = ParseState::FractionOverflow;
}
}
(_, b'_') => {}
(ParseState::Integer, b'.') => state = ParseState::Fraction,
(ParseState::IntegerOverflow, b'.') => state = ParseState::FractionOverflow,
_ => return Err(ParseError::Malformed),
}
}
if matches!(state, ParseState::Empty) {
return Err(ParseError::Empty);
}
let abs_exponent = exponent.unsigned_abs();
if exponent >= 0 {
let power = 10_u64
.checked_pow(abs_exponent)
.ok_or(ParseError::Overflow)?;
let multiply = multiply.checked_mul(power).ok_or(ParseError::Overflow)?;
mantissa.checked_mul(multiply).ok_or(ParseError::Overflow)
} else if exponent >= -38 {
let power = 10_u128.pow(abs_exponent);
let result = (u128::from(mantissa) * u128::from(multiply) + power / 2) / power;
u64::try_from(result).map_err(|_| ParseError::Overflow)
} else {
// (2^128) * 1e-39 < 1, always, and thus saturate to 0.
Ok(0)
}
}

It works well now. But I'd suggest there is always room for improving its performance and accuracy.

File this tracking issue for such a topic.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions