Skip to content
4 changes: 4 additions & 0 deletions src/patch/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ pub(crate) enum ParsePatchErrorKind {

/// Orphaned hunk header found after trailing content.
OrphanedHunkHeader,

/// Filename contains invalid UTF-8 when parsing as text.
InvalidUtf8Path,
}

impl fmt::Display for ParsePatchErrorKind {
Expand All @@ -136,6 +139,7 @@ impl fmt::Display for ParsePatchErrorKind {
Self::UnexpectedHunkLine => "unexpected line in hunk body",
Self::MissingNewline => "missing newline",
Self::OrphanedHunkHeader => "orphaned hunk header after trailing content",
Self::InvalidUtf8Path => "filename is not valid UTF-8",
};
write!(f, "{msg}")
}
Expand Down
130 changes: 76 additions & 54 deletions src/patch/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,43 @@ use std::borrow::Cow;

type Result<T, E = ParsePatchError> = std::result::Result<T, E>;

/// Options that control parsing behavior.
///
/// Defaults match the [`parse`]/[`parse_bytes`] behavior.
#[derive(Clone, Copy)]
pub(crate) struct ParseOpts {
skip_preamble: bool,
reject_orphaned_hunks: bool,
}

impl Default for ParseOpts {
fn default() -> Self {
Self {
skip_preamble: true,
reject_orphaned_hunks: false,
}
}
}

impl ParseOpts {
/// Don't skip preamble lines before `---`/`+++`/`@@`.
///
/// Useful when the caller has already positioned the input
/// at the start of the patch content.
#[allow(dead_code)] // will be used by patch_set parser
pub(crate) fn no_skip_preamble(mut self) -> Self {
self.skip_preamble = false;
self
}

/// Reject orphaned `@@ ` hunk headers after parsed hunks,
/// matching `git apply` behavior.
pub(crate) fn reject_orphaned_hunks(mut self) -> Self {
self.reject_orphaned_hunks = true;
self
}
}

struct Parser<'a, T: Text + ?Sized> {
lines: std::iter::Peekable<LineIter<'a, T>>,
offset: usize,
Expand Down Expand Up @@ -53,78 +90,66 @@ impl<'a, T: Text + ?Sized> Parser<'a, T> {
}
}

// TODO: make a better API for lib consumers
//
// Too many different variants of `parse*` functions here.
// And that also propogate to `Patch::from_{str,bytes}{,_strict}`.

pub fn parse(input: &str) -> Result<Patch<'_, str>> {
let (result, _consumed) = parse_one(input);
let (result, _consumed) = parse_one(input, ParseOpts::default());
result
}

pub fn parse_strict(input: &str) -> Result<Patch<'_, str>> {
let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks());
result
}

pub fn parse_bytes(input: &[u8]) -> Result<Patch<'_, [u8]>> {
let (result, _consumed) = parse_one(input, ParseOpts::default());
result
}

pub fn parse_bytes_strict(input: &[u8]) -> Result<Patch<'_, [u8]>> {
let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks());
result
}

/// Parses one patch from input.
///
/// Always returns consumed bytes alongside the result
/// so callers can advance past the parsed or partially parsed content.
pub(crate) fn parse_one(input: &str) -> (Result<Patch<'_, str>>, usize) {
pub(crate) fn parse_one<T: Text + ?Sized>(
input: &T,
opts: ParseOpts,
) -> (Result<Patch<'_, T>>, usize) {
let mut parser = Parser::new(input);

let header = match patch_header(&mut parser) {
let header = match patch_header(&mut parser, &opts) {
Ok(h) => h,
Err(e) => return (Err(e), parser.offset()),
};
let hunks = match hunks(&mut parser) {
Ok(h) => h,
Err(e) => return (Err(e), parser.offset()),
};

let patch = Patch::new(
header.0.map(convert_cow_to_str),
header.1.map(convert_cow_to_str),
hunks,
);
(Ok(patch), parser.offset())
}

pub fn parse_strict(input: &str) -> Result<Patch<'_, str>> {
let mut parser = Parser::new(input);
let header = patch_header(&mut parser)?;
let hunks = hunks(&mut parser)?;
reject_orphaned_hunk_headers(&mut parser)?;

Ok(Patch::new(
header.0.map(convert_cow_to_str),
header.1.map(convert_cow_to_str),
hunks,
))
}

pub fn parse_bytes(input: &[u8]) -> Result<Patch<'_, [u8]>> {
let mut parser = Parser::new(input);
let header = patch_header(&mut parser)?;
let hunks = hunks(&mut parser)?;

Ok(Patch::new(header.0, header.1, hunks))
}

pub fn parse_bytes_strict(input: &[u8]) -> Result<Patch<'_, [u8]>> {
let mut parser = Parser::new(input);
let header = patch_header(&mut parser)?;
let hunks = hunks(&mut parser)?;
reject_orphaned_hunk_headers(&mut parser)?;

Ok(Patch::new(header.0, header.1, hunks))
}

// This is only used when the type originated as a utf8 string
fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> {
match cow {
Cow::Borrowed(b) => std::str::from_utf8(b).unwrap().into(),
Cow::Owned(o) => String::from_utf8(o).unwrap().into(),
if opts.reject_orphaned_hunks {
if let Err(e) = reject_orphaned_hunk_headers(&mut parser) {
return (Err(e), parser.offset());
}
}

(Ok(Patch::new(header.0, header.1, hunks)), parser.offset())
}

#[allow(clippy::type_complexity)]
fn patch_header<'a, T: Text + ToOwned + ?Sized>(
fn patch_header<'a, T: Text + ?Sized>(
parser: &mut Parser<'a, T>,
) -> Result<(Option<Cow<'a, [u8]>>, Option<Cow<'a, [u8]>>)> {
skip_header_preamble(parser)?;
opts: &ParseOpts,
) -> Result<(Option<Cow<'a, T>>, Option<Cow<'a, T>>)> {
if opts.skip_preamble {
skip_header_preamble(parser)?;
}

let mut filename1 = None;
let mut filename2 = None;
Expand Down Expand Up @@ -161,10 +186,7 @@ fn skip_header_preamble<T: Text + ?Sized>(parser: &mut Parser<'_, T>) -> Result<
Ok(())
}

fn parse_filename<'a, T: Text + ToOwned + ?Sized>(
prefix: &str,
line: &'a T,
) -> Result<Cow<'a, [u8]>> {
fn parse_filename<'a, T: Text + ?Sized>(prefix: &str, line: &'a T) -> Result<Cow<'a, T>> {
let line = line
.strip_prefix(prefix)
.ok_or(ParsePatchErrorKind::InvalidFilename)?;
Expand Down
18 changes: 18 additions & 0 deletions src/patch/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,24 @@ fn plain_filename_roundtrip() {
assert_eq!(p.modified(), p2.modified());
}

// Octal escape \377 decodes to 0xFF, which is not valid UTF-8.
// When parsing into `Patch<'_, str>`, this returns a parse error
// instead of panicking.
#[test]
fn non_utf8_escaped_filename_returns_error_on_str_parse() {
let s = r#"\
--- "a/foo\377"
+++ "b/foo\377"
@@ -1 +1 @@
-x
+y
"#;
assert_eq!(
parse(s).unwrap_err().kind,
ParsePatchErrorKind::InvalidUtf8Path,
);
}

mod error_display {
use crate::patch::error::ParsePatchErrorKind;
use crate::Patch;
Expand Down
3 changes: 2 additions & 1 deletion src/patch_set/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ impl<'a> PatchSet<'a> {

let patch_input = &remaining[patch_start..];

let (result, consumed) = parse_one(patch_input);
let opts = crate::patch::parse::ParseOpts::default();
let (result, consumed) = parse_one(patch_input, opts);
// Always advance so the iterator makes progress even on error.
let abs_patch_start = self.offset + patch_start;
self.offset += patch_start + consumed;
Expand Down
38 changes: 27 additions & 11 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ impl<'a, T: Text + ?Sized> Iterator for LineIter<'a, T> {

/// A helper trait for processing text like `str` and `[u8]`
/// Useful for abstracting over those types for parsing as well as breaking input into lines
pub trait Text: Eq + Hash {
pub trait Text: Eq + Hash + ToOwned {
fn is_empty(&self) -> bool;
fn len(&self) -> usize;
fn starts_with(&self, prefix: &str) -> bool;
Expand All @@ -148,6 +148,12 @@ pub trait Text: Eq + Hash {
#[allow(unused)]
fn lines(&self) -> LineIter<'_, Self>;

/// Converts raw bytes into `Self::Owned`.
///
/// Returns `None` if the bytes are not valid for this type
/// (e.g. non-UTF-8 bytes for `str`).
fn owned_from_bytes(bytes: Vec<u8>) -> Option<Self::Owned>;

fn parse<T: std::str::FromStr>(&self) -> Option<T> {
self.as_str().and_then(|s| s.parse().ok())
}
Expand All @@ -158,6 +164,10 @@ impl Text for str {
self.is_empty()
}

fn owned_from_bytes(bytes: Vec<u8>) -> Option<String> {
String::from_utf8(bytes).ok()
}

fn len(&self) -> usize {
self.len()
}
Expand Down Expand Up @@ -209,6 +219,10 @@ impl Text for [u8] {
self.is_empty()
}

fn owned_from_bytes(bytes: Vec<u8>) -> Option<Vec<u8>> {
Some(bytes)
}

fn len(&self) -> usize {
self.len()
}
Expand Down Expand Up @@ -292,27 +306,29 @@ fn find_byte(haystack: &[u8], byte: u8) -> Option<usize> {
///
/// See [`byte_needs_quoting`] for the set of characters that
/// require quoting.
pub(crate) fn escaped_filename<T: Text + ToOwned + ?Sized>(
pub(crate) fn escaped_filename<T: Text + ?Sized>(
filename: &T,
) -> Result<Cow<'_, [u8]>, ParsePatchError> {
) -> Result<Cow<'_, T>, ParsePatchError> {
if let Some(inner) = filename
.strip_prefix("\"")
.and_then(|s| s.strip_suffix("\""))
{
decode_escaped(inner)
match decode_escaped(inner.as_bytes())? {
None => Ok(Cow::Borrowed(inner)),
Some(bytes) => T::owned_from_bytes(bytes)
.map(Cow::Owned)
.ok_or_else(|| ParsePatchErrorKind::InvalidUtf8Path.into()),
}
} else {
let bytes = filename.as_bytes();
if bytes.iter().any(|b| byte_needs_quoting(*b)) {
return Err(ParsePatchErrorKind::InvalidCharInUnquotedFilename.into());
}
Ok(bytes.into())
Ok(Cow::Borrowed(filename))
}
}

fn decode_escaped<T: Text + ToOwned + ?Sized>(
escaped: &T,
) -> Result<Cow<'_, [u8]>, ParsePatchError> {
let bytes = escaped.as_bytes();
fn decode_escaped(bytes: &[u8]) -> Result<Option<Vec<u8>>, ParsePatchError> {
let mut result = Vec::new();
let mut i = 0;
let mut last_copy = 0;
Expand Down Expand Up @@ -365,8 +381,8 @@ fn decode_escaped<T: Text + ToOwned + ?Sized>(

if needs_allocation {
result.extend_from_slice(&bytes[last_copy..]);
Ok(Cow::Owned(result))
Ok(Some(result))
} else {
Ok(Cow::Borrowed(bytes))
Ok(None)
}
}