From 9bbb873d342ad9dd2e3e673c9382a72cee55c5fd Mon Sep 17 00:00:00 2001 From: Matthew Gregan Date: Thu, 23 Apr 2026 16:46:25 +1200 Subject: [PATCH] Fix esds DSI parsing for truncated and duplicate DecSpecificInfo. Record AOT, sample rate and channel count before the GASpecificConfig bit reads that can hit EOF on a truncated DSI. find_descriptor swallows BitReaderError in non-strict mode, so storing the fields afterwards left a playable-looking AAC track with no usable config and silent playback for HE-AAC streams with explicit SBR signalling. Also keep the first DecSpecificInfo and ignore subsequent ones in non-strict mode, matching Chromium and ffmpeg. The previous code concatenated DSI bytes and overwrote the parsed audio fields from the second DSI, leaving the ES_Descriptor inconsistent when the two disagreed. Strict mode still rejects duplicates with EsdsDecSpecificInfoTagQuantity. --- mp4parse/src/lib.rs | 126 +++++++++++++++----------- mp4parse/src/tests.rs | 199 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 52 deletions(-) diff --git a/mp4parse/src/lib.rs b/mp4parse/src/lib.rs index 955e902f..0f92a266 100644 --- a/mp4parse/src/lib.rs +++ b/mp4parse/src/lib.rs @@ -5151,6 +5151,20 @@ fn read_ds_descriptor( esds: &mut ES_Descriptor, strictness: ParseStrictness, ) -> Result<()> { + // Keep the first DecSpecificInfo and ignore subsequent entries in + // non-strict mode. Concatenating raw bytes and/or overwriting the parsed + // fields from a second DSI leaves the ES_Descriptor inconsistent (e.g. + // sample-rate/channel-count mismatched with the stored DSI bytes) and + // produces silent-audio playback. A single well-formed DSI is always + // sufficient. + if !esds.decoder_specific_data.is_empty() { + fail_with_status_if( + strictness == ParseStrictness::Strict, + Status::EsdsDecSpecificInfoTagQuantity, + )?; + return Ok(()); + } + #[cfg(feature = "mp4v")] // Check if we are in a Visual esda Box. if esds.video_codec != CodecType::Unknown { @@ -5253,6 +5267,32 @@ fn read_ds_descriptor( _ => 96000, }; + // Map channel_configuration to channel count. Zero indicates PCE + // (program_config_element) signalling, which requires parsing the + // trailing GASpecificConfig fields below and so can't be resolved + // yet. + let channel_count_from_config = match channel_configuration { + 0 => None, + 1..=7 => Some(channel_configuration), + 11 => Some(7), // 6.1, AAC Amendment 4 (2013) + 12 | 14 => Some(8), // 7.1 (a/d), ITU BS.2159 + _ => return Err(Error::Unsupported("invalid channel configuration")), + }; + + // Record the essential audio fields now, before any further bit + // reads that may hit EOF on a truncated DSI. find_descriptor + // swallows BitReaderError in non-strict mode; without recording + // here we'd return a playable-looking track with no usable config + // (see band-orion.de, where HE-AAC with explicit SBR signalling + // omits the GASpecificConfig tail). + esds.audio_object_type = Some(audio_object_type); + esds.extended_audio_object_type = extended_audio_object_type; + esds.audio_sample_rate = Some(sample_frequency_value); + if let Some(cc) = channel_count_from_config { + esds.audio_channel_count = Some(cc); + esds.decoder_specific_data.extend_from_slice(data)?; + } + bit_reader.skip(1)?; // frameLengthFlag let depend_on_core_order: u8 = ReadInto::read(bit_reader, 1)?; if depend_on_core_order > 0 { @@ -5260,63 +5300,45 @@ fn read_ds_descriptor( } bit_reader.skip(1)?; // extensionFlag - let channel_counts = match channel_configuration { - 0 => { - debug!("Parsing program_config_element for channel counts"); - - bit_reader.skip(4)?; // element_instance_tag - bit_reader.skip(2)?; // object_type - bit_reader.skip(4)?; // sampling_frequency_index - let num_front_channel: u8 = ReadInto::read(bit_reader, 4)?; - let num_side_channel: u8 = ReadInto::read(bit_reader, 4)?; - let num_back_channel: u8 = ReadInto::read(bit_reader, 4)?; - let num_lfe_channel: u8 = ReadInto::read(bit_reader, 2)?; - bit_reader.skip(3)?; // num_assoc_data - bit_reader.skip(4)?; // num_valid_cc - - let mono_mixdown_present: bool = ReadInto::read(bit_reader, 1)?; - if mono_mixdown_present { - bit_reader.skip(4)?; // mono_mixdown_element_number - } - - let stereo_mixdown_present: bool = ReadInto::read(bit_reader, 1)?; - if stereo_mixdown_present { - bit_reader.skip(4)?; // stereo_mixdown_element_number - } - - let matrix_mixdown_idx_present: bool = ReadInto::read(bit_reader, 1)?; - if matrix_mixdown_idx_present { - bit_reader.skip(2)?; // matrix_mixdown_idx - bit_reader.skip(1)?; // pseudo_surround_enable - } - let mut _channel_counts = 0; - _channel_counts += read_surround_channel_count(bit_reader, num_front_channel)?; - _channel_counts += read_surround_channel_count(bit_reader, num_side_channel)?; - _channel_counts += read_surround_channel_count(bit_reader, num_back_channel)?; - _channel_counts += read_surround_channel_count(bit_reader, num_lfe_channel)?; - _channel_counts + if channel_count_from_config.is_none() { + // channel_configuration == 0: derive channel count from the + // program_config_element. + debug!("Parsing program_config_element for channel counts"); + + bit_reader.skip(4)?; // element_instance_tag + bit_reader.skip(2)?; // object_type + bit_reader.skip(4)?; // sampling_frequency_index + let num_front_channel: u8 = ReadInto::read(bit_reader, 4)?; + let num_side_channel: u8 = ReadInto::read(bit_reader, 4)?; + let num_back_channel: u8 = ReadInto::read(bit_reader, 4)?; + let num_lfe_channel: u8 = ReadInto::read(bit_reader, 2)?; + bit_reader.skip(3)?; // num_assoc_data + bit_reader.skip(4)?; // num_valid_cc + + let mono_mixdown_present: bool = ReadInto::read(bit_reader, 1)?; + if mono_mixdown_present { + bit_reader.skip(4)?; // mono_mixdown_element_number } - 1..=7 => channel_configuration, - // Amendment 4 of the AAC standard in 2013 below - 11 => 7, // 6.1 Amendment 4 of the AAC standard in 2013 - 12 | 14 => 8, // 7.1 (a/d) of ITU BS.2159 - _ => { - return Err(Error::Unsupported("invalid channel configuration")); + + let stereo_mixdown_present: bool = ReadInto::read(bit_reader, 1)?; + if stereo_mixdown_present { + bit_reader.skip(4)?; // stereo_mixdown_element_number } - }; - esds.audio_object_type = Some(audio_object_type); - esds.extended_audio_object_type = extended_audio_object_type; - esds.audio_sample_rate = Some(sample_frequency_value); - esds.audio_channel_count = Some(channel_counts); + let matrix_mixdown_idx_present: bool = ReadInto::read(bit_reader, 1)?; + if matrix_mixdown_idx_present { + bit_reader.skip(2)?; // matrix_mixdown_idx + bit_reader.skip(1)?; // pseudo_surround_enable + } + let mut channel_counts = 0; + channel_counts += read_surround_channel_count(bit_reader, num_front_channel)?; + channel_counts += read_surround_channel_count(bit_reader, num_side_channel)?; + channel_counts += read_surround_channel_count(bit_reader, num_back_channel)?; + channel_counts += read_surround_channel_count(bit_reader, num_lfe_channel)?; - if !esds.decoder_specific_data.is_empty() { - fail_with_status_if( - strictness == ParseStrictness::Strict, - Status::EsdsDecSpecificInfoTagQuantity, - )?; + esds.audio_channel_count = Some(channel_counts); + esds.decoder_specific_data.extend_from_slice(data)?; } - esds.decoder_specific_data.extend_from_slice(data)?; Ok(()) } diff --git a/mp4parse/src/tests.rs b/mp4parse/src/tests.rs index 087ba956..a8b25bed 100644 --- a/mp4parse/src/tests.rs +++ b/mp4parse/src/tests.rs @@ -1120,6 +1120,205 @@ fn read_esds_mpeg2_aac_lc() { assert_eq!(es.decoder_specific_data, aac_dc_descriptor); } +#[test] +fn read_esds_truncated_dsi() { + // HE-AAC DSI truncated so the GASpecificConfig tail (frameLengthFlag / + // dependsOnCoreCoder / extensionFlag) runs off the end of the 24-bit + // payload, causing the bit reader to hit EOF. Reduced from band-orion.de + // (Firefox bug 2011644). + // + // DSI `2b 92 08` decodes as: + // AOT=5 (SBR, explicit signalling) + // samplingFrequencyIndex=7 (22050 Hz base) + // channelConfiguration=2 (stereo) + // extensionSamplingFrequencyIndex=4 (44100 Hz extended) + // inner AOT=2 (AAC-LC) + // which leaves 2 bits remaining, one short of the extensionFlag. + // + // Before our fix read_ds_descriptor stored the parsed fields only after + // the failing reads, so find_descriptor would swallow the BitReaderError + // in non-strict mode and leave the ES_Descriptor with audio_sample_rate, + // audio_channel_count and decoder_specific_data all unset — a "playable" + // AAC track with no config, producing silent audio. + let aac_esds = vec![ + 0x03, 0x1a, // ES_Descriptor tag, len=26 + 0x00, 0x00, 0x00, // ES_ID + flags + 0x04, 0x12, // DC_Descriptor tag, len=18 + 0x40, 0x15, 0x00, 0x00, 0x00, // objType=AAC, streamType, bufferSize + 0x00, 0x00, 0x00, 0x00, // maxBitrate + 0x00, 0x00, 0x00, 0x00, // avgBitrate + 0x05, 0x03, 0x2b, 0x92, 0x08, // DSI: truncated HE-AAC + 0x06, 0x01, 0x02, // SL_Descriptor + ]; + let aac_dc_descriptor = &aac_esds[22..25]; + + let mut stream = make_box(BoxSize::Auto, b"esds", |s| { + s.B32(0) // reserved + .append_bytes(aac_esds.as_slice()) + }); + let mut iter = super::BoxIter::new(&mut stream); + let mut stream = iter.next_box().unwrap().unwrap(); + + let es = super::read_esds(&mut stream, ParseStrictness::Normal).unwrap(); + + assert_eq!(es.audio_codec, super::CodecType::AAC); + assert_eq!(es.audio_object_type, Some(2)); + assert_eq!(es.extended_audio_object_type, Some(5)); + assert_eq!(es.audio_sample_rate, Some(22050)); + assert_eq!(es.audio_channel_count, Some(2)); + assert_eq!(es.codec_esds, aac_esds); + assert_eq!(es.decoder_specific_data, aac_dc_descriptor); +} + +#[test] +fn read_esds_duplicate_dsi() { + // Two DecSpecificInfo descriptors with identical AAC-LC 48 kHz stereo + // payload (`11 90`). Reduced from Firefox bug 1906838. + // + // In non-strict mode the parser must keep the first DSI and ignore the + // second — matching what Chromium (media/formats/mp4/es_descriptor.cc) + // and ffmpeg (libavformat/isom.c ff_mp4_read_dec_config_descr) do. + // Before our fix the parser concatenated both DSI payloads and + // overwrote audio_object_type / audio_sample_rate / audio_channel_count + // with the second DSI's parsed values, which left the ES_Descriptor in + // an inconsistent state when the two DSIs disagreed. + let aac_esds = vec![ + 0x03, 0x1d, // ES_Descriptor tag, len=29 + 0x00, 0x00, 0x00, // ES_ID + flags + 0x04, 0x15, // DC_Descriptor tag, len=21 + 0x40, 0x15, 0x00, 0x00, 0x00, // objType=AAC, streamType, bufferSize + 0x00, 0x00, 0x01, 0xf4, // maxBitrate + 0x00, 0x00, 0x01, 0xf4, // avgBitrate + 0x05, 0x02, 0x11, 0x90, // DSI #1 (kept) + 0x05, 0x02, 0x11, 0x90, // DSI #2 (ignored in non-strict) + 0x06, 0x01, 0x02, // SL_Descriptor + ]; + let aac_dc_descriptor = &aac_esds[22..24]; + + let mut stream = make_box(BoxSize::Auto, b"esds", |s| { + s.B32(0) // reserved + .append_bytes(aac_esds.as_slice()) + }); + let mut iter = super::BoxIter::new(&mut stream); + let mut stream = iter.next_box().unwrap().unwrap(); + + let es = super::read_esds(&mut stream, ParseStrictness::Normal).unwrap(); + + assert_eq!(es.audio_codec, super::CodecType::AAC); + assert_eq!(es.audio_object_type, Some(2)); + assert_eq!(es.extended_audio_object_type, None); + assert_eq!(es.audio_sample_rate, Some(48000)); + assert_eq!(es.audio_channel_count, Some(2)); + assert_eq!(es.codec_esds, aac_esds); + // First DSI's bytes only — not the concatenation of both. + assert_eq!(es.decoder_specific_data, aac_dc_descriptor); +} + +#[test] +fn read_esds_duplicate_dsi_strict() { + // In strict mode, the duplicate-DSI case is rejected with the + // corresponding status rather than silently kept as "first wins". + let aac_esds = vec![ + 0x03, 0x1d, 0x00, 0x00, 0x00, 0x04, 0x15, 0x40, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0xf4, 0x00, 0x00, 0x01, 0xf4, 0x05, 0x02, 0x11, 0x90, 0x05, 0x02, 0x11, 0x90, 0x06, 0x01, + 0x02, + ]; + + let mut stream = make_box(BoxSize::Auto, b"esds", |s| { + s.B32(0) // reserved + .append_bytes(aac_esds.as_slice()) + }); + let mut iter = super::BoxIter::new(&mut stream); + let mut stream = iter.next_box().unwrap().unwrap(); + + match super::read_esds(&mut stream, ParseStrictness::Strict) { + Err(Error::InvalidData(Status::EsdsDecSpecificInfoTagQuantity)) => {} + other => panic!( + "expected EsdsDecSpecificInfoTagQuantity in strict mode, got {:?}", + other + ), + } +} + +#[test] +fn read_esds_duplicate_dsi_disagreeing() { + // Two DecSpecificInfo descriptors whose payloads disagree on sample rate + // and channel count. Locks in "first-wins" rather than "second-wins" or + // "concatenate": + // DSI #1 `11 90` → AOT=2 (AAC-LC), SFI=3 (48 kHz), chCfg=2 (stereo) + // DSI #2 `12 88` → AOT=2 (AAC-LC), SFI=5 (32 kHz), chCfg=1 (mono) + let aac_esds = vec![ + 0x03, 0x1d, // ES_Descriptor tag, len=29 + 0x00, 0x00, 0x00, // ES_ID + flags + 0x04, 0x15, // DC_Descriptor tag, len=21 + 0x40, 0x15, 0x00, 0x00, 0x00, // objType=AAC, streamType, bufferSize + 0x00, 0x00, 0x01, 0xf4, // maxBitrate + 0x00, 0x00, 0x01, 0xf4, // avgBitrate + 0x05, 0x02, 0x11, 0x90, // DSI #1: 48 kHz stereo (kept) + 0x05, 0x02, 0x12, 0x88, // DSI #2: 32 kHz mono (ignored in non-strict) + 0x06, 0x01, 0x02, // SL_Descriptor + ]; + let dsi1 = &aac_esds[22..24]; + + let mut stream = make_box(BoxSize::Auto, b"esds", |s| { + s.B32(0) // reserved + .append_bytes(aac_esds.as_slice()) + }); + let mut iter = super::BoxIter::new(&mut stream); + let mut stream = iter.next_box().unwrap().unwrap(); + + let es = super::read_esds(&mut stream, ParseStrictness::Normal).unwrap(); + + assert_eq!(es.audio_codec, super::CodecType::AAC); + assert_eq!(es.audio_object_type, Some(2)); + assert_eq!(es.extended_audio_object_type, None); + // First DSI wins — not 32000 / 1 from DSI #2. + assert_eq!(es.audio_sample_rate, Some(48000)); + assert_eq!(es.audio_channel_count, Some(2)); + assert_eq!(es.decoder_specific_data, dsi1); +} + +#[test] +fn read_esds_truncated_dsi_pce() { + // Truncated DSI with channel_configuration == 0 (program_config_element + // signalling). The PCE tail is needed to derive the channel count, so + // the fix's early-store cannot populate audio_channel_count or the DSI + // bytes — only AOT / extended_AOT / sample_rate. + // + // DSI `11 80` decodes as: + // AOT=2 (AAC-LC), SFI=3 (48 kHz), chCfg=0 (PCE) + // frameLengthFlag=1, dependsOnCoreCoder=0, extensionFlag=0 + // after which the 4-bit element_instance_tag read runs off the end. + let aac_esds = vec![ + 0x03, 0x19, // ES_Descriptor tag, len=25 + 0x00, 0x00, 0x00, // ES_ID + flags + 0x04, 0x11, // DC_Descriptor tag, len=17 + 0x40, 0x15, 0x00, 0x00, 0x00, // objType=AAC, streamType, bufferSize + 0x00, 0x00, 0x01, 0xf4, // maxBitrate + 0x00, 0x00, 0x01, 0xf4, // avgBitrate + 0x05, 0x02, 0x11, 0x80, // DSI: AAC-LC 48 kHz PCE, truncated mid-PCE + 0x06, 0x01, 0x02, // SL_Descriptor + ]; + + let mut stream = make_box(BoxSize::Auto, b"esds", |s| { + s.B32(0) // reserved + .append_bytes(aac_esds.as_slice()) + }); + let mut iter = super::BoxIter::new(&mut stream); + let mut stream = iter.next_box().unwrap().unwrap(); + + let es = super::read_esds(&mut stream, ParseStrictness::Normal).unwrap(); + + assert_eq!(es.audio_codec, super::CodecType::AAC); + assert_eq!(es.audio_object_type, Some(2)); + assert_eq!(es.extended_audio_object_type, None); + assert_eq!(es.audio_sample_rate, Some(48000)); + // PCE parsing hits EOF before deriving the channel count, so these stay + // unset rather than guessing. + assert_eq!(es.audio_channel_count, None); + assert!(es.decoder_specific_data.is_empty()); +} + #[test] fn read_stsd_mp4v() { let mp4v = vec![