Skip to content

Commit 08c2fb1

Browse files
radustoenescuaditanaseRadu Stoenescu
authored
perf: skip stats (backported from delta-kernel 0.20) (#16)
* [HSTACK] - add skip_stas (backported from delta-kernel 0.20) Signed-off-by: Adrian Tanase <atanase@adobe.com> * [HSTACK] - feat: enable/disable skip stats through table config --------- Signed-off-by: Adrian Tanase <atanase@adobe.com> Co-authored-by: Adrian Tanase <atanase@adobe.com> Co-authored-by: Radu Stoenescu <stoenesc@adobe.com>
1 parent 3fa7f35 commit 08c2fb1

4 files changed

Lines changed: 21 additions & 2 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ documentation = "https://docs.rs/deltalake"
1616
repository = "https://github.com/delta-io/delta.rs"
1717

1818
[workspace.dependencies]
19-
delta_kernel = { version = "0.19.0", features = [
19+
delta_kernel = { version = "0.19.2", features = [
2020
"arrow-57",
2121
"default-engine-rustls",
2222
"internal-api",

crates/core/src/kernel/snapshot/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,8 @@ impl Snapshot {
270270
log_store: &dyn LogStore,
271271
predicate: Option<PredicateRef>,
272272
) -> SendableRBStream {
273-
let scan = match self.scan_builder().with_predicate(predicate).build() {
273+
let skip_stats = self.config.skip_stats_in_file_listing;
274+
let scan = match self.scan_builder().with_predicate(predicate).with_skip_stats(skip_stats).build() {
274275
Ok(scan) => scan,
275276
Err(err) => return Box::pin(once(ready(Err(err)))),
276277
};

crates/core/src/kernel/snapshot/scan.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ impl ScanBuilder {
6363
self
6464
}
6565

66+
/// Skip reading file statistics from checkpoint parquet files.
67+
///
68+
/// When enabled, the stats column is not read from checkpoint files and data skipping
69+
/// is disabled. This is useful when the caller handles data skipping externally or
70+
/// doesn't need file statistics.
71+
pub fn with_skip_stats(mut self, skip_stats: bool) -> Self {
72+
self.inner = self.inner.with_skip_stats(skip_stats);
73+
self
74+
}
75+
6676
pub fn build(self) -> DeltaResult<Scan> {
6777
Ok(Scan::from(self.inner.build()?))
6878
}

crates/core/src/table/builder.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ pub struct DeltaTableConfig {
6060

6161
#[delta(skip)]
6262
pub log_size_limiter: Option<LogSizeLimiter>,
63+
64+
/// HSTACK: skip stats parsing during file listing. Runtime-only (not persisted).
65+
/// Default `true` for performance; set to `false` when stats-based pruning helps the query.
66+
#[serde(skip_serializing, skip_deserializing)]
67+
#[delta(skip)]
68+
pub skip_stats_in_file_listing: bool,
6369
}
6470

6571
impl Default for DeltaTableConfig {
@@ -70,6 +76,7 @@ impl Default for DeltaTableConfig {
7076
log_batch_size: 1024,
7177
io_runtime: None,
7278
log_size_limiter: None,
79+
skip_stats_in_file_listing: true,
7380
}
7481
}
7582
}
@@ -80,6 +87,7 @@ impl PartialEq for DeltaTableConfig {
8087
&& self.log_buffer_size == other.log_buffer_size
8188
&& self.log_batch_size == other.log_batch_size
8289
&& self.log_size_limiter == other.log_size_limiter
90+
&& self.skip_stats_in_file_listing == other.skip_stats_in_file_listing
8391
}
8492
}
8593

0 commit comments

Comments
 (0)