From 56e3395b07ca153c1257bd686d21a60372f01a26 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 12 May 2026 14:43:26 -0700 Subject: [PATCH 1/4] Add tests.analytics.parquet_indices test toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a small, opt-in test infrastructure slice so the PPL integration test suite can run end-to-end against the analytics-engine backend without per-test rewiring. `-Dtests.analytics.parquet_indices=true` makes `TestUtils.createIndexByRestClient` back every test-created index with single-shard composite/parquet storage: index.pluggable.dataformat.enabled = true index.pluggable.dataformat = "composite" index.composite.primary_data_format = "parquet" `RestUnifiedQueryAction.isAnalyticsIndex` (post-#5432) reads these settings and routes any query against such indices to the analytics-engine planner (DataFusion). No additional cluster setting or routing override required — the production routing logic is the single source of truth. Also adds `PPLIntegTestCase.isAnalyticsParquetIndicesEnabled()` as a per-test predicate so individual tests can branch their assertions on engine semantics (DataFusion follows different ordering and null-bucket semantics than the legacy V2 and Calcite-DSL paths). Bulk loads on parquet-backed indices use `refresh=true` because `analytics-backend-lucene`'s `LuceneCommitter.getSafeCommitInfo` is a `TODO:: with index deleter` stub that hangs `refresh=wait_for` until the test framework request timeout (~60s). `integ-test/build.gradle` forwards the property to `:integTestRemote` so the gradle command line is the single knob. Default behavior is unchanged — with the flag unset, every test-created index is Lucene-backed and every IT runs through the existing V2 / Calcite path exactly as before. Signed-off-by: Kai Huang --- integ-test/build.gradle | 8 ++++ .../org/opensearch/sql/legacy/TestUtils.java | 47 ++++++++++++++++++- .../opensearch/sql/ppl/PPLIntegTestCase.java | 15 ++++++ 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/integ-test/build.gradle b/integ-test/build.gradle index 06e6478f5fd..70549c99a7f 100644 --- a/integ-test/build.gradle +++ b/integ-test/build.gradle @@ -849,6 +849,14 @@ task integTestRemote(type: RestIntegTestTask) { systemProperty "user", System.getProperty("user") systemProperty "password", System.getProperty("password") + // Forward the analytics-engine parquet-indices toggle when set on the gradle command + // line. TestUtils.createIndexByRestClient reads this to back every test-created index + // with composite/parquet so RestUnifiedQueryAction.isAnalyticsIndex (post-#5432) routes + // to the analytics-engine planner via index settings. + if (System.getProperty("tests.analytics.parquet_indices") != null) { + systemProperty 'tests.analytics.parquet_indices', System.getProperty("tests.analytics.parquet_indices") + } + // Set default query size limit systemProperty 'defaultQuerySizeLimit', '10000' diff --git a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java index 16827623058..f6e8bf3e282 100644 --- a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java +++ b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java @@ -37,6 +37,18 @@ public class TestUtils { private static final String MAPPING_FILE_PATH = "src/test/resources/indexDefinitions/"; + /** + * System property that makes every test-created index parquet-backed (composite primary data + * format = parquet) with a single shard. When set, {@link + * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code + * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see #5432) + * will return {@code true} for every test-created index — exercising the analytics-engine route + * end-to-end without per-test rewiring. + * + *

Off by default; normal CI is untouched. + */ + public static final String ANALYTICS_PARQUET_INDICES_PROP = "tests.analytics.parquet_indices"; + /** * Create test index by REST client. * @@ -48,6 +60,9 @@ public static void createIndexByRestClient(RestClient client, String indexName, Request request = new Request("PUT", "/" + indexName); JSONObject jsonObject = isNullOrEmpty(mapping) ? new JSONObject() : new JSONObject(mapping); setZeroReplicas(jsonObject); + if (Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))) { + makeParquetBacked(jsonObject); + } request.setJsonEntity(jsonObject.toString()); performRequest(client, request); } @@ -69,6 +84,25 @@ private static void setZeroReplicas(JSONObject jsonObject) { jsonObject.put("settings", settings); } + /** + * Switches the test index to a parquet-backed composite store with a single shard so the + * analytics-engine path has a backend that can scan it. Routing is then driven entirely by index + * settings (#5432) — no other test plumbing required. + */ + private static void makeParquetBacked(JSONObject jsonObject) { + JSONObject settings = + jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject(); + JSONObject indexSettings = + settings.has("index") ? settings.getJSONObject("index") : new JSONObject(); + indexSettings.put("number_of_shards", 1); + indexSettings.put("pluggable.dataformat.enabled", true); + indexSettings.put("pluggable.dataformat", "composite"); + indexSettings.put("composite.primary_data_format", "parquet"); + indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray()); + settings.put("index", indexSettings); + jsonObject.put("settings", settings); + } + /** * https://github.com/elastic/elasticsearch/pull/49959
* Deprecate creation of dot-prefixed index names except for hidden and system indices. Create @@ -116,8 +150,17 @@ public static boolean isIndexExist(RestClient client, String indexName) { public static void loadDataByRestClient( RestClient client, String indexName, String dataSetFilePath) throws IOException { Path path = Paths.get(getResourceFilePath(dataSetFilePath)); - Request request = - new Request("POST", "/" + indexName + "/_bulk?refresh=wait_for&wait_for_active_shards=all"); + // Workaround: parquet-backed indices in the analytics-backend-lucene composite engine + // do not yet implement LuceneCommitter.getSafeCommitInfo (UnsupportedOperationException + // "TODO:: with index deleter"), which hangs refresh=wait_for until the test framework + // request timeout (~60s). Force-refresh sidesteps the safe-commit-info path while still + // making the bulk-loaded docs immediately searchable. Drop this branch once + // LuceneCommitter.getSafeCommitInfo is implemented. + String refreshParam = + Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false")) + ? "refresh=true" + : "refresh=wait_for&wait_for_active_shards=all"; + Request request = new Request("POST", "/" + indexName + "/_bulk?" + refreshParam); request.setJsonEntity(new String(Files.readAllBytes(path))); performRequest(client, request); } diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java index 1a20b42751e..238bc0f68b2 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java @@ -30,6 +30,7 @@ import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.common.setting.Settings.Key; import org.opensearch.sql.legacy.SQLIntegTestCase; +import org.opensearch.sql.legacy.TestUtils; import org.opensearch.sql.protocol.response.format.Format; import org.opensearch.sql.util.RetryProcessor; @@ -49,6 +50,20 @@ protected void init() throws Exception { disableCalcite(); // calcite is enabled by default from 3.3.0 } + /** + * Returns {@code true} when the suite was started with {@code + * -Dtests.analytics.parquet_indices=true}. Use this to branch test assertions that depend on the + * execution backend — when this flag is on, every test-created index is composite/parquet, which + * makes {@code RestUnifiedQueryAction.isAnalyticsIndex} (post-#5432) route every query to the + * analytics-engine backend (DataFusion) instead of the Calcite enumerable / DSL-pushdown backend. + * DataFusion follows different ordering and null-bucket semantics than the legacy V2 and + * Calcite-DSL paths. + */ + public static boolean isAnalyticsParquetIndicesEnabled() { + return Boolean.parseBoolean( + System.getProperty(TestUtils.ANALYTICS_PARQUET_INDICES_PROP, "false")); + } + protected JSONObject executeQuery(String query) throws IOException { return jsonify(executeQueryToString(query)); } From d3782dcc66b64a36c78f0b1ef6cbb1f2469ca818 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 12 May 2026 14:43:41 -0700 Subject: [PATCH 2/4] Branch 7 stats tests on isAnalyticsParquetIndicesEnabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When -Dtests.analytics.parquet_indices=true is set, every test-created index is parquet-backed and RestUnifiedQueryAction.isAnalyticsIndex routes all queries through the analytics-engine backend (DataFusion). Seven tests need analytics-specific assertions because DataFusion follows different null-bucket, SQL-spec, and TDigest interpolation semantics than the legacy V2 / Calcite-DSL paths: SQL-spec semantics (DataFusion follows the spec; legacy DSL returns 0): - testSumWithNull — SUM of all-null returns null on analytics; the existing isPushdownDisabled branch already handles the Calcite-no- pushdown case. OR analytics into that branch. - testStatsPercentileByNullValue + NonNullBucket — percentile of an all-null / empty group returns null on analytics. Same OR pattern. Non-deterministic head ordering: - testStatsWithLimit Q1+Q2 — head 5 over a 6-bucket result picks a different null-balance row than the legacy / Calcite-DSL path; same effect cascades into `head 2 from 1`. Reuse the size-only assertion branch already present for the no-pushdown case. TDigest interpolation divergence (genuine impl difference): - testStatsPercentileWithNull — DataFusion TDigest interpolates p50 to 35413 vs OpenSearch's 39225; both within compression-bound error. Per-engine expected value. - testStatsPercentileBySpan — same TDigest diff on the age=30 bucket (33194 vs 39225). Coordinated with opensearch-project/OpenSearch#21584 commit 5 which fixes the upstream planner-side type mismatch so the query reaches the data assertion at all. Semantic divergence pending team decision: - testDisableLegacyPreferred — under PPL_SYNTAX_LEGACY_PREFERRED=false, V2 / Calcite-DSL drop the null-age bucket (5 rows) while DataFusion keeps it (6 rows). Skipped on the analytics path via Assume.assumeFalse until the team decides which behaviour is the intended new-syntax semantics. Out of scope for this PR (intentionally left failing on the analytics path so the gap is visible in CI): - testStatsBySpanTimeWithNullBucket — span(@timestamp, 12h) multi-unit time interval unsupported in current SpanAdapter (only interval=1 is rewritten to DATE_TRUNC; multi-unit needs DataFusion's date_bin). Signed-off-by: Kai Huang --- .../opensearch/sql/ppl/StatsCommandIT.java | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java index b339f3f8023..229b05ac086 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java @@ -309,7 +309,10 @@ public void testStatsWithLimit() throws IOException { verifySchema(response, schema("a", null, "double"), schema("age", null, "int")); // If push down disabled, the final results will no longer be stable. In DSL, the order is // guaranteed because we always sort by bucket field, while we don't add sort in the plan. - if (!isPushdownDisabled()) { + // The analytics-engine backend (DataFusion) is also non-deterministic — its hash-bucket + // order picks a different null-balance row (e.g. (null,36) instead of (null,null)) for + // `head 5`. + if (!isPushdownDisabled() && !isAnalyticsParquetIndicesEnabled()) { verifyDataRows( response, rows(null, null), @@ -327,7 +330,8 @@ public void testStatsWithLimit() throws IOException { "source=%s | stats avg(balance) as a by age | head 5 | head 2 from 1", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("a", null, "double"), schema("age", null, "int")); - if (!isPushdownDisabled()) { + // Same non-deterministic head-window issue as the preceding `head 5` block. + if (!isPushdownDisabled() && !isAnalyticsParquetIndicesEnabled()) { verifyDataRows(response, rows(32838D, 28), rows(39225D, 32)); } else { assert ((Integer) response.get("size") == 2); @@ -508,7 +512,9 @@ public void testSumWithNull() throws IOException { // TODO: Fix -- temporary workaround for the pushdown issue: // The current pushdown implementation will return 0 for sum when getting null values as input. // Returning null should be the expected behavior. - Integer expectedValue = isPushdownDisabled() ? null : 0; + // The analytics-engine backend (DataFusion) follows the SQL spec like Calcite-no-pushdown — + // SUM of all-null is null, not 0. + Integer expectedValue = (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0; verifyDataRows(response, rows(expectedValue)); } @@ -635,7 +641,11 @@ public void testStatsPercentileWithNull() throws IOException { String.format( "source=%s | stats percentile(balance, 50)", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("percentile(balance, 50)", null, "bigint")); - verifyDataRows(response, rows(39225)); + // DataFusion's TDigest interpolation produces a different median value for the same + // input than OpenSearch's TDigest implementation. Both are valid approximations within + // TDigest's compression-bound error. Until DataFusion's UDAF is replaced with one + // matching OpenSearch's TDigest (or vice versa), record the per-engine values. + verifyDataRows(response, rows(isAnalyticsParquetIndicesEnabled() ? 35413 : 39225)); } @Test @@ -674,14 +684,18 @@ public void testStatsPercentileByNullValue() throws IOException { "source=%s | stats percentile(balance, 50) as p50 by age", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int")); + // DataFusion follows SQL spec like Calcite-no-pushdown — percentile of an all-null + // group is null, not 0 (the latter is a legacy DSL pushdown convention). + Integer emptyGroupPercentile = + (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0; verifyDataRows( response, - rows(isPushdownDisabled() ? null : 0, null), + rows(emptyGroupPercentile, null), rows(32838, 28), rows(39225, 32), rows(4180, 33), rows(48086, 34), - rows(isPushdownDisabled() ? null : 0, 36)); + rows(emptyGroupPercentile, 36)); } @Test @@ -692,13 +706,15 @@ public void testStatsPercentileByNullValueNonNullBucket() throws IOException { "source=%s | stats bucket_nullable=false percentile(balance, 50) as p50 by age", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int")); + Integer emptyGroupPercentile = + (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0; verifyDataRows( response, rows(32838, 28), rows(39225, 32), rows(4180, 33), rows(48086, 34), - rows(isPushdownDisabled() ? null : 0, 36)); + rows(emptyGroupPercentile, 36)); } @Test @@ -709,7 +725,14 @@ public void testStatsPercentileBySpan() throws IOException { "source=%s | stats percentile(balance, 50) as p50 by span(age, 10) as age_bucket", TEST_INDEX_BANK)); verifySchema(response, schema("p50", null, "bigint"), schema("age_bucket", null, "int")); - verifyDataRows(response, rows(32838, 20), rows(39225, 30)); + // Same per-engine TDigest interpolation divergence as testStatsPercentileWithNull — + // the age=30 bucket's p50 differs between OpenSearch's TDigest (39225) and DataFusion's + // (33194). The age=20 bucket happens to coincide. + if (isAnalyticsParquetIndicesEnabled()) { + verifyDataRows(response, rows(32838, 20), rows(33194, 30)); + } else { + verifyDataRows(response, rows(32838, 20), rows(39225, 30)); + } } @Test @@ -729,6 +752,15 @@ public void testDisableLegacyPreferred() throws IOException { throw new RuntimeException(e); } verifySchema(response, schema("a", null, "double"), schema("age", null, "int")); + // Skip on the analytics-engine backend: PPL_SYNTAX_LEGACY_PREFERRED=false is + // expected to give the same shape across backends, but v2 / Calcite-DSL drop + // the null-age bucket (5 rows) while DataFusion keeps it (6 rows). Deciding + // which is correct is a semantic question for the team; until that's resolved, + // exercising this test on the analytics path doesn't tell us anything useful. + org.junit.Assume.assumeFalse( + "PPL_SYNTAX_LEGACY_PREFERRED=false null-bucket semantics not yet aligned" + + " between V2 / Calcite-DSL and analytics-engine backends", + isAnalyticsParquetIndicesEnabled()); verifyDataRows( response, rows(32838D, 28), From b1c850297cff654d7b0f27bb35b7c49017314dfc Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 14 May 2026 12:17:32 -0700 Subject: [PATCH 3/4] Consolidate analytics parquet config into AnalyticsIndexConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @dai-chen's review feedback — the index-creation settings, the bulk-load refresh strategy, and the system-property gate were spread across three places in TestUtils (the `ANALYTICS_PARQUET_INDICES_PROP` constant, the inline check in `createIndexByRestClient`, the inline check in `loadDataByRestClient`, and the private `makeParquetBacked` helper). That's the same spread-out-conditional pattern the OS-Spark repo's AOSS configs ended up with. Collect everything into a single `TestUtils.AnalyticsIndexConfig` nested helper: * `ENABLED_PROP` — the system property name. * `isEnabled()` — single source for the gate. * `applyIndexCreationSettings(jsonObject)` — injects the parquet-backed composite settings; no-op when disabled. * `bulkLoadRefreshParam()` — returns the right `_bulk` refresh query string for the active index type. `createIndexByRestClient` and `loadDataByRestClient` call these unconditionally — the methods themselves short-circuit when the config is off — so the parquet-specific branching is gone from the helpers and concentrated in one place. `PPLIntegTestCase.isAnalyticsParquetIndicesEnabled()` now delegates to `AnalyticsIndexConfig.isEnabled()`. The legacy `ANALYTICS_PARQUET_INDICES_PROP` constant is kept as a forwarding alias for any external callers. Verified end-to-end against the analytics-engine cluster — same 50 / 63 pass + 1 skip + 12 fail as the pre-refactor run, so the consolidation is behavior-preserving. Signed-off-by: Kai Huang Signed-off-by: Kai Huang --- .../org/opensearch/sql/legacy/TestUtils.java | 109 +++++++++++------- .../opensearch/sql/ppl/PPLIntegTestCase.java | 3 +- 2 files changed, 69 insertions(+), 43 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java index f6e8bf3e282..4deef787910 100644 --- a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java +++ b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java @@ -38,16 +38,72 @@ public class TestUtils { private static final String MAPPING_FILE_PATH = "src/test/resources/indexDefinitions/"; /** - * System property that makes every test-created index parquet-backed (composite primary data - * format = parquet) with a single shard. When set, {@link - * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code - * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see #5432) - * will return {@code true} for every test-created index — exercising the analytics-engine route - * end-to-end without per-test rewiring. + * Forwarding alias for {@link AnalyticsIndexConfig#ENABLED_PROP}. Kept for any external callers + * that referenced the old constant location. + */ + public static final String ANALYTICS_PARQUET_INDICES_PROP = AnalyticsIndexConfig.ENABLED_PROP; + + /** + * Centralizes every analytics-engine-only test-index knob in one place so the parquet-backed + * settings, the bulk-load refresh strategy, and any future analytics-specific config don't drift + * across separate helpers (the spread-out-conditional pattern called out in review, borrowed from + * the OS-Spark repo's AOSS-conditional configs that ended up scattered across its codebase). * - *

Off by default; normal CI is untouched. + *

When {@link #ENABLED_PROP} is unset (default), every method here is a no-op or returns the + * standard non-parquet value, so normal CI sees zero behavior change. */ - public static final String ANALYTICS_PARQUET_INDICES_PROP = "tests.analytics.parquet_indices"; + public static final class AnalyticsIndexConfig { + + /** + * System property that makes every test-created index parquet-backed (composite primary data + * format = parquet) with a single shard. When set, {@link + * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code + * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see + * #5432) returns {@code true} for every test-created index — exercising the analytics-engine + * route end-to-end without per-test rewiring. + */ + public static final String ENABLED_PROP = "tests.analytics.parquet_indices"; + + public static boolean isEnabled() { + return Boolean.parseBoolean(System.getProperty(ENABLED_PROP, "false")); + } + + /** + * Inject the parquet-backed composite-store index settings into {@code jsonObject}. No-op when + * the config is disabled; idempotent — safe on any index-creation JSON shape. + */ + static void applyIndexCreationSettings(JSONObject jsonObject) { + if (!isEnabled()) { + return; + } + JSONObject settings = + jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject(); + JSONObject indexSettings = + settings.has("index") ? settings.getJSONObject("index") : new JSONObject(); + indexSettings.put("number_of_shards", 1); + indexSettings.put("pluggable.dataformat.enabled", true); + indexSettings.put("pluggable.dataformat", "composite"); + indexSettings.put("composite.primary_data_format", "parquet"); + indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray()); + settings.put("index", indexSettings); + jsonObject.put("settings", settings); + } + + /** + * Returns the {@code _bulk} refresh query string for the current index type. Parquet-backed + * indices in the analytics-backend-lucene composite engine don't yet implement {@code + * LuceneCommitter.getSafeCommitInfo} (UnsupportedOperationException "TODO:: with index + * deleter"), which hangs {@code refresh=wait_for} until the test framework request timeout + * (~60s). Force-refresh sidesteps the safe-commit-info path while still making the bulk-loaded + * docs immediately searchable. Drop this branch once {@code LuceneCommitter.getSafeCommitInfo} + * is implemented. + */ + static String bulkLoadRefreshParam() { + return isEnabled() ? "refresh=true" : "refresh=wait_for&wait_for_active_shards=all"; + } + + private AnalyticsIndexConfig() {} + } /** * Create test index by REST client. @@ -60,9 +116,7 @@ public static void createIndexByRestClient(RestClient client, String indexName, Request request = new Request("PUT", "/" + indexName); JSONObject jsonObject = isNullOrEmpty(mapping) ? new JSONObject() : new JSONObject(mapping); setZeroReplicas(jsonObject); - if (Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))) { - makeParquetBacked(jsonObject); - } + AnalyticsIndexConfig.applyIndexCreationSettings(jsonObject); request.setJsonEntity(jsonObject.toString()); performRequest(client, request); } @@ -84,25 +138,6 @@ private static void setZeroReplicas(JSONObject jsonObject) { jsonObject.put("settings", settings); } - /** - * Switches the test index to a parquet-backed composite store with a single shard so the - * analytics-engine path has a backend that can scan it. Routing is then driven entirely by index - * settings (#5432) — no other test plumbing required. - */ - private static void makeParquetBacked(JSONObject jsonObject) { - JSONObject settings = - jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject(); - JSONObject indexSettings = - settings.has("index") ? settings.getJSONObject("index") : new JSONObject(); - indexSettings.put("number_of_shards", 1); - indexSettings.put("pluggable.dataformat.enabled", true); - indexSettings.put("pluggable.dataformat", "composite"); - indexSettings.put("composite.primary_data_format", "parquet"); - indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray()); - settings.put("index", indexSettings); - jsonObject.put("settings", settings); - } - /** * https://github.com/elastic/elasticsearch/pull/49959
* Deprecate creation of dot-prefixed index names except for hidden and system indices. Create @@ -150,17 +185,9 @@ public static boolean isIndexExist(RestClient client, String indexName) { public static void loadDataByRestClient( RestClient client, String indexName, String dataSetFilePath) throws IOException { Path path = Paths.get(getResourceFilePath(dataSetFilePath)); - // Workaround: parquet-backed indices in the analytics-backend-lucene composite engine - // do not yet implement LuceneCommitter.getSafeCommitInfo (UnsupportedOperationException - // "TODO:: with index deleter"), which hangs refresh=wait_for until the test framework - // request timeout (~60s). Force-refresh sidesteps the safe-commit-info path while still - // making the bulk-loaded docs immediately searchable. Drop this branch once - // LuceneCommitter.getSafeCommitInfo is implemented. - String refreshParam = - Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false")) - ? "refresh=true" - : "refresh=wait_for&wait_for_active_shards=all"; - Request request = new Request("POST", "/" + indexName + "/_bulk?" + refreshParam); + Request request = + new Request( + "POST", "/" + indexName + "/_bulk?" + AnalyticsIndexConfig.bulkLoadRefreshParam()); request.setJsonEntity(new String(Files.readAllBytes(path))); performRequest(client, request); } diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java index 238bc0f68b2..6c25c415af1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java @@ -60,8 +60,7 @@ protected void init() throws Exception { * Calcite-DSL paths. */ public static boolean isAnalyticsParquetIndicesEnabled() { - return Boolean.parseBoolean( - System.getProperty(TestUtils.ANALYTICS_PARQUET_INDICES_PROP, "false")); + return TestUtils.AnalyticsIndexConfig.isEnabled(); } protected JSONObject executeQuery(String query) throws IOException { From efa97dee4d7939f0ba28287b71c4a93a02350ee4 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 14 May 2026 14:03:55 -0700 Subject: [PATCH 4/4] Revert per-engine assertions on percentile tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @dai-chen — since the companion OpenSearch PR (#21584) no longer wires `percentile_approx` on the analytics path (Sandesh's "repurpose for SPAN" scope), all `percentile(...)` queries fail at execution before reaching the assertion. The per-engine branches that this PR introduced for `testStatsPercentile{WithNull,BySpan,ByNullValue, ByNullValueNonNullBucket}` were doing nothing in practice — the tests fail regardless. Reverts those four tests to their pre-PR state. The three remaining intentional changes stand: * `testStatsWithLimit` Q1+Q2 — non-deterministic head ordering on DataFusion hash-bucket order; size-only branch on the analytics path. * `testSumWithNull` — extends the existing `isPushdownDisabled() ? null : 0` pattern. SUM works on the analytics path and DataFusion returns null per SQL spec. * `testDisableLegacyPreferred` — `Assume.assumeFalse` skip for the pending-team-decision null-bucket semantic divergence. Signed-off-by: Kai Huang Signed-off-by: Kai Huang --- .../opensearch/sql/ppl/StatsCommandIT.java | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java index 229b05ac086..c3166dd6911 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java @@ -641,11 +641,7 @@ public void testStatsPercentileWithNull() throws IOException { String.format( "source=%s | stats percentile(balance, 50)", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("percentile(balance, 50)", null, "bigint")); - // DataFusion's TDigest interpolation produces a different median value for the same - // input than OpenSearch's TDigest implementation. Both are valid approximations within - // TDigest's compression-bound error. Until DataFusion's UDAF is replaced with one - // matching OpenSearch's TDigest (or vice versa), record the per-engine values. - verifyDataRows(response, rows(isAnalyticsParquetIndicesEnabled() ? 35413 : 39225)); + verifyDataRows(response, rows(39225)); } @Test @@ -684,18 +680,14 @@ public void testStatsPercentileByNullValue() throws IOException { "source=%s | stats percentile(balance, 50) as p50 by age", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int")); - // DataFusion follows SQL spec like Calcite-no-pushdown — percentile of an all-null - // group is null, not 0 (the latter is a legacy DSL pushdown convention). - Integer emptyGroupPercentile = - (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0; verifyDataRows( response, - rows(emptyGroupPercentile, null), + rows(isPushdownDisabled() ? null : 0, null), rows(32838, 28), rows(39225, 32), rows(4180, 33), rows(48086, 34), - rows(emptyGroupPercentile, 36)); + rows(isPushdownDisabled() ? null : 0, 36)); } @Test @@ -706,15 +698,13 @@ public void testStatsPercentileByNullValueNonNullBucket() throws IOException { "source=%s | stats bucket_nullable=false percentile(balance, 50) as p50 by age", TEST_INDEX_BANK_WITH_NULL_VALUES)); verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int")); - Integer emptyGroupPercentile = - (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0; verifyDataRows( response, rows(32838, 28), rows(39225, 32), rows(4180, 33), rows(48086, 34), - rows(emptyGroupPercentile, 36)); + rows(isPushdownDisabled() ? null : 0, 36)); } @Test @@ -725,14 +715,7 @@ public void testStatsPercentileBySpan() throws IOException { "source=%s | stats percentile(balance, 50) as p50 by span(age, 10) as age_bucket", TEST_INDEX_BANK)); verifySchema(response, schema("p50", null, "bigint"), schema("age_bucket", null, "int")); - // Same per-engine TDigest interpolation divergence as testStatsPercentileWithNull — - // the age=30 bucket's p50 differs between OpenSearch's TDigest (39225) and DataFusion's - // (33194). The age=20 bucket happens to coincide. - if (isAnalyticsParquetIndicesEnabled()) { - verifyDataRows(response, rows(32838, 20), rows(33194, 30)); - } else { - verifyDataRows(response, rows(32838, 20), rows(39225, 30)); - } + verifyDataRows(response, rows(32838, 20), rows(39225, 30)); } @Test