From 56e3395b07ca153c1257bd686d21a60372f01a26 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Tue, 12 May 2026 14:43:26 -0700
Subject: [PATCH 1/4] Add tests.analytics.parquet_indices test toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a small, opt-in test infrastructure slice so the PPL integration
test suite can run end-to-end against the analytics-engine backend
without per-test rewiring.

`-Dtests.analytics.parquet_indices=true` makes `TestUtils.createIndexByRestClient`
back every test-created index with single-shard composite/parquet storage:

    index.pluggable.dataformat.enabled = true
    index.pluggable.dataformat = "composite"
    index.composite.primary_data_format = "parquet"

`RestUnifiedQueryAction.isAnalyticsIndex` (post-#5432) reads these settings
and routes any query against such indices to the analytics-engine planner
(DataFusion). No additional cluster setting or routing override required —
the production routing logic is the single source of truth.

Also adds `PPLIntegTestCase.isAnalyticsParquetIndicesEnabled()` as a
per-test predicate so individual tests can branch their assertions on
engine semantics (DataFusion follows different ordering and null-bucket
semantics than the legacy V2 and Calcite-DSL paths).

Bulk loads on parquet-backed indices use `refresh=true` because
`analytics-backend-lucene`'s `LuceneCommitter.getSafeCommitInfo` is a
`TODO:: with index deleter` stub that hangs `refresh=wait_for` until the
test framework request timeout (~60s).

`integ-test/build.gradle` forwards the property to `:integTestRemote` so
the gradle command line is the single knob.

Default behavior is unchanged — with the flag unset, every test-created
index is Lucene-backed and every IT runs through the existing V2 /
Calcite path exactly as before.

Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 integ-test/build.gradle                       |  8 ++++
 .../org/opensearch/sql/legacy/TestUtils.java  | 47 ++++++++++++++++++-
 .../opensearch/sql/ppl/PPLIntegTestCase.java  | 15 ++++++
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/integ-test/build.gradle b/integ-test/build.gradle
index 06e6478f5fd..70549c99a7f 100644
--- a/integ-test/build.gradle
+++ b/integ-test/build.gradle
@@ -849,6 +849,14 @@ task integTestRemote(type: RestIntegTestTask) {
     systemProperty "user", System.getProperty("user")
     systemProperty "password", System.getProperty("password")
 
+    // Forward the analytics-engine parquet-indices toggle when set on the gradle command
+    // line. TestUtils.createIndexByRestClient reads this to back every test-created index
+    // with composite/parquet so RestUnifiedQueryAction.isAnalyticsIndex (post-#5432) routes
+    // to the analytics-engine planner via index settings.
+    if (System.getProperty("tests.analytics.parquet_indices") != null) {
+        systemProperty 'tests.analytics.parquet_indices', System.getProperty("tests.analytics.parquet_indices")
+    }
+
     // Set default query size limit
     systemProperty 'defaultQuerySizeLimit', '10000'
 
diff --git a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
index 16827623058..f6e8bf3e282 100644
--- a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
+++ b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
@@ -37,6 +37,18 @@ public class TestUtils {
 
   private static final String MAPPING_FILE_PATH = "src/test/resources/indexDefinitions/";
 
+  /**
+   * System property that makes every test-created index parquet-backed (composite primary data
+   * format = parquet) with a single shard. When set, {@link
+   * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code
+   * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see #5432)
+   * will return {@code true} for every test-created index — exercising the analytics-engine route
+   * end-to-end without per-test rewiring.
+   *
+   * <p>Off by default; normal CI is untouched.
+   */
+  public static final String ANALYTICS_PARQUET_INDICES_PROP = "tests.analytics.parquet_indices";
+
   /**
    * Create test index by REST client.
    *
@@ -48,6 +60,9 @@ public static void createIndexByRestClient(RestClient client, String indexName,
     Request request = new Request("PUT", "/" + indexName);
     JSONObject jsonObject = isNullOrEmpty(mapping) ? new JSONObject() : new JSONObject(mapping);
     setZeroReplicas(jsonObject);
+    if (Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))) {
+      makeParquetBacked(jsonObject);
+    }
     request.setJsonEntity(jsonObject.toString());
     performRequest(client, request);
   }
@@ -69,6 +84,25 @@ private static void setZeroReplicas(JSONObject jsonObject) {
     jsonObject.put("settings", settings);
   }
 
+  /**
+   * Switches the test index to a parquet-backed composite store with a single shard so the
+   * analytics-engine path has a backend that can scan it. Routing is then driven entirely by index
+   * settings (#5432) — no other test plumbing required.
+   */
+  private static void makeParquetBacked(JSONObject jsonObject) {
+    JSONObject settings =
+        jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject();
+    JSONObject indexSettings =
+        settings.has("index") ? settings.getJSONObject("index") : new JSONObject();
+    indexSettings.put("number_of_shards", 1);
+    indexSettings.put("pluggable.dataformat.enabled", true);
+    indexSettings.put("pluggable.dataformat", "composite");
+    indexSettings.put("composite.primary_data_format", "parquet");
+    indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray());
+    settings.put("index", indexSettings);
+    jsonObject.put("settings", settings);
+  }
+
   /**
    * https://github.com/elastic/elasticsearch/pull/49959<br>
    * Deprecate creation of dot-prefixed index names except for hidden and system indices. Create
@@ -116,8 +150,17 @@ public static boolean isIndexExist(RestClient client, String indexName) {
   public static void loadDataByRestClient(
       RestClient client, String indexName, String dataSetFilePath) throws IOException {
     Path path = Paths.get(getResourceFilePath(dataSetFilePath));
-    Request request =
-        new Request("POST", "/" + indexName + "/_bulk?refresh=wait_for&wait_for_active_shards=all");
+    // Workaround: parquet-backed indices in the analytics-backend-lucene composite engine
+    // do not yet implement LuceneCommitter.getSafeCommitInfo (UnsupportedOperationException
+    // "TODO:: with index deleter"), which hangs refresh=wait_for until the test framework
+    // request timeout (~60s). Force-refresh sidesteps the safe-commit-info path while still
+    // making the bulk-loaded docs immediately searchable. Drop this branch once
+    // LuceneCommitter.getSafeCommitInfo is implemented.
+    String refreshParam =
+        Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))
+            ? "refresh=true"
+            : "refresh=wait_for&wait_for_active_shards=all";
+    Request request = new Request("POST", "/" + indexName + "/_bulk?" + refreshParam);
     request.setJsonEntity(new String(Files.readAllBytes(path)));
     performRequest(client, request);
   }
diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
index 1a20b42751e..238bc0f68b2 100644
--- a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
+++ b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
@@ -30,6 +30,7 @@
 import org.opensearch.sql.common.setting.Settings;
 import org.opensearch.sql.common.setting.Settings.Key;
 import org.opensearch.sql.legacy.SQLIntegTestCase;
+import org.opensearch.sql.legacy.TestUtils;
 import org.opensearch.sql.protocol.response.format.Format;
 import org.opensearch.sql.util.RetryProcessor;
 
@@ -49,6 +50,20 @@ protected void init() throws Exception {
     disableCalcite(); // calcite is enabled by default from 3.3.0
   }
 
+  /**
+   * Returns {@code true} when the suite was started with {@code
+   * -Dtests.analytics.parquet_indices=true}. Use this to branch test assertions that depend on the
+   * execution backend — when this flag is on, every test-created index is composite/parquet, which
+   * makes {@code RestUnifiedQueryAction.isAnalyticsIndex} (post-#5432) route every query to the
+   * analytics-engine backend (DataFusion) instead of the Calcite enumerable / DSL-pushdown backend.
+   * DataFusion follows different ordering and null-bucket semantics than the legacy V2 and
+   * Calcite-DSL paths.
+   */
+  public static boolean isAnalyticsParquetIndicesEnabled() {
+    return Boolean.parseBoolean(
+        System.getProperty(TestUtils.ANALYTICS_PARQUET_INDICES_PROP, "false"));
+  }
+
   protected JSONObject executeQuery(String query) throws IOException {
     return jsonify(executeQueryToString(query));
   }

From d3782dcc66b64a36c78f0b1ef6cbb1f2469ca818 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Tue, 12 May 2026 14:43:41 -0700
Subject: [PATCH 2/4] Branch 7 stats tests on isAnalyticsParquetIndicesEnabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When -Dtests.analytics.parquet_indices=true is set, every test-created
index is parquet-backed and RestUnifiedQueryAction.isAnalyticsIndex
routes all queries through the analytics-engine backend (DataFusion).
Seven tests need analytics-specific assertions because DataFusion follows
different null-bucket, SQL-spec, and TDigest interpolation semantics
than the legacy V2 / Calcite-DSL paths:

SQL-spec semantics (DataFusion follows the spec; legacy DSL returns 0):
- testSumWithNull — SUM of all-null returns null on analytics; the
  existing isPushdownDisabled branch already handles the Calcite-no-
  pushdown case. OR analytics into that branch.
- testStatsPercentileByNullValue + NonNullBucket — percentile of an
  all-null / empty group returns null on analytics. Same OR pattern.

Non-deterministic head ordering:
- testStatsWithLimit Q1+Q2 — head 5 over a 6-bucket result picks a
  different null-balance row than the legacy / Calcite-DSL path; same
  effect cascades into `head 2 from 1`. Reuse the size-only assertion
  branch already present for the no-pushdown case.

TDigest interpolation divergence (genuine impl difference):
- testStatsPercentileWithNull — DataFusion TDigest interpolates p50 to
  35413 vs OpenSearch's 39225; both within compression-bound error.
  Per-engine expected value.
- testStatsPercentileBySpan — same TDigest diff on the age=30 bucket
  (33194 vs 39225). Coordinated with opensearch-project/OpenSearch#21584
  commit 5 which fixes the upstream planner-side type mismatch so the
  query reaches the data assertion at all.

Semantic divergence pending team decision:
- testDisableLegacyPreferred — under PPL_SYNTAX_LEGACY_PREFERRED=false,
  V2 / Calcite-DSL drop the null-age bucket (5 rows) while DataFusion
  keeps it (6 rows). Skipped on the analytics path via
  Assume.assumeFalse until the team decides which behaviour is the
  intended new-syntax semantics.

Out of scope for this PR (intentionally left failing on the analytics
path so the gap is visible in CI):
- testStatsBySpanTimeWithNullBucket — span(@timestamp, 12h) multi-unit
  time interval unsupported in current SpanAdapter (only interval=1 is
  rewritten to DATE_TRUNC; multi-unit needs DataFusion's date_bin).

Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 .../opensearch/sql/ppl/StatsCommandIT.java    | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
index b339f3f8023..229b05ac086 100644
--- a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
+++ b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
@@ -309,7 +309,10 @@ public void testStatsWithLimit() throws IOException {
     verifySchema(response, schema("a", null, "double"), schema("age", null, "int"));
     // If push down disabled, the final results will no longer be stable. In DSL, the order is
     // guaranteed because we always sort by bucket field, while we don't add sort in the plan.
-    if (!isPushdownDisabled()) {
+    // The analytics-engine backend (DataFusion) is also non-deterministic — its hash-bucket
+    // order picks a different null-balance row (e.g. (null,36) instead of (null,null)) for
+    // `head 5`.
+    if (!isPushdownDisabled() && !isAnalyticsParquetIndicesEnabled()) {
       verifyDataRows(
           response,
           rows(null, null),
@@ -327,7 +330,8 @@ public void testStatsWithLimit() throws IOException {
                 "source=%s | stats avg(balance) as a by age | head 5 | head 2 from 1",
                 TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("a", null, "double"), schema("age", null, "int"));
-    if (!isPushdownDisabled()) {
+    // Same non-deterministic head-window issue as the preceding `head 5` block.
+    if (!isPushdownDisabled() && !isAnalyticsParquetIndicesEnabled()) {
       verifyDataRows(response, rows(32838D, 28), rows(39225D, 32));
     } else {
       assert ((Integer) response.get("size") == 2);
@@ -508,7 +512,9 @@ public void testSumWithNull() throws IOException {
     // TODO: Fix -- temporary workaround for the pushdown issue:
     //  The current pushdown implementation will return 0 for sum when getting null values as input.
     //  Returning null should be the expected behavior.
-    Integer expectedValue = isPushdownDisabled() ? null : 0;
+    // The analytics-engine backend (DataFusion) follows the SQL spec like Calcite-no-pushdown —
+    // SUM of all-null is null, not 0.
+    Integer expectedValue = (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0;
     verifyDataRows(response, rows(expectedValue));
   }
 
@@ -635,7 +641,11 @@ public void testStatsPercentileWithNull() throws IOException {
             String.format(
                 "source=%s | stats percentile(balance, 50)", TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("percentile(balance, 50)", null, "bigint"));
-    verifyDataRows(response, rows(39225));
+    // DataFusion's TDigest interpolation produces a different median value for the same
+    // input than OpenSearch's TDigest implementation. Both are valid approximations within
+    // TDigest's compression-bound error. Until DataFusion's UDAF is replaced with one
+    // matching OpenSearch's TDigest (or vice versa), record the per-engine values.
+    verifyDataRows(response, rows(isAnalyticsParquetIndicesEnabled() ? 35413 : 39225));
   }
 
   @Test
@@ -674,14 +684,18 @@ public void testStatsPercentileByNullValue() throws IOException {
                 "source=%s | stats percentile(balance, 50) as p50 by age",
                 TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int"));
+    // DataFusion follows SQL spec like Calcite-no-pushdown — percentile of an all-null
+    // group is null, not 0 (the latter is a legacy DSL pushdown convention).
+    Integer emptyGroupPercentile =
+        (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0;
     verifyDataRows(
         response,
-        rows(isPushdownDisabled() ? null : 0, null),
+        rows(emptyGroupPercentile, null),
         rows(32838, 28),
         rows(39225, 32),
         rows(4180, 33),
         rows(48086, 34),
-        rows(isPushdownDisabled() ? null : 0, 36));
+        rows(emptyGroupPercentile, 36));
   }
 
   @Test
@@ -692,13 +706,15 @@ public void testStatsPercentileByNullValueNonNullBucket() throws IOException {
                 "source=%s | stats bucket_nullable=false percentile(balance, 50) as p50 by age",
                 TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int"));
+    Integer emptyGroupPercentile =
+        (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0;
     verifyDataRows(
         response,
         rows(32838, 28),
         rows(39225, 32),
         rows(4180, 33),
         rows(48086, 34),
-        rows(isPushdownDisabled() ? null : 0, 36));
+        rows(emptyGroupPercentile, 36));
   }
 
   @Test
@@ -709,7 +725,14 @@ public void testStatsPercentileBySpan() throws IOException {
                 "source=%s | stats percentile(balance, 50) as p50 by span(age, 10) as age_bucket",
                 TEST_INDEX_BANK));
     verifySchema(response, schema("p50", null, "bigint"), schema("age_bucket", null, "int"));
-    verifyDataRows(response, rows(32838, 20), rows(39225, 30));
+    // Same per-engine TDigest interpolation divergence as testStatsPercentileWithNull —
+    // the age=30 bucket's p50 differs between OpenSearch's TDigest (39225) and DataFusion's
+    // (33194). The age=20 bucket happens to coincide.
+    if (isAnalyticsParquetIndicesEnabled()) {
+      verifyDataRows(response, rows(32838, 20), rows(33194, 30));
+    } else {
+      verifyDataRows(response, rows(32838, 20), rows(39225, 30));
+    }
   }
 
   @Test
@@ -729,6 +752,15 @@ public void testDisableLegacyPreferred() throws IOException {
             throw new RuntimeException(e);
           }
           verifySchema(response, schema("a", null, "double"), schema("age", null, "int"));
+          // Skip on the analytics-engine backend: PPL_SYNTAX_LEGACY_PREFERRED=false is
+          // expected to give the same shape across backends, but v2 / Calcite-DSL drop
+          // the null-age bucket (5 rows) while DataFusion keeps it (6 rows). Deciding
+          // which is correct is a semantic question for the team; until that's resolved,
+          // exercising this test on the analytics path doesn't tell us anything useful.
+          org.junit.Assume.assumeFalse(
+              "PPL_SYNTAX_LEGACY_PREFERRED=false null-bucket semantics not yet aligned"
+                  + " between V2 / Calcite-DSL and analytics-engine backends",
+              isAnalyticsParquetIndicesEnabled());
           verifyDataRows(
               response,
               rows(32838D, 28),

From b1c850297cff654d7b0f27bb35b7c49017314dfc Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Thu, 14 May 2026 12:17:32 -0700
Subject: [PATCH 3/4] Consolidate analytics parquet config into
 AnalyticsIndexConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per @dai-chen's review feedback — the index-creation settings, the
bulk-load refresh strategy, and the system-property gate were spread
across three places in TestUtils (the `ANALYTICS_PARQUET_INDICES_PROP`
constant, the inline check in `createIndexByRestClient`, the inline
check in `loadDataByRestClient`, and the private `makeParquetBacked`
helper). That's the same spread-out-conditional pattern the OS-Spark
repo's AOSS configs ended up with.

Collect everything into a single `TestUtils.AnalyticsIndexConfig`
nested helper:

  * `ENABLED_PROP` — the system property name.
  * `isEnabled()` — single source for the gate.
  * `applyIndexCreationSettings(jsonObject)` — injects the
    parquet-backed composite settings; no-op when disabled.
  * `bulkLoadRefreshParam()` — returns the right `_bulk` refresh query
    string for the active index type.

`createIndexByRestClient` and `loadDataByRestClient` call these
unconditionally — the methods themselves short-circuit when the config
is off — so the parquet-specific branching is gone from the helpers
and concentrated in one place. `PPLIntegTestCase.isAnalyticsParquetIndicesEnabled()`
now delegates to `AnalyticsIndexConfig.isEnabled()`.

The legacy `ANALYTICS_PARQUET_INDICES_PROP` constant is kept as a
forwarding alias for any external callers.

Verified end-to-end against the analytics-engine cluster — same
50 / 63 pass + 1 skip + 12 fail as the pre-refactor run, so the
consolidation is behavior-preserving.

Signed-off-by: Kai Huang <huangkaics@gmail.com>
Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 .../org/opensearch/sql/legacy/TestUtils.java  | 109 +++++++++++-------
 .../opensearch/sql/ppl/PPLIntegTestCase.java  |   3 +-
 2 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
index f6e8bf3e282..4deef787910 100644
--- a/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
+++ b/integ-test/src/test/java/org/opensearch/sql/legacy/TestUtils.java
@@ -38,16 +38,72 @@ public class TestUtils {
   private static final String MAPPING_FILE_PATH = "src/test/resources/indexDefinitions/";
 
   /**
-   * System property that makes every test-created index parquet-backed (composite primary data
-   * format = parquet) with a single shard. When set, {@link
-   * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code
-   * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see #5432)
-   * will return {@code true} for every test-created index — exercising the analytics-engine route
-   * end-to-end without per-test rewiring.
+   * Forwarding alias for {@link AnalyticsIndexConfig#ENABLED_PROP}. Kept for any external callers
+   * that referenced the old constant location.
+   */
+  public static final String ANALYTICS_PARQUET_INDICES_PROP = AnalyticsIndexConfig.ENABLED_PROP;
+
+  /**
+   * Centralizes every analytics-engine-only test-index knob in one place so the parquet-backed
+   * settings, the bulk-load refresh strategy, and any future analytics-specific config don't drift
+   * across separate helpers (the spread-out-conditional pattern called out in review, borrowed from
+   * the OS-Spark repo's AOSS-conditional configs that ended up scattered across its codebase).
    *
-   * <p>Off by default; normal CI is untouched.
+   * <p>When {@link #ENABLED_PROP} is unset (default), every method here is a no-op or returns the
+   * standard non-parquet value, so normal CI sees zero behavior change.
    */
-  public static final String ANALYTICS_PARQUET_INDICES_PROP = "tests.analytics.parquet_indices";
+  public static final class AnalyticsIndexConfig {
+
+    /**
+     * System property that makes every test-created index parquet-backed (composite primary data
+     * format = parquet) with a single shard. When set, {@link
+     * RestUnifiedQueryAction#isAnalyticsIndex} (which routes based on {@code
+     * index.pluggable.dataformat.enabled} / {@code index.pluggable.dataformat=composite}, see
+     * #5432) returns {@code true} for every test-created index — exercising the analytics-engine
+     * route end-to-end without per-test rewiring.
+     */
+    public static final String ENABLED_PROP = "tests.analytics.parquet_indices";
+
+    public static boolean isEnabled() {
+      return Boolean.parseBoolean(System.getProperty(ENABLED_PROP, "false"));
+    }
+
+    /**
+     * Inject the parquet-backed composite-store index settings into {@code jsonObject}. No-op when
+     * the config is disabled; idempotent — safe on any index-creation JSON shape.
+     */
+    static void applyIndexCreationSettings(JSONObject jsonObject) {
+      if (!isEnabled()) {
+        return;
+      }
+      JSONObject settings =
+          jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject();
+      JSONObject indexSettings =
+          settings.has("index") ? settings.getJSONObject("index") : new JSONObject();
+      indexSettings.put("number_of_shards", 1);
+      indexSettings.put("pluggable.dataformat.enabled", true);
+      indexSettings.put("pluggable.dataformat", "composite");
+      indexSettings.put("composite.primary_data_format", "parquet");
+      indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray());
+      settings.put("index", indexSettings);
+      jsonObject.put("settings", settings);
+    }
+
+    /**
+     * Returns the {@code _bulk} refresh query string for the current index type. Parquet-backed
+     * indices in the analytics-backend-lucene composite engine don't yet implement {@code
+     * LuceneCommitter.getSafeCommitInfo} (UnsupportedOperationException "TODO:: with index
+     * deleter"), which hangs {@code refresh=wait_for} until the test framework request timeout
+     * (~60s). Force-refresh sidesteps the safe-commit-info path while still making the bulk-loaded
+     * docs immediately searchable. Drop this branch once {@code LuceneCommitter.getSafeCommitInfo}
+     * is implemented.
+     */
+    static String bulkLoadRefreshParam() {
+      return isEnabled() ? "refresh=true" : "refresh=wait_for&wait_for_active_shards=all";
+    }
+
+    private AnalyticsIndexConfig() {}
+  }
 
   /**
    * Create test index by REST client.
@@ -60,9 +116,7 @@ public static void createIndexByRestClient(RestClient client, String indexName,
     Request request = new Request("PUT", "/" + indexName);
     JSONObject jsonObject = isNullOrEmpty(mapping) ? new JSONObject() : new JSONObject(mapping);
     setZeroReplicas(jsonObject);
-    if (Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))) {
-      makeParquetBacked(jsonObject);
-    }
+    AnalyticsIndexConfig.applyIndexCreationSettings(jsonObject);
     request.setJsonEntity(jsonObject.toString());
     performRequest(client, request);
   }
@@ -84,25 +138,6 @@ private static void setZeroReplicas(JSONObject jsonObject) {
     jsonObject.put("settings", settings);
   }
 
-  /**
-   * Switches the test index to a parquet-backed composite store with a single shard so the
-   * analytics-engine path has a backend that can scan it. Routing is then driven entirely by index
-   * settings (#5432) — no other test plumbing required.
-   */
-  private static void makeParquetBacked(JSONObject jsonObject) {
-    JSONObject settings =
-        jsonObject.has("settings") ? jsonObject.getJSONObject("settings") : new JSONObject();
-    JSONObject indexSettings =
-        settings.has("index") ? settings.getJSONObject("index") : new JSONObject();
-    indexSettings.put("number_of_shards", 1);
-    indexSettings.put("pluggable.dataformat.enabled", true);
-    indexSettings.put("pluggable.dataformat", "composite");
-    indexSettings.put("composite.primary_data_format", "parquet");
-    indexSettings.put("composite.secondary_data_formats", new org.json.JSONArray());
-    settings.put("index", indexSettings);
-    jsonObject.put("settings", settings);
-  }
-
   /**
    * https://github.com/elastic/elasticsearch/pull/49959<br>
    * Deprecate creation of dot-prefixed index names except for hidden and system indices. Create
@@ -150,17 +185,9 @@ public static boolean isIndexExist(RestClient client, String indexName) {
   public static void loadDataByRestClient(
       RestClient client, String indexName, String dataSetFilePath) throws IOException {
     Path path = Paths.get(getResourceFilePath(dataSetFilePath));
-    // Workaround: parquet-backed indices in the analytics-backend-lucene composite engine
-    // do not yet implement LuceneCommitter.getSafeCommitInfo (UnsupportedOperationException
-    // "TODO:: with index deleter"), which hangs refresh=wait_for until the test framework
-    // request timeout (~60s). Force-refresh sidesteps the safe-commit-info path while still
-    // making the bulk-loaded docs immediately searchable. Drop this branch once
-    // LuceneCommitter.getSafeCommitInfo is implemented.
-    String refreshParam =
-        Boolean.parseBoolean(System.getProperty(ANALYTICS_PARQUET_INDICES_PROP, "false"))
-            ? "refresh=true"
-            : "refresh=wait_for&wait_for_active_shards=all";
-    Request request = new Request("POST", "/" + indexName + "/_bulk?" + refreshParam);
+    Request request =
+        new Request(
+            "POST", "/" + indexName + "/_bulk?" + AnalyticsIndexConfig.bulkLoadRefreshParam());
     request.setJsonEntity(new String(Files.readAllBytes(path)));
     performRequest(client, request);
   }
diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
index 238bc0f68b2..6c25c415af1 100644
--- a/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
+++ b/integ-test/src/test/java/org/opensearch/sql/ppl/PPLIntegTestCase.java
@@ -60,8 +60,7 @@ protected void init() throws Exception {
    * Calcite-DSL paths.
    */
   public static boolean isAnalyticsParquetIndicesEnabled() {
-    return Boolean.parseBoolean(
-        System.getProperty(TestUtils.ANALYTICS_PARQUET_INDICES_PROP, "false"));
+    return TestUtils.AnalyticsIndexConfig.isEnabled();
   }
 
   protected JSONObject executeQuery(String query) throws IOException {

From efa97dee4d7939f0ba28287b71c4a93a02350ee4 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Thu, 14 May 2026 14:03:55 -0700
Subject: [PATCH 4/4] Revert per-engine assertions on percentile tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per @dai-chen — since the companion OpenSearch PR (#21584) no longer
wires `percentile_approx` on the analytics path (Sandesh's "repurpose
for SPAN" scope), all `percentile(...)` queries fail at execution
before reaching the assertion. The per-engine branches that this PR
introduced for `testStatsPercentile{WithNull,BySpan,ByNullValue,
ByNullValueNonNullBucket}` were doing nothing in practice — the tests
fail regardless.

Reverts those four tests to their pre-PR state. The three remaining
intentional changes stand:

  * `testStatsWithLimit` Q1+Q2 — non-deterministic head ordering on
    DataFusion hash-bucket order; size-only branch on the analytics
    path.
  * `testSumWithNull` — extends the existing
    `isPushdownDisabled() ? null : 0` pattern. SUM works on the
    analytics path and DataFusion returns null per SQL spec.
  * `testDisableLegacyPreferred` — `Assume.assumeFalse` skip for the
    pending-team-decision null-bucket semantic divergence.

Signed-off-by: Kai Huang <huangkaics@gmail.com>
Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 .../opensearch/sql/ppl/StatsCommandIT.java    | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
index 229b05ac086..c3166dd6911 100644
--- a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
+++ b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
@@ -641,11 +641,7 @@ public void testStatsPercentileWithNull() throws IOException {
             String.format(
                 "source=%s | stats percentile(balance, 50)", TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("percentile(balance, 50)", null, "bigint"));
-    // DataFusion's TDigest interpolation produces a different median value for the same
-    // input than OpenSearch's TDigest implementation. Both are valid approximations within
-    // TDigest's compression-bound error. Until DataFusion's UDAF is replaced with one
-    // matching OpenSearch's TDigest (or vice versa), record the per-engine values.
-    verifyDataRows(response, rows(isAnalyticsParquetIndicesEnabled() ? 35413 : 39225));
+    verifyDataRows(response, rows(39225));
   }
 
   @Test
@@ -684,18 +680,14 @@ public void testStatsPercentileByNullValue() throws IOException {
                 "source=%s | stats percentile(balance, 50) as p50 by age",
                 TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int"));
-    // DataFusion follows SQL spec like Calcite-no-pushdown — percentile of an all-null
-    // group is null, not 0 (the latter is a legacy DSL pushdown convention).
-    Integer emptyGroupPercentile =
-        (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0;
     verifyDataRows(
         response,
-        rows(emptyGroupPercentile, null),
+        rows(isPushdownDisabled() ? null : 0, null),
         rows(32838, 28),
         rows(39225, 32),
         rows(4180, 33),
         rows(48086, 34),
-        rows(emptyGroupPercentile, 36));
+        rows(isPushdownDisabled() ? null : 0, 36));
   }
 
   @Test
@@ -706,15 +698,13 @@ public void testStatsPercentileByNullValueNonNullBucket() throws IOException {
                 "source=%s | stats bucket_nullable=false percentile(balance, 50) as p50 by age",
                 TEST_INDEX_BANK_WITH_NULL_VALUES));
     verifySchema(response, schema("p50", null, "bigint"), schema("age", null, "int"));
-    Integer emptyGroupPercentile =
-        (isPushdownDisabled() || isAnalyticsParquetIndicesEnabled()) ? null : 0;
     verifyDataRows(
         response,
         rows(32838, 28),
         rows(39225, 32),
         rows(4180, 33),
         rows(48086, 34),
-        rows(emptyGroupPercentile, 36));
+        rows(isPushdownDisabled() ? null : 0, 36));
   }
 
   @Test
@@ -725,14 +715,7 @@ public void testStatsPercentileBySpan() throws IOException {
                 "source=%s | stats percentile(balance, 50) as p50 by span(age, 10) as age_bucket",
                 TEST_INDEX_BANK));
     verifySchema(response, schema("p50", null, "bigint"), schema("age_bucket", null, "int"));
-    // Same per-engine TDigest interpolation divergence as testStatsPercentileWithNull —
-    // the age=30 bucket's p50 differs between OpenSearch's TDigest (39225) and DataFusion's
-    // (33194). The age=20 bucket happens to coincide.
-    if (isAnalyticsParquetIndicesEnabled()) {
-      verifyDataRows(response, rows(32838, 20), rows(33194, 30));
-    } else {
-      verifyDataRows(response, rows(32838, 20), rows(39225, 30));
-    }
+    verifyDataRows(response, rows(32838, 20), rows(39225, 30));
   }
 
   @Test