From f1ef6d0f9412c45147205150a4e952e320794235 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Wed, 13 May 2026 10:55:25 +0000 Subject: [PATCH 1/3] [PPL] Lower isempty/isblank to CHAR_LENGTH = 0 instead of multiset IS_EMPTY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PPLFuncImpTable was emitting SqlStdOperatorTable.IS_EMPTY against string operands as part of its isempty(x) / isblank(x) lowering. That operator is the SQL:2003 multiset emptiness predicate — its OperandTypeChecker is COLLECTION_OR_MAP and its enumerable-runtime binding calls java.util.Collection.isEmpty() reflectively. Passing a string slipped through only because RexBuilder.makeCall bypasses the operand checker and Calcite's Linq4j codegen emits a bare `target.isEmpty()` that happens to bind to String.isEmpty() at Janino compile time. Outside of the script-pushdown path, the abuse breaks: any backend that does not run RexNodes through Calcite's enumerable codegen — most notably Substrait emitters such as the analytics-engine route — has no isthmus mapping for SqlStdOperatorTable.IS_EMPTY and rejects the plan with "unresolved function reference IS EMPTY". Replace the lowering with the predicate that actually matches the doc: isempty(x) -> OR(IS_NULL(x), CHAR_LENGTH(x) = 0) isblank(x) -> OR(IS_NULL(x), CHAR_LENGTH(TRIM(BOTH ' ' FROM x)) = 0) CHAR_LENGTH is a standard string operator with explicit type semantics, backed by SqlFunctions.charLength(String) in Calcite's enumerable runtime, and it round-trips through Substrait's default extension catalog. Same observable behavior on existing PPL paths; correct behavior on Substrait-based paths. Also remove PredicateAnalyzer.containIsEmptyFunction. It existed solely to abort DSL pushdown when it saw the OR(IS_NULL, IS_EMPTY) shape because the SHOULD branches would NPE on null operands evaluating .isEmpty() through the script. The new shape produces OR(IS_NULL, CHAR_LENGTH = 0); the IS_NULL arm pushes down to must_not.exists, the CHAR_LENGTH arm to a script that returns null (not NPE) for null fields, and the SHOULD union answers correctly. Test plan: - CalcitePPLConditionBuiltinFunctionIT.testIsEmpty / testIsBlank pass. - CalciteExplainIT.testExplainIsEmpty / testExplainIsBlank / testExplainIsEmptyOrOthers pass with regenerated golden plans for both pushdown-enabled and no-pushdown variants. - PredicateAnalyzerTest passes, including the unrelated equals_scriptPushDown_Struct test that still uses SqlStdOperatorTable.IS_EMPTY for its legitimate map-emptiness semantics. Signed-off-by: Songkan Tang --- .../expression/function/PPLFuncImpTable.java | 31 +++++++++++++++---- .../calcite/explain_isblank.yaml | 5 +-- .../calcite/explain_isempty.yaml | 5 +-- .../calcite/explain_isempty_or_others.yaml | 5 +-- .../calcite_no_pushdown/explain_isblank.yaml | 4 +-- .../calcite_no_pushdown/explain_isempty.yaml | 4 +-- .../explain_isempty_or_others.yaml | 4 +-- .../opensearch/request/PredicateAnalyzer.java | 17 ---------- .../request/PredicateAnalyzerTest.java | 4 ++- 9 files changed, 43 insertions(+), 36 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 849c60fe4eb..54371d84b79 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -1260,6 +1260,19 @@ void populate() { builder.makeNullLiteral(arg1.getType()), arg1), PPLTypeChecker.wrapComparable((SameOperandTypeChecker) OperandTypes.SAME_SAME)); + // PPL isempty(x) — TRUE iff x is NULL or an empty string. We express this as + // OR(IS_NULL(x), CHAR_LENGTH(x) = 0) + // rather than reusing SqlStdOperatorTable.IS_EMPTY: the latter is the SQL:2003 + // multiset/collection IS EMPTY predicate (its OperandTypeChecker is + // OperandTypes.COLLECTION_OR_MAP and its enumerable runtime calls + // java.util.Collection.isEmpty() reflectively). Passing a string operand only + // worked by coincidence — RexBuilder.makeCall bypasses the operand checker, and + // Calcite's enumerable codegen emits a bare `target.isEmpty()` call that happens + // to bind to String.isEmpty() at Janino compile time. The CHAR_LENGTH form makes + // the string semantics explicit, lets every backend translate the predicate + // through their normal length / equality bindings, and works on any code path + // that doesn't go through Calcite's enumerable runtime (e.g. Substrait emission + // for analytics-engine, which has no IS EMPTY mapping). register( IS_EMPTY, (FunctionImp1) @@ -1267,7 +1280,10 @@ void populate() { builder.makeCall( SqlStdOperatorTable.OR, builder.makeCall(SqlStdOperatorTable.IS_NULL, arg), - builder.makeCall(SqlStdOperatorTable.IS_EMPTY, arg)), + builder.makeCall( + SqlStdOperatorTable.EQUALS, + builder.makeCall(SqlStdOperatorTable.CHAR_LENGTH, arg), + builder.makeExactLiteral(BigDecimal.ZERO))), PPLTypeChecker.family(SqlTypeFamily.ANY)); register( IS_BLANK, @@ -1277,12 +1293,15 @@ void populate() { SqlStdOperatorTable.OR, builder.makeCall(SqlStdOperatorTable.IS_NULL, arg), builder.makeCall( - SqlStdOperatorTable.IS_EMPTY, + SqlStdOperatorTable.EQUALS, builder.makeCall( - SqlStdOperatorTable.TRIM, - builder.makeFlag(Flag.BOTH), - builder.makeLiteral(" "), - arg))), + SqlStdOperatorTable.CHAR_LENGTH, + builder.makeCall( + SqlStdOperatorTable.TRIM, + builder.makeFlag(Flag.BOTH), + builder.makeLiteral(" "), + arg)), + builder.makeExactLiteral(BigDecimal.ZERO))), PPLTypeChecker.family(SqlTypeFamily.ANY)); register( ILIKE, diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_isblank.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_isblank.yaml index 642d64b4236..4f2c524ba07 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_isblank.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_isblank.yaml @@ -2,7 +2,8 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(IS NULL($1), IS EMPTY(TRIM(FLAG(BOTH), ' ', $1)))]) + LogicalFilter(condition=[OR(IS NULL($1), =(CHAR_LENGTH(TRIM(FLAG(BOTH), ' ', $1)), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), IS EMPTY(TRIM(FLAG(BOTH), ' ', $1))), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQFHHsKICAib3AiOiB7CiAgICAibmFtZSI6ICJPUiIsCiAgICAia2luZCI6ICJPUiIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIklTIE5VTEwiLAogICAgICAgICJraW5kIjogIklTX05VTEwiLAogICAgICAgICJzeW50YXgiOiAiUE9TVEZJWCIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAwLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0sCiAgICB7CiAgICAgICJvcCI6IHsKICAgICAgICAibmFtZSI6ICJJUyBFTVBUWSIsCiAgICAgICAgImtpbmQiOiAiT1RIRVIiLAogICAgICAgICJzeW50YXgiOiAiUE9TVEZJWCIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJvcCI6IHsKICAgICAgICAgICAgIm5hbWUiOiAiVFJJTSIsCiAgICAgICAgICAgICJraW5kIjogIlRSSU0iLAogICAgICAgICAgICAic3ludGF4IjogIkZVTkNUSU9OIgogICAgICAgICAgfSwKICAgICAgICAgICJvcGVyYW5kcyI6IFsKICAgICAgICAgICAgewogICAgICAgICAgICAgICJsaXRlcmFsIjogIkJPVEgiLAogICAgICAgICAgICAgICJ0eXBlIjogewogICAgICAgICAgICAgICAgInR5cGUiOiAiU1lNQk9MIiwKICAgICAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlCiAgICAgICAgICAgICAgfQogICAgICAgICAgICB9LAogICAgICAgICAgICB7CiAgICAgICAgICAgICAgImR5bmFtaWNQYXJhbSI6IDEsCiAgICAgICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAgICAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgICAgICB9CiAgICAgICAgICAgIH0sCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAiZHluYW1pY1BhcmFtIjogMiwKICAgICAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgICAgIH0KICAgICAgICAgICAgfQogICAgICAgICAgXQogICAgICAgIH0KICAgICAgXQogICAgfQogIF0KfQ==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0,"SOURCES":[0,2,0],"DIGESTS":["firstname.keyword"," ","firstname.keyword"]}},"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), =(CHAR_LENGTH(TRIM(FLAG(BOTH), ' ', $1)), 0)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"bool":{"must_not":[{"exists":{"field":"firstname","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQEXnsKICAib3AiOiB7CiAgICAibmFtZSI6ICI9IiwKICAgICJraW5kIjogIkVRVUFMUyIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIkNIQVJfTEVOR1RIIiwKICAgICAgICAia2luZCI6ICJDSEFSX0xFTkdUSCIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJvcCI6IHsKICAgICAgICAgICAgIm5hbWUiOiAiVFJJTSIsCiAgICAgICAgICAgICJraW5kIjogIlRSSU0iLAogICAgICAgICAgICAic3ludGF4IjogIkZVTkNUSU9OIgogICAgICAgICAgfSwKICAgICAgICAgICJvcGVyYW5kcyI6IFsKICAgICAgICAgICAgewogICAgICAgICAgICAgICJsaXRlcmFsIjogIkJPVEgiLAogICAgICAgICAgICAgICJ0eXBlIjogewogICAgICAgICAgICAgICAgInR5cGUiOiAiU1lNQk9MIiwKICAgICAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlCiAgICAgICAgICAgICAgfQogICAgICAgICAgICB9LAogICAgICAgICAgICB7CiAgICAgICAgICAgICAgImR5bmFtaWNQYXJhbSI6IDAsCiAgICAgICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAgICAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgICAgICB9CiAgICAgICAgICAgIH0sCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAiZHluYW1pY1BhcmFtIjogMSwKICAgICAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgICAgIH0KICAgICAgICAgICAgfQogICAgICAgICAgXQogICAgICAgIH0KICAgICAgXQogICAgfSwKICAgIHsKICAgICAgImR5bmFtaWNQYXJhbSI6IDIsCiAgICAgICJ0eXBlIjogewogICAgICAgICJ0eXBlIjogIkJJR0lOVCIsCiAgICAgICAgIm51bGxhYmxlIjogdHJ1ZQogICAgICB9CiAgICB9CiAgXQp9\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp":1778662429619956162,"SOURCES":[2,0,2],"DIGESTS":[" ","firstname.keyword",0]}},"boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty.yaml index c4f104e6462..f997ead5604 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty.yaml @@ -2,7 +2,8 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(IS NULL($1), IS EMPTY($1))]) + LogicalFilter(condition=[OR(IS NULL($1), =(CHAR_LENGTH($1), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), IS EMPTY($1)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQC13sKICAib3AiOiB7CiAgICAibmFtZSI6ICJPUiIsCiAgICAia2luZCI6ICJPUiIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIklTIE5VTEwiLAogICAgICAgICJraW5kIjogIklTX05VTEwiLAogICAgICAgICJzeW50YXgiOiAiUE9TVEZJWCIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAwLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0sCiAgICB7CiAgICAgICJvcCI6IHsKICAgICAgICAibmFtZSI6ICJJUyBFTVBUWSIsCiAgICAgICAgImtpbmQiOiAiT1RIRVIiLAogICAgICAgICJzeW50YXgiOiAiUE9TVEZJWCIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAxLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0KICBdCn0=\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0,"SOURCES":[0,0],"DIGESTS":["firstname.keyword","firstname.keyword"]}},"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), =(CHAR_LENGTH($1), 0)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"bool":{"must_not":[{"exists":{"field":"firstname","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQCGXsKICAib3AiOiB7CiAgICAibmFtZSI6ICI9IiwKICAgICJraW5kIjogIkVRVUFMUyIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIkNIQVJfTEVOR1RIIiwKICAgICAgICAia2luZCI6ICJDSEFSX0xFTkdUSCIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAwLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0sCiAgICB7CiAgICAgICJkeW5hbWljUGFyYW0iOiAxLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAgICJudWxsYWJsZSI6IHRydWUKICAgICAgfQogICAgfQogIF0KfQ==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp":1778662435626872557,"SOURCES":[0,2],"DIGESTS":["firstname.keyword",0]}},"boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty_or_others.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty_or_others.yaml index e7603f5643d..30c9c83813d 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty_or_others.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_isempty_or_others.yaml @@ -2,7 +2,8 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(=($4, 'M'), IS NULL($1), IS EMPTY($1))]) + LogicalFilter(condition=[OR(=($4, 'M'), IS NULL($1), =(CHAR_LENGTH($1), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), =($4, 'M'), IS EMPTY($1)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQEtnsKICAib3AiOiB7CiAgICAibmFtZSI6ICJPUiIsCiAgICAia2luZCI6ICJPUiIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIklTIE5VTEwiLAogICAgICAgICJraW5kIjogIklTX05VTEwiLAogICAgICAgICJzeW50YXgiOiAiUE9TVEZJWCIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAwLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0sCiAgICB7CiAgICAgICJvcCI6IHsKICAgICAgICAibmFtZSI6ICI9IiwKICAgICAgICAia2luZCI6ICJFUVVBTFMiLAogICAgICAgICJzeW50YXgiOiAiQklOQVJZIgogICAgICB9LAogICAgICAib3BlcmFuZHMiOiBbCiAgICAgICAgewogICAgICAgICAgImR5bmFtaWNQYXJhbSI6IDEsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgfQogICAgICAgIH0sCiAgICAgICAgewogICAgICAgICAgImR5bmFtaWNQYXJhbSI6IDIsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgfQogICAgICAgIH0KICAgICAgXQogICAgfSwKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIklTIEVNUFRZIiwKICAgICAgICAia2luZCI6ICJPVEhFUiIsCiAgICAgICAgInN5bnRheCI6ICJQT1NURklYIgogICAgICB9LAogICAgICAib3BlcmFuZHMiOiBbCiAgICAgICAgewogICAgICAgICAgImR5bmFtaWNQYXJhbSI6IDMsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgfQogICAgICAgIH0KICAgICAgXQogICAgfQogIF0KfQ==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0,"SOURCES":[0,0,2,0],"DIGESTS":["firstname.keyword","gender.keyword","M","firstname.keyword"]}},"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], SCRIPT->OR(IS NULL($1), =($4, 'M'), =(CHAR_LENGTH($1), 0)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"bool":{"must_not":[{"exists":{"field":"firstname","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},{"term":{"gender.keyword":{"value":"M","boost":1.0}}},{"script":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXQCGXsKICAib3AiOiB7CiAgICAibmFtZSI6ICI9IiwKICAgICJraW5kIjogIkVRVUFMUyIsCiAgICAic3ludGF4IjogIkJJTkFSWSIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIkNIQVJfTEVOR1RIIiwKICAgICAgICAia2luZCI6ICJDSEFSX0xFTkdUSCIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJkeW5hbWljUGFyYW0iOiAwLAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0sCiAgICB7CiAgICAgICJkeW5hbWljUGFyYW0iOiAxLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAgICJudWxsYWJsZSI6IHRydWUKICAgICAgfQogICAgfQogIF0KfQ==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp":1778662435212192100,"SOURCES":[0,2],"DIGESTS":["firstname.keyword",0]}},"boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) + diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isblank.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isblank.yaml index 887fd96408b..6cc557f2a45 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isblank.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isblank.yaml @@ -2,9 +2,9 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(IS NULL($1), IS EMPTY(TRIM(FLAG(BOTH), ' ', $1)))]) + LogicalFilter(condition=[OR(IS NULL($1), =(CHAR_LENGTH(TRIM(FLAG(BOTH), ' ', $1)), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=[FLAG(BOTH)], expr#19=[' '], expr#20=[TRIM($t18, $t19, $t1)], expr#21=[IS EMPTY($t20)], expr#22=[OR($t17, $t21)], proj#0..10=[{exprs}], $condition=[$t22]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=[FLAG(BOTH)], expr#19=[' '], expr#20=[TRIM($t18, $t19, $t1)], expr#21=[CHAR_LENGTH($t20)], expr#22=[0], expr#23=[=($t21, $t22)], expr#24=[OR($t17, $t23)], proj#0..10=[{exprs}], $condition=[$t24]) CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty.yaml index 6115f98e23f..5d9c34e366b 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty.yaml @@ -2,9 +2,9 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(IS NULL($1), IS EMPTY($1))]) + LogicalFilter(condition=[OR(IS NULL($1), =(CHAR_LENGTH($1), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=[IS EMPTY($t1)], expr#19=[OR($t17, $t18)], proj#0..10=[{exprs}], $condition=[$t19]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=[CHAR_LENGTH($t1)], expr#19=[0], expr#20=[=($t18, $t19)], expr#21=[OR($t17, $t20)], proj#0..10=[{exprs}], $condition=[$t21]) CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty_or_others.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty_or_others.yaml index 7f43f48dc57..b67f50610d9 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty_or_others.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_isempty_or_others.yaml @@ -2,9 +2,9 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10]) - LogicalFilter(condition=[OR(=($4, 'M'), IS NULL($1), IS EMPTY($1))]) + LogicalFilter(condition=[OR(=($4, 'M'), IS NULL($1), =(CHAR_LENGTH($1), 0))]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=['M'], expr#19=[=($t4, $t18)], expr#20=[IS EMPTY($t1)], expr#21=[OR($t17, $t19, $t20)], proj#0..10=[{exprs}], $condition=[$t21]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NULL($t1)], expr#18=['M'], expr#19=[=($t4, $t18)], expr#20=[CHAR_LENGTH($t1)], expr#21=[0], expr#22=[=($t20, $t21)], expr#23=[OR($t17, $t19, $t22)], proj#0..10=[{exprs}], $condition=[$t23]) CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java index cc13c3a4a7a..6389fb28395 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java @@ -811,24 +811,7 @@ private static QueryExpression constructQueryExpressionForSearch( } } - private boolean containIsEmptyFunction(RexCall call) { - return call.getKind() == SqlKind.OR - && call.getOperands().stream().anyMatch(o -> o.getKind() == SqlKind.IS_NULL) - && call.getOperands().stream() - .anyMatch( - o -> - o.getKind() == SqlKind.OTHER - && ((RexCall) o).getOperator().equals(SqlStdOperatorTable.IS_EMPTY)); - } - private QueryExpression andOr(RexCall call) { - // For function isEmpty and isBlank, we implement them via expression `isNull or {@function}`, - // Unlike `OR` in Java, `SHOULD` in DSL will evaluate both branches and lead to NPE. - if (containIsEmptyFunction(call)) { - throw new PredicateAnalyzerException( - "DSL will evaluate both branches of OR with isNUll, prevent push-down to avoid NPE"); - } - QueryExpression[] expressions = new QueryExpression[call.getOperands().size()]; PredicateAnalyzerException firstError = null; boolean partial = false; diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java index 733c2de5213..432823276e0 100644 --- a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java @@ -838,7 +838,9 @@ void isNullOr_ScriptPushDown() throws ExpressionNotAnalyzableException { .add("a", builder.getTypeFactory().createSqlType(SqlTypeName.BIGINT)) .add("b", builder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR)) .build(); - // PPL IS_EMPTY is translated to OR(IS_NULL(arg), IS_EMPTY(arg)) + // PPL isempty(x) is translated to OR(IS_NULL(x), CHAR_LENGTH(x) = 0). Push-down + // falls back to script_query because CHAR_LENGTH has no DSL bool-query equivalent, + // so the OR is unanalyzable as a whole and lands in compounded_script. RexNode call = PPLFuncImpTable.INSTANCE.resolve(builder, BuiltinFunctionName.IS_EMPTY, field2); Hook.CURRENT_TIME.addThread((Consumer>) h -> h.set(0L)); QueryExpression expression = From 7177ad63620aff66db44aaf617c44de06018eb82 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Thu, 14 May 2026 02:36:17 +0000 Subject: [PATCH 2/3] [PPL] Strengthen isempty pushdown test to assert mixed-shape bool.should MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous test (isNullOr_ScriptPushDown) only asserted that the builder string contained the opensearch_compounded_script lang marker. With the isempty -> OR(IS_NULL, CHAR_LENGTH=0) lowering, the IS_NULL arm now pushes down as a native bool.must_not.exists clause and only the CHAR_LENGTH=0 arm remains a script — so the substring assertion still passed without verifying the actual structure. Renamed and rewritten to assert: - Top-level builder is a BoolQueryBuilder with exactly two should arms and no must / top-level must_not clauses. - Arm 0 is a BoolQueryBuilder whose mustNot wraps an ExistsQueryBuilder (the IS_NULL lowering). - Arm 1 is a ScriptQueryBuilder using the opensearch_compounded_script lang (the CHAR_LENGTH=0 lowering). Also documents inline why the prior containIsEmptyFunction NPE-prevention detector is no longer needed: CHAR_LENGTH(null) follows Calcite's STRICT null policy and returns null instead of NPE, so DSL's non-short-circuiting should evaluation is safe. Signed-off-by: Songkan Tang --- .../request/PredicateAnalyzerTest.java | 56 ++++++++++++++++--- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java index 432823276e0..b47b0acf927 100644 --- a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java @@ -7,6 +7,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.spy; import com.google.common.collect.ImmutableList; @@ -829,7 +830,8 @@ void isTrue_predicate() throws ExpressionNotAnalyzableException { } @Test - void isNullOr_ScriptPushDown() throws ExpressionNotAnalyzableException { + void isEmpty_pushesIsNullArmAsExistsAndCharLengthArmAsScript() + throws ExpressionNotAnalyzableException { final RelDataType rowType = builder .getTypeFactory() @@ -838,17 +840,55 @@ void isNullOr_ScriptPushDown() throws ExpressionNotAnalyzableException { .add("a", builder.getTypeFactory().createSqlType(SqlTypeName.BIGINT)) .add("b", builder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR)) .build(); - // PPL isempty(x) is translated to OR(IS_NULL(x), CHAR_LENGTH(x) = 0). Push-down - // falls back to script_query because CHAR_LENGTH has no DSL bool-query equivalent, - // so the OR is unanalyzable as a whole and lands in compounded_script. + // PPL isempty(x) lowers to OR(IS_NULL(x), CHAR_LENGTH(x) = 0) (see PPLFuncImpTable). + // The IS_NULL arm has a native DSL form (bool.must_not.exists); the CHAR_LENGTH arm + // has no DSL equivalent and falls back to opensearch_compounded_script. The analyzer + // emits a bool.should that mixes the two — not a single fully-script_query, which is + // strictly better for matching null docs since the IS_NULL arm avoids the script + // engine entirely. + // + // This shape is also why no special-case detector is needed in PredicateAnalyzer.andOr: + // CHAR_LENGTH(null) returns null (Calcite NullPolicy.STRICT) rather than NPE, so DSL's + // non-short-circuiting `should` evaluation is safe even when the field is null. Prior + // to the OR(IS_NULL, CHAR_LENGTH=0) lowering the right arm was IS_EMPTY which compiled + // to `name.isEmpty()` and would NPE on null operands — that is what containIsEmptyFunction + // used to guard against, and is no longer needed. RexNode call = PPLFuncImpTable.INSTANCE.resolve(builder, BuiltinFunctionName.IS_EMPTY, field2); Hook.CURRENT_TIME.addThread((Consumer>) h -> h.set(0L)); QueryExpression expression = PredicateAnalyzer.analyzeExpression(call, schema, fieldTypes, rowType, cluster); - assert (expression - .builder() - .toString() - .contains("\"lang\" : \"opensearch_compounded_script\"")); + + QueryBuilder builderResult = expression.builder(); + assertInstanceOf(BoolQueryBuilder.class, builderResult); + BoolQueryBuilder bool = (BoolQueryBuilder) builderResult; + assertEquals( + 2, + bool.should().size(), + "isempty pushes down as a bool.should mixing native IS_NULL and a script for CHAR_LENGTH=0"); + assertTrue(bool.must().isEmpty(), "must clauses are not used by isempty pushdown"); + assertTrue( + bool.mustNot().isEmpty(), + "must_not clauses at the top level are not used by isempty pushdown"); + + // Arm 1: IS_NULL($field) → bool.must_not.exists + QueryBuilder isNullArm = bool.should().get(0); + assertInstanceOf(BoolQueryBuilder.class, isNullArm); + BoolQueryBuilder isNullBool = (BoolQueryBuilder) isNullArm; + assertEquals(1, isNullBool.mustNot().size()); + assertInstanceOf( + ExistsQueryBuilder.class, + isNullBool.mustNot().get(0), + "IS_NULL arm must lower to bool.must_not.exists, not to a script"); + + // Arm 2: CHAR_LENGTH($field) = 0 → script (CHAR_LENGTH has no native DSL form) + QueryBuilder charLengthArm = bool.should().get(1); + assertInstanceOf( + ScriptQueryBuilder.class, + charLengthArm, + "CHAR_LENGTH=0 arm must lower to a script_query (no native DSL equivalent)"); + assertTrue( + charLengthArm.toString().contains("\"lang\" : \"opensearch_compounded_script\""), + "script arm uses the Calcite-RexNode-based opensearch_compounded_script lang"); } @Test From 217d4777266b04f8825770c3ec1bf3bf50ca1ca5 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Thu, 14 May 2026 02:52:19 +0000 Subject: [PATCH 3/3] Apply spotless line-wrap fix Signed-off-by: Songkan Tang --- .../sql/opensearch/request/PredicateAnalyzerTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java index b47b0acf927..3deb39d58c0 100644 --- a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java @@ -864,7 +864,8 @@ void isEmpty_pushesIsNullArmAsExistsAndCharLengthArmAsScript() assertEquals( 2, bool.should().size(), - "isempty pushes down as a bool.should mixing native IS_NULL and a script for CHAR_LENGTH=0"); + "isempty pushes down as a bool.should mixing native IS_NULL and a script for" + + " CHAR_LENGTH=0"); assertTrue(bool.must().isEmpty(), "must clauses are not used by isempty pushdown"); assertTrue( bool.mustNot().isEmpty(),