From a4f1f248fdaa5e0590c70f3027314458e59ce2ac Mon Sep 17 00:00:00 2001 From: Gaurav Jayswal Date: Mon, 20 Apr 2026 15:01:19 +0530 Subject: [PATCH] =?UTF-8?q?Add=20matchset=5Fscale=20function=20=E2=80=94?= =?UTF-8?q?=20matching-set-scoped=20scale=20for=20filtered=20queries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces matchset_scale(source, min, max) as a new function query parser. Semantically similar to Lucene's scale(...), but computes the observed min/max over the current request's matching DocSet (intersection of q and all fqs) rather than every doc in every segment. For narrowly filtered queries this reduces the bounds computation from O(N) to O(M) where M is the matching set size. - Adds MatchSetScaleFloatFunction in solr-core - Registers matchset_scale in ValueSourceParser - Adds unit tests covering matching-set scope, divide-by-zero guard, global bounds, and custom target range - Adds ref guide documentation - Adds changelog fragment --- .../unreleased/matchset_scale-function.yml | 5 + .../apache/solr/search/ValueSourceParser.java | 12 + .../function/MatchSetScaleFloatFunction.java | 237 ++++++++++++++++++ .../TestMatchSetScaleFloatFunction.java | 129 ++++++++++ .../query-guide/pages/function-queries.adoc | 15 ++ 5 files changed, 398 insertions(+) create mode 100644 changelog/unreleased/matchset_scale-function.yml create mode 100644 solr/core/src/java/org/apache/solr/search/function/MatchSetScaleFloatFunction.java create mode 100644 solr/core/src/test/org/apache/solr/search/function/TestMatchSetScaleFloatFunction.java diff --git a/changelog/unreleased/matchset_scale-function.yml b/changelog/unreleased/matchset_scale-function.yml new file mode 100644 index 000000000000..1e1674daae22 --- /dev/null +++ b/changelog/unreleased/matchset_scale-function.yml @@ -0,0 +1,5 @@ +title: Add matchset_scale function — a matching-set-scoped variant of scale that avoids the full-index traversal for narrowly filtered queries +type: added +authors: + - name: Gaurav Jayswal + nick: gauravjayswal diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java index 04e17b78d8e5..67aaae4632c9 100644 --- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java +++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java @@ -105,6 +105,7 @@ import org.apache.solr.search.function.ConcatStringFunction; import org.apache.solr.search.function.DualDoubleFunction; import org.apache.solr.search.function.EqualFunction; +import org.apache.solr.search.function.MatchSetScaleFloatFunction; import org.apache.solr.search.function.OrdFieldSource; import org.apache.solr.search.function.ReverseOrdFieldSource; import org.apache.solr.search.function.SolrComparisonBoolFunction; @@ -262,6 +263,17 @@ public ValueSource parse(FunctionQParser fp) throws SyntaxError { return new ScaleFloatFunction(source, min, max); } }); + addParser( + "matchset_scale", + new ValueSourceParser() { + @Override + public ValueSource parse(FunctionQParser fp) throws SyntaxError { + ValueSource source = fp.parseValueSource(); + float min = fp.parseFloat(); + float max = fp.parseFloat(); + return new MatchSetScaleFloatFunction(source, min, max); + } + }); addParser( "div", new ValueSourceParser() { diff --git a/solr/core/src/java/org/apache/solr/search/function/MatchSetScaleFloatFunction.java b/solr/core/src/java/org/apache/solr/search/function/MatchSetScaleFloatFunction.java new file mode 100644 index 000000000000..7b3fc8fed983 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/function/MatchSetScaleFloatFunction.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.function; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.queries.function.docvalues.FloatDocValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.solr.handler.component.ResponseBuilder; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrRequestInfo; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.SolrIndexSearcher; + +/** + * Linearly scales {@code source} into {@code [targetMin, targetMax]} using the observed min/max of + * {@code source} over the current request's matching DocSet. + * + *

Differs from Lucene's {@code ScaleFloatFunction} in two ways: + * + *

+ * + *

Falls back to a full index scan when a Solr request context is not available — e.g. when + * invoked from Lucene-level tests or embedded tool usage. + */ +public class MatchSetScaleFloatFunction extends ValueSource { + protected final ValueSource source; + protected final float targetMin; + protected final float targetMax; + + public MatchSetScaleFloatFunction(ValueSource source, float targetMin, float targetMax) { + this.source = source; + this.targetMin = targetMin; + this.targetMax = targetMax; + } + + @Override + public String description() { + return "matchset_scale(" + source.description() + "," + targetMin + "," + targetMax + ")"; + } + + private static final class Bounds { + float min; + float max; + } + + @Override + public void createWeight(Map context, IndexSearcher searcher) throws IOException { + source.createWeight(context, searcher); + } + + private Bounds computeBounds(Map context, LeafReaderContext readerContext) + throws IOException { + float minVal = Float.POSITIVE_INFINITY; + float maxVal = Float.NEGATIVE_INFINITY; + + List leaves = ReaderUtil.getTopLevelContext(readerContext).leaves(); + DocSet matchSet = findMatchSet(); + + if (matchSet != null) { + for (LeafReaderContext leaf : leaves) { + DocIdSetIterator it = matchSet.iterator(leaf); + if (it == null) continue; + FunctionValues vals = source.getValues(context, leaf); + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + float v = vals.floatVal(doc); + if ((Float.floatToRawIntBits(v) & (0xff << 23)) == 0xff << 23) { + continue; + } + if (v < minVal) minVal = v; + if (v > maxVal) maxVal = v; + } + } + } else { + for (LeafReaderContext leaf : leaves) { + int maxDoc = leaf.reader().maxDoc(); + FunctionValues vals = source.getValues(context, leaf); + for (int i = 0; i < maxDoc; i++) { + if (!vals.exists(i)) continue; + float v = vals.floatVal(i); + if ((Float.floatToRawIntBits(v) & (0xff << 23)) == 0xff << 23) { + continue; + } + if (v < minVal) minVal = v; + if (v > maxVal) maxVal = v; + } + } + } + + if (minVal == Float.POSITIVE_INFINITY) { + minVal = maxVal = 0f; + } + + Bounds b = new Bounds(); + b.min = minVal; + b.max = maxVal; + context.put(MatchSetScaleFloatFunction.this, b); + return b; + } + + // Guards against reentrant DocSet materialization when matchset_scale appears (directly or + // nested) inside the main query being materialized. Each recursive FunctionQuery creates a + // fresh ValueSource context, so the per-context Bounds cache cannot prevent recursion — the + // guard lives in the per-request context instead. + private static final String COMPUTE_GUARD_KEY = "matchset_scale.computing"; + + private DocSet findMatchSet() throws IOException { + SolrRequestInfo reqInfo = SolrRequestInfo.getRequestInfo(); + if (reqInfo == null) return null; + ResponseBuilder rb = reqInfo.getResponseBuilder(); + if (rb == null) return null; + + if (rb.getResults() != null && rb.getResults().docSet != null) { + return rb.getResults().docSet; + } + + SolrQueryRequest req = reqInfo.getReq(); + if (req == null) return null; + Map reqCtx = req.getContext(); + if (reqCtx != null && reqCtx.containsKey(COMPUTE_GUARD_KEY)) { + // Reentrant call from inside our own DocSet materialization — fall back to a full scan. + return null; + } + + SolrIndexSearcher sis = req.getSearcher(); + if (sis == null) return null; + Query q = rb.getQuery(); + if (q == null) return null; + List filters = rb.getFilters(); + + if (reqCtx != null) reqCtx.put(COMPUTE_GUARD_KEY, Boolean.TRUE); + try { + if (filters == null || filters.isEmpty()) { + return sis.getDocSet(q); + } + return sis.getDocSet(q, sis.getDocSet(filters)); + } finally { + if (reqCtx != null) reqCtx.remove(COMPUTE_GUARD_KEY); + } + } + + @Override + public FunctionValues getValues(Map context, LeafReaderContext readerContext) + throws IOException { + Bounds b = (Bounds) context.get(MatchSetScaleFloatFunction.this); + if (b == null) { + b = computeBounds(context, readerContext); + } + + final float minObs = b.min; + final float maxObs = b.max; + final float outMin = targetMin; + final float outMax = targetMax; + final float obsRange = maxObs - minObs; + final float scale = (obsRange == 0f) ? 0f : (outMax - outMin) / obsRange; + + final FunctionValues vals = source.getValues(context, readerContext); + + return new FloatDocValues(this) { + @Override + public boolean exists(int doc) throws IOException { + return vals.exists(doc); + } + + @Override + public float floatVal(int doc) throws IOException { + if (obsRange == 0f) { + return outMin; + } + float v = (vals.floatVal(doc) - minObs) * scale + outMin; + if (v < outMin) return outMin; + if (v > outMax) return outMax; + return v; + } + + @Override + public String toString(int doc) throws IOException { + return "matchset_scale(" + + vals.toString(doc) + + ",toMin=" + + outMin + + ",toMax=" + + outMax + + ",fromMin=" + + minObs + + ",fromMax=" + + maxObs + + ")"; + } + }; + } + + @Override + public int hashCode() { + int h = Float.floatToIntBits(targetMin); + h = h * 29; + h += Float.floatToIntBits(targetMax); + h = h * 29; + h += source.hashCode(); + return h ^ MatchSetScaleFloatFunction.class.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o == null || MatchSetScaleFloatFunction.class != o.getClass()) return false; + MatchSetScaleFloatFunction other = (MatchSetScaleFloatFunction) o; + return this.targetMin == other.targetMin + && this.targetMax == other.targetMax + && this.source.equals(other.source); + } +} diff --git a/solr/core/src/test/org/apache/solr/search/function/TestMatchSetScaleFloatFunction.java b/solr/core/src/test/org/apache/solr/search/function/TestMatchSetScaleFloatFunction.java new file mode 100644 index 000000000000..0aac9ba8be02 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/search/function/TestMatchSetScaleFloatFunction.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.function; + +import org.apache.solr.SolrTestCaseJ4; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestMatchSetScaleFloatFunction extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-functionquery.xml", "schema11.xml"); + } + + @Test + public void testLinearTransform_globalBounds() throws Exception { + clearIndex(); + assertU(adoc("id", "1", "foo_f", "0")); + assertU(adoc("id", "2", "foo_f", "10")); + assertU(adoc("id", "3", "foo_f", "20")); + assertU(adoc("id", "4", "foo_f", "30")); + assertU(adoc("id", "5", "foo_f", "40")); + assertU(commit()); + + // No filter — bounds are min=0, max=40 across matching set (= all docs). + // matchset_scale(v, 0, 1) = (v - 0) / 40 + assertQ( + req("q", "{!func}matchset_scale(foo_f,0,1)", "fl", "id,score", "rows", "10"), + "//doc[./str[@name='id']='1']/float[@name='score'][.='0.0']", + "//doc[./str[@name='id']='3']/float[@name='score'][.='0.5']", + "//doc[./str[@name='id']='5']/float[@name='score'][.='1.0']"); + } + + @Test + public void testBoundsScopedToMatchingSet() throws Exception { + clearIndex(); + // Broad value range across index, but filter will restrict. + assertU(adoc("id", "1", "foo_f", "0", "cat_s", "A")); + assertU(adoc("id", "2", "foo_f", "100", "cat_s", "A")); + assertU(adoc("id", "3", "foo_f", "200", "cat_s", "A")); + assertU(adoc("id", "4", "foo_f", "1000", "cat_s", "B")); + assertU(adoc("id", "5", "foo_f", "2000", "cat_s", "B")); + assertU(commit()); + + // Scoped to cat_s:A → matching set values {0, 100, 200}. + // matchset_scale(v, 0, 10) = (v - 0) * 10 / 200. + // Critical: if bounds were global (0..2000), score for id=3 would be 1.0, not 10.0. + assertQ( + req( + "q", + "{!func}matchset_scale(foo_f,0,10)", + "fq", + "cat_s:A", + "fl", + "id,score", + "rows", + "10"), + "//doc[./str[@name='id']='1']/float[@name='score'][.='0.0']", + "//doc[./str[@name='id']='2']/float[@name='score'][.='5.0']", + "//doc[./str[@name='id']='3']/float[@name='score'][.='10.0']"); + + // And scoped to cat_s:B → matching set {1000, 2000}. + // matchset_scale(v, 0, 10) = (v - 1000) * 10 / 1000. + assertQ( + req( + "q", + "{!func}matchset_scale(foo_f,0,10)", + "fq", + "cat_s:B", + "fl", + "id,score", + "rows", + "10"), + "//doc[./str[@name='id']='4']/float[@name='score'][.='0.0']", + "//doc[./str[@name='id']='5']/float[@name='score'][.='10.0']"); + } + + @Test + public void testDivideByZeroGuard_allEqualValues() throws Exception { + clearIndex(); + assertU(adoc("id", "1", "foo_f", "42")); + assertU(adoc("id", "2", "foo_f", "42")); + assertU(adoc("id", "3", "foo_f", "42")); + assertU(commit()); + + // min == max → avoid NaN/Inf; every matching doc gets targetMin. + assertQ( + req("q", "{!func}matchset_scale(foo_f,7,99)", "fl", "id,score", "rows", "10"), + "//doc[./str[@name='id']='1']/float[@name='score'][.='7.0']", + "//doc[./str[@name='id']='2']/float[@name='score'][.='7.0']", + "//doc[./str[@name='id']='3']/float[@name='score'][.='7.0']"); + } + + @Test + public void testCustomTargetRange() throws Exception { + clearIndex(); + assertU(adoc("id", "1", "foo_f", "10")); + assertU(adoc("id", "2", "foo_f", "20")); + assertU(adoc("id", "3", "foo_f", "30")); + assertU(commit()); + + // Bounds: min=10, max=30. Target: [2, 8]. + // v=10 → 2, v=20 → 5, v=30 → 8. + // Note: targetMin must be >= 0 when using matchset_scale as a top-level q, because + // Lucene clamps negative query scores to 0. This is a Lucene constraint, not a + // matchset_scale constraint — in a nested expression (fl / boost), negative outputs + // pass through fine. + assertQ( + req("q", "{!func}matchset_scale(foo_f,2,8)", "fl", "id,score", "rows", "10"), + "//doc[./str[@name='id']='1']/float[@name='score'][.='2.0']", + "//doc[./str[@name='id']='2']/float[@name='score'][.='5.0']", + "//doc[./str[@name='id']='3']/float[@name='score'][.='8.0']"); + } +} diff --git a/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc b/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc index 7244db1428ff..784e8add7ba1 100644 --- a/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc +++ b/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc @@ -454,6 +454,21 @@ In these cases, an appropriate `map()` function could be used as a workaround to * `scale(x, minTarget, maxTarget)` * `scale(x,1,2)`: scales the values of x such that all values will be between 1 and 2 inclusive. +=== matchset_scale Function +Linearly scales values of the function `x` into `[minTarget, maxTarget]`, using the observed min and max of `x` computed over the *current request's matching DocSet* (the intersection of `q` and all `fq`s), rather than over every document in every segment. + +This is semantically similar to `scale`, but differs in two ways: + +* *Scope of bounds*: `scale` traverses every document in every segment; `matchset_scale` traverses only documents matching the current request. For narrowly filtered queries (typical for end-user searches) this can be orders of magnitude faster, since the matching set is often a small fraction of the full index. +* *Clamping*: `matchset_scale` clamps output values to `[minTarget, maxTarget]`. If the denominator is zero (i.e. all matching documents share the same value), the function returns `minTarget` rather than producing `NaN` or `Infinity`. + +When no Solr request context is available (for example, during Lucene-level tests), `matchset_scale` falls back to a full-index scan — identical behavior to `scale`. + +*Syntax Examples* + +* `matchset_scale(x, minTarget, maxTarget)` +* `matchset_scale(popularity, 0, 1)`: scales `popularity` into `[0, 1]` using the min and max observed among matching documents. + === sqedist Function The Square Euclidean distance calculates the 2-norm (Euclidean distance) but does not take the square root, thus saving a fairly expensive operation. It is often the case that applications that care about Euclidean distance do not need the actual distance, but instead can use the square of the distance.