Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog/unreleased/matchset_scale-function.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
title: Add matchset_scale function — a matching-set-scoped variant of scale that avoids the full-index traversal for narrowly filtered queries
type: added
authors:
- name: Gaurav Jayswal
nick: gauravjayswal
12 changes: 12 additions & 0 deletions solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
import org.apache.solr.search.function.ConcatStringFunction;
import org.apache.solr.search.function.DualDoubleFunction;
import org.apache.solr.search.function.EqualFunction;
import org.apache.solr.search.function.MatchSetScaleFloatFunction;
import org.apache.solr.search.function.OrdFieldSource;
import org.apache.solr.search.function.ReverseOrdFieldSource;
import org.apache.solr.search.function.SolrComparisonBoolFunction;
Expand Down Expand Up @@ -262,6 +263,17 @@ public ValueSource parse(FunctionQParser fp) throws SyntaxError {
return new ScaleFloatFunction(source, min, max);
}
});
addParser(
"matchset_scale",
new ValueSourceParser() {
@Override
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
ValueSource source = fp.parseValueSource();
float min = fp.parseFloat();
float max = fp.parseFloat();
return new MatchSetScaleFloatFunction(source, min, max);
}
});
addParser(
"div",
new ValueSourceParser() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.function;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.docvalues.FloatDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;

/**
* Linearly scales {@code source} into {@code [targetMin, targetMax]} using the observed min/max of
* {@code source} over the <b>current request's matching DocSet</b>.
*
* <p>Differs from Lucene's {@code ScaleFloatFunction} in two ways:
*
* <ul>
* <li>Bounds are computed over only the request's matching set (intersection of {@code q} and all
* {@code fq}s), not every doc in every segment. For narrowly filtered queries this can be
* orders of magnitude faster.
* <li>Output is clamped to {@code [targetMin, targetMax]}.
* </ul>
*
* <p>Falls back to a full index scan when a Solr request context is not available — e.g. when
* invoked from Lucene-level tests or embedded tool usage.
*/
public class MatchSetScaleFloatFunction extends ValueSource {
protected final ValueSource source;
protected final float targetMin;
protected final float targetMax;

public MatchSetScaleFloatFunction(ValueSource source, float targetMin, float targetMax) {
this.source = source;
this.targetMin = targetMin;
this.targetMax = targetMax;
}

@Override
public String description() {
return "matchset_scale(" + source.description() + "," + targetMin + "," + targetMax + ")";
}

private static final class Bounds {
float min;
float max;
}

@Override
public void createWeight(Map<Object, Object> context, IndexSearcher searcher) throws IOException {
source.createWeight(context, searcher);
}

private Bounds computeBounds(Map<Object, Object> context, LeafReaderContext readerContext)
throws IOException {
float minVal = Float.POSITIVE_INFINITY;
float maxVal = Float.NEGATIVE_INFINITY;

List<LeafReaderContext> leaves = ReaderUtil.getTopLevelContext(readerContext).leaves();
DocSet matchSet = findMatchSet();

if (matchSet != null) {
for (LeafReaderContext leaf : leaves) {
DocIdSetIterator it = matchSet.iterator(leaf);
if (it == null) continue;
FunctionValues vals = source.getValues(context, leaf);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
float v = vals.floatVal(doc);
if ((Float.floatToRawIntBits(v) & (0xff << 23)) == 0xff << 23) {
continue;
}
if (v < minVal) minVal = v;
if (v > maxVal) maxVal = v;
}
}
} else {
for (LeafReaderContext leaf : leaves) {
int maxDoc = leaf.reader().maxDoc();
FunctionValues vals = source.getValues(context, leaf);
for (int i = 0; i < maxDoc; i++) {
if (!vals.exists(i)) continue;
float v = vals.floatVal(i);
if ((Float.floatToRawIntBits(v) & (0xff << 23)) == 0xff << 23) {
continue;
}
if (v < minVal) minVal = v;
if (v > maxVal) maxVal = v;
}
}
}

if (minVal == Float.POSITIVE_INFINITY) {
minVal = maxVal = 0f;
}

Bounds b = new Bounds();
b.min = minVal;
b.max = maxVal;
context.put(MatchSetScaleFloatFunction.this, b);
return b;
}

// Guards against reentrant DocSet materialization when matchset_scale appears (directly or
// nested) inside the main query being materialized. Each recursive FunctionQuery creates a
// fresh ValueSource context, so the per-context Bounds cache cannot prevent recursion — the
// guard lives in the per-request context instead.
private static final String COMPUTE_GUARD_KEY = "matchset_scale.computing";

private DocSet findMatchSet() throws IOException {
SolrRequestInfo reqInfo = SolrRequestInfo.getRequestInfo();
if (reqInfo == null) return null;
ResponseBuilder rb = reqInfo.getResponseBuilder();
if (rb == null) return null;

if (rb.getResults() != null && rb.getResults().docSet != null) {
return rb.getResults().docSet;
}

SolrQueryRequest req = reqInfo.getReq();
if (req == null) return null;
Map<Object, Object> reqCtx = req.getContext();
if (reqCtx != null && reqCtx.containsKey(COMPUTE_GUARD_KEY)) {
// Reentrant call from inside our own DocSet materialization — fall back to a full scan.
return null;
}

SolrIndexSearcher sis = req.getSearcher();
if (sis == null) return null;
Query q = rb.getQuery();
if (q == null) return null;
List<Query> filters = rb.getFilters();

if (reqCtx != null) reqCtx.put(COMPUTE_GUARD_KEY, Boolean.TRUE);
try {
if (filters == null || filters.isEmpty()) {
return sis.getDocSet(q);
}
return sis.getDocSet(q, sis.getDocSet(filters));
} finally {
if (reqCtx != null) reqCtx.remove(COMPUTE_GUARD_KEY);
}
}

@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext)
throws IOException {
Bounds b = (Bounds) context.get(MatchSetScaleFloatFunction.this);
if (b == null) {
b = computeBounds(context, readerContext);
}

final float minObs = b.min;
final float maxObs = b.max;
final float outMin = targetMin;
final float outMax = targetMax;
final float obsRange = maxObs - minObs;
final float scale = (obsRange == 0f) ? 0f : (outMax - outMin) / obsRange;

final FunctionValues vals = source.getValues(context, readerContext);

return new FloatDocValues(this) {
@Override
public boolean exists(int doc) throws IOException {
return vals.exists(doc);
}

@Override
public float floatVal(int doc) throws IOException {
if (obsRange == 0f) {
return outMin;
}
float v = (vals.floatVal(doc) - minObs) * scale + outMin;
if (v < outMin) return outMin;
if (v > outMax) return outMax;
return v;
}

@Override
public String toString(int doc) throws IOException {
return "matchset_scale("
+ vals.toString(doc)
+ ",toMin="
+ outMin
+ ",toMax="
+ outMax
+ ",fromMin="
+ minObs
+ ",fromMax="
+ maxObs
+ ")";
}
};
}

@Override
public int hashCode() {
int h = Float.floatToIntBits(targetMin);
h = h * 29;
h += Float.floatToIntBits(targetMax);
h = h * 29;
h += source.hashCode();
return h ^ MatchSetScaleFloatFunction.class.hashCode();
}

@Override
public boolean equals(Object o) {
if (o == null || MatchSetScaleFloatFunction.class != o.getClass()) return false;
MatchSetScaleFloatFunction other = (MatchSetScaleFloatFunction) o;
return this.targetMin == other.targetMin
&& this.targetMax == other.targetMax
&& this.source.equals(other.source);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.function;

import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestMatchSetScaleFloatFunction extends SolrTestCaseJ4 {

@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-functionquery.xml", "schema11.xml");
}

@Test
public void testLinearTransform_globalBounds() throws Exception {
clearIndex();
assertU(adoc("id", "1", "foo_f", "0"));
assertU(adoc("id", "2", "foo_f", "10"));
assertU(adoc("id", "3", "foo_f", "20"));
assertU(adoc("id", "4", "foo_f", "30"));
assertU(adoc("id", "5", "foo_f", "40"));
assertU(commit());

// No filter — bounds are min=0, max=40 across matching set (= all docs).
// matchset_scale(v, 0, 1) = (v - 0) / 40
assertQ(
req("q", "{!func}matchset_scale(foo_f,0,1)", "fl", "id,score", "rows", "10"),
"//doc[./str[@name='id']='1']/float[@name='score'][.='0.0']",
"//doc[./str[@name='id']='3']/float[@name='score'][.='0.5']",
"//doc[./str[@name='id']='5']/float[@name='score'][.='1.0']");
}

@Test
public void testBoundsScopedToMatchingSet() throws Exception {
clearIndex();
// Broad value range across index, but filter will restrict.
assertU(adoc("id", "1", "foo_f", "0", "cat_s", "A"));
assertU(adoc("id", "2", "foo_f", "100", "cat_s", "A"));
assertU(adoc("id", "3", "foo_f", "200", "cat_s", "A"));
assertU(adoc("id", "4", "foo_f", "1000", "cat_s", "B"));
assertU(adoc("id", "5", "foo_f", "2000", "cat_s", "B"));
assertU(commit());

// Scoped to cat_s:A → matching set values {0, 100, 200}.
// matchset_scale(v, 0, 10) = (v - 0) * 10 / 200.
// Critical: if bounds were global (0..2000), score for id=3 would be 1.0, not 10.0.
assertQ(
req(
"q",
"{!func}matchset_scale(foo_f,0,10)",
"fq",
"cat_s:A",
"fl",
"id,score",
"rows",
"10"),
"//doc[./str[@name='id']='1']/float[@name='score'][.='0.0']",
"//doc[./str[@name='id']='2']/float[@name='score'][.='5.0']",
"//doc[./str[@name='id']='3']/float[@name='score'][.='10.0']");

// And scoped to cat_s:B → matching set {1000, 2000}.
// matchset_scale(v, 0, 10) = (v - 1000) * 10 / 1000.
assertQ(
req(
"q",
"{!func}matchset_scale(foo_f,0,10)",
"fq",
"cat_s:B",
"fl",
"id,score",
"rows",
"10"),
"//doc[./str[@name='id']='4']/float[@name='score'][.='0.0']",
"//doc[./str[@name='id']='5']/float[@name='score'][.='10.0']");
}

@Test
public void testDivideByZeroGuard_allEqualValues() throws Exception {
clearIndex();
assertU(adoc("id", "1", "foo_f", "42"));
assertU(adoc("id", "2", "foo_f", "42"));
assertU(adoc("id", "3", "foo_f", "42"));
assertU(commit());

// min == max → avoid NaN/Inf; every matching doc gets targetMin.
assertQ(
req("q", "{!func}matchset_scale(foo_f,7,99)", "fl", "id,score", "rows", "10"),
"//doc[./str[@name='id']='1']/float[@name='score'][.='7.0']",
"//doc[./str[@name='id']='2']/float[@name='score'][.='7.0']",
"//doc[./str[@name='id']='3']/float[@name='score'][.='7.0']");
}

@Test
public void testCustomTargetRange() throws Exception {
clearIndex();
assertU(adoc("id", "1", "foo_f", "10"));
assertU(adoc("id", "2", "foo_f", "20"));
assertU(adoc("id", "3", "foo_f", "30"));
assertU(commit());

// Bounds: min=10, max=30. Target: [2, 8].
// v=10 → 2, v=20 → 5, v=30 → 8.
// Note: targetMin must be >= 0 when using matchset_scale as a top-level q, because
// Lucene clamps negative query scores to 0. This is a Lucene constraint, not a
// matchset_scale constraint — in a nested expression (fl / boost), negative outputs
// pass through fine.
assertQ(
req("q", "{!func}matchset_scale(foo_f,2,8)", "fl", "id,score", "rows", "10"),
"//doc[./str[@name='id']='1']/float[@name='score'][.='2.0']",
"//doc[./str[@name='id']='2']/float[@name='score'][.='5.0']",
"//doc[./str[@name='id']='3']/float[@name='score'][.='8.0']");
}
}
Loading