From 9a1702a8f5b5496893a99c4e1f39cd58520786ae Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Wed, 15 Feb 2017 07:13:28 +0530 Subject: [PATCH] SOLR-8029: Reverting the previous commit and the merge --- lucene/CHANGES.txt | 10 + .../pattern/SimplePatternSplitTokenizer.java | 258 +++++++ .../SimplePatternSplitTokenizerFactory.java | 76 +++ .../pattern/SimplePatternTokenizer.java | 242 +++++++ .../SimplePatternTokenizerFactory.java | 76 +++ ...ache.lucene.analysis.util.TokenizerFactory | 2 + .../analysis/core/TestRandomChains.java | 10 +- .../TestSimplePatternSplitTokenizer.java | 273 ++++++++ .../pattern/TestSimplePatternTokenizer.java | 218 ++++++ lucene/common-build.xml | 2 +- .../apache/lucene/analysis/package-info.java | 12 +- .../util/automaton/ByteRunAutomaton.java | 4 +- .../util/automaton/CharacterRunAutomaton.java | 2 +- .../lucene/util/automaton/Operations.java | 44 +- .../lucene/util/automaton/RunAutomaton.java | 126 ++-- lucene/demo/ivy.xml | 2 +- .../lucene/search/TermAutomatonScorer.java | 2 +- lucene/test-framework/ivy.xml | 2 +- .../index/BasePointsFormatTestCase.java | 41 +- lucene/tools/ivy.xml | 2 +- .../GetMavenDependenciesTask.java | 58 +- solr/CHANGES.txt | 21 +- solr/contrib/extraction/ivy.xml | 2 +- .../java/org/apache/solr/api/V2HttpCall.java | 2 +- .../java/org/apache/solr/cloud/Overseer.java | 2 +- .../component/RealTimeGetComponent.java | 7 +- .../component/SortedNumericStatsValues.java | 106 +++ .../solr/handler/component/StatsField.java | 2 +- .../handler/component/StatsValuesFactory.java | 7 +- .../apache/solr/request/IntervalFacets.java | 77 ++- .../apache/solr/request/NumericFacets.java | 173 ++++- .../org/apache/solr/request/SimpleFacets.java | 8 +- .../apache/solr/schema/DoublePointField.java | 13 +- .../org/apache/solr/schema/FieldType.java | 2 +- .../apache/solr/schema/FloatPointField.java | 16 +- .../org/apache/solr/schema/IntPointField.java | 10 +- .../apache/solr/schema/LongPointField.java | 10 +- .../apache/solr/schema/NumericFieldType.java | 48 +- .../org/apache/solr/schema/PointField.java | 31 +- .../apache/solr/search/SolrIndexSearcher.java | 214 +++--- .../solr/store/blockcache/BlockCache.java | 10 +- .../solr/uninverting/UninvertingReader.java | 37 +- .../conf/schema-distrib-interval-faceting.xml | 8 +- .../conf/schema-docValuesFaceting.xml | 11 +- .../solr/collection1/conf/schema-point.xml | 6 + .../solr/collection1/conf/schema.xml | 55 +- .../solr/collection1/conf/schema11.xml | 28 +- .../solr/collection1/conf/schema12.xml | 14 +- .../solr/collection1/conf/schema_latest.xml | 50 +- .../solrconfig-update-processor-chains.xml | 2 + .../apache/solr/TestDistributedSearch.java | 8 +- .../org/apache/solr/TestGroupingSearch.java | 1 + .../org/apache/solr/TestRandomDVFaceting.java | 26 +- .../cloud/SegmentTerminateEarlyTestState.java | 4 +- .../apache/solr/cloud/TestSegmentSorting.java | 2 + .../handler/component/StatsComponentTest.java | 7 +- .../handler/component/TermsComponentTest.java | 2 + .../apache/solr/request/TestFacetMethods.java | 11 +- .../solr/request/TestIntervalFaceting.java | 12 +- .../apache/solr/schema/TestPointFields.java | 631 +++++++++++++----- .../solr/search/TestSolrQueryParser.java | 2 +- .../update/processor/AtomicUpdatesTest.java | 64 +- solr/test-framework/ivy.xml | 2 +- .../java/org/apache/solr/SolrTestCaseJ4.java | 4 + .../solr/cloud/MiniSolrCloudCluster.java | 2 +- 65 files changed, 2628 insertions(+), 584 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java create mode 100644 solr/core/src/java/org/apache/solr/handler/component/SortedNumericStatsValues.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f9c464b4388..e4042dad5e1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -107,6 +107,11 @@ New Features SortedNumericSelector.Type can give a ValueSource view of a SortedNumericDocValues field. (Tomás Fernández Löbbe) +* LUCENE-7465: Add SimplePatternTokenizer and + SimplePatternSplitTokenizer, using Lucene's regexp/automaton + implementation for analysis/tokenization (Clinton Gormley, Mike + McCandless) + Bug Fixes * LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads @@ -171,6 +176,11 @@ Build Jars are not downloaded; compilation is not updated; and Clover is not enabled. (Steve Rowe) +* LUCENE-7694: Update forbiddenapis to version 2.3. (Uwe Schindler) + +* LUCENE-7693: Replace "org.apache." logic in GetMavenDependenciesTask. + (Daniel Collins, Christine Poerschke) + Other * LUCENE-7666: Fix typos in lucene-join package info javadoc. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java new file mode 100644 index 00000000000..d2b10c1a0d2 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.pattern; + +import java.io.IOException; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens. + * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. This is just + * like {@link SimplePatternTokenizer} except that the pattern shold make valid token separator characters, like + * {@code String.split}. Empty string tokens are never produced. + * + * @lucene.experimental + */ + +public final class SimplePatternSplitTokenizer extends Tokenizer { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final CharacterRunAutomaton runDFA; + + // TODO: this is copied from SimplePatternTokenizer, but there are subtle differences e.g. we track sepUpto an tokenUpto; + // find a clean way to share it: + + // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I + // suspect it's slowish: + + private char[] pendingChars = new char[8]; + private int tokenUpto; + private int pendingLimit; + private int pendingUpto; + private int offset; + private int sepUpto; + private final char[] buffer = new char[1024]; + private int bufferLimit; + private int bufferNextRead; + + /** See {@link RegExp} for the accepted syntax. */ + public SimplePatternSplitTokenizer(String regexp) { + this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + /** Runs a pre-built automaton. */ + public SimplePatternSplitTokenizer(Automaton dfa) { + this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa); + } + + /** See {@link RegExp} for the accepted syntax. */ + public SimplePatternSplitTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) { + this(factory, new RegExp(regexp).toAutomaton()); + } + + /** Runs a pre-built automaton. */ + public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) { + super(factory); + + // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not + // realizing this ctor is otherwise trappy + if (dfa.isDeterministic() == false) { + throw new IllegalArgumentException("please determinize the incoming automaton first"); + } + + runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + private void fillToken(int offsetStart) { + termAtt.setLength(tokenUpto); + offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+tokenUpto)); + } + + @Override + public boolean incrementToken() throws IOException { + + int offsetStart = offset; + + clearAttributes(); + + tokenUpto = 0; + + while (true) { + sepUpto = 0; + + // The runDFA operates in Unicode space, not UTF16 (java's char): + int ch = nextCodePoint(); + if (ch == -1) { + if (tokenUpto > 0) { + fillToken(offsetStart); + return true; + } else { + return false; + } + } + int state = runDFA.step(0, ch); + + if (state != -1) { + // a token separator just possibly started; keep scanning to see if the token is accepted: + int lastAcceptLength = -1; + do { + + if (runDFA.isAccept(state)) { + // record that the token separator matches here, but keep scanning in case a longer match also works (greedy): + lastAcceptLength = sepUpto; + } + + ch = nextCodePoint(); + if (ch == -1) { + break; + } + state = runDFA.step(state, ch); + } while (state != -1); + + if (lastAcceptLength != -1) { + // strip the trailing separater we just matched from the token: + tokenUpto -= lastAcceptLength; + // we found a token separator + int extra = sepUpto - lastAcceptLength; + if (extra != 0) { + pushBack(extra); + } + if (tokenUpto > 0) { + fillToken(offsetStart); + return true; + } else { + // we matched one token separator immediately after another + offsetStart = offset; + } + } else if (ch == -1) { + if (tokenUpto > 0) { + fillToken(offsetStart); + return true; + } else { + return false; + } + } else { + // false alarm: there was no token separator here; push back all but the first character we scanned + pushBack(sepUpto-1); + } + } + } + } + + @Override + public void end() throws IOException { + super.end(); + final int ofs = correctOffset(offset + pendingLimit - pendingUpto); + offsetAtt.setOffset(ofs, ofs); + } + + @Override + public void reset() throws IOException { + super.reset(); + offset = 0; + pendingUpto = 0; + pendingLimit = 0; + sepUpto = 0; + bufferNextRead = 0; + bufferLimit = 0; + } + + /** Pushes back the last {@code count} characters in current token's buffer. */ + private void pushBack(int count) { + tokenUpto -= count; + assert tokenUpto >= 0; + if (pendingLimit == 0) { + if (bufferNextRead >= count) { + // optimize common case when the chars we are pushing back are still in the buffer + bufferNextRead -= count; + } else { + if (count > pendingChars.length) { + pendingChars = ArrayUtil.grow(pendingChars, count); + } + System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count); + pendingLimit = count; + } + } else { + // we are pushing back what is already in our pending buffer + pendingUpto -= count; + assert pendingUpto >= 0; + } + offset -= count; + } + + private void appendToToken(char ch) { + char[] buffer = termAtt.buffer(); + if (tokenUpto == buffer.length) { + buffer = termAtt.resizeBuffer(tokenUpto + 1); + } + buffer[tokenUpto++] = ch; + sepUpto++; + } + + private int nextCodeUnit() throws IOException { + int result; + if (pendingUpto < pendingLimit) { + result = pendingChars[pendingUpto++]; + if (pendingUpto == pendingLimit) { + // We used up the pending buffer + pendingUpto = 0; + pendingLimit = 0; + } + appendToToken((char) result); + offset++; + } else if (bufferLimit == -1) { + return -1; + } else { + assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit; + if (bufferNextRead == bufferLimit) { + bufferLimit = input.read(buffer, 0, buffer.length); + if (bufferLimit == -1) { + return -1; + } + bufferNextRead = 0; + } + result = buffer[bufferNextRead++]; + offset++; + appendToToken((char) result); + } + return result; + } + + private int nextCodePoint() throws IOException { + + int ch = nextCodeUnit(); + if (ch == -1) { + return ch; + } + if (Character.isHighSurrogate((char) ch)) { + return Character.toCodePoint((char) ch, (char) nextCodeUnit()); + } else { + return ch; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java new file mode 100644 index 00000000000..4af6286c901 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.pattern; + +import java.util.Map; + +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Factory for {@link SimplePatternSplitTokenizer}, for producing tokens by splitting according to the provided regexp. + * + *

This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens + * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the + * tokenization is quite a bit faster. It takes two arguments: + *
+ *

+ *

+ * The pattern matches the characters that should split tokens, like {@code String.split}, and the + * matching is greedy such that the longest token separator matching at a given point is matched. Empty + * tokens are never created. + * + *

For example, to match tokens delimited by simple whitespace characters: + * + *

+ * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.SimplePatternSplitTokenizerFactory" pattern="[ \t\r\n]+"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @lucene.experimental + * + * @see SimplePatternSplitTokenizer + */ +public class SimplePatternSplitTokenizerFactory extends TokenizerFactory { + public static final String PATTERN = "pattern"; + private final Automaton dfa; + private final int maxDeterminizedStates; + + /** Creates a new SimpleSplitPatternTokenizerFactory */ + public SimplePatternSplitTokenizerFactory(Map args) { + super(args); + maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES); + dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates); + if (args.isEmpty() == false) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public SimplePatternSplitTokenizer create(final AttributeFactory factory) { + return new SimplePatternSplitTokenizer(factory, dfa); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java new file mode 100644 index 00000000000..867b10a9d23 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.pattern; + +import java.io.IOException; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens. + * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. The provided + * regex should match valid token characters (not token separator characters, like {@code String.split}). The matching is greedy: + * the longest match at a given start point will be the next token. Empty string tokens are never produced. + * + * @lucene.experimental + */ + +// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is +// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's. This is because on failing to match +// a token, we skip one character forward and try again. A better approach would be to compile something like this regexp +// instead: .* | , because that automaton would not "forget" all the as it had already seen, and would be a single pass +// through the input. I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm). +// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know +// which specific characters the pattern matched. SynonymFilter has this same limitation. + +public final class SimplePatternTokenizer extends Tokenizer { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final CharacterRunAutomaton runDFA; + + // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I + // suspect it's slowish: + + private char[] pendingChars = new char[8]; + private int pendingLimit; + private int pendingUpto; + private int offset; + private int tokenUpto; + private final char[] buffer = new char[1024]; + private int bufferLimit; + private int bufferNextRead; + + /** See {@link RegExp} for the accepted syntax. */ + public SimplePatternTokenizer(String regexp) { + this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + /** Runs a pre-built automaton. */ + public SimplePatternTokenizer(Automaton dfa) { + this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa); + } + + /** See {@link RegExp} for the accepted syntax. */ + public SimplePatternTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) { + this(factory, new RegExp(regexp).toAutomaton()); + } + + /** Runs a pre-built automaton. */ + public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) { + super(factory); + + // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not + // realizing this ctor is otherwise trappy + if (dfa.isDeterministic() == false) { + throw new IllegalArgumentException("please determinize the incoming automaton first"); + } + + runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + @Override + public boolean incrementToken() throws IOException { + + clearAttributes(); + tokenUpto = 0; + + while (true) { + + int offsetStart = offset; + + // The runDFA operates in Unicode space, not UTF16 (java's char): + + int ch = nextCodePoint(); + if (ch == -1) { + return false; + } + + int state = runDFA.step(0, ch); + + if (state != -1) { + // a token just possibly started; keep scanning to see if the token is accepted: + int lastAcceptLength = -1; + do { + + if (runDFA.isAccept(state)) { + // record that the token matches here, but keep scanning in case a longer match also works (greedy): + lastAcceptLength = tokenUpto; + } + + ch = nextCodePoint(); + if (ch == -1) { + break; + } + state = runDFA.step(state, ch); + } while (state != -1); + + if (lastAcceptLength != -1) { + // we found a token + int extra = tokenUpto - lastAcceptLength; + if (extra != 0) { + pushBack(extra); + } + termAtt.setLength(lastAcceptLength); + offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength)); + return true; + } else if (ch == -1) { + return false; + } else { + // false alarm: there was no token here; push back all but the first character we scanned + pushBack(tokenUpto-1); + tokenUpto = 0; + } + } else { + tokenUpto = 0; + } + } + } + + @Override + public void end() throws IOException { + super.end(); + final int ofs = correctOffset(offset + pendingLimit - pendingUpto); + offsetAtt.setOffset(ofs, ofs); + } + + @Override + public void reset() throws IOException { + super.reset(); + offset = 0; + pendingUpto = 0; + pendingLimit = 0; + tokenUpto = 0; + bufferNextRead = 0; + bufferLimit = 0; + } + + /** Pushes back the last {@code count} characters in current token's buffer. */ + private void pushBack(int count) { + + if (pendingLimit == 0) { + if (bufferNextRead >= count) { + // optimize common case when the chars we are pushing back are still in the buffer + bufferNextRead -= count; + } else { + if (count > pendingChars.length) { + pendingChars = ArrayUtil.grow(pendingChars, count); + } + System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count); + pendingLimit = count; + } + } else { + // we are pushing back what is already in our pending buffer + pendingUpto -= count; + assert pendingUpto >= 0; + } + offset -= count; + } + + private void appendToToken(char ch) { + char[] buffer = termAtt.buffer(); + if (tokenUpto == buffer.length) { + buffer = termAtt.resizeBuffer(tokenUpto + 1); + } + buffer[tokenUpto++] = ch; + } + + private int nextCodeUnit() throws IOException { + int result; + if (pendingUpto < pendingLimit) { + result = pendingChars[pendingUpto++]; + if (pendingUpto == pendingLimit) { + // We used up the pending buffer + pendingUpto = 0; + pendingLimit = 0; + } + appendToToken((char) result); + offset++; + } else if (bufferLimit == -1) { + return -1; + } else { + assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit; + if (bufferNextRead == bufferLimit) { + bufferLimit = input.read(buffer, 0, buffer.length); + if (bufferLimit == -1) { + return -1; + } + bufferNextRead = 0; + } + result = buffer[bufferNextRead++]; + offset++; + appendToToken((char) result); + } + return result; + } + + private int nextCodePoint() throws IOException { + + int ch = nextCodeUnit(); + if (ch == -1) { + return ch; + } + if (Character.isHighSurrogate((char) ch)) { + return Character.toCodePoint((char) ch, (char) nextCodeUnit()); + } else { + return ch; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java new file mode 100644 index 00000000000..3e74d023b3a --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.pattern; + +import java.util.Map; + +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Factory for {@link SimplePatternTokenizer}, for matching tokens based on the provided regexp. + * + *

This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens + * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the + * tokenization is quite a bit faster. It takes two arguments: + *
+ *

    + *
  • "pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}
  • + *
  • "maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp
  • + *
+ *

+ * The pattern matches the characters to include in a token (not the split characters), and the + * matching is greedy such that the longest token matching at a given point is created. Empty + * tokens are never created. + * + *

For example, to match tokens delimited by simple whitespace characters: + * + *

+ * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.SimplePatternTokenizerFactory" pattern="[^ \t\r\n]+"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @lucene.experimental + * + * @see SimplePatternTokenizer + */ +public class SimplePatternTokenizerFactory extends TokenizerFactory { + public static final String PATTERN = "pattern"; + private final Automaton dfa; + private final int maxDeterminizedStates; + + /** Creates a new SimplePatternTokenizerFactory */ + public SimplePatternTokenizerFactory(Map args) { + super(args); + maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES); + dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates); + if (args.isEmpty() == false) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public SimplePatternTokenizer create(final AttributeFactory factory) { + return new SimplePatternTokenizer(factory, dfa); + } +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory index be0b7d4082a..4b37eb868ea 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory @@ -21,6 +21,8 @@ org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory org.apache.lucene.analysis.ngram.NGramTokenizerFactory org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory org.apache.lucene.analysis.pattern.PatternTokenizerFactory +org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizerFactory +org.apache.lucene.analysis.pattern.SimplePatternTokenizerFactory org.apache.lucene.analysis.standard.ClassicTokenizerFactory org.apache.lucene.analysis.standard.StandardTokenizerFactory org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 8953f9f2192..3a58bdd4991 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -96,7 +96,11 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; import org.junit.AfterClass; import org.junit.BeforeClass; import org.tartarus.snowball.SnowballProgram; @@ -494,6 +498,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase { if (random.nextBoolean()) return null; return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); }); + put(Automaton.class, random -> { + return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); + }); }}; static final Set> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; @@ -503,7 +510,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase { allowedTokenizerArgs.add(Reader.class); allowedTokenizerArgs.add(AttributeFactory.class); allowedTokenizerArgs.add(AttributeSource.class); - + allowedTokenizerArgs.add(Automaton.class); + allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); allowedTokenFilterArgs.addAll(argProducers.keySet()); allowedTokenFilterArgs.add(TokenStream.class); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java new file mode 100644 index 00000000000..5642c2b68e4 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.pattern; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.Automaton; + +public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase { + + public void testGreedy() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("(foo)+"); + t.setReader(new StringReader("bar foofoo baz")); + assertTokenStreamContents(t, + new String[] {"bar ", " baz"}, + new int[] {0, 10}, + new int[] {4, 14}); + } + + public void testBackToBack() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("foo"); + t.setReader(new StringReader("bar foofoo baz")); + assertTokenStreamContents(t, + new String[] {"bar ", " baz"}, + new int[] {0, 10}, + new int[] {4, 14}); + } + + public void testBigLookahead() throws Exception { + StringBuilder b = new StringBuilder(); + for(int i=0;i<100;i++) { + b.append('a'); + } + b.append('b'); + Tokenizer t = new SimplePatternSplitTokenizer(b.toString()); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + + b = new StringBuilder(); + for(int i=0;i<200;i++) { + b.append('a'); + } + t.setReader(new StringReader(b.toString())); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals(b.toString(), termAtt.toString()); + assertFalse(t.incrementToken()); + } + + public void testNoTokens() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer(".*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + String s; + while (true) { + s = TestUtil.randomUnicodeString(random()); + if (s.length() > 0) { + break; + } + } + t.setReader(new StringReader(s)); + t.reset(); + assertFalse(t.incrementToken()); + } + + public void testEmptyStringPatternNoMatch() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("a*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("bbb")); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals("bbb", termAtt.toString()); + assertFalse(t.incrementToken()); + } + + public void testSplitSingleCharWhitespace() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("a \tb c")); + assertTokenStreamContents(t, + new String[] {"a", "b", "c"}, + new int[] {0, 3, 7}, + new int[] {1, 4, 8}); + } + + public void testSplitMultiCharWhitespace() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("a \tb c")); + assertTokenStreamContents(t, + new String[] {"a", "b", "c"}, + new int[] {0, 3, 7}, + new int[] {1, 4, 8}); + } + + public void testLeadingNonToken() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader(" a c")); + assertTokenStreamContents(t, + new String[] {"a", "c"}, + new int[] {4, 6}, + new int[] {5, 7}); + } + + public void testTrailingNonToken() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("a c ")); + assertTokenStreamContents(t, + new String[] {"a", "c"}, + new int[] {0, 2}, + new int[] {1, 3}); + } + + public void testEmptyStringPatternOneMatch() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("a*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("bbab")); + assertTokenStreamContents(t, + new String[] {"bb", "b"}, + new int[] {0, 3}, + new int[] {2, 4}); + } + + public void testEndOffset() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("a+"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); + t.setReader(new StringReader("aaabbb")); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals("bbb", termAtt.toString()); + assertFalse(t.incrementToken()); + t.end(); + assertEquals(6, offsetAtt.endOffset()); + } + + public void testFixedToken() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("aaaa"); + + t.setReader(new StringReader("aaaaaaaaaaaaaaa")); + assertTokenStreamContents(t, + new String[] {"aaa"}, + new int[] {12}, + new int[] {15}); + } + + public void testBasic() throws Exception + { + String[][] tests = { + // pattern input output + { "--", "aaa--bbb--ccc", "aaa bbb ccc" }, + { ":", "aaa:bbb:ccc", "aaa bbb ccc" }, + { ":", "boo:and:foo", "boo and foo" }, + { "o", "boo:and:foo", "b :and:f" }, + }; + + for(String[] test : tests) { + TokenStream stream = new SimplePatternSplitTokenizer(test[0]); + ((Tokenizer)stream).setReader(new StringReader(test[1])); + String out = tsToString(stream); + assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out); + } + } + + public void testNotDeterminized() throws Exception { + Automaton a = new Automaton(); + int start = a.createState(); + int mid1 = a.createState(); + int mid2 = a.createState(); + int end = a.createState(); + a.setAccept(end, true); + a.addTransition(start, mid1, 'a', 'z'); + a.addTransition(start, mid2, 'a', 'z'); + a.addTransition(mid1, end, 'b'); + a.addTransition(mid2, end, 'b'); + expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);}); + } + + public void testOffsetCorrection() throws Exception { + final String INPUT = "Günther Günther is here"; + + // create MappingCharFilter + List mappingRules = new ArrayList<>(); + mappingRules.add( "\"ü\" => \"ü\"" ); + NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + builder.add("ü", "ü"); + NormalizeCharMap normMap = builder.build(); + CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT)); + + // create SimplePatternSplitTokenizer + Tokenizer stream = new SimplePatternSplitTokenizer("Günther"); + stream.setReader(charStream); + assertTokenStreamContents(stream, + new String[] { " ", " is here" }, + new int[] { 12, 25 }, + new int[] { 13, 33 }, + INPUT.length()); + } + + /** + * TODO: rewrite tests not to use string comparison. + */ + private static String tsToString(TokenStream in) throws IOException { + StringBuilder out = new StringBuilder(); + CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); + // extra safety to enforce, that the state is not preserved and also + // assign bogus values + in.clearAttributes(); + termAtt.setEmpty().append("bogusTerm"); + in.reset(); + while (in.incrementToken()) { + if (out.length() > 0) { + out.append(' '); + } + out.append(termAtt.toString()); + in.clearAttributes(); + termAtt.setEmpty().append("bogusTerm"); + } + + in.close(); + return out.toString(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new SimplePatternSplitTokenizer("a"); + return new TokenStreamComponents(tokenizer); + } + }; + checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); + a.close(); + + Analyzer b = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new SimplePatternSplitTokenizer("a"); + return new TokenStreamComponents(tokenizer); + } + }; + checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); + b.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java new file mode 100644 index 00000000000..b566713312b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.pattern; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.Automaton; + +public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase { + + public void testGreedy() throws Exception { + Tokenizer t = new SimplePatternTokenizer("(foo)+"); + t.setReader(new StringReader("bar foofoo baz")); + assertTokenStreamContents(t, + new String[] {"foofoo"}, + new int[] {4}, + new int[] {10}); + } + + public void testBigLookahead() throws Exception { + StringBuilder b = new StringBuilder(); + for(int i=0;i<100;i++) { + b.append('a'); + } + b.append('b'); + Tokenizer t = new SimplePatternTokenizer(b.toString()); + + b = new StringBuilder(); + for(int i=0;i<200;i++) { + b.append('a'); + } + t.setReader(new StringReader(b.toString())); + t.reset(); + assertFalse(t.incrementToken()); + } + + public void testOneToken() throws Exception { + Tokenizer t = new SimplePatternTokenizer(".*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + String s; + while (true) { + s = TestUtil.randomUnicodeString(random()); + if (s.length() > 0) { + break; + } + } + t.setReader(new StringReader(s)); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals(s, termAtt.toString()); + } + + public void testEmptyStringPatternNoMatch() throws Exception { + Tokenizer t = new SimplePatternTokenizer("a*"); + t.setReader(new StringReader("bbb")); + t.reset(); + assertFalse(t.incrementToken()); + } + + public void testEmptyStringPatternOneMatch() throws Exception { + Tokenizer t = new SimplePatternTokenizer("a*"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + t.setReader(new StringReader("bbab")); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals("a", termAtt.toString()); + assertFalse(t.incrementToken()); + } + + public void testEndOffset() throws Exception { + Tokenizer t = new SimplePatternTokenizer("a+"); + CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); + t.setReader(new StringReader("aaabbb")); + t.reset(); + assertTrue(t.incrementToken()); + assertEquals("aaa", termAtt.toString()); + assertFalse(t.incrementToken()); + t.end(); + assertEquals(6, offsetAtt.endOffset()); + } + + public void testFixedToken() throws Exception { + Tokenizer t = new SimplePatternTokenizer("aaaa"); + + t.setReader(new StringReader("aaaaaaaaaaaaaaa")); + assertTokenStreamContents(t, + new String[] {"aaaa", "aaaa", "aaaa"}, + new int[] {0, 4, 8}, + new int[] {4, 8, 12}); + } + + public void testBasic() throws Exception { + String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" + String[][] tests = { + // pattern input output + { ":", "boo:and:foo", ": :" }, + { qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, + }; + + for(String[] test : tests) { + TokenStream stream = new SimplePatternTokenizer(test[0]); + ((Tokenizer)stream).setReader(new StringReader(test[1])); + String out = tsToString(stream); + + assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out); + } + } + + public void testNotDeterminized() throws Exception { + Automaton a = new Automaton(); + int start = a.createState(); + int mid1 = a.createState(); + int mid2 = a.createState(); + int end = a.createState(); + a.setAccept(end, true); + a.addTransition(start, mid1, 'a', 'z'); + a.addTransition(start, mid2, 'a', 'z'); + a.addTransition(mid1, end, 'b'); + a.addTransition(mid2, end, 'b'); + expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);}); + } + + public void testOffsetCorrection() throws Exception { + final String INPUT = "Günther Günther is here"; + + // create MappingCharFilter + List mappingRules = new ArrayList<>(); + mappingRules.add( "\"ü\" => \"ü\"" ); + NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + builder.add("ü", "ü"); + NormalizeCharMap normMap = builder.build(); + CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT)); + + // create SimplePatternTokenizer + Tokenizer stream = new SimplePatternTokenizer("Günther"); + stream.setReader(charStream); + assertTokenStreamContents(stream, + new String[] { "Günther", "Günther" }, + new int[] { 0, 13 }, + new int[] { 12, 25 }, + INPUT.length()); + } + + /** + * TODO: rewrite tests not to use string comparison. + */ + private static String tsToString(TokenStream in) throws IOException { + StringBuilder out = new StringBuilder(); + CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); + // extra safety to enforce, that the state is not preserved and also + // assign bogus values + in.clearAttributes(); + termAtt.setEmpty().append("bogusTerm"); + in.reset(); + while (in.incrementToken()) { + if (out.length() > 0) { + out.append(' '); + } + out.append(termAtt.toString()); + in.clearAttributes(); + termAtt.setEmpty().append("bogusTerm"); + } + + in.close(); + return out.toString(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new SimplePatternTokenizer("a"); + return new TokenStreamComponents(tokenizer); + } + }; + checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); + a.close(); + + Analyzer b = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new SimplePatternTokenizer("a"); + return new TokenStreamComponents(tokenizer); + } + }; + checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); + b.close(); + } +} diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 2a988eb9aaa..7d64bc237b6 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -2348,7 +2348,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list} - diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java index 81858df198d..a536f73fc16 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java @@ -362,11 +362,13 @@ *

*
    *
  1. Inhibiting phrase and proximity matches in sentence boundaries – for this, a tokenizer that - * identifies a new sentence can add 1 to the position increment of the first token of the new sentence.
  2. - *
  3. Injecting synonyms – here, synonyms of a token should be added after that token, - * and their position increment should be set to 0. - * As result, all synonyms of a token would be considered to appear in exactly the - * same position as that token, and so would they be seen by phrase and proximity searches.
  4. + * identifies a new sentence can add 1 to the position increment of the first token of the new sentence. + *
  5. Injecting synonyms – synonyms of a token should be created at the same position as the + * original token, and the output order of the original token and the injected synonym is undefined + * as long as they both leave from the same position. As result, all synonyms of a token would be + * considered to appear in exactly the same position as that token, and so would they be seen by + * phrase and proximity searches. For multi-token synonyms to work correctly, you should use + * {@code SynoymGraphFilter} at search time only.
  6. *
* *

Token Position Length

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java index ca14bc6dd3a..abd5109e655 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java @@ -27,9 +27,9 @@ public class ByteRunAutomaton extends RunAutomaton { this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES); } - /** expert: if utf8 is true, the input is already byte-based */ + /** expert: if isBinary is true, the input is already byte-based */ public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) { - super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates); + super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java index 70ff9aa21f6..1a9c1c92680 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java @@ -36,7 +36,7 @@ public class CharacterRunAutomaton extends RunAutomaton { * it then a TooComplexToDeterminizeException is thrown. */ public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) { - super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates); + super(a, Character.MAX_CODE_POINT+1, maxDeterminizedStates); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 718a9089ce2..b673a82e974 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -29,24 +29,24 @@ package org.apache.lucene.util.automaton; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.RamUsageEstimator; - +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.RamUsageEstimator; + /** * Automata operations. * @@ -335,7 +335,7 @@ final public class Operations { Transition[][] transitions2 = a2.getSortedTransitions(); Automaton c = new Automaton(); c.createState(); - LinkedList worklist = new LinkedList<>(); + ArrayDeque worklist = new ArrayDeque<>(); HashMap newstates = new HashMap<>(); StatePair p = new StatePair(0, 0, 0); worklist.add(p); @@ -435,7 +435,7 @@ final public class Operations { // TODO: cutover to iterators instead Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); - LinkedList worklist = new LinkedList<>(); + ArrayDeque worklist = new ArrayDeque<>(); HashSet visited = new HashSet<>(); StatePair p = new StatePair(0, 0); worklist.add(p); @@ -682,7 +682,7 @@ final public class Operations { // Create state 0: b.createState(); - LinkedList worklist = new LinkedList<>(); + ArrayDeque worklist = new ArrayDeque<>(); Map newstate = new HashMap<>(); worklist.add(initialset); @@ -804,7 +804,7 @@ final public class Operations { return false; } - LinkedList workList = new LinkedList<>(); + ArrayDeque workList = new ArrayDeque<>(); BitSet seen = new BitSet(a.getNumStates()); workList.add(0); seen.set(0); @@ -907,7 +907,7 @@ final public class Operations { if (numStates == 0) { return live; } - LinkedList workList = new LinkedList<>(); + ArrayDeque workList = new ArrayDeque<>(); live.set(0); workList.add(0); @@ -946,7 +946,7 @@ final public class Operations { } Automaton a2 = builder.finish(); - LinkedList workList = new LinkedList<>(); + ArrayDeque workList = new ArrayDeque<>(); BitSet live = new BitSet(numStates); BitSet acceptBits = a.getAcceptStates(); int s = 0; @@ -1010,22 +1010,6 @@ final public class Operations { return result; } - /** - * Finds the largest entry whose value is less than or equal to c, or 0 if - * there is no such entry. - */ - static int findIndex(int c, int[] points) { - int a = 0; - int b = points.length; - while (b - a > 1) { - int d = (a + b) >>> 1; - if (points[d] > c) b = d; - else if (points[d] < c) a = d; - else return d; - } - return a; - } - /** * Returns true if the language of this automaton is finite. The * automaton must not have any dead states. diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java index 1d640954e13..4f539260450 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java @@ -38,13 +38,62 @@ import java.util.Arrays; */ public abstract class RunAutomaton { final Automaton automaton; - final int maxInterval; + final int alphabetSize; final int size; final boolean[] accept; final int[] transitions; // delta(state,c) = transitions[state*points.length + // getCharClass(c)] final int[] points; // char interval start points - final int[] classmap; // map from char number to class class + final int[] classmap; // map from char number to class + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. + * + * @param a an automaton + */ + protected RunAutomaton(Automaton a, int alphabetSize) { + this(a, alphabetSize, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. + * + * @param a an automaton + * @param maxDeterminizedStates maximum number of states that can be created + * while determinizing a + */ + protected RunAutomaton(Automaton a, int alphabetSize, int maxDeterminizedStates) { + this.alphabetSize = alphabetSize; + a = Operations.determinize(a, maxDeterminizedStates); + this.automaton = a; + points = a.getStartPoints(); + size = Math.max(1,a.getNumStates()); + accept = new boolean[size]; + transitions = new int[size * points.length]; + Arrays.fill(transitions, -1); + for (int n=0;nRunAutomaton from a deterministic - * Automaton. - * - * @param a an automaton - */ - public RunAutomaton(Automaton a, int maxInterval, boolean tableize) { - this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES); - } - - /** - * Constructs a new RunAutomaton from a deterministic - * Automaton. - * - * @param a an automaton - * @param maxDeterminizedStates maximum number of states that can be created - * while determinizing a - */ - public RunAutomaton(Automaton a, int maxInterval, boolean tableize, - int maxDeterminizedStates) { - this.maxInterval = maxInterval; - a = Operations.determinize(a, maxDeterminizedStates); - this.automaton = a; - points = a.getStartPoints(); - size = Math.max(1,a.getNumStates()); - accept = new boolean[size]; - transitions = new int[size * points.length]; - Arrays.fill(transitions, -1); - for (int n=0;n 1) { + int d = (a + b) >>> 1; + if (points[d] > c) b = d; + else if (points[d] < c) a = d; + else return d; } + return a; } - + /** * Returns the state obtained by reading the given char from the given state. * Returns -1 if not obtaining any such state. (If the original @@ -168,7 +173,8 @@ public abstract class RunAutomaton { * transition function.) */ public final int step(int state, int c) { - if (classmap == null) { + assert c < alphabetSize; + if (c >= classmap.length) { return transitions[state * points.length + getCharClass(c)]; } else { return transitions[state * points.length + classmap[c]]; @@ -179,7 +185,7 @@ public abstract class RunAutomaton { public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + maxInterval; + result = prime * result + alphabetSize; result = prime * result + points.length; result = prime * result + size; return result; @@ -191,7 +197,7 @@ public abstract class RunAutomaton { if (obj == null) return false; if (getClass() != obj.getClass()) return false; RunAutomaton other = (RunAutomaton) obj; - if (maxInterval != other.maxInterval) return false; + if (alphabetSize != other.alphabetSize) return false; if (size != other.size) return false; if (!Arrays.equals(points, other.points)) return false; if (!Arrays.equals(accept, other.accept)) return false; diff --git a/lucene/demo/ivy.xml b/lucene/demo/ivy.xml index 050f0a58883..5dd7e74fba5 100644 --- a/lucene/demo/ivy.xml +++ b/lucene/demo/ivy.xml @@ -17,7 +17,7 @@ under the License. --> - + diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java index 0a1755c1675..776971286e2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java @@ -367,7 +367,7 @@ class TermAutomatonScorer extends Scorer { static class TermRunAutomaton extends RunAutomaton { public TermRunAutomaton(Automaton a, int termCount) { - super(a, termCount, true); + super(a, termCount); } } diff --git a/lucene/test-framework/ivy.xml b/lucene/test-framework/ivy.xml index a71c25a51ec..a51716c8698 100644 --- a/lucene/test-framework/ivy.xml +++ b/lucene/test-framework/ivy.xml @@ -17,7 +17,7 @@ under the License. --> - + diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java index 4cd6534d64e..ca68d2e3610 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java @@ -40,6 +40,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.TestUtil; @@ -232,16 +233,7 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa dir.setRandomIOExceptionRateOnOpen(0.05); verify(dir, docValues, null, numDims, numBytesPerDim, true); } catch (IllegalStateException ise) { - if (ise.getMessage().contains("this writer hit an unrecoverable error")) { - Throwable cause = ise.getCause(); - if (cause != null && cause.getMessage().contains("a random IOException")) { - done = true; - } else { - throw ise; - } - } else { - throw ise; - } + done = handlePossiblyFakeException(ise); } catch (AssertionError ae) { if (ae.getMessage() != null && ae.getMessage().contains("does not exist; files=")) { // OK: likely we threw the random IOExc when IW was asserting the commit files exist @@ -253,23 +245,28 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa // This just means we got a too-small maxMB for the maxPointsInLeafNode; just retry w/ more heap assertTrue(iae.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode")); } catch (IOException ioe) { - Throwable ex = ioe; - while (ex != null) { - String message = ex.getMessage(); - if (message != null && (message.contains("a random IOException") || message.contains("background merge hit exception"))) { - done = true; - break; - } - ex = ex.getCause(); - } - if (done == false) { - throw ioe; - } + done = handlePossiblyFakeException(ioe); } } } } + // TODO: merge w/ BaseIndexFileFormatTestCase.handleFakeIOException + private boolean handlePossiblyFakeException(Exception e) { + Throwable ex = e; + while (ex != null) { + String message = ex.getMessage(); + if (message != null && (message.contains("a random IOException") || message.contains("background merge hit exception"))) { + return true; + } + ex = ex.getCause(); + } + Rethrow.rethrow(e); + + // dead code yet javac disagrees: + return false; + } + public void testMultiValued() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, PointValues.MAX_NUM_BYTES); int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS); diff --git a/lucene/tools/ivy.xml b/lucene/tools/ivy.xml index 614aa8eb45b..1fa2974f85b 100644 --- a/lucene/tools/ivy.xml +++ b/lucene/tools/ivy.xml @@ -17,7 +17,7 @@ under the License. --> - + diff --git a/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java b/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java index 45a9d1126ab..5b2f0b80b00 100644 --- a/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java +++ b/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java @@ -54,6 +54,7 @@ import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; +import java.util.function.Consumer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -118,6 +119,7 @@ public class GetMavenDependenciesTask extends Task { private final DocumentBuilder documentBuilder; private File ivyCacheDir; private Pattern internalJarPattern; + private Map ivyModuleInfo; /** @@ -189,6 +191,8 @@ public class GetMavenDependenciesTask extends Task { internalJarPattern = Pattern.compile(".*(lucene|solr)([^/]*?)-" + Pattern.quote(getProject().getProperty("version")) + "\\.jar"); + ivyModuleInfo = getIvyModuleInfo(ivyXmlResources, documentBuilder, xpath); + setInternalDependencyProperties(); // side-effect: all modules' internal deps are recorded setExternalDependencyProperties(); // side-effect: all modules' external deps are recorded setGrandparentDependencyManagementProperty(); // uses deps recorded in above two methods @@ -219,11 +223,57 @@ public class GetMavenDependenciesTask extends Task { } } + /** + * Visits all ivy.xml files and collects module and organisation attributes into a map. + */ + private static Map getIvyModuleInfo(Resources ivyXmlResources, + DocumentBuilder documentBuilder, XPath xpath) { + Map ivyInfoModuleToOrganisation = new HashMap(); + traverseIvyXmlResources(ivyXmlResources, new Consumer() { + @Override + public void accept(File f) { + try { + Document document = documentBuilder.parse(f); + { + String infoPath = "/ivy-module/info"; + NodeList infos = (NodeList)xpath.evaluate(infoPath, document, XPathConstants.NODESET); + for (int infoNum = 0 ; infoNum < infos.getLength() ; ++infoNum) { + Element infoElement = (Element)infos.item(infoNum); + String infoOrg = infoElement.getAttribute("organisation"); + String infoOrgSuffix = infoOrg.substring(infoOrg.lastIndexOf('.')+1); + String infoModule = infoElement.getAttribute("module"); + String module = infoOrgSuffix+"-"+infoModule; + ivyInfoModuleToOrganisation.put(module, infoOrg); + } + } + } catch (XPathExpressionException | IOException | SAXException e) { + throw new RuntimeException(e); + } + } + }); + return ivyInfoModuleToOrganisation; + } + /** * Collects external dependencies from each ivy.xml file and sets * external dependency properties to be inserted into modules' POMs. */ private void setExternalDependencyProperties() { + traverseIvyXmlResources(ivyXmlResources, new Consumer() { + @Override + public void accept(File f) { + try { + collectExternalDependenciesFromIvyXmlFile(f); + } catch (XPathExpressionException | IOException | SAXException e) { + throw new RuntimeException(e); + } + } + }); + addSharedExternalDependencies(); + setExternalDependencyXmlProperties(); + } + + private static void traverseIvyXmlResources(Resources ivyXmlResources, Consumer ivyXmlFileConsumer) { @SuppressWarnings("unchecked") Iterator iter = (Iterator)ivyXmlResources.iterator(); while (iter.hasNext()) { @@ -238,15 +288,13 @@ public class GetMavenDependenciesTask extends Task { File ivyXmlFile = ((FileResource)resource).getFile(); try { - collectExternalDependenciesFromIvyXmlFile(ivyXmlFile); + ivyXmlFileConsumer.accept(ivyXmlFile); } catch (BuildException e) { throw e; } catch (Exception e) { throw new BuildException("Exception reading file " + ivyXmlFile.getPath() + ": " + e, e); } } - addSharedExternalDependencies(); - setExternalDependencyXmlProperties(); } /** @@ -396,7 +444,7 @@ public class GetMavenDependenciesTask extends Task { } } } - String groupId = "org.apache." + artifactId.substring(0, artifactId.indexOf('-')); + String groupId = ivyModuleInfo.get(artifactId); appendDependencyXml(builder, groupId, artifactId, " ", "${project.version}", false, false, null, exclusions); } } @@ -581,7 +629,7 @@ public class GetMavenDependenciesTask extends Task { continue; // skip external (/(test-)lib/), and non-jar and unwanted (self) internal deps } String artifactId = dependencyToArtifactId(newPropertyKey, dependency); - String groupId = "org.apache." + artifactId.substring(0, artifactId.indexOf('-')); + String groupId = ivyModuleInfo.get(artifactId); String coordinate = groupId + ':' + artifactId; sortedDeps.add(coordinate); } diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4a8766edc27..6cd52911487 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -74,20 +74,12 @@ Optimizations * SOLR-9584: Support Solr being proxied with another endpoint than default /solr, by using relative links in AdminUI javascripts (Yun Jie Zhou via janhoy) -* SOLR-9996: Unstored IntPointField returns Long type (Ishan Chattopadhyaya) - * SOLR-5944: In-place updates of Numeric DocValues. To leverage this, the _version_ field and the updated field must both be stored=false, indexed=false, docValues=true. (Ishan Chattopadhyaya, hossman, noble, shalin, yonik) Other Changes ---------------------- -* SOLR-8396: Add support for PointFields in Solr (Ishan Chattopadhyaya, Tomás Fernández Löbbe) - -* SOLR-10011: Refactor PointField & TrieField to now have a common base class, NumericFieldType. The - TrieField.TrieTypes and PointField.PointTypes are now consolidated to NumericFieldType.NumberType. This - refactoring also fixes a bug whereby PointFields were not using DocValues for range queries for - indexed=false, docValues=true fields. (Ishan Chattopadhyaya, Tomás Fernández Löbbe) ================== 6.5.0 ================== @@ -139,6 +131,10 @@ New Features * SOLR-9903: Stop interrupting the update executor on shutdown, it can cause graceful shutdowns to put replicas into Leader Initiated Recovery among other undesirable things. (Mark Miller) +* SOLR-8396: Add support for PointFields in Solr (Ishan Chattopadhyaya, Tomás Fernández Löbbe) + +* SOLR-9987: Add support for MultiValued DocValues in PointFields using SortedNumericDocValues (Tomás Fernández Löbbe) + Bug Fixes ---------------------- @@ -161,6 +157,8 @@ Bug Fixes * SOLR-10063: CoreContainer shutdown has race condition that can cause a hang on shutdown. (Mark Miller) +* SOLR-10104: BlockDirectoryCache release hooks do not work with multiple directories. (Mike Drob, Mark Miller) + Optimizations ---------------------- @@ -197,6 +195,13 @@ Other Changes * SOLR-10072: The test TestSelectiveWeightCreation appears to be unreliable. (Michael Nilsson via Mark Miller) +* SOLR-9996: Unstored IntPointField returns Long type (Ishan Chattopadhyaya) + +* SOLR-10011: Refactor PointField & TrieField to now have a common base class, NumericFieldType. The + TrieField.TrieTypes and PointField.PointTypes are now consolidated to NumericFieldType.NumberType. This + refactoring also fixes a bug whereby PointFields were not using DocValues for range queries for + indexed=false, docValues=true fields. (Ishan Chattopadhyaya, Tomás Fernández Löbbe) + ================== 6.4.1 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/contrib/extraction/ivy.xml b/solr/contrib/extraction/ivy.xml index 5cf19a17f36..42cee8af805 100644 --- a/solr/contrib/extraction/ivy.xml +++ b/solr/contrib/extraction/ivy.xml @@ -17,7 +17,7 @@ under the License. --> - + diff --git a/solr/core/src/java/org/apache/solr/api/V2HttpCall.java b/solr/core/src/java/org/apache/solr/api/V2HttpCall.java index c996b252a2a..4a053dc36e2 100644 --- a/solr/core/src/java/org/apache/solr/api/V2HttpCall.java +++ b/solr/core/src/java/org/apache/solr/api/V2HttpCall.java @@ -136,7 +136,7 @@ public class V2HttpCall extends HttpSolrCall { if (isCompositeApi && apiInfo instanceof CompositeApi) { ((CompositeApi) this.api).add(apiInfo); } else { - api = apiInfo == null ? api : apiInfo; + api = apiInfo; } MDCLoggingContext.setCore(core); parseRequest(); diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java index 0b74ccba180..3a8aa3edfca 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java +++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java @@ -490,7 +490,6 @@ public class Overseer implements Closeable { this.zkController = zkController; this.stats = new Stats(); this.config = config; - assert ObjectReleaseTracker.track(this); } public synchronized void start(String id) { @@ -521,6 +520,7 @@ public class Overseer implements Closeable { updaterThread.start(); ccThread.start(); arfoThread.start(); + assert ObjectReleaseTracker.track(this); } public Stats getStats() { diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index 4be643e3bfe..123abeacc33 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -688,7 +688,12 @@ public class RealTimeGetComponent extends SearchComponent if (sf != null && sf.multiValued()) { List vals = new ArrayList<>(); - vals.add( f ); + if (f.fieldType().docValuesType() == DocValuesType.SORTED_NUMERIC) { + // SORTED_NUMERICS store sortable bits version of the value, need to retrieve the original + vals.add(sf.getType().toObject(f)); + } else { + vals.add( f ); + } out.setField( f.name(), vals ); } else{ diff --git a/solr/core/src/java/org/apache/solr/handler/component/SortedNumericStatsValues.java b/solr/core/src/java/org/apache/solr/handler/component/SortedNumericStatsValues.java new file mode 100644 index 00000000000..007fb28111d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/component/SortedNumericStatsValues.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.schema.NumberType; + +public class SortedNumericStatsValues implements StatsValues { + + private final NumericStatsValues nsv; + private final String fieldName; + private final NumberType numberType; + private SortedNumericDocValues sndv; + + + public SortedNumericStatsValues(NumericStatsValues nsv, StatsField field) { + this.nsv = nsv; + this.fieldName = field.getSchemaField().getName(); + this.numberType = field.getSchemaField().getType().getNumberType(); + } + + @Override + public void accumulate(NamedList stv) { + nsv.accumulate(stv); + } + + @Override + public void accumulate(int docId) throws IOException { + if (!sndv.advanceExact(docId)) { + missing(); + } else { + for (int i = 0 ; i < sndv.docValueCount(); i++) { + nsv.accumulate(toCorrectType(sndv.nextValue()), 1); + } + } + + } + + private Number toCorrectType(long value) { + switch (numberType) { + case INTEGER: + case LONG: + return value; + case FLOAT: + return NumericUtils.sortableIntToFloat((int)value); + case DOUBLE: + return NumericUtils.sortableLongToDouble(value); + default: + throw new AssertionError("Unsupported number type"); + } + } + + @Override + public void accumulate(BytesRef value, int count) { + nsv.accumulate(value, count); + } + + @Override + public void missing() { + nsv.missing(); + } + + @Override + public void addMissing(int count) { + nsv.addMissing(count); + } + + @Override + public void addFacet(String facetName, Map facetValues) { + nsv.addFacet(facetName, facetValues); + } + + @Override + public NamedList getStatsValues() { + return nsv.getStatsValues(); + } + + @Override + public void setNextReader(LeafReaderContext ctx) throws IOException { + sndv = DocValues.getSortedNumeric(ctx.reader(), fieldName); + assert sndv != null; + } + +} diff --git a/solr/core/src/java/org/apache/solr/handler/component/StatsField.java b/solr/core/src/java/org/apache/solr/handler/component/StatsField.java index 03bf814bb31..1d0ba69949a 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/StatsField.java +++ b/solr/core/src/java/org/apache/solr/handler/component/StatsField.java @@ -416,7 +416,7 @@ public class StatsField { return StatsValuesFactory.createStatsValues(this); } - if (null != schemaField + if (null != schemaField && !schemaField.getType().isPointField() && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) { // TODO: should this also be used for single-valued string fields? (should work fine) diff --git a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java index 7605f73902a..2a6e795a6cb 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java +++ b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java @@ -66,7 +66,12 @@ public class StatsValuesFactory { if (TrieDateField.class.isInstance(fieldType)) { return new DateStatsValues(statsField); } else if (TrieField.class.isInstance(fieldType) || PointField.class.isInstance(fieldType)) { - return new NumericStatsValues(statsField); + + NumericStatsValues statsValue = new NumericStatsValues(statsField); + if (sf.multiValued()) { + return new SortedNumericStatsValues(statsValue, statsField); + } + return statsValue; } else if (StrField.class.isInstance(fieldType)) { return new StringStatsValues(statsField); } else if (sf.getType().getClass().equals(EnumField.class)) { diff --git a/solr/core/src/java/org/apache/solr/request/IntervalFacets.java b/solr/core/src/java/org/apache/solr/request/IntervalFacets.java index 8b7cd3ccc3c..9a23a839cd9 100644 --- a/solr/core/src/java/org/apache/solr/request/IntervalFacets.java +++ b/solr/core/src/java/org/apache/solr/request/IntervalFacets.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.FilterNumericDocValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; @@ -174,8 +175,12 @@ public class IntervalFacets implements Iterable { } private void doCount() throws IOException { - if (schemaField.getType().getNumberType() != null && !schemaField.multiValued()) { - getCountNumeric(); + if (schemaField.getType().getNumberType() != null && (!schemaField.multiValued() || schemaField.getType().isPointField())) { + if (schemaField.multiValued()) { + getCountMultiValuedNumeric(); + } else { + getCountNumeric(); + } } else { getCountString(); } @@ -241,6 +246,36 @@ public class IntervalFacets implements Iterable { } } } + + private void getCountMultiValuedNumeric() throws IOException { + final FieldType ft = schemaField.getType(); + final String fieldName = schemaField.getName(); + if (ft.getNumberType() == null) { + throw new IllegalStateException(); + } + final List leaves = searcher.getIndexReader().leaves(); + + final Iterator ctxIt = leaves.iterator(); + LeafReaderContext ctx = null; + SortedNumericDocValues longs = null; + for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) { + final int doc = docsIt.nextDoc(); + if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) { + do { + ctx = ctxIt.next(); + } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()); + assert doc >= ctx.docBase; + longs = DocValues.getSortedNumeric(ctx.reader(), fieldName); + } + int valuesDocID = longs.docID(); + if (valuesDocID < doc - ctx.docBase) { + valuesDocID = longs.advance(doc - ctx.docBase); + } + if (valuesDocID == doc - ctx.docBase) { + accumIntervalWithMultipleValues(longs); + } + } + } private void getCountString() throws IOException { Filter filter = docs.getTopFilter(); @@ -276,6 +311,44 @@ public class IntervalFacets implements Iterable { } } + private void accumIntervalWithMultipleValues(SortedNumericDocValues longs) throws IOException { + // longs should be already positioned to the correct doc + assert longs.docID() != -1; + assert longs.docValueCount() > 0: "Should have at least one value for this document"; + int currentInterval = 0; + for (int i = 0; i < longs.docValueCount(); i++) { + boolean evaluateNextInterval = true; + long value = longs.nextValue(); + while (evaluateNextInterval && currentInterval < intervals.length) { + IntervalCompareResult result = intervals[currentInterval].includes(value); + switch (result) { + case INCLUDED: + /* + * Increment the current interval and move to the next one using + * the same value + */ + intervals[currentInterval].incCount(); + currentInterval++; + break; + case LOWER_THAN_START: + /* + * None of the next intervals will match this value (all of them have + * higher start value). Move to the next value for this document. + */ + evaluateNextInterval = false; + break; + case GREATER_THAN_END: + /* + * Next interval may match this value + */ + currentInterval++; + break; + } + //Maybe return if currentInterval == intervals.length? + } + } + } + private void accumIntervalsMulti(SortedSetDocValues ssdv, DocIdSetIterator disi, Bits bits) throws IOException { // First update the ordinals in the intervals for this segment diff --git a/solr/core/src/java/org/apache/solr/request/NumericFacets.java b/solr/core/src/java/org/apache/solr/request/NumericFacets.java index 9452c530caa..a72eeeede99 100644 --- a/solr/core/src/java/org/apache/solr/request/NumericFacets.java +++ b/solr/core/src/java/org/apache/solr/request/NumericFacets.java @@ -32,12 +32,14 @@ import org.apache.lucene.index.FilterNumericDocValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.StringHelper; import org.apache.solr.common.params.FacetParams; @@ -61,16 +63,18 @@ final class NumericFacets { long[] bits; // bits identifying a value int[] counts; - int[] docIDs; + int[] docIDs; //Will be null if HashTable is created with needsDocId=false int mask; int size; int threshold; - HashTable() { + HashTable(boolean needsDocId) { final int capacity = 64; // must be a power of 2 bits = new long[capacity]; counts = new int[capacity]; - docIDs = new int[capacity]; + if (needsDocId) { + docIDs = new int[capacity]; + } mask = capacity - 1; size = 0; threshold = (int) (capacity * LOAD_FACTOR); @@ -99,6 +103,23 @@ final class NumericFacets { break; } } + + void add(long value, int count) { + if (size >= threshold) { + rehash(); + } + final int h = hash(value); + for (int slot = h; ; slot = (slot + 1) & mask) { + if (counts[slot] == 0) { + bits[slot] = value; + ++size; + } else if (bits[slot] != value) { + continue; + } + counts[slot] += count; + break; + } + } private void rehash() { final long[] oldBits = bits; @@ -108,14 +129,24 @@ final class NumericFacets { final int newCapacity = bits.length * 2; bits = new long[newCapacity]; counts = new int[newCapacity]; - docIDs = new int[newCapacity]; + if (oldDocIDs!= null) { + docIDs = new int[newCapacity]; + } mask = newCapacity - 1; threshold = (int) (LOAD_FACTOR * newCapacity); size = 0; - for (int i = 0; i < oldBits.length; ++i) { - if (oldCounts[i] > 0) { - add(oldDocIDs[i], oldBits[i], oldCounts[i]); + if (oldDocIDs!= null) { + for (int i = 0; i < oldBits.length; ++i) { + if (oldCounts[i] > 0) { + add(oldDocIDs[i], oldBits[i], oldCounts[i]); + } + } + } else { + for (int i = 0; i < oldBits.length; ++i) { + if (oldCounts[i] > 0) { + add(oldBits[i], oldCounts[i]); + } } } } @@ -129,7 +160,16 @@ final class NumericFacets { } public static NamedList getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException { - final boolean zeros = mincount <= 0; + final SchemaField sf = searcher.getSchema().getField(fieldName); + if (sf.multiValued()) { + // TODO: evaluate using getCountsMultiValued for singleValued numerics with SingletonSortedNumericDocValues + return getCountsMultiValued(searcher, docs, fieldName, offset, limit, mincount, missing, sort); + } + return getCountsSingleValue(searcher, docs, fieldName, offset, limit, mincount, missing, sort); + } + + private static NamedList getCountsSingleValue(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException { + boolean zeros = mincount <= 0; mincount = Math.max(mincount, 1); final SchemaField sf = searcher.getSchema().getField(fieldName); final FieldType ft = sf.getType(); @@ -137,10 +177,11 @@ final class NumericFacets { if (numericType == null) { throw new IllegalStateException(); } + zeros = zeros && !ft.isPointField() && sf.indexed(); // We don't return zeros when using PointFields or when index=false final List leaves = searcher.getIndexReader().leaves(); // 1. accumulate - final HashTable hashTable = new HashTable(); + final HashTable hashTable = new HashTable(true); final Iterator ctxIt = leaves.iterator(); LeafReaderContext ctx = null; NumericDocValues longs = null; @@ -363,4 +404,118 @@ final class NumericFacets { return result; } + private static NamedList getCountsMultiValued(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException { + // If facet.mincount=0 with PointFields the only option is to get the values from DocValues + // not currently supported. See SOLR-10033 + mincount = Math.max(mincount, 1); + final SchemaField sf = searcher.getSchema().getField(fieldName); + final FieldType ft = sf.getType(); + assert sf.multiValued(); + final List leaves = searcher.getIndexReader().leaves(); + + // 1. accumulate + final HashTable hashTable = new HashTable(false); + final Iterator ctxIt = leaves.iterator(); + LeafReaderContext ctx = null; + SortedNumericDocValues longs = null; + int missingCount = 0; + for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) { + final int doc = docsIt.nextDoc(); + if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) { + do { + ctx = ctxIt.next(); + } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()); + assert doc >= ctx.docBase; + longs = DocValues.getSortedNumeric(ctx.reader(), fieldName); + } + int valuesDocID = longs.docID(); + if (valuesDocID < doc - ctx.docBase) { + valuesDocID = longs.advance(doc - ctx.docBase); + } + if (valuesDocID == doc - ctx.docBase) { + long l = longs.nextValue(); // This document must have at least one value + hashTable.add(l, 1); + for (int i = 1; i < longs.docValueCount(); i++) { + long lnew = longs.nextValue(); + if (lnew > l) { // Skip the value if it's equal to the last one, we don't want to double-count it + hashTable.add(lnew, 1); + } + l = lnew; + } + + } else { + ++missingCount; + } + } + + // 2. select top-k facet values + final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size); + final PriorityQueue pq; + if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) { + pq = new PriorityQueue(pqSize) { + @Override + protected boolean lessThan(Entry a, Entry b) { + if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) { + return true; + } else { + return false; + } + } + }; + } else { + // sort=index + pq = new PriorityQueue(pqSize) { + @Override + protected boolean lessThan(Entry a, Entry b) { + return a.bits > b.bits; + } + }; + } + Entry e = null; + for (int i = 0; i < hashTable.bits.length; ++i) { + if (hashTable.counts[i] >= mincount) { + if (e == null) { + e = new Entry(); + } + e.bits = hashTable.bits[i]; + e.count = hashTable.counts[i]; + e = pq.insertWithOverflow(e); + } + } + + // 4. build the NamedList + final NamedList result = new NamedList<>(Math.max(pq.size() - offset + 1, 1)); + final Deque counts = new ArrayDeque<>(pq.size() - offset); + while (pq.size() > offset) { + counts.addFirst(pq.pop()); + } + + for (Entry entry : counts) { + result.add(bitsToStringValue(ft, entry.bits), entry.count); // TODO: convert to correct value + } + + // Once facet.mincount=0 is supported we'll need to add logic similar to the SingleValue case, but obtaining values + // with count 0 from DocValues + + if (missing) { + result.add(null, missingCount); + } + return result; + } + + private static String bitsToStringValue(FieldType fieldType, long bits) { + switch (fieldType.getNumberType()) { + case LONG: + case INTEGER: + return String.valueOf(bits); + case FLOAT: + return String.valueOf(NumericUtils.sortableIntToFloat((int)bits)); + case DOUBLE: + return String.valueOf(NumericUtils.sortableLongToDouble(bits)); + //TODO: DATE + default: + throw new AssertionError("Unsupported NumberType: " + fieldType.getNumberType()); + } + } + } diff --git a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java index 22a837a2891..41909580746 100644 --- a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java @@ -482,8 +482,8 @@ public class SimpleFacets { counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists); break; case FCS: - assert !multiToken; - if (ft.getNumberType() != null && !sf.multiValued()) { + assert ft.isPointField() || !multiToken; + if (ft.isPointField() || (ft.getNumberType() != null && !sf.multiValued())) { // force numeric faceting if (prefix != null && !prefix.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types"); @@ -494,6 +494,10 @@ public class SimpleFacets { throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types"); } } +// We should do this, but mincount=0 is currently the default +// if (ft.isPointField() && mincount <= 0) { +// throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_MINCOUNT + " <= 0 is not supported on point types"); +// } counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort); } else { PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter); diff --git a/solr/core/src/java/org/apache/solr/schema/DoublePointField.java b/solr/core/src/java/org/apache/solr/schema/DoublePointField.java index b9a7311f5e2..3a90eecb939 100644 --- a/solr/core/src/java/org/apache/solr/schema/DoublePointField.java +++ b/solr/core/src/java/org/apache/solr/schema/DoublePointField.java @@ -27,10 +27,13 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.legacy.LegacyNumericType; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.DoubleFieldSource; +import org.apache.lucene.queries.function.valuesource.MultiValuedDoubleFieldSource; import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.NumericUtils; import org.apache.solr.search.QParser; import org.apache.solr.uninverting.UninvertingReader.Type; import org.slf4j.Logger; @@ -91,6 +94,8 @@ public class DoublePointField extends PointField implements DoubleValueFieldType if (val != null) { if (f.fieldType().stored() == false && f.fieldType().docValuesType() == DocValuesType.NUMERIC) { return Double.longBitsToDouble(val.longValue()); + } else if (f.fieldType().stored() == false && f.fieldType().docValuesType() == DocValuesType.SORTED_NUMERIC) { + return NumericUtils.sortableLongToDouble(val.longValue()); } else { return val; } @@ -149,8 +154,7 @@ public class DoublePointField extends PointField implements DoubleValueFieldType @Override public Type getUninversionType(SchemaField sf) { if (sf.multiValued()) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported"); -// return Type.SORTED_DOUBLE; + return Type.SORTED_DOUBLE; } else { return Type.DOUBLE_POINT; } @@ -161,6 +165,11 @@ public class DoublePointField extends PointField implements DoubleValueFieldType field.checkFieldCacheSource(); return new DoubleFieldSource(field.getName()); } + + @Override + protected ValueSource getSingleValueSource(SortedNumericSelector.Type choice, SchemaField f) { + return new MultiValuedDoubleFieldSource(f.getName(), choice); + } @Override public LegacyNumericType getNumericType() { diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 7f4400021a3..c21b23fb565 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -626,7 +626,7 @@ public abstract class FieldType extends FieldProperties { /** * Return the numeric type of this field, or null if this field is not a - * numeric field. + * numeric field. */ public NumberType getNumberType() { return null; diff --git a/solr/core/src/java/org/apache/solr/schema/FloatPointField.java b/solr/core/src/java/org/apache/solr/schema/FloatPointField.java index 7b866fce9a6..66472868cd9 100644 --- a/solr/core/src/java/org/apache/solr/schema/FloatPointField.java +++ b/solr/core/src/java/org/apache/solr/schema/FloatPointField.java @@ -27,10 +27,13 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.legacy.LegacyNumericType; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.FloatFieldSource; +import org.apache.lucene.queries.function.valuesource.MultiValuedFloatFieldSource; import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.NumericUtils; import org.apache.solr.search.QParser; import org.apache.solr.uninverting.UninvertingReader.Type; import org.slf4j.Logger; @@ -91,7 +94,9 @@ public class FloatPointField extends PointField implements FloatValueFieldType { if (val != null) { if (f.fieldType().stored() == false && f.fieldType().docValuesType() == DocValuesType.NUMERIC) { return Float.intBitsToFloat(val.intValue()); - } else { + } else if (f.fieldType().stored() == false && f.fieldType().docValuesType() == DocValuesType.SORTED_NUMERIC) { + return NumericUtils.sortableIntToFloat(val.intValue()); + } else { return val; } } else { @@ -149,8 +154,7 @@ public class FloatPointField extends PointField implements FloatValueFieldType { @Override public Type getUninversionType(SchemaField sf) { if (sf.multiValued()) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported"); -// return Type.SORTED_FLOAT; + return Type.SORTED_FLOAT; } else { return Type.FLOAT_POINT; } @@ -161,6 +165,12 @@ public class FloatPointField extends PointField implements FloatValueFieldType { field.checkFieldCacheSource(); return new FloatFieldSource(field.getName()); } + + @Override + protected ValueSource getSingleValueSource(SortedNumericSelector.Type choice, SchemaField f) { + return new MultiValuedFloatFieldSource(f.getName(), choice); + } + @Override public LegacyNumericType getNumericType() { diff --git a/solr/core/src/java/org/apache/solr/schema/IntPointField.java b/solr/core/src/java/org/apache/solr/schema/IntPointField.java index 3e74241f07a..b25bc9f5faa 100644 --- a/solr/core/src/java/org/apache/solr/schema/IntPointField.java +++ b/solr/core/src/java/org/apache/solr/schema/IntPointField.java @@ -26,8 +26,10 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.legacy.LegacyNumericType; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.IntFieldSource; +import org.apache.lucene.queries.function.valuesource.MultiValuedIntFieldSource; import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.solr.search.QParser; @@ -149,8 +151,7 @@ public class IntPointField extends PointField implements IntValueFieldType { @Override public Type getUninversionType(SchemaField sf) { if (sf.multiValued()) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported"); -// return Type.SORTED_INTEGER; + return Type.SORTED_INTEGER; } else { return Type.INTEGER_POINT; } @@ -182,5 +183,10 @@ public class IntPointField extends PointField implements IntValueFieldType { protected StoredField getStoredField(SchemaField sf, Object value) { return new StoredField(sf.getName(), (Integer) this.toNativeType(value)); } + + @Override + protected ValueSource getSingleValueSource(SortedNumericSelector.Type choice, SchemaField f) { + return new MultiValuedIntFieldSource(f.getName(), choice); + } } diff --git a/solr/core/src/java/org/apache/solr/schema/LongPointField.java b/solr/core/src/java/org/apache/solr/schema/LongPointField.java index 80f3cf7765e..56319383f69 100644 --- a/solr/core/src/java/org/apache/solr/schema/LongPointField.java +++ b/solr/core/src/java/org/apache/solr/schema/LongPointField.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.legacy.LegacyNumericType; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.LongFieldSource; +import org.apache.lucene.queries.function.valuesource.MultiValuedLongFieldSource; import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; @@ -149,8 +150,7 @@ public class LongPointField extends PointField implements LongValueFieldType { @Override public Type getUninversionType(SchemaField sf) { if (sf.multiValued()) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported"); -// return Type.SORTED_LONG; + return Type.SORTED_LONG; } else { return Type.LONG_POINT; } @@ -161,6 +161,12 @@ public class LongPointField extends PointField implements LongValueFieldType { field.checkFieldCacheSource(); return new LongFieldSource(field.getName()); } + + @Override + protected ValueSource getSingleValueSource(org.apache.lucene.search.SortedNumericSelector.Type choice, + SchemaField field) { + return new MultiValuedLongFieldSource(field.getName(), choice); + } @Override public LegacyNumericType getNumericType() { diff --git a/solr/core/src/java/org/apache/solr/schema/NumericFieldType.java b/solr/core/src/java/org/apache/solr/schema/NumericFieldType.java index 44066a2c7a7..5801766f7a9 100644 --- a/solr/core/src/java/org/apache/solr/schema/NumericFieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/NumericFieldType.java @@ -17,9 +17,11 @@ package org.apache.solr.schema; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.NumericUtils; import org.apache.solr.common.SolrException; import org.apache.solr.search.FunctionRangeQuery; import org.apache.solr.search.QParser; @@ -49,28 +51,36 @@ public abstract class NumericFieldType extends PrimitiveFieldType { protected Query getDocValuesRangeQuery(QParser parser, SchemaField field, String min, String max, boolean minInclusive, boolean maxInclusive) { - assert field.hasDocValues() && !field.multiValued(); + assert field.hasDocValues() && (field.getType().isPointField() || !field.multiValued()); switch (getNumberType()) { case INTEGER: return numericDocValuesRangeQuery(field.getName(), min == null ? null : (long) Integer.parseInt(min), max == null ? null : (long) Integer.parseInt(max), - minInclusive, maxInclusive); + minInclusive, maxInclusive, field.multiValued()); case FLOAT: - return getRangeQueryForFloatDoubleDocValues(field, min, max, minInclusive, maxInclusive); + if (field.multiValued()) { + return getRangeQueryForMultiValuedFloatDocValues(field, min, max, minInclusive, maxInclusive); + } else { + return getRangeQueryForFloatDoubleDocValues(field, min, max, minInclusive, maxInclusive); + } case LONG: return numericDocValuesRangeQuery(field.getName(), min == null ? null : Long.parseLong(min), max == null ? null : Long.parseLong(max), - minInclusive, maxInclusive); + minInclusive, maxInclusive, field.multiValued()); case DOUBLE: - return getRangeQueryForFloatDoubleDocValues(field, min, max, minInclusive, maxInclusive); + if (field.multiValued()) { + return getRangeQueryForMultiValuedDoubleDocValues(field, min, max, minInclusive, maxInclusive); + } else { + return getRangeQueryForFloatDoubleDocValues(field, min, max, minInclusive, maxInclusive); + } case DATE: return numericDocValuesRangeQuery(field.getName(), min == null ? null : DateMathParser.parseMath(null, min).getTime(), max == null ? null : DateMathParser.parseMath(null, max).getTime(), - minInclusive, maxInclusive); + minInclusive, maxInclusive, field.multiValued()); default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for numeric field"); } @@ -104,19 +114,32 @@ public abstract class NumericFieldType extends PrimitiveFieldType { if ((minVal == null || minVal.doubleValue() < 0d || minBits == minusZeroBits) && (maxVal != null && (maxVal.doubleValue() < 0d || maxBits == minusZeroBits))) { query = numericDocValuesRangeQuery - (fieldName, maxBits, (min == null ? negativeInfinityBits : minBits), maxInclusive, minInclusive); + (fieldName, maxBits, (min == null ? negativeInfinityBits : minBits), maxInclusive, minInclusive, false); } else { // If both max and min are positive, then issue range query query = numericDocValuesRangeQuery - (fieldName, minBits, (max == null ? positiveInfinityBits : maxBits), minInclusive, maxInclusive); + (fieldName, minBits, (max == null ? positiveInfinityBits : maxBits), minInclusive, maxInclusive, false); } } return query; } + protected Query getRangeQueryForMultiValuedDoubleDocValues(SchemaField sf, String min, String max, boolean minInclusive, boolean maxInclusive) { + Long minBits = min == null ? NumericUtils.doubleToSortableLong(Double.NEGATIVE_INFINITY): NumericUtils.doubleToSortableLong(Double.parseDouble(min)); + Long maxBits = max == null ? NumericUtils.doubleToSortableLong(Double.POSITIVE_INFINITY): NumericUtils.doubleToSortableLong(Double.parseDouble(max)); + return numericDocValuesRangeQuery(sf.getName(), minBits, maxBits, minInclusive, maxInclusive, true); + } + + protected Query getRangeQueryForMultiValuedFloatDocValues(SchemaField sf, String min, String max, boolean minInclusive, boolean maxInclusive) { + Long minBits = (long)(min == null ? NumericUtils.floatToSortableInt(Float.NEGATIVE_INFINITY): NumericUtils.floatToSortableInt(Float.parseFloat(min))); + Long maxBits = (long)(max == null ? NumericUtils.floatToSortableInt(Float.POSITIVE_INFINITY): NumericUtils.floatToSortableInt(Float.parseFloat(max))); + return numericDocValuesRangeQuery(sf.getName(), minBits, maxBits, minInclusive, maxInclusive, true); + } + public static Query numericDocValuesRangeQuery( String field, Number lowerValue, Number upperValue, - boolean lowerInclusive, boolean upperInclusive) { + boolean lowerInclusive, boolean upperInclusive, + boolean multiValued) { long actualLowerValue = Long.MIN_VALUE; if (lowerValue != null) { @@ -139,6 +162,11 @@ public abstract class NumericFieldType extends PrimitiveFieldType { --actualUpperValue; } } - return NumericDocValuesField.newRangeQuery(field, actualLowerValue, actualUpperValue); + if (multiValued) { + // In multiValued case use SortedNumericDocValuesField, this won't work for Trie*Fields wince they use BinaryDV in the multiValue case + return SortedNumericDocValuesField.newRangeQuery(field, actualLowerValue, actualUpperValue); + } else { + return NumericDocValuesField.newRangeQuery(field, actualLowerValue, actualUpperValue); + } } } diff --git a/solr/core/src/java/org/apache/solr/schema/PointField.java b/solr/core/src/java/org/apache/solr/schema/PointField.java index 9b1ed380a38..1168386dca5 100644 --- a/solr/core/src/java/org/apache/solr/schema/PointField.java +++ b/solr/core/src/java/org/apache/solr/schema/PointField.java @@ -24,15 +24,17 @@ import java.util.Collections; import java.util.List; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.IndexableField; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.Query; -import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.search.SortedNumericSelector; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.NumericUtils; import org.apache.solr.common.SolrException; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; @@ -75,7 +77,7 @@ public abstract class PointField extends NumericFieldType { // multivalued Point fields all use SortedSetDocValues, so we give a clean error if that's // not supported by the specified choice, else we delegate to a helper - SortedSetSelector.Type selectorType = choice.getSortedSetSelectorType(); + SortedNumericSelector.Type selectorType = choice.getSortedNumericSelectorType(); if (null == selectorType) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, choice.toString() + " is not a supported option for picking a single value" @@ -95,9 +97,7 @@ public abstract class PointField extends NumericFieldType { * @param field the field to use, guaranteed to be multivalued. * @see #getSingleValueSource(MultiValueSelector,SchemaField,QParser) */ - protected ValueSource getSingleValueSource(SortedSetSelector.Type choice, SchemaField field) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported"); - } + protected abstract ValueSource getSingleValueSource(SortedNumericSelector.Type choice, SchemaField field); @Override public boolean isTokenized() { @@ -130,7 +130,7 @@ public abstract class PointField extends NumericFieldType { @Override public Query getRangeQuery(QParser parser, SchemaField field, String min, String max, boolean minInclusive, boolean maxInclusive) { - if (!field.indexed() && field.hasDocValues() && !field.multiValued()) { + if (!field.indexed() && field.hasDocValues()) { return getDocValuesRangeQuery(parser, field, min, max, minInclusive, maxInclusive); } else { return getPointRangeQuery(parser, field, min, max, minInclusive, maxInclusive); @@ -203,10 +203,8 @@ public abstract class PointField extends NumericFieldType { fields.add(field); if (sf.hasDocValues()) { - if (sf.multiValued()) { - throw new UnsupportedOperationException("MultiValued Point fields with DocValues is not currently supported. Field: '" + sf.getName() + "'"); - } else { - final long bits; + final long bits; + if (!sf.multiValued()) { if (field.numericValue() instanceof Integer || field.numericValue() instanceof Long) { bits = field.numericValue().longValue(); } else if (field.numericValue() instanceof Float) { @@ -216,8 +214,19 @@ public abstract class PointField extends NumericFieldType { bits = Double.doubleToLongBits(field.numericValue().doubleValue()); } fields.add(new NumericDocValuesField(sf.getName(), bits)); + } else { + // MultiValued + if (field.numericValue() instanceof Integer || field.numericValue() instanceof Long) { + bits = field.numericValue().longValue(); + } else if (field.numericValue() instanceof Float) { + bits = NumericUtils.floatToSortableInt(field.numericValue().floatValue()); + } else { + assert field.numericValue() instanceof Double; + bits = NumericUtils.doubleToSortableLong(field.numericValue().doubleValue()); + } + fields.add(new SortedNumericDocValuesField(sf.getName(), bits)); } - } + } if (sf.stored()) { fields.add(getStoredField(sf, value)); } diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index 820e1bab932..c6508453637 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -56,6 +56,7 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.StoredFieldVisitor.Status; @@ -98,6 +99,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.NumericUtils; import org.apache.solr.common.SolrDocumentBase; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; @@ -816,110 +818,136 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable, SolrI log.warn("Couldn't decorate docValues for field: [{}], schemaField: [{}]", fieldName, schemaField); continue; } - - if (schemaField.multiValued()) { - final SortedSetDocValues values = leafReader.getSortedSetDocValues(fieldName); - if (values != null && values.getValueCount() > 0) { - if (values.advance(localId) == localId) { - final List outValues = new LinkedList(); - for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { - final BytesRef value = values.lookupOrd(ord); - outValues.add(schemaField.getType().toObject(schemaField, value)); - } - assert outValues.size() > 0; - doc.addField(fieldName, outValues); + FieldInfo fi = fieldInfos.fieldInfo(fieldName); + if (fi == null) { + continue; // Searcher doesn't have info about this field, hence ignore it. + } + final DocValuesType dvType = fi.getDocValuesType(); + switch (dvType) { + case NUMERIC: + final NumericDocValues ndv = leafReader.getNumericDocValues(fieldName); + if (ndv == null) { + continue; } - } - } else { - FieldInfo fi = fieldInfos.fieldInfo(fieldName); - if (fi == null) { - continue; // Searcher doesn't have info about this field, hence ignore it. - } - final DocValuesType dvType = fi.getDocValuesType(); - switch (dvType) { - case NUMERIC: - final NumericDocValues ndv = leafReader.getNumericDocValues(fieldName); - if (ndv == null) { - continue; - } - Long val; - if (ndv.advanceExact(localId)) { - val = ndv.longValue(); - } else { - continue; - } - Object newVal = val; - if (schemaField.getType().isPointField()) { - NumberType type = schemaField.getType().getNumberType(); - switch (type) { - case INTEGER: - newVal = val.intValue(); - break; - case LONG: - newVal = val.longValue(); - break; - case FLOAT: - newVal = Float.intBitsToFloat(val.intValue()); - break; - case DOUBLE: - newVal = Double.longBitsToDouble(val); - break; - case DATE: - newVal = new Date(val); - break; - default: - throw new AssertionError("Unexpected PointType: " + type); - } - } else { - if (schemaField.getType() instanceof TrieIntField) { + Long val; + if (ndv.advanceExact(localId)) { + val = ndv.longValue(); + } else { + continue; + } + Object newVal = val; + if (schemaField.getType().isPointField()) { + // TODO: Maybe merge PointField with TrieFields here + NumberType type = schemaField.getType().getNumberType(); + switch (type) { + case INTEGER: newVal = val.intValue(); - } else if (schemaField.getType() instanceof TrieFloatField) { + break; + case LONG: + newVal = val.longValue(); + break; + case FLOAT: newVal = Float.intBitsToFloat(val.intValue()); - } else if (schemaField.getType() instanceof TrieDoubleField) { + break; + case DOUBLE: newVal = Double.longBitsToDouble(val); - } else if (schemaField.getType() instanceof TrieDateField) { + break; + case DATE: newVal = new Date(val); - } else if (schemaField.getType() instanceof EnumField) { - newVal = ((EnumField) schemaField.getType()).intValueToStringValue(val.intValue()); - } + break; + default: + throw new AssertionError("Unexpected PointType: " + type); } - doc.addField(fieldName, newVal); - break; - case BINARY: - BinaryDocValues bdv = leafReader.getBinaryDocValues(fieldName); - if (bdv == null) { - continue; + } else { + if (schemaField.getType() instanceof TrieIntField) { + newVal = val.intValue(); + } else if (schemaField.getType() instanceof TrieFloatField) { + newVal = Float.intBitsToFloat(val.intValue()); + } else if (schemaField.getType() instanceof TrieDoubleField) { + newVal = Double.longBitsToDouble(val); + } else if (schemaField.getType() instanceof TrieDateField) { + newVal = new Date(val); + } else if (schemaField.getType() instanceof EnumField) { + newVal = ((EnumField) schemaField.getType()).intValueToStringValue(val.intValue()); } - BytesRef value; - if (bdv.advanceExact(localId)) { - value = BytesRef.deepCopyOf(bdv.binaryValue()); + } + doc.addField(fieldName, newVal); + break; + case BINARY: + BinaryDocValues bdv = leafReader.getBinaryDocValues(fieldName); + if (bdv == null) { + continue; + } + BytesRef value; + if (bdv.advanceExact(localId)) { + value = BytesRef.deepCopyOf(bdv.binaryValue()); + } else { + continue; + } + doc.addField(fieldName, value); + break; + case SORTED: + SortedDocValues sdv = leafReader.getSortedDocValues(fieldName); + if (sdv == null) { + continue; + } + if (sdv.advanceExact(localId)) { + final BytesRef bRef = sdv.binaryValue(); + // Special handling for Boolean fields since they're stored as 'T' and 'F'. + if (schemaField.getType() instanceof BoolField) { + doc.addField(fieldName, schemaField.getType().toObject(schemaField, bRef)); } else { - continue; + doc.addField(fieldName, bRef.utf8ToString()); } - doc.addField(fieldName, value); - break; - case SORTED: - SortedDocValues sdv = leafReader.getSortedDocValues(fieldName); - if (sdv == null) { - continue; - } - if (sdv.advanceExact(localId)) { - final BytesRef bRef = sdv.binaryValue(); - // Special handling for Boolean fields since they're stored as 'T' and 'F'. - if (schemaField.getType() instanceof BoolField) { - doc.addField(fieldName, schemaField.getType().toObject(schemaField, bRef)); - } else { - doc.addField(fieldName, bRef.utf8ToString()); + } + break; + case SORTED_NUMERIC: + final SortedNumericDocValues numericDv = leafReader.getSortedNumericDocValues(fieldName); + NumberType type = schemaField.getType().getNumberType(); + if (numericDv != null) { + if (numericDv.advance(localId) == localId) { + final List outValues = new ArrayList(numericDv.docValueCount()); + for (int i = 0; i < numericDv.docValueCount(); i++) { + long number = numericDv.nextValue(); + switch (type) { + case INTEGER: + outValues.add((int)number); + break; + case LONG: + outValues.add(number); + break; + case FLOAT: + outValues.add(NumericUtils.sortableIntToFloat((int)number)); + break; + case DOUBLE: + outValues.add(NumericUtils.sortableLongToDouble(number)); + break; + case DATE: + newVal = new Date(number); + break; + default: + throw new AssertionError("Unexpected PointType: " + type); + } } + assert outValues.size() > 0; + doc.addField(fieldName, outValues); } - break; - case SORTED_NUMERIC: - throw new AssertionError("SORTED_NUMERIC not supported yet!"); - case SORTED_SET: - throw new AssertionError("SORTED_SET fields should be multi-valued!"); - case NONE: - break; - } + } + case SORTED_SET: + final SortedSetDocValues values = leafReader.getSortedSetDocValues(fieldName); + if (values != null && values.getValueCount() > 0) { + if (values.advance(localId) == localId) { + final List outValues = new LinkedList(); + for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { + value = values.lookupOrd(ord); + outValues.add(schemaField.getType().toObject(schemaField, value)); + } + assert outValues.size() > 0; + doc.addField(fieldName, outValues); + } + } + case NONE: + break; } } } diff --git a/solr/core/src/java/org/apache/solr/store/blockcache/BlockCache.java b/solr/core/src/java/org/apache/solr/store/blockcache/BlockCache.java index 7a5c67c78a2..f00ca1ddaf4 100644 --- a/solr/core/src/java/org/apache/solr/store/blockcache/BlockCache.java +++ b/solr/core/src/java/org/apache/solr/store/blockcache/BlockCache.java @@ -17,6 +17,8 @@ package org.apache.solr.store.blockcache; import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.atomic.AtomicInteger; import com.github.benmanes.caffeine.cache.Cache; @@ -38,8 +40,8 @@ public class BlockCache { private final int numberOfBlocksPerBank; private final int maxEntries; private final Metrics metrics; - private volatile OnRelease onRelease; - + private final List onReleases = new CopyOnWriteArrayList<>(); + public static interface OnRelease { public void release(BlockCacheKey blockCacheKey); } @@ -95,7 +97,7 @@ public class BlockCache { location.setRemoved(true); locks[bankId].clear(block); lockCounters[bankId].decrementAndGet(); - if (onRelease != null) { + for (OnRelease onRelease : onReleases) { onRelease.release(blockCacheKey); } metrics.blockCacheEviction.incrementAndGet(); @@ -239,6 +241,6 @@ public class BlockCache { } void setOnRelease(OnRelease onRelease) { - this.onRelease = onRelease; + this.onReleases.add(onRelease); } } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 5276ca9da30..bc27231b091 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.document.BinaryDocValuesField; // javadocs import org.apache.lucene.document.NumericDocValuesField; // javadocs import org.apache.lucene.document.SortedDocValuesField; // javadocs +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; // javadocs import org.apache.lucene.document.StringField; // javadocs import org.apache.lucene.index.BinaryDocValues; @@ -169,7 +170,35 @@ public class UninvertingReader extends FilterLeafReader { * Fields with this type act as if they were indexed with * {@link SortedSetDocValuesField}. */ - SORTED_SET_DOUBLE + SORTED_SET_DOUBLE, + /** + * Multi-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.IntPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedNumericDocValuesField}. + */ + SORTED_INTEGER, + /** + * Multi-valued Float, (e.g. indexed with {@link org.apache.lucene.document.FloatPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedNumericDocValuesField}. + */ + SORTED_FLOAT, + /** + * Multi-valued Long, (e.g. indexed with {@link org.apache.lucene.document.LongPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedNumericDocValuesField}. + */ + SORTED_LONG, + /** + * Multi-valued Double, (e.g. indexed with {@link org.apache.lucene.document.DoublePoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedNumericDocValuesField}. + */ + SORTED_DOUBLE } /** @@ -255,6 +284,12 @@ public class UninvertingReader extends FilterLeafReader { case SORTED_SET_DOUBLE: type = DocValuesType.SORTED_SET; break; + case SORTED_INTEGER: + case SORTED_FLOAT: + case SORTED_LONG: + case SORTED_DOUBLE: + type = DocValuesType.SORTED_NUMERIC; + break; default: throw new AssertionError(); } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-distrib-interval-faceting.xml b/solr/core/src/test-files/solr/collection1/conf/schema-distrib-interval-faceting.xml index ff73fdcd300..e843e0a607b 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-distrib-interval-faceting.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-distrib-interval-faceting.xml @@ -37,20 +37,20 @@ - + - + - + - + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml index 673e7dd1968..597f2c30604 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml @@ -34,8 +34,9 @@ - + + @@ -45,16 +46,19 @@ + + + @@ -67,15 +71,18 @@ + - + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-point.xml b/solr/core/src/test-files/solr/collection1/conf/schema-point.xml index 053d39bd306..3561013284e 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-point.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-point.xml @@ -84,6 +84,12 @@ + + + + + + id diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml index aef6c4c2950..ef7fc8df7bf 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml @@ -571,18 +571,17 @@ - - + - + - + - - + @@ -612,14 +611,14 @@ - - + + - + - + @@ -679,10 +678,10 @@ - - - - + + + + @@ -693,21 +692,41 @@ useDocValuesAsStored="true"/> - - - - + + + + + + + + + + + + + + + + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema11.xml b/solr/core/src/test-files/solr/collection1/conf/schema11.xml index 370f32159b3..24129aeed55 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema11.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema11.xml @@ -322,12 +322,12 @@ valued. --> - + - - + + @@ -346,14 +346,14 @@ valued. --> - + - + @@ -382,6 +382,24 @@ valued. --> + + + + + + + + + + + + + + + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 206cd9ebacf..2d0615c10c3 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -458,8 +458,8 @@ - - + + @@ -554,20 +554,20 @@ - - + + - + - + - + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema_latest.xml b/solr/core/src/test-files/solr/collection1/conf/schema_latest.xml index c6491eb4173..e4747d83651 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema_latest.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema_latest.xml @@ -109,7 +109,7 @@ - + - - + + - + - - - - - - + + + + + + - - - + + + - + @@ -228,15 +228,15 @@ - + - + - + - + @@ -396,10 +396,10 @@ - - - - + + + + - + diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index 0a1c1b27848..1dcd529354e 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -2474,4 +2474,8 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase { protected static void systemClearPropertySolrTestsMergePolicyFactory() { System.clearProperty(SYSTEM_PROPERTY_SOLR_TESTS_MERGEPOLICYFACTORY); } + + protected T pickRandom(T... options) { + return options[random().nextInt(options.length)]; + } } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index 81e1f224ef2..7eb9b0d2e1e 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -457,7 +457,7 @@ public class MiniSolrCloudCluster { } } finally { executor.shutdown(); - executor.awaitTermination(2, TimeUnit.SECONDS); + executor.awaitTermination(15, TimeUnit.SECONDS); try { if (!externalZkServer) { zkServer.shutdown();