From 325cbf00185e3fe2576720c83635cbed8ed12c0e Mon Sep 17 00:00:00 2001
From: Noble Paul
Date: Tue, 14 Feb 2017 17:32:14 -0800
Subject: [PATCH] Revert "Merge remote-tracking branch 'origin/master'"
This reverts commit 26298f35df118aad186e6eaf1ea5c2e5f50d607f, reversing
changes made to 65c6c576b720b19029a10bf14f81d4de23302863.
undoing the merge commit
---
lucene/CHANGES.txt | 10 -
.../pattern/SimplePatternSplitTokenizer.java | 258 -------
.../SimplePatternSplitTokenizerFactory.java | 76 ---
.../pattern/SimplePatternTokenizer.java | 242 -------
.../SimplePatternTokenizerFactory.java | 76 ---
...ache.lucene.analysis.util.TokenizerFactory | 2 -
.../analysis/core/TestRandomChains.java | 10 +-
.../TestSimplePatternSplitTokenizer.java | 273 --------
.../pattern/TestSimplePatternTokenizer.java | 218 ------
lucene/common-build.xml | 2 +-
.../apache/lucene/analysis/package-info.java | 12 +-
.../util/automaton/ByteRunAutomaton.java | 4 +-
.../util/automaton/CharacterRunAutomaton.java | 2 +-
.../lucene/util/automaton/Operations.java | 50 +-
.../lucene/util/automaton/RunAutomaton.java | 128 ++--
lucene/demo/ivy.xml | 2 +-
.../lucene/search/TermAutomatonScorer.java | 2 +-
lucene/test-framework/ivy.xml | 2 +-
.../index/BasePointsFormatTestCase.java | 41 +-
lucene/tools/ivy.xml | 2 +-
.../GetMavenDependenciesTask.java | 58 +-
solr/CHANGES.txt | 21 +-
solr/contrib/extraction/ivy.xml | 2 +-
.../java/org/apache/solr/cloud/Overseer.java | 2 +-
.../component/RealTimeGetComponent.java | 7 +-
.../component/SortedNumericStatsValues.java | 106 ---
.../solr/handler/component/StatsField.java | 2 +-
.../handler/component/StatsValuesFactory.java | 7 +-
.../apache/solr/request/IntervalFacets.java | 77 +--
.../apache/solr/request/NumericFacets.java | 173 +----
.../org/apache/solr/request/SimpleFacets.java | 8 +-
.../apache/solr/schema/DoublePointField.java | 13 +-
.../org/apache/solr/schema/FieldType.java | 2 +-
.../apache/solr/schema/FloatPointField.java | 16 +-
.../org/apache/solr/schema/IntPointField.java | 10 +-
.../apache/solr/schema/LongPointField.java | 10 +-
.../apache/solr/schema/NumericFieldType.java | 48 +-
.../org/apache/solr/schema/PointField.java | 31 +-
.../apache/solr/search/SolrIndexSearcher.java | 222 +++---
.../solr/store/blockcache/BlockCache.java | 10 +-
.../solr/uninverting/UninvertingReader.java | 37 +-
.../conf/schema-distrib-interval-faceting.xml | 8 +-
.../conf/schema-docValuesFaceting.xml | 11 +-
.../solr/collection1/conf/schema-point.xml | 6 -
.../solr/collection1/conf/schema.xml | 55 +-
.../solr/collection1/conf/schema11.xml | 28 +-
.../solr/collection1/conf/schema12.xml | 14 +-
.../solr/collection1/conf/schema_latest.xml | 50 +-
.../solrconfig-update-processor-chains.xml | 2 -
.../apache/solr/TestDistributedSearch.java | 8 +-
.../org/apache/solr/TestGroupingSearch.java | 1 -
.../org/apache/solr/TestRandomDVFaceting.java | 26 +-
.../cloud/SegmentTerminateEarlyTestState.java | 4 +-
.../apache/solr/cloud/TestSegmentSorting.java | 2 -
.../handler/component/StatsComponentTest.java | 7 +-
.../handler/component/TermsComponentTest.java | 2 -
.../apache/solr/request/TestFacetMethods.java | 11 +-
.../solr/request/TestIntervalFaceting.java | 12 +-
.../apache/solr/schema/TestPointFields.java | 631 +++++-------------
.../solr/search/TestSolrQueryParser.java | 2 +-
.../update/processor/AtomicUpdatesTest.java | 64 +-
solr/test-framework/ivy.xml | 2 +-
.../java/org/apache/solr/SolrTestCaseJ4.java | 4 -
.../solr/cloud/MiniSolrCloudCluster.java | 2 +-
64 files changed, 591 insertions(+), 2635 deletions(-)
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
delete mode 100644 solr/core/src/java/org/apache/solr/handler/component/SortedNumericStatsValues.java
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e4042dad5e1..f9c464b4388 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -107,11 +107,6 @@ New Features
SortedNumericSelector.Type can give a ValueSource view of a
SortedNumericDocValues field. (Tomás Fernández Löbbe)
-* LUCENE-7465: Add SimplePatternTokenizer and
- SimplePatternSplitTokenizer, using Lucene's regexp/automaton
- implementation for analysis/tokenization (Clinton Gormley, Mike
- McCandless)
-
Bug Fixes
* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
@@ -176,11 +171,6 @@ Build
Jars are not downloaded; compilation is not updated; and Clover is not enabled.
(Steve Rowe)
-* LUCENE-7694: Update forbiddenapis to version 2.3. (Uwe Schindler)
-
-* LUCENE-7693: Replace "org.apache." logic in GetMavenDependenciesTask.
- (Daniel Collins, Christine Poerschke)
-
Other
* LUCENE-7666: Fix typos in lucene-join package info javadoc.
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
deleted file mode 100644
index d2b10c1a0d2..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.pattern;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
-
-/**
- * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
- * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. This is just
- * like {@link SimplePatternTokenizer} except that the pattern shold make valid token separator characters, like
- * {@code String.split}. Empty string tokens are never produced.
- *
- * @lucene.experimental
- */
-
-public final class SimplePatternSplitTokenizer extends Tokenizer {
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private final CharacterRunAutomaton runDFA;
-
- // TODO: this is copied from SimplePatternTokenizer, but there are subtle differences e.g. we track sepUpto an tokenUpto;
- // find a clean way to share it:
-
- // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
- // suspect it's slowish:
-
- private char[] pendingChars = new char[8];
- private int tokenUpto;
- private int pendingLimit;
- private int pendingUpto;
- private int offset;
- private int sepUpto;
- private final char[] buffer = new char[1024];
- private int bufferLimit;
- private int bufferNextRead;
-
- /** See {@link RegExp} for the accepted syntax. */
- public SimplePatternSplitTokenizer(String regexp) {
- this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- /** Runs a pre-built automaton. */
- public SimplePatternSplitTokenizer(Automaton dfa) {
- this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
- }
-
- /** See {@link RegExp} for the accepted syntax. */
- public SimplePatternSplitTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
- this(factory, new RegExp(regexp).toAutomaton());
- }
-
- /** Runs a pre-built automaton. */
- public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
- super(factory);
-
- // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
- // realizing this ctor is otherwise trappy
- if (dfa.isDeterministic() == false) {
- throw new IllegalArgumentException("please determinize the incoming automaton first");
- }
-
- runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- private void fillToken(int offsetStart) {
- termAtt.setLength(tokenUpto);
- offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+tokenUpto));
- }
-
- @Override
- public boolean incrementToken() throws IOException {
-
- int offsetStart = offset;
-
- clearAttributes();
-
- tokenUpto = 0;
-
- while (true) {
- sepUpto = 0;
-
- // The runDFA operates in Unicode space, not UTF16 (java's char):
- int ch = nextCodePoint();
- if (ch == -1) {
- if (tokenUpto > 0) {
- fillToken(offsetStart);
- return true;
- } else {
- return false;
- }
- }
- int state = runDFA.step(0, ch);
-
- if (state != -1) {
- // a token separator just possibly started; keep scanning to see if the token is accepted:
- int lastAcceptLength = -1;
- do {
-
- if (runDFA.isAccept(state)) {
- // record that the token separator matches here, but keep scanning in case a longer match also works (greedy):
- lastAcceptLength = sepUpto;
- }
-
- ch = nextCodePoint();
- if (ch == -1) {
- break;
- }
- state = runDFA.step(state, ch);
- } while (state != -1);
-
- if (lastAcceptLength != -1) {
- // strip the trailing separater we just matched from the token:
- tokenUpto -= lastAcceptLength;
- // we found a token separator
- int extra = sepUpto - lastAcceptLength;
- if (extra != 0) {
- pushBack(extra);
- }
- if (tokenUpto > 0) {
- fillToken(offsetStart);
- return true;
- } else {
- // we matched one token separator immediately after another
- offsetStart = offset;
- }
- } else if (ch == -1) {
- if (tokenUpto > 0) {
- fillToken(offsetStart);
- return true;
- } else {
- return false;
- }
- } else {
- // false alarm: there was no token separator here; push back all but the first character we scanned
- pushBack(sepUpto-1);
- }
- }
- }
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
- offsetAtt.setOffset(ofs, ofs);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- offset = 0;
- pendingUpto = 0;
- pendingLimit = 0;
- sepUpto = 0;
- bufferNextRead = 0;
- bufferLimit = 0;
- }
-
- /** Pushes back the last {@code count} characters in current token's buffer. */
- private void pushBack(int count) {
- tokenUpto -= count;
- assert tokenUpto >= 0;
- if (pendingLimit == 0) {
- if (bufferNextRead >= count) {
- // optimize common case when the chars we are pushing back are still in the buffer
- bufferNextRead -= count;
- } else {
- if (count > pendingChars.length) {
- pendingChars = ArrayUtil.grow(pendingChars, count);
- }
- System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
- pendingLimit = count;
- }
- } else {
- // we are pushing back what is already in our pending buffer
- pendingUpto -= count;
- assert pendingUpto >= 0;
- }
- offset -= count;
- }
-
- private void appendToToken(char ch) {
- char[] buffer = termAtt.buffer();
- if (tokenUpto == buffer.length) {
- buffer = termAtt.resizeBuffer(tokenUpto + 1);
- }
- buffer[tokenUpto++] = ch;
- sepUpto++;
- }
-
- private int nextCodeUnit() throws IOException {
- int result;
- if (pendingUpto < pendingLimit) {
- result = pendingChars[pendingUpto++];
- if (pendingUpto == pendingLimit) {
- // We used up the pending buffer
- pendingUpto = 0;
- pendingLimit = 0;
- }
- appendToToken((char) result);
- offset++;
- } else if (bufferLimit == -1) {
- return -1;
- } else {
- assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
- if (bufferNextRead == bufferLimit) {
- bufferLimit = input.read(buffer, 0, buffer.length);
- if (bufferLimit == -1) {
- return -1;
- }
- bufferNextRead = 0;
- }
- result = buffer[bufferNextRead++];
- offset++;
- appendToToken((char) result);
- }
- return result;
- }
-
- private int nextCodePoint() throws IOException {
-
- int ch = nextCodeUnit();
- if (ch == -1) {
- return ch;
- }
- if (Character.isHighSurrogate((char) ch)) {
- return Character.toCodePoint((char) ch, (char) nextCodeUnit());
- } else {
- return ch;
- }
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
deleted file mode 100644
index 4af6286c901..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import java.util.Map;
-
-import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
-
-/**
- * Factory for {@link SimplePatternSplitTokenizer}, for producing tokens by splitting according to the provided regexp.
- *
- *
This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
- * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
- * tokenization is quite a bit faster. It takes two arguments:
- *
- *
- *
"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}
- *
"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp
- *
- *
- * The pattern matches the characters that should split tokens, like {@code String.split}, and the
- * matching is greedy such that the longest token separator matching at a given point is matched. Empty
- * tokens are never created.
- *
- *
For example, to match tokens delimited by simple whitespace characters:
- *
- *
- *
- * @lucene.experimental
- *
- * @see SimplePatternSplitTokenizer
- */
-public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
- public static final String PATTERN = "pattern";
- private final Automaton dfa;
- private final int maxDeterminizedStates;
-
- /** Creates a new SimpleSplitPatternTokenizerFactory */
- public SimplePatternSplitTokenizerFactory(Map args) {
- super(args);
- maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
- if (args.isEmpty() == false) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public SimplePatternSplitTokenizer create(final AttributeFactory factory) {
- return new SimplePatternSplitTokenizer(factory, dfa);
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
deleted file mode 100644
index 867b10a9d23..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.pattern;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
-
-/**
- * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
- * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. The provided
- * regex should match valid token characters (not token separator characters, like {@code String.split}). The matching is greedy:
- * the longest match at a given start point will be the next token. Empty string tokens are never produced.
- *
- * @lucene.experimental
- */
-
-// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is
-// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's. This is because on failing to match
-// a token, we skip one character forward and try again. A better approach would be to compile something like this regexp
-// instead: .* | , because that automaton would not "forget" all the as it had already seen, and would be a single pass
-// through the input. I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
-// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know
-// which specific characters the pattern matched. SynonymFilter has this same limitation.
-
-public final class SimplePatternTokenizer extends Tokenizer {
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private final CharacterRunAutomaton runDFA;
-
- // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
- // suspect it's slowish:
-
- private char[] pendingChars = new char[8];
- private int pendingLimit;
- private int pendingUpto;
- private int offset;
- private int tokenUpto;
- private final char[] buffer = new char[1024];
- private int bufferLimit;
- private int bufferNextRead;
-
- /** See {@link RegExp} for the accepted syntax. */
- public SimplePatternTokenizer(String regexp) {
- this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- /** Runs a pre-built automaton. */
- public SimplePatternTokenizer(Automaton dfa) {
- this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
- }
-
- /** See {@link RegExp} for the accepted syntax. */
- public SimplePatternTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
- this(factory, new RegExp(regexp).toAutomaton());
- }
-
- /** Runs a pre-built automaton. */
- public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
- super(factory);
-
- // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
- // realizing this ctor is otherwise trappy
- if (dfa.isDeterministic() == false) {
- throw new IllegalArgumentException("please determinize the incoming automaton first");
- }
-
- runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
-
- clearAttributes();
- tokenUpto = 0;
-
- while (true) {
-
- int offsetStart = offset;
-
- // The runDFA operates in Unicode space, not UTF16 (java's char):
-
- int ch = nextCodePoint();
- if (ch == -1) {
- return false;
- }
-
- int state = runDFA.step(0, ch);
-
- if (state != -1) {
- // a token just possibly started; keep scanning to see if the token is accepted:
- int lastAcceptLength = -1;
- do {
-
- if (runDFA.isAccept(state)) {
- // record that the token matches here, but keep scanning in case a longer match also works (greedy):
- lastAcceptLength = tokenUpto;
- }
-
- ch = nextCodePoint();
- if (ch == -1) {
- break;
- }
- state = runDFA.step(state, ch);
- } while (state != -1);
-
- if (lastAcceptLength != -1) {
- // we found a token
- int extra = tokenUpto - lastAcceptLength;
- if (extra != 0) {
- pushBack(extra);
- }
- termAtt.setLength(lastAcceptLength);
- offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength));
- return true;
- } else if (ch == -1) {
- return false;
- } else {
- // false alarm: there was no token here; push back all but the first character we scanned
- pushBack(tokenUpto-1);
- tokenUpto = 0;
- }
- } else {
- tokenUpto = 0;
- }
- }
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
- offsetAtt.setOffset(ofs, ofs);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- offset = 0;
- pendingUpto = 0;
- pendingLimit = 0;
- tokenUpto = 0;
- bufferNextRead = 0;
- bufferLimit = 0;
- }
-
- /** Pushes back the last {@code count} characters in current token's buffer. */
- private void pushBack(int count) {
-
- if (pendingLimit == 0) {
- if (bufferNextRead >= count) {
- // optimize common case when the chars we are pushing back are still in the buffer
- bufferNextRead -= count;
- } else {
- if (count > pendingChars.length) {
- pendingChars = ArrayUtil.grow(pendingChars, count);
- }
- System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
- pendingLimit = count;
- }
- } else {
- // we are pushing back what is already in our pending buffer
- pendingUpto -= count;
- assert pendingUpto >= 0;
- }
- offset -= count;
- }
-
- private void appendToToken(char ch) {
- char[] buffer = termAtt.buffer();
- if (tokenUpto == buffer.length) {
- buffer = termAtt.resizeBuffer(tokenUpto + 1);
- }
- buffer[tokenUpto++] = ch;
- }
-
- private int nextCodeUnit() throws IOException {
- int result;
- if (pendingUpto < pendingLimit) {
- result = pendingChars[pendingUpto++];
- if (pendingUpto == pendingLimit) {
- // We used up the pending buffer
- pendingUpto = 0;
- pendingLimit = 0;
- }
- appendToToken((char) result);
- offset++;
- } else if (bufferLimit == -1) {
- return -1;
- } else {
- assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
- if (bufferNextRead == bufferLimit) {
- bufferLimit = input.read(buffer, 0, buffer.length);
- if (bufferLimit == -1) {
- return -1;
- }
- bufferNextRead = 0;
- }
- result = buffer[bufferNextRead++];
- offset++;
- appendToToken((char) result);
- }
- return result;
- }
-
- private int nextCodePoint() throws IOException {
-
- int ch = nextCodeUnit();
- if (ch == -1) {
- return ch;
- }
- if (Character.isHighSurrogate((char) ch)) {
- return Character.toCodePoint((char) ch, (char) nextCodeUnit());
- } else {
- return ch;
- }
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
deleted file mode 100644
index 3e74d023b3a..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import java.util.Map;
-
-import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
-
-/**
- * Factory for {@link SimplePatternTokenizer}, for matching tokens based on the provided regexp.
- *
- *
This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
- * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
- * tokenization is quite a bit faster. It takes two arguments:
- *
- *
- *
"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}
- *
"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp
- *
- *
- * The pattern matches the characters to include in a token (not the split characters), and the
- * matching is greedy such that the longest token matching at a given point is created. Empty
- * tokens are never created.
- *
- *
For example, to match tokens delimited by simple whitespace characters:
- *
- *
- *
- * @lucene.experimental
- *
- * @see SimplePatternTokenizer
- */
-public class SimplePatternTokenizerFactory extends TokenizerFactory {
- public static final String PATTERN = "pattern";
- private final Automaton dfa;
- private final int maxDeterminizedStates;
-
- /** Creates a new SimplePatternTokenizerFactory */
- public SimplePatternTokenizerFactory(Map args) {
- super(args);
- maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
- if (args.isEmpty() == false) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public SimplePatternTokenizer create(final AttributeFactory factory) {
- return new SimplePatternTokenizer(factory, dfa);
- }
-}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
index 4b37eb868ea..be0b7d4082a 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@@ -21,8 +21,6 @@ org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
org.apache.lucene.analysis.ngram.NGramTokenizerFactory
org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
org.apache.lucene.analysis.pattern.PatternTokenizerFactory
-org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizerFactory
-org.apache.lucene.analysis.pattern.SimplePatternTokenizerFactory
org.apache.lucene.analysis.standard.ClassicTokenizerFactory
org.apache.lucene.analysis.standard.StandardTokenizerFactory
org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 3a58bdd4991..8953f9f2192 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -96,11 +96,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.tartarus.snowball.SnowballProgram;
@@ -498,9 +494,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (random.nextBoolean()) return null;
return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
});
- put(Automaton.class, random -> {
- return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- });
}};
static final Set> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
@@ -510,8 +503,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
allowedTokenizerArgs.add(Reader.class);
allowedTokenizerArgs.add(AttributeFactory.class);
allowedTokenizerArgs.add(AttributeSource.class);
- allowedTokenizerArgs.add(Automaton.class);
-
+
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap,Boolean>());
allowedTokenFilterArgs.addAll(argProducers.keySet());
allowedTokenFilterArgs.add(TokenStream.class);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
deleted file mode 100644
index 5642c2b68e4..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.pattern;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CharFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.charfilter.MappingCharFilter;
-import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-
-public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
-
- public void testGreedy() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("(foo)+");
- t.setReader(new StringReader("bar foofoo baz"));
- assertTokenStreamContents(t,
- new String[] {"bar ", " baz"},
- new int[] {0, 10},
- new int[] {4, 14});
- }
-
- public void testBackToBack() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("foo");
- t.setReader(new StringReader("bar foofoo baz"));
- assertTokenStreamContents(t,
- new String[] {"bar ", " baz"},
- new int[] {0, 10},
- new int[] {4, 14});
- }
-
- public void testBigLookahead() throws Exception {
- StringBuilder b = new StringBuilder();
- for(int i=0;i<100;i++) {
- b.append('a');
- }
- b.append('b');
- Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
-
- b = new StringBuilder();
- for(int i=0;i<200;i++) {
- b.append('a');
- }
- t.setReader(new StringReader(b.toString()));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals(b.toString(), termAtt.toString());
- assertFalse(t.incrementToken());
- }
-
- public void testNoTokens() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer(".*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- String s;
- while (true) {
- s = TestUtil.randomUnicodeString(random());
- if (s.length() > 0) {
- break;
- }
- }
- t.setReader(new StringReader(s));
- t.reset();
- assertFalse(t.incrementToken());
- }
-
- public void testEmptyStringPatternNoMatch() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("a*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("bbb"));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals("bbb", termAtt.toString());
- assertFalse(t.incrementToken());
- }
-
- public void testSplitSingleCharWhitespace() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("a \tb c"));
- assertTokenStreamContents(t,
- new String[] {"a", "b", "c"},
- new int[] {0, 3, 7},
- new int[] {1, 4, 8});
- }
-
- public void testSplitMultiCharWhitespace() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("a \tb c"));
- assertTokenStreamContents(t,
- new String[] {"a", "b", "c"},
- new int[] {0, 3, 7},
- new int[] {1, 4, 8});
- }
-
- public void testLeadingNonToken() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader(" a c"));
- assertTokenStreamContents(t,
- new String[] {"a", "c"},
- new int[] {4, 6},
- new int[] {5, 7});
- }
-
- public void testTrailingNonToken() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("a c "));
- assertTokenStreamContents(t,
- new String[] {"a", "c"},
- new int[] {0, 2},
- new int[] {1, 3});
- }
-
- public void testEmptyStringPatternOneMatch() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("a*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("bbab"));
- assertTokenStreamContents(t,
- new String[] {"bb", "b"},
- new int[] {0, 3},
- new int[] {2, 4});
- }
-
- public void testEndOffset() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("a+");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
- t.setReader(new StringReader("aaabbb"));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals("bbb", termAtt.toString());
- assertFalse(t.incrementToken());
- t.end();
- assertEquals(6, offsetAtt.endOffset());
- }
-
- public void testFixedToken() throws Exception {
- Tokenizer t = new SimplePatternSplitTokenizer("aaaa");
-
- t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
- assertTokenStreamContents(t,
- new String[] {"aaa"},
- new int[] {12},
- new int[] {15});
- }
-
- public void testBasic() throws Exception
- {
- String[][] tests = {
- // pattern input output
- { "--", "aaa--bbb--ccc", "aaa bbb ccc" },
- { ":", "aaa:bbb:ccc", "aaa bbb ccc" },
- { ":", "boo:and:foo", "boo and foo" },
- { "o", "boo:and:foo", "b :and:f" },
- };
-
- for(String[] test : tests) {
- TokenStream stream = new SimplePatternSplitTokenizer(test[0]);
- ((Tokenizer)stream).setReader(new StringReader(test[1]));
- String out = tsToString(stream);
- assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
- }
- }
-
- public void testNotDeterminized() throws Exception {
- Automaton a = new Automaton();
- int start = a.createState();
- int mid1 = a.createState();
- int mid2 = a.createState();
- int end = a.createState();
- a.setAccept(end, true);
- a.addTransition(start, mid1, 'a', 'z');
- a.addTransition(start, mid2, 'a', 'z');
- a.addTransition(mid1, end, 'b');
- a.addTransition(mid2, end, 'b');
- expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);});
- }
-
- public void testOffsetCorrection() throws Exception {
- final String INPUT = "Günther Günther is here";
-
- // create MappingCharFilter
- List mappingRules = new ArrayList<>();
- mappingRules.add( "\"ü\" => \"ü\"" );
- NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
- builder.add("ü", "ü");
- NormalizeCharMap normMap = builder.build();
- CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
-
- // create SimplePatternSplitTokenizer
- Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
- stream.setReader(charStream);
- assertTokenStreamContents(stream,
- new String[] { " ", " is here" },
- new int[] { 12, 25 },
- new int[] { 13, 33 },
- INPUT.length());
- }
-
- /**
- * TODO: rewrite tests not to use string comparison.
- */
- private static String tsToString(TokenStream in) throws IOException {
- StringBuilder out = new StringBuilder();
- CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
- // extra safety to enforce, that the state is not preserved and also
- // assign bogus values
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- in.reset();
- while (in.incrementToken()) {
- if (out.length() > 0) {
- out.append(' ');
- }
- out.append(termAtt.toString());
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- }
-
- in.close();
- return out.toString();
- }
-
- /** blast some random strings through the analyzer */
- public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
- return new TokenStreamComponents(tokenizer);
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
- a.close();
-
- Analyzer b = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
- return new TokenStreamComponents(tokenizer);
- }
- };
- checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
- b.close();
- }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
deleted file mode 100644
index b566713312b..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.pattern;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CharFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.charfilter.MappingCharFilter;
-import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-
-public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
-
- public void testGreedy() throws Exception {
- Tokenizer t = new SimplePatternTokenizer("(foo)+");
- t.setReader(new StringReader("bar foofoo baz"));
- assertTokenStreamContents(t,
- new String[] {"foofoo"},
- new int[] {4},
- new int[] {10});
- }
-
- public void testBigLookahead() throws Exception {
- StringBuilder b = new StringBuilder();
- for(int i=0;i<100;i++) {
- b.append('a');
- }
- b.append('b');
- Tokenizer t = new SimplePatternTokenizer(b.toString());
-
- b = new StringBuilder();
- for(int i=0;i<200;i++) {
- b.append('a');
- }
- t.setReader(new StringReader(b.toString()));
- t.reset();
- assertFalse(t.incrementToken());
- }
-
- public void testOneToken() throws Exception {
- Tokenizer t = new SimplePatternTokenizer(".*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- String s;
- while (true) {
- s = TestUtil.randomUnicodeString(random());
- if (s.length() > 0) {
- break;
- }
- }
- t.setReader(new StringReader(s));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals(s, termAtt.toString());
- }
-
- public void testEmptyStringPatternNoMatch() throws Exception {
- Tokenizer t = new SimplePatternTokenizer("a*");
- t.setReader(new StringReader("bbb"));
- t.reset();
- assertFalse(t.incrementToken());
- }
-
- public void testEmptyStringPatternOneMatch() throws Exception {
- Tokenizer t = new SimplePatternTokenizer("a*");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- t.setReader(new StringReader("bbab"));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals("a", termAtt.toString());
- assertFalse(t.incrementToken());
- }
-
- public void testEndOffset() throws Exception {
- Tokenizer t = new SimplePatternTokenizer("a+");
- CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
- OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
- t.setReader(new StringReader("aaabbb"));
- t.reset();
- assertTrue(t.incrementToken());
- assertEquals("aaa", termAtt.toString());
- assertFalse(t.incrementToken());
- t.end();
- assertEquals(6, offsetAtt.endOffset());
- }
-
- public void testFixedToken() throws Exception {
- Tokenizer t = new SimplePatternTokenizer("aaaa");
-
- t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
- assertTokenStreamContents(t,
- new String[] {"aaaa", "aaaa", "aaaa"},
- new int[] {0, 4, 8},
- new int[] {4, 8, 12});
- }
-
- public void testBasic() throws Exception {
- String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
- String[][] tests = {
- // pattern input output
- { ":", "boo:and:foo", ": :" },
- { qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
- };
-
- for(String[] test : tests) {
- TokenStream stream = new SimplePatternTokenizer(test[0]);
- ((Tokenizer)stream).setReader(new StringReader(test[1]));
- String out = tsToString(stream);
-
- assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
- }
- }
-
- public void testNotDeterminized() throws Exception {
- Automaton a = new Automaton();
- int start = a.createState();
- int mid1 = a.createState();
- int mid2 = a.createState();
- int end = a.createState();
- a.setAccept(end, true);
- a.addTransition(start, mid1, 'a', 'z');
- a.addTransition(start, mid2, 'a', 'z');
- a.addTransition(mid1, end, 'b');
- a.addTransition(mid2, end, 'b');
- expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
- }
-
- public void testOffsetCorrection() throws Exception {
- final String INPUT = "Günther Günther is here";
-
- // create MappingCharFilter
- List mappingRules = new ArrayList<>();
- mappingRules.add( "\"ü\" => \"ü\"" );
- NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
- builder.add("ü", "ü");
- NormalizeCharMap normMap = builder.build();
- CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
-
- // create SimplePatternTokenizer
- Tokenizer stream = new SimplePatternTokenizer("Günther");
- stream.setReader(charStream);
- assertTokenStreamContents(stream,
- new String[] { "Günther", "Günther" },
- new int[] { 0, 13 },
- new int[] { 12, 25 },
- INPUT.length());
- }
-
- /**
- * TODO: rewrite tests not to use string comparison.
- */
- private static String tsToString(TokenStream in) throws IOException {
- StringBuilder out = new StringBuilder();
- CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
- // extra safety to enforce, that the state is not preserved and also
- // assign bogus values
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- in.reset();
- while (in.incrementToken()) {
- if (out.length() > 0) {
- out.append(' ');
- }
- out.append(termAtt.toString());
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- }
-
- in.close();
- return out.toString();
- }
-
- /** blast some random strings through the analyzer */
- public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new SimplePatternTokenizer("a");
- return new TokenStreamComponents(tokenizer);
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
- a.close();
-
- Analyzer b = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new SimplePatternTokenizer("a");
- return new TokenStreamComponents(tokenizer);
- }
- };
- checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
- b.close();
- }
-}
diff --git a/lucene/common-build.xml b/lucene/common-build.xml
index 7d64bc237b6..2a988eb9aaa 100644
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@@ -2348,7 +2348,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
-
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
index a536f73fc16..81858df198d 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
@@ -362,13 +362,11 @@
*
*
*
Inhibiting phrase and proximity matches in sentence boundaries – for this, a tokenizer that
- * identifies a new sentence can add 1 to the position increment of the first token of the new sentence.
- *
Injecting synonyms – synonyms of a token should be created at the same position as the
- * original token, and the output order of the original token and the injected synonym is undefined
- * as long as they both leave from the same position. As result, all synonyms of a token would be
- * considered to appear in exactly the same position as that token, and so would they be seen by
- * phrase and proximity searches. For multi-token synonyms to work correctly, you should use
- * {@code SynoymGraphFilter} at search time only.
+ * identifies a new sentence can add 1 to the position increment of the first token of the new sentence.
+ *
Injecting synonyms – here, synonyms of a token should be added after that token,
+ * and their position increment should be set to 0.
+ * As result, all synonyms of a token would be considered to appear in exactly the
+ * same position as that token, and so would they be seen by phrase and proximity searches.
*
*
*
Token Position Length
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
index abd5109e655..ca14bc6dd3a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -27,9 +27,9 @@ public class ByteRunAutomaton extends RunAutomaton {
this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
- /** expert: if isBinary is true, the input is already byte-based */
+ /** expert: if utf8 is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
- super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
+ super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
}
/**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
index 1a9c1c92680..70ff9aa21f6 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
@@ -36,7 +36,7 @@ public class CharacterRunAutomaton extends RunAutomaton {
* it then a TooComplexToDeterminizeException is thrown.
*/
public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
- super(a, Character.MAX_CODE_POINT+1, maxDeterminizedStates);
+ super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
}
/**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index b673a82e974..718a9089ce2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -29,17 +29,6 @@
package org.apache.lucene.util.automaton;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@@ -47,6 +36,17 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
/**
* Automata operations.
*
@@ -335,7 +335,7 @@ final public class Operations {
Transition[][] transitions2 = a2.getSortedTransitions();
Automaton c = new Automaton();
c.createState();
- ArrayDeque worklist = new ArrayDeque<>();
+ LinkedList worklist = new LinkedList<>();
HashMap newstates = new HashMap<>();
StatePair p = new StatePair(0, 0, 0);
worklist.add(p);
@@ -435,7 +435,7 @@ final public class Operations {
// TODO: cutover to iterators instead
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
- ArrayDeque worklist = new ArrayDeque<>();
+ LinkedList worklist = new LinkedList<>();
HashSet visited = new HashSet<>();
StatePair p = new StatePair(0, 0);
worklist.add(p);
@@ -682,7 +682,7 @@ final public class Operations {
// Create state 0:
b.createState();
- ArrayDeque worklist = new ArrayDeque<>();
+ LinkedList worklist = new LinkedList<>();
Map newstate = new HashMap<>();
worklist.add(initialset);
@@ -804,7 +804,7 @@ final public class Operations {
return false;
}
- ArrayDeque workList = new ArrayDeque<>();
+ LinkedList workList = new LinkedList<>();
BitSet seen = new BitSet(a.getNumStates());
workList.add(0);
seen.set(0);
@@ -907,7 +907,7 @@ final public class Operations {
if (numStates == 0) {
return live;
}
- ArrayDeque workList = new ArrayDeque<>();
+ LinkedList workList = new LinkedList<>();
live.set(0);
workList.add(0);
@@ -946,7 +946,7 @@ final public class Operations {
}
Automaton a2 = builder.finish();
- ArrayDeque workList = new ArrayDeque<>();
+ LinkedList workList = new LinkedList<>();
BitSet live = new BitSet(numStates);
BitSet acceptBits = a.getAcceptStates();
int s = 0;
@@ -1010,6 +1010,22 @@ final public class Operations {
return result;
}
+ /**
+ * Finds the largest entry whose value is less than or equal to c, or 0 if
+ * there is no such entry.
+ */
+ static int findIndex(int c, int[] points) {
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
+ }
+ return a;
+ }
+
/**
* Returns true if the language of this automaton is finite. The
* automaton must not have any dead states.
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
index 4f539260450..1d640954e13 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@@ -38,62 +38,13 @@ import java.util.Arrays;
*/
public abstract class RunAutomaton {
final Automaton automaton;
- final int alphabetSize;
+ final int maxInterval;
final int size;
final boolean[] accept;
final int[] transitions; // delta(state,c) = transitions[state*points.length +
// getCharClass(c)]
final int[] points; // char interval start points
- final int[] classmap; // map from char number to class
-
- /**
- * Constructs a new RunAutomaton from a deterministic
- * Automaton.
- *
- * @param a an automaton
- */
- protected RunAutomaton(Automaton a, int alphabetSize) {
- this(a, alphabetSize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- /**
- * Constructs a new RunAutomaton from a deterministic
- * Automaton.
- *
- * @param a an automaton
- * @param maxDeterminizedStates maximum number of states that can be created
- * while determinizing a
- */
- protected RunAutomaton(Automaton a, int alphabetSize, int maxDeterminizedStates) {
- this.alphabetSize = alphabetSize;
- a = Operations.determinize(a, maxDeterminizedStates);
- this.automaton = a;
- points = a.getStartPoints();
- size = Math.max(1,a.getNumStates());
- accept = new boolean[size];
- transitions = new int[size * points.length];
- Arrays.fill(transitions, -1);
- for (int n=0;n 1) {
- int d = (a + b) >>> 1;
- if (points[d] > c) b = d;
- else if (points[d] < c) a = d;
- else return d;
- }
- return a;
+ return Operations.findIndex(c, points);
}
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton.
+ *
+ * @param a an automaton
+ */
+ public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
+ this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton.
+ *
+ * @param a an automaton
+ * @param maxDeterminizedStates maximum number of states that can be created
+ * while determinizing a
+ */
+ public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
+ int maxDeterminizedStates) {
+ this.maxInterval = maxInterval;
+ a = Operations.determinize(a, maxDeterminizedStates);
+ this.automaton = a;
+ points = a.getStartPoints();
+ size = Math.max(1,a.getNumStates());
+ accept = new boolean[size];
+ transitions = new int[size * points.length];
+ Arrays.fill(transitions, -1);
+ for (int n=0;n= classmap.length) {
+ if (classmap == null) {
return transitions[state * points.length + getCharClass(c)];
} else {
return transitions[state * points.length + classmap[c]];
@@ -185,7 +179,7 @@ public abstract class RunAutomaton {
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + alphabetSize;
+ result = prime * result + maxInterval;
result = prime * result + points.length;
result = prime * result + size;
return result;
@@ -197,7 +191,7 @@ public abstract class RunAutomaton {
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
RunAutomaton other = (RunAutomaton) obj;
- if (alphabetSize != other.alphabetSize) return false;
+ if (maxInterval != other.maxInterval) return false;
if (size != other.size) return false;
if (!Arrays.equals(points, other.points)) return false;
if (!Arrays.equals(accept, other.accept)) return false;
diff --git a/lucene/demo/ivy.xml b/lucene/demo/ivy.xml
index 5dd7e74fba5..050f0a58883 100644
--- a/lucene/demo/ivy.xml
+++ b/lucene/demo/ivy.xml
@@ -17,7 +17,7 @@
under the License.
-->
-
+
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
index 776971286e2..0a1755c1675 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
@@ -367,7 +367,7 @@ class TermAutomatonScorer extends Scorer {
static class TermRunAutomaton extends RunAutomaton {
public TermRunAutomaton(Automaton a, int termCount) {
- super(a, termCount);
+ super(a, termCount, true);
}
}
diff --git a/lucene/test-framework/ivy.xml b/lucene/test-framework/ivy.xml
index a51716c8698..a71c25a51ec 100644
--- a/lucene/test-framework/ivy.xml
+++ b/lucene/test-framework/ivy.xml
@@ -17,7 +17,7 @@
under the License.
-->
-
+
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
index ca68d2e3610..4cd6534d64e 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
@@ -40,7 +40,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.NumericUtils;
-import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
@@ -233,7 +232,16 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
dir.setRandomIOExceptionRateOnOpen(0.05);
verify(dir, docValues, null, numDims, numBytesPerDim, true);
} catch (IllegalStateException ise) {
- done = handlePossiblyFakeException(ise);
+ if (ise.getMessage().contains("this writer hit an unrecoverable error")) {
+ Throwable cause = ise.getCause();
+ if (cause != null && cause.getMessage().contains("a random IOException")) {
+ done = true;
+ } else {
+ throw ise;
+ }
+ } else {
+ throw ise;
+ }
} catch (AssertionError ae) {
if (ae.getMessage() != null && ae.getMessage().contains("does not exist; files=")) {
// OK: likely we threw the random IOExc when IW was asserting the commit files exist
@@ -245,28 +253,23 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
// This just means we got a too-small maxMB for the maxPointsInLeafNode; just retry w/ more heap
assertTrue(iae.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode"));
} catch (IOException ioe) {
- done = handlePossiblyFakeException(ioe);
+ Throwable ex = ioe;
+ while (ex != null) {
+ String message = ex.getMessage();
+ if (message != null && (message.contains("a random IOException") || message.contains("background merge hit exception"))) {
+ done = true;
+ break;
+ }
+ ex = ex.getCause();
+ }
+ if (done == false) {
+ throw ioe;
+ }
}
}
}
}
- // TODO: merge w/ BaseIndexFileFormatTestCase.handleFakeIOException
- private boolean handlePossiblyFakeException(Exception e) {
- Throwable ex = e;
- while (ex != null) {
- String message = ex.getMessage();
- if (message != null && (message.contains("a random IOException") || message.contains("background merge hit exception"))) {
- return true;
- }
- ex = ex.getCause();
- }
- Rethrow.rethrow(e);
-
- // dead code yet javac disagrees:
- return false;
- }
-
public void testMultiValued() throws Exception {
int numBytesPerDim = TestUtil.nextInt(random(), 2, PointValues.MAX_NUM_BYTES);
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
diff --git a/lucene/tools/ivy.xml b/lucene/tools/ivy.xml
index 1fa2974f85b..614aa8eb45b 100644
--- a/lucene/tools/ivy.xml
+++ b/lucene/tools/ivy.xml
@@ -17,7 +17,7 @@
under the License.
-->
-
+
diff --git a/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java b/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java
index 5b2f0b80b00..45a9d1126ab 100644
--- a/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java
+++ b/lucene/tools/src/java/org/apache/lucene/dependencies/GetMavenDependenciesTask.java
@@ -54,7 +54,6 @@ import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
-import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -119,7 +118,6 @@ public class GetMavenDependenciesTask extends Task {
private final DocumentBuilder documentBuilder;
private File ivyCacheDir;
private Pattern internalJarPattern;
- private Map ivyModuleInfo;
/**
@@ -191,8 +189,6 @@ public class GetMavenDependenciesTask extends Task {
internalJarPattern = Pattern.compile(".*(lucene|solr)([^/]*?)-"
+ Pattern.quote(getProject().getProperty("version")) + "\\.jar");
- ivyModuleInfo = getIvyModuleInfo(ivyXmlResources, documentBuilder, xpath);
-
setInternalDependencyProperties(); // side-effect: all modules' internal deps are recorded
setExternalDependencyProperties(); // side-effect: all modules' external deps are recorded
setGrandparentDependencyManagementProperty(); // uses deps recorded in above two methods
@@ -223,57 +219,11 @@ public class GetMavenDependenciesTask extends Task {
}
}
- /**
- * Visits all ivy.xml files and collects module and organisation attributes into a map.
- */
- private static Map getIvyModuleInfo(Resources ivyXmlResources,
- DocumentBuilder documentBuilder, XPath xpath) {
- Map ivyInfoModuleToOrganisation = new HashMap();
- traverseIvyXmlResources(ivyXmlResources, new Consumer() {
- @Override
- public void accept(File f) {
- try {
- Document document = documentBuilder.parse(f);
- {
- String infoPath = "/ivy-module/info";
- NodeList infos = (NodeList)xpath.evaluate(infoPath, document, XPathConstants.NODESET);
- for (int infoNum = 0 ; infoNum < infos.getLength() ; ++infoNum) {
- Element infoElement = (Element)infos.item(infoNum);
- String infoOrg = infoElement.getAttribute("organisation");
- String infoOrgSuffix = infoOrg.substring(infoOrg.lastIndexOf('.')+1);
- String infoModule = infoElement.getAttribute("module");
- String module = infoOrgSuffix+"-"+infoModule;
- ivyInfoModuleToOrganisation.put(module, infoOrg);
- }
- }
- } catch (XPathExpressionException | IOException | SAXException e) {
- throw new RuntimeException(e);
- }
- }
- });
- return ivyInfoModuleToOrganisation;
- }
-
/**
* Collects external dependencies from each ivy.xml file and sets
* external dependency properties to be inserted into modules' POMs.
*/
private void setExternalDependencyProperties() {
- traverseIvyXmlResources(ivyXmlResources, new Consumer() {
- @Override
- public void accept(File f) {
- try {
- collectExternalDependenciesFromIvyXmlFile(f);
- } catch (XPathExpressionException | IOException | SAXException e) {
- throw new RuntimeException(e);
- }
- }
- });
- addSharedExternalDependencies();
- setExternalDependencyXmlProperties();
- }
-
- private static void traverseIvyXmlResources(Resources ivyXmlResources, Consumer ivyXmlFileConsumer) {
@SuppressWarnings("unchecked")
Iterator iter = (Iterator)ivyXmlResources.iterator();
while (iter.hasNext()) {
@@ -288,13 +238,15 @@ public class GetMavenDependenciesTask extends Task {
File ivyXmlFile = ((FileResource)resource).getFile();
try {
- ivyXmlFileConsumer.accept(ivyXmlFile);
+ collectExternalDependenciesFromIvyXmlFile(ivyXmlFile);
} catch (BuildException e) {
throw e;
} catch (Exception e) {
throw new BuildException("Exception reading file " + ivyXmlFile.getPath() + ": " + e, e);
}
}
+ addSharedExternalDependencies();
+ setExternalDependencyXmlProperties();
}
/**
@@ -444,7 +396,7 @@ public class GetMavenDependenciesTask extends Task {
}
}
}
- String groupId = ivyModuleInfo.get(artifactId);
+ String groupId = "org.apache." + artifactId.substring(0, artifactId.indexOf('-'));
appendDependencyXml(builder, groupId, artifactId, " ", "${project.version}", false, false, null, exclusions);
}
}
@@ -629,7 +581,7 @@ public class GetMavenDependenciesTask extends Task {
continue; // skip external (/(test-)lib/), and non-jar and unwanted (self) internal deps
}
String artifactId = dependencyToArtifactId(newPropertyKey, dependency);
- String groupId = ivyModuleInfo.get(artifactId);
+ String groupId = "org.apache." + artifactId.substring(0, artifactId.indexOf('-'));
String coordinate = groupId + ':' + artifactId;
sortedDeps.add(coordinate);
}
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6cd52911487..4a8766edc27 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -74,12 +74,20 @@ Optimizations
* SOLR-9584: Support Solr being proxied with another endpoint than default /solr, by using relative links
in AdminUI javascripts (Yun Jie Zhou via janhoy)
+* SOLR-9996: Unstored IntPointField returns Long type (Ishan Chattopadhyaya)
+
* SOLR-5944: In-place updates of Numeric DocValues. To leverage this, the _version_ field and the updated
field must both be stored=false, indexed=false, docValues=true. (Ishan Chattopadhyaya, hossman, noble,
shalin, yonik)
Other Changes
----------------------
+* SOLR-8396: Add support for PointFields in Solr (Ishan Chattopadhyaya, Tomás Fernández Löbbe)
+
+* SOLR-10011: Refactor PointField & TrieField to now have a common base class, NumericFieldType. The
+ TrieField.TrieTypes and PointField.PointTypes are now consolidated to NumericFieldType.NumberType. This
+ refactoring also fixes a bug whereby PointFields were not using DocValues for range queries for
+ indexed=false, docValues=true fields. (Ishan Chattopadhyaya, Tomás Fernández Löbbe)
================== 6.5.0 ==================
@@ -131,10 +139,6 @@ New Features
* SOLR-9903: Stop interrupting the update executor on shutdown, it can cause graceful shutdowns to put replicas into Leader
Initiated Recovery among other undesirable things. (Mark Miller)
-* SOLR-8396: Add support for PointFields in Solr (Ishan Chattopadhyaya, Tomás Fernández Löbbe)
-
-* SOLR-9987: Add support for MultiValued DocValues in PointFields using SortedNumericDocValues (Tomás Fernández Löbbe)
-
Bug Fixes
----------------------
@@ -157,8 +161,6 @@ Bug Fixes
* SOLR-10063: CoreContainer shutdown has race condition that can cause a hang on shutdown. (Mark Miller)
-* SOLR-10104: BlockDirectoryCache release hooks do not work with multiple directories. (Mike Drob, Mark Miller)
-
Optimizations
----------------------
@@ -195,13 +197,6 @@ Other Changes
* SOLR-10072: The test TestSelectiveWeightCreation appears to be unreliable. (Michael Nilsson via Mark Miller)
-* SOLR-9996: Unstored IntPointField returns Long type (Ishan Chattopadhyaya)
-
-* SOLR-10011: Refactor PointField & TrieField to now have a common base class, NumericFieldType. The
- TrieField.TrieTypes and PointField.PointTypes are now consolidated to NumericFieldType.NumberType. This
- refactoring also fixes a bug whereby PointFields were not using DocValues for range queries for
- indexed=false, docValues=true fields. (Ishan Chattopadhyaya, Tomás Fernández Löbbe)
-
================== 6.4.1 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
diff --git a/solr/contrib/extraction/ivy.xml b/solr/contrib/extraction/ivy.xml
index 42cee8af805..5cf19a17f36 100644
--- a/solr/contrib/extraction/ivy.xml
+++ b/solr/contrib/extraction/ivy.xml
@@ -17,7 +17,7 @@
under the License.
-->
-
+
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index 3a8aa3edfca..0b74ccba180 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -490,6 +490,7 @@ public class Overseer implements Closeable {
this.zkController = zkController;
this.stats = new Stats();
this.config = config;
+ assert ObjectReleaseTracker.track(this);
}
public synchronized void start(String id) {
@@ -520,7 +521,6 @@ public class Overseer implements Closeable {
updaterThread.start();
ccThread.start();
arfoThread.start();
- assert ObjectReleaseTracker.track(this);
}
public Stats getStats() {
diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
index 123abeacc33..4be643e3bfe 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@@ -688,12 +688,7 @@ public class RealTimeGetComponent extends SearchComponent
if (sf != null && sf.multiValued()) {
List