From 4888adf08a70c25b4d931a2bdfdcfe3979ba3936 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 1 Jun 2018 17:12:21 -0400 Subject: [PATCH 01/38] SOLR-12337: Remove the obsolete QueryWrapperFilter --- solr/CHANGES.txt | 4 + .../field/AbstractAnalyticsFieldTest.java | 41 ++- .../handler/component/ExpandComponent.java | 27 +- .../transform/ChildDocTransformerFactory.java | 11 +- .../apache/solr/schema/CurrencyFieldType.java | 18 +- .../solr/search/QueryWrapperFilter.java | 106 -------- .../solr/search/TermsQParserPlugin.java | 22 +- .../solr/search/TestQueryWrapperFilter.java | 241 ------------------ 8 files changed, 57 insertions(+), 413 deletions(-) delete mode 100644 solr/core/src/java/org/apache/solr/search/QueryWrapperFilter.java delete mode 100644 solr/core/src/test/org/apache/solr/search/TestQueryWrapperFilter.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c1f821b05fb..80f21a539ab 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -321,6 +321,10 @@ Optimizations * SOLR-12366: A slow "live docs" implementation was being used instead of a bitset. Affects classic faceting enum method, JSON Facets enum method, UnInvertedField faceting, GraphTermsQParser, JoinQParser. (David Smiley) +* SOLR-12337: Remove the obsolete QueryWrapperFilter intermediate wrapper, which also removed some needless uses of + SolrConstantScoreQuery as well. QWF since v5.4.0 sometimes needlessly internally executed and cached the query. + Affects ExpandComponent, ChildDocTransformer, CurrencyFieldType, TermsQParser. (David Smiley) + Other Changes ---------------------- diff --git a/solr/contrib/analytics/src/test/org/apache/solr/analytics/function/field/AbstractAnalyticsFieldTest.java b/solr/contrib/analytics/src/test/org/apache/solr/analytics/function/field/AbstractAnalyticsFieldTest.java index 299d21e8c10..ae4b88152e9 100644 --- a/solr/contrib/analytics/src/test/org/apache/solr/analytics/function/field/AbstractAnalyticsFieldTest.java +++ b/solr/contrib/analytics/src/test/org/apache/solr/analytics/function/field/AbstractAnalyticsFieldTest.java @@ -27,14 +27,10 @@ import java.util.Set; import java.util.function.Predicate; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.Bits; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.analytics.ExpressionFactory; import org.apache.solr.schema.IndexSchema; -import org.apache.solr.search.Filter; -import org.apache.solr.search.QueryWrapperFilter; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; import org.junit.AfterClass; @@ -266,31 +262,24 @@ public class AbstractAnalyticsFieldTest extends SolrTestCaseJ4 { protected Set collectFieldValues(AnalyticsField testField, Predicate valuesFiller) throws IOException { StringField idField = new StringField("id"); - Filter filter = new QueryWrapperFilter(new MatchAllDocsQuery()); Set missing = new HashSet<>(); List contexts = searcher.getTopReaderContext().leaves(); - for (int leafNum = 0; leafNum < contexts.size(); leafNum++) { - LeafReaderContext context = contexts.get(leafNum); - DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs - if (dis == null) { - continue; - } - DocIdSetIterator disi = dis.iterator(); - if (disi != null) { - testField.doSetNextReader(context); - idField.doSetNextReader(context); - int doc = disi.nextDoc(); - while( doc != DocIdSetIterator.NO_MORE_DOCS){ - // Add a document to the statistics being generated - testField.collect(doc); - idField.collect(doc); + for (LeafReaderContext context : contexts) { + testField.doSetNextReader(context); + idField.doSetNextReader(context); + Bits liveDocs = context.reader().getLiveDocs(); + for (int doc = 0; doc < context.reader().maxDoc(); doc++) { + if (liveDocs != null && !liveDocs.get(doc)) { + continue; + } + // Add a document to the statistics being generated + testField.collect(doc); + idField.collect(doc); - String id = idField.getString(); - if (!valuesFiller.test(id)) { - missing.add(id); - } - doc = disi.nextDoc(); + String id = idField.getString(); + if (!valuesFiller.test(id)) { + missing.add(id); } } } diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index f6e29e4e796..82a62d56d3e 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -24,6 +24,15 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import com.carrotsearch.hppc.IntHashSet; +import com.carrotsearch.hppc.IntObjectHashMap; +import com.carrotsearch.hppc.LongHashSet; +import com.carrotsearch.hppc.LongObjectHashMap; +import com.carrotsearch.hppc.LongObjectMap; +import com.carrotsearch.hppc.cursors.IntObjectCursor; +import com.carrotsearch.hppc.cursors.LongCursor; +import com.carrotsearch.hppc.cursors.LongObjectCursor; +import com.carrotsearch.hppc.cursors.ObjectCursor; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; @@ -73,24 +82,12 @@ import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; import org.apache.solr.search.DocSlice; import org.apache.solr.search.QParser; -import org.apache.solr.search.QueryWrapperFilter; -import org.apache.solr.search.SolrConstantScoreQuery; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SortSpecParsing; import org.apache.solr.uninverting.UninvertingReader; import org.apache.solr.util.plugin.PluginInfoInitialized; import org.apache.solr.util.plugin.SolrCoreAware; -import com.carrotsearch.hppc.IntHashSet; -import com.carrotsearch.hppc.IntObjectHashMap; -import com.carrotsearch.hppc.LongHashSet; -import com.carrotsearch.hppc.LongObjectHashMap; -import com.carrotsearch.hppc.LongObjectMap; -import com.carrotsearch.hppc.cursors.IntObjectCursor; -import com.carrotsearch.hppc.cursors.LongCursor; -import com.carrotsearch.hppc.cursors.LongObjectCursor; -import com.carrotsearch.hppc.cursors.ObjectCursor; - /** * The ExpandComponent is designed to work with the CollapsingPostFilter. * The CollapsingPostFilter collapses a result set on a field. @@ -705,7 +702,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia bytesRefs[++index] = term.toBytesRef(); } - return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs))); + return new TermInSetQuery(fname, bytesRefs); } private Query getPointGroupQuery(SchemaField sf, @@ -720,7 +717,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia values.add(numericToString(ft, cursor.value)); } - return new SolrConstantScoreQuery(new QueryWrapperFilter(sf.getType().getSetQuery(null, sf, values))); + return sf.getType().getSetQuery(null, sf, values); } private String numericToString(FieldType fieldType, long val) { @@ -750,7 +747,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia IntObjectCursor cursor = it.next(); bytesRefs[++index] = cursor.value; } - return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs))); + return new TermInSetQuery(fname, bytesRefs); } diff --git a/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java b/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java index 2d28d91ce76..a414dc9edd7 100644 --- a/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java +++ b/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java @@ -27,8 +27,8 @@ import org.apache.lucene.search.join.BitSetProducer; import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.search.join.ToChildBlockJoinQuery; import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.SolrParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.DocsStreamer; @@ -38,10 +38,9 @@ import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; import org.apache.solr.search.QParser; -import org.apache.solr.search.QueryWrapperFilter; -import org.apache.solr.search.SyntaxError; import org.apache.solr.search.SolrDocumentFetcher; import org.apache.solr.search.SolrReturnFields; +import org.apache.solr.search.SyntaxError; /** * @@ -81,7 +80,11 @@ public class ChildDocTransformerFactory extends TransformerFactory { BitSetProducer parentsFilter = null; try { Query parentFilterQuery = QParser.getParser( parentFilter, req).getQuery(); - parentsFilter = new QueryBitSetProducer(new QueryWrapperFilter(parentFilterQuery)); + //TODO shouldn't we try to use the Solr filter cache, and then ideally implement + // BitSetProducer over that? + // DocSet parentDocSet = req.getSearcher().getDocSet(parentFilterQuery); + // then return BitSetProducer with custom BitSet impl accessing the docSet + parentsFilter = new QueryBitSetProducer(parentFilterQuery); } catch (SyntaxError syntaxError) { throw new SolrException( ErrorCode.BAD_REQUEST, "Failed to create correct parent filter query" ); } diff --git a/solr/core/src/java/org/apache/solr/schema/CurrencyFieldType.java b/solr/core/src/java/org/apache/solr/schema/CurrencyFieldType.java index 97195da243a..481db59c6a7 100644 --- a/solr/core/src/java/org/apache/solr/schema/CurrencyFieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/CurrencyFieldType.java @@ -33,16 +33,14 @@ import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; -import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.response.TextResponseWriter; -import org.apache.solr.search.Filter; import org.apache.solr.search.QParser; -import org.apache.solr.search.QueryWrapperFilter; -import org.apache.solr.search.SolrConstantScoreQuery; import org.apache.solr.search.function.ValueSourceRangeFilter; import org.apache.solr.uninverting.UninvertingReader.Type; import org.slf4j.Logger; @@ -337,17 +335,15 @@ public class CurrencyFieldType extends FieldType implements SchemaAware, Resourc (p2 != null) ? p2.getCurrencyCode() : defaultCurrency; // ValueSourceRangeFilter doesn't check exists(), so we have to - final Filter docsWithValues = new QueryWrapperFilter(new DocValuesFieldExistsQuery(getAmountField(field).getName())); - final Filter vsRangeFilter = new ValueSourceRangeFilter + final Query docsWithValues = new DocValuesFieldExistsQuery(getAmountField(field).getName()); + final Query vsRangeFilter = new ValueSourceRangeFilter (new RawCurrencyValueSource(field, currencyCode, parser), p1 == null ? null : p1.getAmount() + "", p2 == null ? null : p2.getAmount() + "", minInclusive, maxInclusive); - final BooleanQuery.Builder docsInRange = new BooleanQuery.Builder(); - docsInRange.add(docsWithValues, Occur.FILTER); - docsInRange.add(vsRangeFilter, Occur.FILTER); - - return new SolrConstantScoreQuery(new QueryWrapperFilter(docsInRange.build())); + return new ConstantScoreQuery(new BooleanQuery.Builder() + .add(docsWithValues, Occur.FILTER) + .add(vsRangeFilter, Occur.FILTER).build()); } @Override diff --git a/solr/core/src/java/org/apache/solr/search/QueryWrapperFilter.java b/solr/core/src/java/org/apache/solr/search/QueryWrapperFilter.java deleted file mode 100644 index 1d9de70b405..00000000000 --- a/solr/core/src/java/org/apache/solr/search/QueryWrapperFilter.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.search; - -import java.io.IOException; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.BoostQuery; -import org.apache.lucene.search.ConstantScoreQuery; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; -import org.apache.lucene.util.Bits; - -/** - * Constrains search results to only match those which also match a provided - * query. - * - *

This could be used, for example, with a {@link org.apache.solr.legacy.LegacyNumericRangeQuery} on a suitably - * formatted date field to implement date filtering. One could re-use a single - * CachingWrapperFilter(QueryWrapperFilter) that matches, e.g., only documents modified - * within the last week. This would only need to be reconstructed once per day. - */ -public class QueryWrapperFilter extends Filter { - private final Query query; - - /** Constructs a filter which only matches documents matching - * query. - */ - public QueryWrapperFilter(Query query) { - if (query == null) - throw new NullPointerException("Query may not be null"); - this.query = query; - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - return new BoostQuery(new ConstantScoreQuery(query), 0f); - } - - /** returns the inner Query */ - public final Query getQuery() { - return query; - } - - @Override - public DocIdSet getDocIdSet(final LeafReaderContext context, final Bits acceptDocs) throws IOException { - // get a private context that is used to rewrite, createWeight and score eventually - final LeafReaderContext privateContext = context.reader().getContext(); - final IndexSearcher searcher = new IndexSearcher(privateContext); - final Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1); - - DocIdSet set = new DocIdSet() { - @Override - public DocIdSetIterator iterator() throws IOException { - Scorer scorer = weight.scorer(privateContext); - return scorer == null ? null : scorer.iterator(); - } - - @Override - public long ramBytesUsed() { - return 0L; - } - }; - return BitsFilteredDocIdSet.wrap(set, acceptDocs); - } - - @Override - public String toString(String field) { - return "QueryWrapperFilter(" + query.toString(field) + ")"; - } - - @Override - public boolean equals(Object o) { - return sameClassAs(o) && - equalsTo(getClass().cast(o)); - } - - private boolean equalsTo(QueryWrapperFilter other) { - return query.equals(other.query); - } - - @Override - public int hashCode() { - return query.hashCode(); - } -} diff --git a/solr/core/src/java/org/apache/solr/search/TermsQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/TermsQParserPlugin.java index c4073539cc4..45bb13fc310 100644 --- a/solr/core/src/java/org/apache/solr/search/TermsQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/TermsQParserPlugin.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocValuesTermsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; @@ -61,35 +62,35 @@ public class TermsQParserPlugin extends QParserPlugin { private static enum Method { termsFilter { @Override - Filter makeFilter(String fname, BytesRef[] bytesRefs) { - return new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs)); + Query makeFilter(String fname, BytesRef[] bytesRefs) { + return new TermInSetQuery(fname, bytesRefs);// constant scores } }, booleanQuery { @Override - Filter makeFilter(String fname, BytesRef[] byteRefs) { + Query makeFilter(String fname, BytesRef[] byteRefs) { BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (BytesRef byteRef : byteRefs) { bq.add(new TermQuery(new Term(fname, byteRef)), BooleanClause.Occur.SHOULD); } - return new QueryWrapperFilter(bq.build()); + return new ConstantScoreQuery(bq.build()); } }, automaton { @Override - Filter makeFilter(String fname, BytesRef[] byteRefs) { + Query makeFilter(String fname, BytesRef[] byteRefs) { Automaton union = Automata.makeStringUnion(Arrays.asList(byteRefs)); - return new QueryWrapperFilter(new AutomatonQuery(new Term(fname), union)); + return new AutomatonQuery(new Term(fname), union);//constant scores } }, docValuesTermsFilter {//on 4x this is FieldCacheTermsFilter but we use the 5x name any way @Override - Filter makeFilter(String fname, BytesRef[] byteRefs) { - return new QueryWrapperFilter(new DocValuesTermsQuery(fname, byteRefs)); + Query makeFilter(String fname, BytesRef[] byteRefs) { + return new DocValuesTermsQuery(fname, byteRefs);//constant scores } }; - abstract Filter makeFilter(String fname, BytesRef[] byteRefs); + abstract Query makeFilter(String fname, BytesRef[] byteRefs); } @Override @@ -103,6 +104,7 @@ public class TermsQParserPlugin extends QParserPlugin { String qstr = localParams.get(QueryParsing.V);//never null Method method = Method.valueOf(localParams.get(METHOD, Method.termsFilter.name())); //TODO pick the default method based on various heuristics from benchmarks + //TODO pick the default using FieldType.getSetQuery //if space then split on all whitespace & trim, otherwise strictly interpret final boolean sepIsSpace = separator.equals(" "); @@ -134,7 +136,7 @@ public class TermsQParserPlugin extends QParserPlugin { bytesRefs[i] = term.toBytesRef(); } - return new SolrConstantScoreQuery(method.makeFilter(fname, bytesRefs)); + return method.makeFilter(fname, bytesRefs); } }; } diff --git a/solr/core/src/test/org/apache/solr/search/TestQueryWrapperFilter.java b/solr/core/src/test/org/apache/solr/search/TestQueryWrapperFilter.java deleted file mode 100644 index 72a7606a411..00000000000 --- a/solr/core/src/test/org/apache/solr/search/TestQueryWrapperFilter.java +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.search; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Objects; -import java.util.Set; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.FuzzyQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.RandomApproximationQuery; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.Weight; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.English; -import org.apache.lucene.util.LuceneTestCase; - -import com.carrotsearch.randomizedtesting.generators.RandomPicks; - -public class TestQueryWrapperFilter extends LuceneTestCase { - - // a filter for which other queries don't have special rewrite rules - private static class FilterWrapper extends Filter { - final Filter in; - - FilterWrapper(Filter in) { - this.in = in; - } - - @Override - public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException { - return in.getDocIdSet(context, acceptDocs); - } - - @Override - public String toString(String field) { - return in.toString(field); - } - - @Override - public boolean equals(Object other) { - return sameClassAs(other) && - Objects.equals(in, getClass().cast(other).in); - } - - @Override - public int hashCode() { - return 31 * classHash() + in.hashCode(); - } - } - - public void testBasic() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "value", Field.Store.NO)); - writer.addDocument(doc); - IndexReader reader = writer.getReader(); - writer.close(); - - TermQuery termQuery = new TermQuery(new Term("field", "value")); - - // should not throw exception with primitive query - QueryWrapperFilter qwf = new QueryWrapperFilter(termQuery); - - IndexSearcher searcher = newSearcher(reader); - TopDocs hits = searcher.search(qwf, 10); - assertEquals(1, hits.totalHits); - hits = searcher.search(new FilterWrapper(qwf), 10); - assertEquals(1, hits.totalHits); - - // should not throw exception with complex primitive query - BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); - booleanQuery.add(termQuery, Occur.MUST); - booleanQuery.add(new TermQuery(new Term("field", "missing")), - Occur.MUST_NOT); - qwf = new QueryWrapperFilter(termQuery); - - hits = searcher.search(qwf, 10); - assertEquals(1, hits.totalHits); - hits = searcher.search(new FilterWrapper(qwf), 10); - assertEquals(1, hits.totalHits); - - // should not throw exception with non primitive Query (doesn't implement - // Query#createWeight) - qwf = new QueryWrapperFilter(new FuzzyQuery(new Term("field", "valu"))); - - hits = searcher.search(qwf, 10); - assertEquals(1, hits.totalHits); - hits = searcher.search(new FilterWrapper(qwf), 10); - assertEquals(1, hits.totalHits); - - // test a query with no hits - termQuery = new TermQuery(new Term("field", "not_exist")); - qwf = new QueryWrapperFilter(termQuery); - hits = searcher.search(qwf, 10); - assertEquals(0, hits.totalHits); - hits = searcher.search(new FilterWrapper(qwf), 10); - assertEquals(0, hits.totalHits); - reader.close(); - dir.close(); - } - - public void testRandom() throws Exception { - final Directory d = newDirectory(); - final RandomIndexWriter w = new RandomIndexWriter(random(), d); - w.w.getConfig().setMaxBufferedDocs(17); - final int numDocs = atLeast(100); - final Set aDocs = new HashSet<>(); - for(int i=0;i Date: Sat, 2 Jun 2018 00:16:28 -0500 Subject: [PATCH 02/38] SOLR-12290,SOLR-12391: Do not close any servlet streams and improve our servlet stream closing prevention code for users and devs. --- solr/CHANGES.txt | 2 +- .../java/org/apache/solr/servlet/LoadAdminUiServlet.java | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 80f21a539ab..66d885362fd 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -242,7 +242,7 @@ Bug Fixes non-boolean queries. (James Dyer) * SOLR-12290: Do not close any servlet streams and improve our servlet stream closing prevention code for users - and devs. (Mark Miller) + and devs. (Mark Miller, janhoy, Andrzej Bialecki) * SOLR-12293: Updates need to use their own connection pool to maintain connection reuse and prevent spurious recoveries. (Mark Miller) diff --git a/solr/core/src/java/org/apache/solr/servlet/LoadAdminUiServlet.java b/solr/core/src/java/org/apache/solr/servlet/LoadAdminUiServlet.java index 1aa11379023..f5c301e8de5 100644 --- a/solr/core/src/java/org/apache/solr/servlet/LoadAdminUiServlet.java +++ b/solr/core/src/java/org/apache/solr/servlet/LoadAdminUiServlet.java @@ -17,6 +17,7 @@ package org.apache.solr.servlet; import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.CloseShieldOutputStream; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.apache.solr.common.params.CommonParams; @@ -58,8 +59,8 @@ public final class LoadAdminUiServlet extends BaseSolrServlet { response.setCharacterEncoding("UTF-8"); response.setContentType("text/html"); - // Don't close this! - see SOLR-8933 - out = new OutputStreamWriter(response.getOutputStream(), StandardCharsets.UTF_8); + // We have to close this to flush OutputStreamWriter buffer + out = new OutputStreamWriter(new CloseShieldOutputStream(response.getOutputStream()), StandardCharsets.UTF_8); String html = IOUtils.toString(in, "UTF-8"); Package pack = SolrCore.class.getPackage(); @@ -78,6 +79,7 @@ public final class LoadAdminUiServlet extends BaseSolrServlet { out.write( StringUtils.replaceEach(html, search, replace) ); } finally { IOUtils.closeQuietly(in); + IOUtils.closeQuietly(out); } } else { response.sendError(404); From a2d927667418d17a1f5f31a193092d5b04a4219e Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 2 Jun 2018 12:30:02 +0200 Subject: [PATCH 03/38] LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked as such once it's introduced and can't be changed after the fact. --- lucene/CHANGES.txt | 3 + .../SimpleTextFieldInfosFormat.java | 11 +- .../lucene50/Lucene50FieldInfosFormat.java | 2 +- .../lucene60/Lucene60FieldInfosFormat.java | 8 +- .../org/apache/lucene/index/FieldInfo.java | 18 ++- .../org/apache/lucene/index/FieldInfos.java | 33 ++++-- .../org/apache/lucene/index/IndexWriter.java | 8 +- .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestIndexWriter.java | 104 ++++++++++++++++++ .../lucene/index/TestPendingSoftDeletes.java | 10 +- .../lucene/index/TestSegmentMerger.java | 2 +- .../highlight/TermVectorLeafReader.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 4 +- .../index/BaseIndexFileFormatTestCase.java | 2 +- .../lucene/index/MismatchedLeafReader.java | 3 +- .../lucene/index/RandomPostingsTester.java | 4 +- .../handler/component/ExpandComponent.java | 3 +- .../solr/search/CollapsingQParserPlugin.java | 2 +- .../java/org/apache/solr/search/Insanity.java | 2 +- .../solr/uninverting/UninvertingReader.java | 2 +- 20 files changed, 187 insertions(+), 38 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cd11e7dbeb5..83f9ea29c71 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -203,6 +203,9 @@ New Features now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) +* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked + as such once it's introduced and can't be changed after the fact. (Nhat Nguyen via Simon Willnauer) + Bug Fixes * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 0ace1534d4e..1c40cbd4255 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -66,6 +66,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef ATT_VALUE = new BytesRef(" value "); static final BytesRef DIM_COUNT = new BytesRef(" dimensional count "); static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes "); + static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes "); @Override public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException { @@ -140,9 +141,13 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES); int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch)); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SOFT_DELETES); + boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch)); + infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts), - dimensionalCount, dimensionalNumBytes); + dimensionalCount, dimensionalNumBytes, isSoftDeletesField); } SimpleTextUtil.checkFooter(input); @@ -238,6 +243,10 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { SimpleTextUtil.write(out, DIM_NUM_BYTES); SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch); SimpleTextUtil.writeNewline(out); + + SimpleTextUtil.write(out, SOFT_DELETES); + SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch); + SimpleTextUtil.writeNewline(out); } SimpleTextUtil.writeChecksum(out, scratch); success = true; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java index a76bfeb6e7a..30dca7041f8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java @@ -148,7 +148,7 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat { lastAttributes = attributes; try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, - indexOptions, docValuesType, dvGen, attributes, 0, 0); + indexOptions, docValuesType, dvGen, attributes, 0, 0, false); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java index a35461e3ef7..522a73f1d27 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java @@ -136,6 +136,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; + boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -159,7 +160,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, - pointDimensionCount, pointNumBytes); + pointDimensionCount, pointNumBytes, isSoftDeletesField); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); @@ -277,6 +278,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { if (fi.hasVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; + if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -301,10 +303,12 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { // Codec header static final String CODEC_NAME = "Lucene60FieldInfos"; static final int FORMAT_START = 0; - static final int FORMAT_CURRENT = FORMAT_START; + static final int FORMAT_SOFT_DELETES = 1; + static final int FORMAT_CURRENT = FORMAT_SOFT_DELETES; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; + static final byte SOFT_DELETES_FIELD = 0x8; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 037fe5c1bc7..b50cb12cd5e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -53,14 +53,17 @@ public final class FieldInfo { private int pointDimensionCount; private int pointNumBytes; + // whether this field is used as the soft-deletes field + private final boolean softDeletesField; + /** * Sole constructor. * * @lucene.experimental */ - public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, - boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, - long dvGen, Map attributes, int pointDimensionCount, int pointNumBytes) { + public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads, + IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes, + int pointDimensionCount, int pointNumBytes, boolean softDeletesField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")"); @@ -78,6 +81,7 @@ public final class FieldInfo { this.attributes = Objects.requireNonNull(attributes); this.pointDimensionCount = pointDimensionCount; this.pointNumBytes = pointNumBytes; + this.softDeletesField = softDeletesField; assert checkConsistency(); } @@ -332,4 +336,12 @@ public final class FieldInfo { public Map attributes() { return attributes; } + + /** + * Returns true if this field is configured and used as the soft-deletes field. + * See {@link IndexWriterConfig#softDeletesField} + */ + public boolean isSoftDeletesField() { + return softDeletesField; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 4b472a55503..244333678a3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -221,13 +221,17 @@ public class FieldInfos implements Iterable { // norms back on after they were already ommitted; today // we silently discard the norm but this is badly trappy private int lowestUnassignedFieldNumber = -1; + + // The soft-deletes field from IWC to enforce a single soft-deletes field + private final String softDeletesFieldName; - FieldNumbers() { + FieldNumbers(String softDeletesFieldName) { this.nameToNumber = new HashMap<>(); this.numberToName = new HashMap<>(); this.indexOptions = new HashMap<>(); this.docValuesType = new HashMap<>(); this.dimensions = new HashMap<>(); + this.softDeletesFieldName = softDeletesFieldName; } /** @@ -236,7 +240,7 @@ public class FieldInfos implements Iterable { * number assigned if possible otherwise the first unassigned field number * is used as the field number. */ - synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes) { + synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { if (indexOptions != IndexOptions.NONE) { IndexOptions currentOpts = this.indexOptions.get(fieldName); if (currentOpts == null) { @@ -284,6 +288,16 @@ public class FieldInfos implements Iterable { nameToNumber.put(fieldName, fieldNumber); } + if (isSoftDeletesField) { + if (softDeletesFieldName == null) { + throw new IllegalArgumentException("this index has [" + fieldName + "] as soft-deletes already but soft-deletes field is not configured in IWC"); + } else if (fieldName.equals(softDeletesFieldName) == false) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as soft-deletes already"); + } + } else if (fieldName.equals(softDeletesFieldName)) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as non-soft-deletes already"); + } + return fieldNumber.intValue(); } @@ -385,7 +399,7 @@ public class FieldInfos implements Iterable { private boolean finished; Builder() { - this(new FieldNumbers()); + this(new FieldNumbers(null)); } /** @@ -413,8 +427,9 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0); - fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0); + final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName); + final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE); byName.put(fi.name, fi); @@ -427,7 +442,7 @@ public class FieldInfos implements Iterable { boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, - int dimensionCount, int dimensionNumBytes) { + int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { assert assertNotFinished(); if (docValues == null) { throw new NullPointerException("DocValuesType must not be null"); @@ -439,8 +454,8 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes); + final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); @@ -473,7 +488,7 @@ public class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), dvGen, - fi.getPointDimensionCount(), fi.getPointNumBytes()); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } public FieldInfo fieldInfo(String fieldName) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index bc2264b7eab..5efba70ad01 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -960,12 +960,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * If this {@link SegmentInfos} has no global field number map the returned instance is empty */ private FieldNumbers getFieldNumberMap() throws IOException { - final FieldNumbers map = new FieldNumbers(); + final FieldNumbers map = new FieldNumbers(config.softDeletesField); for(SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { - map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } } @@ -1787,7 +1787,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (globalFieldNumberMap.contains(f.name(), dvType) == false) { // if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we // get a consistent error message as if you try to do that during an indexing operation. - globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0); + globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, f.name().equals(config.softDeletesField)); assert globalFieldNumberMap.contains(f.name(), dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -2824,7 +2824,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { // This will throw exceptions if any of the incoming fields have an illegal schema change: - globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } infos.add(copySegmentAsIs(info, newSegName, context)); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index 37761d3f681..b3262588e65 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -222,7 +222,7 @@ public class TestDoc extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(r1, r2), si, InfoStream.getDefault(), trackingDir, - new FieldInfos.FieldNumbers(), context); + new FieldInfos.FieldNumbers(null), context); MergeState mergeState = merger.merge(); r1.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 5e394d560fc..b9f13c88e0f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3376,4 +3376,108 @@ public class TestIndexWriter extends LuceneTestCase { IOUtils.close(reader, writer, dir); } + public void testPreventChangingSoftDeletesField() throws Exception { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("my_deletes")); + Document v1 = new Document(); + v1.add(new StringField("id", "1", Field.Store.YES)); + v1.add(new StringField("version", "1", Field.Store.YES)); + writer.addDocument(v1); + Document v2 = new Document(); + v2.add(new StringField("id", "1", Field.Store.YES)); + v2.add(new StringField("version", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "1"), v2, new NumericDocValuesField("my_deletes", 1)); + writer.commit(); + writer.close(); + for (SegmentCommitInfo si : SegmentInfos.readLatestCommit(dir)) { + FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("my_deletes"); + assertTrue(softDeleteField.isSoftDeletesField()); + } + + IllegalArgumentException illegalError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("your_deletes")); + }); + assertEquals("cannot configure [your_deletes] as soft-deletes; " + + "this index uses [my_deletes] as soft-deletes already", illegalError.getMessage()); + + IndexWriterConfig softDeleteConfig = newIndexWriterConfig().setSoftDeletesField("my_deletes") + .setMergePolicy(new SoftDeletesRetentionMergePolicy("my_deletes", () -> new MatchAllDocsQuery(), newMergePolicy())); + writer = new IndexWriter(dir, softDeleteConfig); + Document tombstone = new Document(); + tombstone.add(new StringField("id", "tombstone", Field.Store.YES)); + tombstone.add(new NumericDocValuesField("my_deletes", 1)); + writer.addDocument(tombstone); + writer.flush(); + for (SegmentCommitInfo si : writer.segmentInfos) { + FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("my_deletes"); + assertTrue(softDeleteField.isSoftDeletesField()); + } + writer.close(); + // reopen writer without soft-deletes field should be prevented + IllegalArgumentException reopenError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig()); + }); + assertEquals("this index has [my_deletes] as soft-deletes already" + + " but soft-deletes field is not configured in IWC", reopenError.getMessage()); + dir.close(); + } + + public void testPreventAddingIndexesWithDifferentSoftDeletesField() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig().setSoftDeletesField("soft_deletes_1")); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + d.add(new StringField("version", Integer.toString(i), Field.Store.YES)); + w1.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("soft_deletes_1", 1)); + } + w1.commit(); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig().setSoftDeletesField("soft_deletes_2")); + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> w2.addIndexes(dir1)); + assertEquals("cannot configure [soft_deletes_2] as soft-deletes; this index uses [soft_deletes_1] as soft-deletes already", + error.getMessage()); + w2.close(); + + Directory dir3 = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("soft_deletes_1"); + IndexWriter w3 = new IndexWriter(dir3, config); + w3.addIndexes(dir1); + for (SegmentCommitInfo si : w3.segmentInfos) { + FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("soft_deletes_1"); + assertTrue(softDeleteField.isSoftDeletesField()); + } + w3.close(); + IOUtils.close(dir1, dir2, dir3); + } + + public void testNotAllowUsingExistingFieldAsSoftDeletes() throws Exception { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + if (random().nextBoolean()) { + d.add(new NumericDocValuesField("dv_field", 1)); + w.updateDocument(new Term("id", "1"), d); + } else { + w.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("dv_field", 1)); + } + } + w.commit(); + w.close(); + String softDeletesField = random().nextBoolean() ? "id" : "dv_field"; + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> { + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField(softDeletesField); + new IndexWriter(dir, config); + }); + assertEquals("cannot configure [" + softDeletesField + "] as soft-deletes;" + + " this index uses [" + softDeletesField + "] as non-soft-deletes already", error.getMessage()); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("non-existing-field"); + w = new IndexWriter(dir, config); + w.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 5fadd3f10cd..3047364781e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -120,7 +120,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { deletes.onNewReader(segmentReader, commitInfo); reader.close(); writer.close(); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { @@ -140,7 +140,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, true); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } @@ -182,7 +182,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { @@ -228,7 +228,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -247,7 +247,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertEquals(0, deletes.numPendingDeletes()); segmentInfo.advanceDocValuesGen(); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 6d0e04bbb2c..1171b906b98 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -88,7 +88,7 @@ public class TestSegmentMerger extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, - new FieldInfos.FieldNumbers(), + new FieldInfos.FieldNumbers(null), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 144209dcceb..1eef95fdd6d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -81,7 +81,7 @@ public class TermVectorLeafReader extends LeafReader { } FieldInfo fieldInfo = new FieldInfo(field, 0, true, true, terms.hasPayloads(), - indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0); + indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, false); fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo}); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index ff248c34538..11913d1cbee 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -501,7 +501,7 @@ public class MemoryIndex { IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads, indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(), - fieldType.pointDimensionCount(), fieldType.pointNumBytes()); + fieldType.pointDimensionCount(), fieldType.pointNumBytes(), false); } private void storePointValues(Info info, BytesRef pointValue) { @@ -520,7 +520,7 @@ public class MemoryIndex { info.fieldInfo = new FieldInfo( info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(), - info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes() + info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes(), info.fieldInfo.isSoftDeletesField() ); } else if (existingDocValuesType != docValuesType) { throw new IllegalArgumentException("Can't add [" + docValuesType + "] doc values field [" + fieldName + "], because [" + existingDocValuesType + "] doc values field already exists"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index f5b52239057..83419de52e2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -323,7 +323,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), - proto.getPointDimensionCount(), proto.getPointNumBytes()); + proto.getPointDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index 7dd6ba89bd0..2c746773f94 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -77,7 +77,8 @@ public class MismatchedLeafReader extends FilterLeafReader { oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // dimension count - oldInfo.getPointNumBytes()); // dimension numBytes + oldInfo.getPointNumBytes(), // dimension numBytes + oldInfo.isSoftDeletesField()); // used as soft-deletes field shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 29962e609a7..9f2d9b7adc0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -130,7 +130,7 @@ public class RandomPostingsTester { fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); fieldUpto++; SortedMap postings = new TreeMap<>(); @@ -651,7 +651,7 @@ public class RandomPostingsTester { DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index 82a62d56d3e..9ffea4bc9c8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -797,7 +797,8 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia fieldInfo.getDocValuesGen(), fieldInfo.attributes(), fieldInfo.getPointDimensionCount(), - fieldInfo.getPointNumBytes()); + fieldInfo.getPointNumBytes(), + fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java index 76a52583e32..d0f8cd4633e 100644 --- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java @@ -425,7 +425,7 @@ public class CollapsingQParserPlugin extends QParserPlugin { DocValuesType.NONE, fieldInfo.getDocValuesGen(), fieldInfo.attributes(), - 0, 0); + 0, 0, fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java index aa366521e88..8fe081f947b 100644 --- a/solr/core/src/java/org/apache/solr/search/Insanity.java +++ b/solr/core/src/java/org/apache/solr/search/Insanity.java @@ -66,7 +66,7 @@ public class Insanity { if (fi.name.equals(insaneField)) { filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } else { filteredInfos.add(fi); } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 967db541414..9f0f5271c67 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -282,7 +282,7 @@ public class UninvertingReader extends FilterLeafReader { } filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); } From 3dc4fa199c175ed6351f66bac5c23c73b4e3f89a Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 2 Jun 2018 13:47:24 +0200 Subject: [PATCH 04/38] Revert "LUCENE-8335: Enforce soft-deletes field up-front." This reverts commit a2d927667418d17a1f5f31a193092d5b04a4219e. --- lucene/CHANGES.txt | 3 - .../SimpleTextFieldInfosFormat.java | 11 +- .../lucene50/Lucene50FieldInfosFormat.java | 2 +- .../lucene60/Lucene60FieldInfosFormat.java | 8 +- .../org/apache/lucene/index/FieldInfo.java | 18 +-- .../org/apache/lucene/index/FieldInfos.java | 33 ++---- .../org/apache/lucene/index/IndexWriter.java | 8 +- .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestIndexWriter.java | 104 ------------------ .../lucene/index/TestPendingSoftDeletes.java | 10 +- .../lucene/index/TestSegmentMerger.java | 2 +- .../highlight/TermVectorLeafReader.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 4 +- .../index/BaseIndexFileFormatTestCase.java | 2 +- .../lucene/index/MismatchedLeafReader.java | 3 +- .../lucene/index/RandomPostingsTester.java | 4 +- .../handler/component/ExpandComponent.java | 3 +- .../solr/search/CollapsingQParserPlugin.java | 2 +- .../java/org/apache/solr/search/Insanity.java | 2 +- .../solr/uninverting/UninvertingReader.java | 2 +- 20 files changed, 38 insertions(+), 187 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 83f9ea29c71..cd11e7dbeb5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -203,9 +203,6 @@ New Features now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) -* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked - as such once it's introduced and can't be changed after the fact. (Nhat Nguyen via Simon Willnauer) - Bug Fixes * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 1c40cbd4255..0ace1534d4e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -66,7 +66,6 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef ATT_VALUE = new BytesRef(" value "); static final BytesRef DIM_COUNT = new BytesRef(" dimensional count "); static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes "); - static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes "); @Override public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException { @@ -141,13 +140,9 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES); int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch)); - SimpleTextUtil.readLine(input, scratch); - assert StringHelper.startsWith(scratch.get(), SOFT_DELETES); - boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch)); - infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts), - dimensionalCount, dimensionalNumBytes, isSoftDeletesField); + dimensionalCount, dimensionalNumBytes); } SimpleTextUtil.checkFooter(input); @@ -243,10 +238,6 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { SimpleTextUtil.write(out, DIM_NUM_BYTES); SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch); SimpleTextUtil.writeNewline(out); - - SimpleTextUtil.write(out, SOFT_DELETES); - SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch); - SimpleTextUtil.writeNewline(out); } SimpleTextUtil.writeChecksum(out, scratch); success = true; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java index 30dca7041f8..a76bfeb6e7a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java @@ -148,7 +148,7 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat { lastAttributes = attributes; try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, - indexOptions, docValuesType, dvGen, attributes, 0, 0, false); + indexOptions, docValuesType, dvGen, attributes, 0, 0); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java index 522a73f1d27..a35461e3ef7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java @@ -136,7 +136,6 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; - boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -160,7 +159,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, - pointDimensionCount, pointNumBytes, isSoftDeletesField); + pointDimensionCount, pointNumBytes); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); @@ -278,7 +277,6 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { if (fi.hasVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; - if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -303,12 +301,10 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { // Codec header static final String CODEC_NAME = "Lucene60FieldInfos"; static final int FORMAT_START = 0; - static final int FORMAT_SOFT_DELETES = 1; - static final int FORMAT_CURRENT = FORMAT_SOFT_DELETES; + static final int FORMAT_CURRENT = FORMAT_START; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; - static final byte SOFT_DELETES_FIELD = 0x8; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index b50cb12cd5e..037fe5c1bc7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -53,17 +53,14 @@ public final class FieldInfo { private int pointDimensionCount; private int pointNumBytes; - // whether this field is used as the soft-deletes field - private final boolean softDeletesField; - /** * Sole constructor. * * @lucene.experimental */ - public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads, - IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes, - int pointDimensionCount, int pointNumBytes, boolean softDeletesField) { + public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, + boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, + long dvGen, Map attributes, int pointDimensionCount, int pointNumBytes) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")"); @@ -81,7 +78,6 @@ public final class FieldInfo { this.attributes = Objects.requireNonNull(attributes); this.pointDimensionCount = pointDimensionCount; this.pointNumBytes = pointNumBytes; - this.softDeletesField = softDeletesField; assert checkConsistency(); } @@ -336,12 +332,4 @@ public final class FieldInfo { public Map attributes() { return attributes; } - - /** - * Returns true if this field is configured and used as the soft-deletes field. - * See {@link IndexWriterConfig#softDeletesField} - */ - public boolean isSoftDeletesField() { - return softDeletesField; - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 244333678a3..4b472a55503 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -221,17 +221,13 @@ public class FieldInfos implements Iterable { // norms back on after they were already ommitted; today // we silently discard the norm but this is badly trappy private int lowestUnassignedFieldNumber = -1; - - // The soft-deletes field from IWC to enforce a single soft-deletes field - private final String softDeletesFieldName; - FieldNumbers(String softDeletesFieldName) { + FieldNumbers() { this.nameToNumber = new HashMap<>(); this.numberToName = new HashMap<>(); this.indexOptions = new HashMap<>(); this.docValuesType = new HashMap<>(); this.dimensions = new HashMap<>(); - this.softDeletesFieldName = softDeletesFieldName; } /** @@ -240,7 +236,7 @@ public class FieldInfos implements Iterable { * number assigned if possible otherwise the first unassigned field number * is used as the field number. */ - synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { + synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes) { if (indexOptions != IndexOptions.NONE) { IndexOptions currentOpts = this.indexOptions.get(fieldName); if (currentOpts == null) { @@ -288,16 +284,6 @@ public class FieldInfos implements Iterable { nameToNumber.put(fieldName, fieldNumber); } - if (isSoftDeletesField) { - if (softDeletesFieldName == null) { - throw new IllegalArgumentException("this index has [" + fieldName + "] as soft-deletes already but soft-deletes field is not configured in IWC"); - } else if (fieldName.equals(softDeletesFieldName) == false) { - throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as soft-deletes already"); - } - } else if (fieldName.equals(softDeletesFieldName)) { - throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as non-soft-deletes already"); - } - return fieldNumber.intValue(); } @@ -399,7 +385,7 @@ public class FieldInfos implements Iterable { private boolean finished; Builder() { - this(new FieldNumbers(null)); + this(new FieldNumbers()); } /** @@ -427,9 +413,8 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName); - final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, isSoftDeletesField); - fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, isSoftDeletesField); + final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0); + fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE); byName.put(fi.name, fi); @@ -442,7 +427,7 @@ public class FieldInfos implements Iterable { boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, - int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { + int dimensionCount, int dimensionNumBytes) { assert assertNotFinished(); if (docValues == null) { throw new NullPointerException("DocValuesType must not be null"); @@ -454,8 +439,8 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes, isSoftDeletesField); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes, isSoftDeletesField); + final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); @@ -488,7 +473,7 @@ public class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), dvGen, - fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + fi.getPointDimensionCount(), fi.getPointNumBytes()); } public FieldInfo fieldInfo(String fieldName) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 5efba70ad01..bc2264b7eab 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -960,12 +960,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * If this {@link SegmentInfos} has no global field number map the returned instance is empty */ private FieldNumbers getFieldNumberMap() throws IOException { - final FieldNumbers map = new FieldNumbers(config.softDeletesField); + final FieldNumbers map = new FieldNumbers(); for(SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { - map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); } } @@ -1787,7 +1787,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (globalFieldNumberMap.contains(f.name(), dvType) == false) { // if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we // get a consistent error message as if you try to do that during an indexing operation. - globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, f.name().equals(config.softDeletesField)); + globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0); assert globalFieldNumberMap.contains(f.name(), dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -2824,7 +2824,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { // This will throw exceptions if any of the incoming fields have an illegal schema change: - globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); } infos.add(copySegmentAsIs(info, newSegName, context)); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index b3262588e65..37761d3f681 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -222,7 +222,7 @@ public class TestDoc extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(r1, r2), si, InfoStream.getDefault(), trackingDir, - new FieldInfos.FieldNumbers(null), context); + new FieldInfos.FieldNumbers(), context); MergeState mergeState = merger.merge(); r1.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index b9f13c88e0f..5e394d560fc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3376,108 +3376,4 @@ public class TestIndexWriter extends LuceneTestCase { IOUtils.close(reader, writer, dir); } - public void testPreventChangingSoftDeletesField() throws Exception { - Directory dir = newDirectory(); - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("my_deletes")); - Document v1 = new Document(); - v1.add(new StringField("id", "1", Field.Store.YES)); - v1.add(new StringField("version", "1", Field.Store.YES)); - writer.addDocument(v1); - Document v2 = new Document(); - v2.add(new StringField("id", "1", Field.Store.YES)); - v2.add(new StringField("version", "2", Field.Store.YES)); - writer.softUpdateDocument(new Term("id", "1"), v2, new NumericDocValuesField("my_deletes", 1)); - writer.commit(); - writer.close(); - for (SegmentCommitInfo si : SegmentInfos.readLatestCommit(dir)) { - FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("my_deletes"); - assertTrue(softDeleteField.isSoftDeletesField()); - } - - IllegalArgumentException illegalError = expectThrows(IllegalArgumentException.class, () -> { - new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("your_deletes")); - }); - assertEquals("cannot configure [your_deletes] as soft-deletes; " + - "this index uses [my_deletes] as soft-deletes already", illegalError.getMessage()); - - IndexWriterConfig softDeleteConfig = newIndexWriterConfig().setSoftDeletesField("my_deletes") - .setMergePolicy(new SoftDeletesRetentionMergePolicy("my_deletes", () -> new MatchAllDocsQuery(), newMergePolicy())); - writer = new IndexWriter(dir, softDeleteConfig); - Document tombstone = new Document(); - tombstone.add(new StringField("id", "tombstone", Field.Store.YES)); - tombstone.add(new NumericDocValuesField("my_deletes", 1)); - writer.addDocument(tombstone); - writer.flush(); - for (SegmentCommitInfo si : writer.segmentInfos) { - FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("my_deletes"); - assertTrue(softDeleteField.isSoftDeletesField()); - } - writer.close(); - // reopen writer without soft-deletes field should be prevented - IllegalArgumentException reopenError = expectThrows(IllegalArgumentException.class, () -> { - new IndexWriter(dir, newIndexWriterConfig()); - }); - assertEquals("this index has [my_deletes] as soft-deletes already" + - " but soft-deletes field is not configured in IWC", reopenError.getMessage()); - dir.close(); - } - - public void testPreventAddingIndexesWithDifferentSoftDeletesField() throws Exception { - Directory dir1 = newDirectory(); - IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig().setSoftDeletesField("soft_deletes_1")); - for (int i = 0; i < 2; i++) { - Document d = new Document(); - d.add(new StringField("id", "1", Field.Store.YES)); - d.add(new StringField("version", Integer.toString(i), Field.Store.YES)); - w1.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("soft_deletes_1", 1)); - } - w1.commit(); - w1.close(); - - Directory dir2 = newDirectory(); - IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig().setSoftDeletesField("soft_deletes_2")); - IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> w2.addIndexes(dir1)); - assertEquals("cannot configure [soft_deletes_2] as soft-deletes; this index uses [soft_deletes_1] as soft-deletes already", - error.getMessage()); - w2.close(); - - Directory dir3 = newDirectory(); - IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("soft_deletes_1"); - IndexWriter w3 = new IndexWriter(dir3, config); - w3.addIndexes(dir1); - for (SegmentCommitInfo si : w3.segmentInfos) { - FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("soft_deletes_1"); - assertTrue(softDeleteField.isSoftDeletesField()); - } - w3.close(); - IOUtils.close(dir1, dir2, dir3); - } - - public void testNotAllowUsingExistingFieldAsSoftDeletes() throws Exception { - Directory dir = newDirectory(); - IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); - for (int i = 0; i < 2; i++) { - Document d = new Document(); - d.add(new StringField("id", "1", Field.Store.YES)); - if (random().nextBoolean()) { - d.add(new NumericDocValuesField("dv_field", 1)); - w.updateDocument(new Term("id", "1"), d); - } else { - w.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("dv_field", 1)); - } - } - w.commit(); - w.close(); - String softDeletesField = random().nextBoolean() ? "id" : "dv_field"; - IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> { - IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField(softDeletesField); - new IndexWriter(dir, config); - }); - assertEquals("cannot configure [" + softDeletesField + "] as soft-deletes;" + - " this index uses [" + softDeletesField + "] as non-soft-deletes already", error.getMessage()); - IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("non-existing-field"); - w = new IndexWriter(dir, config); - w.close(); - dir.close(); - } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 3047364781e..5fadd3f10cd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -120,7 +120,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { deletes.onNewReader(segmentReader, commitInfo); reader.close(); writer.close(); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { @@ -140,7 +140,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, true); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } @@ -182,7 +182,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { @@ -228,7 +228,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -247,7 +247,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertEquals(0, deletes.numPendingDeletes()); segmentInfo.advanceDocValuesGen(); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 1171b906b98..6d0e04bbb2c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -88,7 +88,7 @@ public class TestSegmentMerger extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, - new FieldInfos.FieldNumbers(null), + new FieldInfos.FieldNumbers(), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 1eef95fdd6d..144209dcceb 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -81,7 +81,7 @@ public class TermVectorLeafReader extends LeafReader { } FieldInfo fieldInfo = new FieldInfo(field, 0, true, true, terms.hasPayloads(), - indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, false); + indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0); fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo}); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 11913d1cbee..ff248c34538 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -501,7 +501,7 @@ public class MemoryIndex { IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads, indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(), - fieldType.pointDimensionCount(), fieldType.pointNumBytes(), false); + fieldType.pointDimensionCount(), fieldType.pointNumBytes()); } private void storePointValues(Info info, BytesRef pointValue) { @@ -520,7 +520,7 @@ public class MemoryIndex { info.fieldInfo = new FieldInfo( info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(), - info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes(), info.fieldInfo.isSoftDeletesField() + info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes() ); } else if (existingDocValuesType != docValuesType) { throw new IllegalArgumentException("Can't add [" + docValuesType + "] doc values field [" + fieldName + "], because [" + existingDocValuesType + "] doc values field already exists"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index 83419de52e2..f5b52239057 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -323,7 +323,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), - proto.getPointDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField()); + proto.getPointDimensionCount(), proto.getPointNumBytes()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index 2c746773f94..7dd6ba89bd0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -77,8 +77,7 @@ public class MismatchedLeafReader extends FilterLeafReader { oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // dimension count - oldInfo.getPointNumBytes(), // dimension numBytes - oldInfo.isSoftDeletesField()); // used as soft-deletes field + oldInfo.getPointNumBytes()); // dimension numBytes shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 9f2d9b7adc0..29962e609a7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -130,7 +130,7 @@ public class RandomPostingsTester { fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, -1, new HashMap<>(), - 0, 0, false); + 0, 0); fieldUpto++; SortedMap postings = new TreeMap<>(); @@ -651,7 +651,7 @@ public class RandomPostingsTester { DocValuesType.NONE, -1, new HashMap<>(), - 0, 0, false); + 0, 0); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index 9ffea4bc9c8..82a62d56d3e 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -797,8 +797,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia fieldInfo.getDocValuesGen(), fieldInfo.attributes(), fieldInfo.getPointDimensionCount(), - fieldInfo.getPointNumBytes(), - fieldInfo.isSoftDeletesField()); + fieldInfo.getPointNumBytes()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java index d0f8cd4633e..76a52583e32 100644 --- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java @@ -425,7 +425,7 @@ public class CollapsingQParserPlugin extends QParserPlugin { DocValuesType.NONE, fieldInfo.getDocValuesGen(), fieldInfo.attributes(), - 0, 0, fieldInfo.isSoftDeletesField()); + 0, 0); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java index 8fe081f947b..aa366521e88 100644 --- a/solr/core/src/java/org/apache/solr/search/Insanity.java +++ b/solr/core/src/java/org/apache/solr/search/Insanity.java @@ -66,7 +66,7 @@ public class Insanity { if (fi.name.equals(insaneField)) { filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), - fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); + fi.getPointDimensionCount(), fi.getPointNumBytes())); } else { filteredInfos.add(fi); } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 9f0f5271c67..967db541414 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -282,7 +282,7 @@ public class UninvertingReader extends FilterLeafReader { } filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), - fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); + fi.getPointDimensionCount(), fi.getPointNumBytes())); } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); } From ab316bbc91c273b13c851a38ad5d14ef64ab3eec Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Mon, 4 Jun 2018 11:32:31 +0700 Subject: [PATCH 05/38] SOLR-9922: Write buffering updates to another tlog --- SOLR-9922.patch | 1294 +++++++++++++++++ solr/CHANGES.txt | 2 + .../apache/solr/cloud/RecoveryStrategy.java | 29 +- .../solr/cloud/ReplicateFromLeader.java | 2 +- .../solr/update/CdcrTransactionLog.java | 20 +- .../org/apache/solr/update/CdcrUpdateLog.java | 3 - .../solr/update/HdfsTransactionLog.java | 18 +- .../org/apache/solr/update/HdfsUpdateLog.java | 84 +- .../apache/solr/update/TransactionLog.java | 56 +- .../org/apache/solr/update/UpdateLog.java | 259 ++-- .../org/apache/solr/search/TestRecovery.java | 58 +- .../apache/solr/search/TestRecoveryHdfs.java | 46 +- .../solr/update/TransactionLogTest.java | 2 +- 13 files changed, 1555 insertions(+), 318 deletions(-) create mode 100644 SOLR-9922.patch diff --git a/SOLR-9922.patch b/SOLR-9922.patch new file mode 100644 index 00000000000..052abf4041a --- /dev/null +++ b/SOLR-9922.patch @@ -0,0 +1,1294 @@ +diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +index c8f5ae8..966497b 100644 +--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java ++++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +@@ -449,7 +449,6 @@ public class RecoveryStrategy implements Runnable, Closeable { + + // TODO: perhaps make this grab a new core each time through the loop to handle core reloads? + final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception { +- boolean replayed = false; + boolean successfulRecovery = false; + + UpdateLog ulog; +@@ -500,8 +499,7 @@ public class RecoveryStrategy implements Runnable, Closeable { + // when we went down. We may have received updates since then. + recentVersions = startingVersions; + try { +- if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) { +- // last operation at the time of startup had the GAP flag set... ++ if (ulog.existOldBufferLog()) { + // this means we were previously doing a full index replication + // that probably didn't complete and buffering updates in the + // meantime. +@@ -542,9 +540,9 @@ public class RecoveryStrategy implements Runnable, Closeable { + } + + LOG.info("Begin buffering updates. core=[{}]", coreName); ++ // recalling buffer updates will drop the old buffer tlog + ulog.bufferUpdates(); +- replayed = false; +- ++ + LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(), + ourUrl); + zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); +@@ -603,8 +601,7 @@ public class RecoveryStrategy implements Runnable, Closeable { + + LOG.info("Replaying updates buffered during PeerSync."); + replay(core); +- replayed = true; +- ++ + // sync success + successfulRecovery = true; + return; +@@ -630,8 +627,7 @@ public class RecoveryStrategy implements Runnable, Closeable { + } + + replayFuture = replay(core); +- replayed = true; +- ++ + if (isClosed()) { + LOG.info("RecoveryStrategy has been closed"); + break; +@@ -650,21 +646,6 @@ public class RecoveryStrategy implements Runnable, Closeable { + } catch (Exception e) { + SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e); + } finally { +- if (!replayed) { +- // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates +- // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date. +- // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will +- // reset our starting point for playback. +- LOG.info("Replay not started, or was not successful... still buffering updates."); +- +- /** this prev code is retained in case we want to switch strategies. +- try { +- ulog.dropBufferedUpdates(); +- } catch (Exception e) { +- SolrException.log(log, "", e); +- } +- **/ +- } + if (successfulRecovery) { + LOG.info("Registering as Active after recovery."); + try { +diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +index 0a742e3..aa648dd 100644 +--- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java ++++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +@@ -97,7 +97,7 @@ public class ReplicateFromLeader { + new ModifiableSolrParams()); + CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); + cuc.setVersion(Long.parseLong(commitVersion)); +- updateLog.copyOverOldUpdates(cuc); ++ updateLog.commitAndSwitchToNewTlog(cuc); + lastVersion = Long.parseLong(commitVersion); + } + }); +diff --git a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java +index 3534f62..f668540 100644 +--- a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java ++++ b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java +@@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory; + * methods {@link #incref()}, {@link #close()} and {@link #reopenOutputStream()}. + *

  • encode the number of records in the tlog file in the last commit record. The number of records will be + * decoded and reuse if the tlog file is reopened. This is achieved by extending the constructor, and the +- * methods {@link #writeCommit(CommitUpdateCommand, int)} and {@link #getReader(long)}.
  • ++ * methods {@link #writeCommit(CommitUpdateCommand)} and {@link #getReader(long)}. + * + */ + public class CdcrTransactionLog extends TransactionLog { +@@ -108,7 +108,7 @@ public class CdcrTransactionLog extends TransactionLog { + } + + @Override +- public long write(AddUpdateCommand cmd, long prevPointer, int flags) { ++ public long write(AddUpdateCommand cmd, long prevPointer) { + assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); + + LogCodec codec = new LogCodec(resolver); +@@ -125,7 +125,7 @@ public class CdcrTransactionLog extends TransactionLog { + codec.init(out); + if (cmd.isInPlaceUpdate()) { + codec.writeTag(JavaBinCodec.ARR, 6); +- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte ++ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeLong(prevPointer); + codec.writeLong(cmd.prevVersion); +@@ -141,7 +141,7 @@ public class CdcrTransactionLog extends TransactionLog { + + } else { + codec.writeTag(JavaBinCodec.ARR, 4); +- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte ++ codec.writeInt(UpdateLog.ADD); // should just take one byte + codec.writeLong(cmd.getVersion()); + if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { + // if the update is received via cdcr source; add extra boolean entry +@@ -179,7 +179,7 @@ public class CdcrTransactionLog extends TransactionLog { + } + + @Override +- public long writeDelete(DeleteUpdateCommand cmd, int flags) { ++ public long writeDelete(DeleteUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + + try { +@@ -190,7 +190,7 @@ public class CdcrTransactionLog extends TransactionLog { + MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); + codec.init(out); + codec.writeTag(JavaBinCodec.ARR, 4); +- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte ++ codec.writeInt(UpdateLog.DELETE); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeByteArray(br.bytes, br.offset, br.length); + if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { +@@ -217,7 +217,7 @@ public class CdcrTransactionLog extends TransactionLog { + } + + @Override +- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { ++ public long writeDeleteByQuery(DeleteUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + try { + checkWriteHeader(codec, null); +@@ -225,7 +225,7 @@ public class CdcrTransactionLog extends TransactionLog { + MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); + codec.init(out); + codec.writeTag(JavaBinCodec.ARR, 4); +- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte ++ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeStr(cmd.query); + if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { +@@ -249,7 +249,7 @@ public class CdcrTransactionLog extends TransactionLog { + } + + @Override +- public long writeCommit(CommitUpdateCommand cmd, int flags) { ++ public long writeCommit(CommitUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + synchronized (this) { + try { +@@ -261,7 +261,7 @@ public class CdcrTransactionLog extends TransactionLog { + } + codec.init(fos); + codec.writeTag(JavaBinCodec.ARR, 4); +- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte ++ codec.writeInt(UpdateLog.COMMIT); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeTag(JavaBinCodec.INT); // Enforce the encoding of a plain integer, to simplify decoding + fos.writeInt(numRecords + 1); // the number of records in the file - +1 to account for the commit operation being written +diff --git a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java +index 6b20204..bff1612 100644 +--- a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java ++++ b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java +@@ -352,7 +352,6 @@ public class CdcrUpdateLog extends UpdateLog { + long latestVersion = startingUpdates.getMaxRecentVersion(); + try { + startingVersions = startingUpdates.getVersions(numRecordsToKeep); +- startingOperation = startingUpdates.getLatestOperation(); + + // populate recent deletes list (since we can't get that info from the index) + for (int i=startingUpdates.deleteList.size()-1; i>=0; i--) { +@@ -389,9 +388,7 @@ public class CdcrUpdateLog extends UpdateLog { + */ + private void copyBufferedUpdates(File tlogSrc, long offsetSrc, long latestVersion) { + recoveryInfo = new RecoveryInfo(); +- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); + state = State.BUFFERING; +- operationFlags |= FLAG_GAP; + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set(DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString()); +diff --git a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java +index 0f89016..8ed7d7a 100644 +--- a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java ++++ b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java +@@ -166,20 +166,6 @@ public class HdfsTransactionLog extends TransactionLog { + } + return true; + } +- +- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. +- // This should only be used to roll back buffered updates, not actually applied updates. +- @Override +- public void rollback(long pos) throws IOException { +- synchronized (this) { +- assert snapshot_size == pos; +- ensureFlushed(); +- // TODO: how do we rollback with hdfs?? We need HDFS-3107 +- fos.setWritten(pos); +- assert fos.size() == pos; +- numRecords = snapshot_numRecords; +- } +- } + + private void readHeader(FastInputStream fis) throws IOException { + // read existing header +@@ -210,7 +196,7 @@ public class HdfsTransactionLog extends TransactionLog { + } + + @Override +- public long writeCommit(CommitUpdateCommand cmd, int flags) { ++ public long writeCommit(CommitUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + synchronized (this) { + try { +@@ -223,7 +209,7 @@ public class HdfsTransactionLog extends TransactionLog { + + codec.init(fos); + codec.writeTag(JavaBinCodec.ARR, 3); +- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte ++ codec.writeInt(UpdateLog.COMMIT); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file + +diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java +index 7bb74d0..8ca4b1c 100644 +--- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java ++++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java +@@ -65,37 +65,6 @@ public class HdfsUpdateLog extends UpdateLog { + this.confDir = confDir; + } + +- // HACK +- // while waiting for HDFS-3107, instead of quickly +- // dropping, we slowly apply +- // This is somewhat brittle, but current usage +- // allows for it +- @Override +- public boolean dropBufferedUpdates() { +- versionInfo.blockUpdates(); +- try { +- if (state != State.BUFFERING) return false; +- +- if (log.isInfoEnabled()) { +- log.info("Dropping buffered updates " + this); +- } +- +- // since we blocked updates, this synchronization shouldn't strictly be +- // necessary. +- synchronized (this) { +- if (tlog != null) { +- // tlog.rollback(recoveryInfo.positionOfStart); +- } +- } +- +- state = State.ACTIVE; +- operationFlags &= ~FLAG_GAP; +- } finally { +- versionInfo.unblockUpdates(); +- } +- return true; +- } +- + @Override + public void init(PluginInfo info) { + super.init(info); +@@ -186,6 +155,11 @@ public class HdfsUpdateLog extends UpdateLog { + throw new RuntimeException("Problem creating directory: " + tlogDir, e); + } + } ++ ++ String[] oldBufferTlog = getBufferLogList(fs, tlogDir); ++ if (oldBufferTlog != null && oldBufferTlog.length != 0) { ++ existOldBufferLog = true; ++ } + + tlogFiles = getLogList(fs, tlogDir); + id = getLastLogId() + 1; // add 1 since we will create a new log for the +@@ -241,7 +215,6 @@ public class HdfsUpdateLog extends UpdateLog { + // non-complete tlogs. + try (RecentUpdates startingUpdates = getRecentUpdates()) { + startingVersions = startingUpdates.getVersions(getNumRecordsToKeep()); +- startingOperation = startingUpdates.getLatestOperation(); + + // populate recent deletes list (since we can't get that info from the + // index) +@@ -269,6 +242,23 @@ public class HdfsUpdateLog extends UpdateLog { + public String getLogDir() { + return tlogDir.toUri().toString(); + } ++ ++ public static String[] getBufferLogList(FileSystem fs, Path tlogDir) { ++ final String prefix = BUFFER_TLOG_NAME+'.'; ++ assert fs != null; ++ FileStatus[] fileStatuses; ++ try { ++ fileStatuses = fs.listStatus(tlogDir, path -> path.getName().startsWith(prefix)); ++ } catch (IOException e) { ++ throw new SolrException(ErrorCode.SERVER_ERROR, "Failed on listing old buffer tlog", e); ++ } ++ ++ String[] names = new String[fileStatuses.length]; ++ for (int i = 0; i < fileStatuses.length; i++) { ++ names[i] = fileStatuses[i].getPath().getName(); ++ } ++ return names; ++ } + + public static String[] getLogList(FileSystem fs, Path tlogDir) { + final String prefix = TLOG_NAME + '.'; +@@ -307,7 +297,35 @@ public class HdfsUpdateLog extends UpdateLog { + IOUtils.closeQuietly(fs); + } + } +- ++ ++ @Override ++ protected void ensureBufferTlog() { ++ if (bufferTlog != null) return; ++ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); ++ bufferTlog = new HdfsTransactionLog(fs, new Path(tlogDir, newLogName), ++ globalStrings, tlogDfsReplication); ++ } ++ ++ @Override ++ protected void deleteBufferLogs() { ++ // Delete old buffer logs ++ String[] oldBufferTlog = getBufferLogList(fs, tlogDir); ++ if (oldBufferTlog != null && oldBufferTlog.length != 0) { ++ for (String oldBufferLogName : oldBufferTlog) { ++ Path f = new Path(tlogDir, oldBufferLogName); ++ try { ++ boolean s = fs.delete(f, false); ++ if (!s) { ++ log.error("Could not remove old buffer tlog file:" + f); ++ } ++ } catch (IOException e) { ++ // No need to bubble up this exception, because it won't cause any problems on recovering ++ log.error("Could not remove old buffer tlog file:" + f, e); ++ } ++ } ++ } ++ } ++ + @Override + protected void ensureLog() { + if (tlog == null) { +diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java +index 96a928c..2a23896 100644 +--- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java ++++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java +@@ -85,9 +85,6 @@ public class TransactionLog implements Closeable { + Map globalStringMap = new HashMap<>(); + List globalStringList = new ArrayList<>(); + +- long snapshot_size; +- int snapshot_numRecords; +- + // write a BytesRef as a byte array + static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() { + @Override +@@ -153,7 +150,7 @@ public class TransactionLog implements Closeable { + + // Parse tlog id from the filename + String filename = tlogFile.getName(); +- id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20)); ++ id = Long.parseLong(filename.substring(filename.lastIndexOf('.')+1)); + + this.tlogFile = tlogFile; + raf = new RandomAccessFile(this.tlogFile, "rw"); +@@ -233,29 +230,6 @@ public class TransactionLog implements Closeable { + return true; + } + +- /** takes a snapshot of the current position and number of records +- * for later possible rollback, and returns the position */ +- public long snapshot() { +- synchronized (this) { +- snapshot_size = fos.size(); +- snapshot_numRecords = numRecords; +- return snapshot_size; +- } +- } +- +- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. +- // This should only be used to roll back buffered updates, not actually applied updates. +- public void rollback(long pos) throws IOException { +- synchronized (this) { +- assert snapshot_size == pos; +- fos.flush(); +- raf.setLength(pos); +- fos.setWritten(pos); +- assert fos.size() == pos; +- numRecords = snapshot_numRecords; +- } +- } +- + public long writeData(Object o) { + @SuppressWarnings("resource") final LogCodec codec = new LogCodec(resolver); + try { +@@ -346,17 +320,16 @@ public class TransactionLog implements Closeable { + + /** + * Writes an add update command to the transaction log. This is not applicable for +- * in-place updates; use {@link #write(AddUpdateCommand, long, int)}. ++ * in-place updates; use {@link #write(AddUpdateCommand, long)}. + * (The previous pointer (applicable for in-place updates) is set to -1 while writing + * the command to the transaction log.) + * @param cmd The add update command to be written +- * @param flags Options for writing the command to the transaction log + * @return Returns the position pointer of the written update command + * +- * @see #write(AddUpdateCommand, long, int) ++ * @see #write(AddUpdateCommand, long) + */ +- public long write(AddUpdateCommand cmd, int flags) { +- return write(cmd, -1, flags); ++ public long write(AddUpdateCommand cmd) { ++ return write(cmd, -1); + } + + /** +@@ -365,10 +338,9 @@ public class TransactionLog implements Closeable { + * @param cmd The add update command to be written + * @param prevPointer The pointer in the transaction log which this update depends + * on (applicable for in-place updates) +- * @param flags Options for writing the command to the transaction log + * @return Returns the position pointer of the written update command + */ +- public long write(AddUpdateCommand cmd, long prevPointer, int flags) { ++ public long write(AddUpdateCommand cmd, long prevPointer) { + assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); + + LogCodec codec = new LogCodec(resolver); +@@ -386,14 +358,14 @@ public class TransactionLog implements Closeable { + codec.init(out); + if (cmd.isInPlaceUpdate()) { + codec.writeTag(JavaBinCodec.ARR, 5); +- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte ++ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeLong(prevPointer); + codec.writeLong(cmd.prevVersion); + codec.writeSolrInputDocument(cmd.getSolrInputDocument()); + } else { + codec.writeTag(JavaBinCodec.ARR, 3); +- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte ++ codec.writeInt(UpdateLog.ADD); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeSolrInputDocument(cmd.getSolrInputDocument()); + } +@@ -422,7 +394,7 @@ public class TransactionLog implements Closeable { + } + } + +- public long writeDelete(DeleteUpdateCommand cmd, int flags) { ++ public long writeDelete(DeleteUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + + try { +@@ -433,7 +405,7 @@ public class TransactionLog implements Closeable { + MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); + codec.init(out); + codec.writeTag(JavaBinCodec.ARR, 3); +- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte ++ codec.writeInt(UpdateLog.DELETE); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeByteArray(br.bytes, br.offset, br.length); + +@@ -452,7 +424,7 @@ public class TransactionLog implements Closeable { + + } + +- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { ++ public long writeDeleteByQuery(DeleteUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + try { + checkWriteHeader(codec, null); +@@ -460,7 +432,7 @@ public class TransactionLog implements Closeable { + MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); + codec.init(out); + codec.writeTag(JavaBinCodec.ARR, 3); +- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte ++ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeStr(cmd.query); + +@@ -478,7 +450,7 @@ public class TransactionLog implements Closeable { + } + + +- public long writeCommit(CommitUpdateCommand cmd, int flags) { ++ public long writeCommit(CommitUpdateCommand cmd) { + LogCodec codec = new LogCodec(resolver); + synchronized (this) { + try { +@@ -490,7 +462,7 @@ public class TransactionLog implements Closeable { + } + codec.init(fos); + codec.writeTag(JavaBinCodec.ARR, 3); +- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte ++ codec.writeInt(UpdateLog.COMMIT); // should just take one byte + codec.writeLong(cmd.getVersion()); + codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file + +diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java +index 7f821ea..1bda23f 100644 +--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java ++++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java +@@ -96,6 +96,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + private static final long STATUS_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); + public static String LOG_FILENAME_PATTERN = "%s.%019d"; + public static String TLOG_NAME="tlog"; ++ public static String BUFFER_TLOG_NAME="buffer.tlog"; + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private boolean debug = log.isDebugEnabled(); +@@ -139,11 +140,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + public static final int DELETE_BY_QUERY = 0x03; + public static final int COMMIT = 0x04; + public static final int UPDATE_INPLACE = 0x08; +- // Flag indicating that this is a buffered operation, and that a gap exists before buffering started. +- // for example, if full index replication starts and we are buffering updates, then this flag should +- // be set to indicate that replaying the log would not bring us into sync (i.e. peersync should +- // fail if this flag is set on the last update in the tlog). +- public static final int FLAG_GAP = 0x10; ++ // For backward-compatibility, we should delete this field in 9.0 + public static final int OPERATION_MASK = 0x0f; // mask off flags to get the operation + + /** +@@ -186,8 +183,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + + long id = -1; + protected State state = State.ACTIVE; +- protected int operationFlags; // flags to write in the transaction log with operations (i.e. FLAG_GAP) + ++ protected TransactionLog bufferTlog; + protected TransactionLog tlog; + protected TransactionLog prevTlog; + protected final Deque logs = new LinkedList<>(); // list of recent logs, newest first +@@ -206,6 +203,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + protected int maxNumLogsToKeep; + protected int numVersionBuckets; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two. + protected Long maxVersionFromIndex = null; ++ protected boolean existOldBufferLog = false; + + // keep track of deletes only... this is not updated on an add + protected LinkedHashMap oldDeletes = new LinkedHashMap(numDeletesToKeep) { +@@ -244,7 +242,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + volatile UpdateHandler uhandler; // a core reload can change this reference! + protected volatile boolean cancelApplyBufferUpdate; + List startingVersions; +- int startingOperation; // last operation in the logs on startup + + // metrics + protected Gauge bufferedOpsGauge; +@@ -378,6 +375,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id); + } + ++ String[] oldBufferTlog = getBufferLogList(tlogDir); ++ if (oldBufferTlog != null && oldBufferTlog.length != 0) { ++ existOldBufferLog = true; ++ } + TransactionLog oldLog = null; + for (String oldLogName : tlogFiles) { + File f = new File(tlogDir, oldLogName); +@@ -408,7 +409,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + // TODO: these startingVersions assume that we successfully recover from all non-complete tlogs. + try (RecentUpdates startingUpdates = getRecentUpdates()) { + startingVersions = startingUpdates.getVersions(numRecordsToKeep); +- startingOperation = startingUpdates.getLatestOperation(); + + // populate recent deletes list (since we can't get that info from the index) + for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) { +@@ -434,14 +434,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + this.metricManager = manager; + this.registryName = registry; + bufferedOpsGauge = () -> { ++ if (state == State.BUFFERING) { ++ if (bufferTlog == null) return 0; ++ // numRecords counts header as a record ++ return bufferTlog.numRecords() - 1; ++ } + if (tlog == null) { + return 0; + } else if (state == State.APPLYING_BUFFERED) { + // numRecords counts header as a record + return tlog.numRecords() - 1 - recoveryInfo.adds - recoveryInfo.deleteByQuery - recoveryInfo.deletes - recoveryInfo.errors; +- } else if (state == State.BUFFERING) { +- // numRecords counts header as a record +- return tlog.numRecords() - 1; + } else { + return 0; + } +@@ -472,8 +474,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + return startingVersions; + } + +- public int getStartingOperation() { +- return startingOperation; ++ public boolean existOldBufferLog() { ++ return existOldBufferLog; + } + + /* Takes over ownership of the log, keeping it until no longer needed +@@ -509,6 +511,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + logs.addFirst(oldLog); + } + ++ public String[] getBufferLogList(File directory) { ++ final String prefix = BUFFER_TLOG_NAME+'.'; ++ return directory.list((dir, name) -> name.startsWith(prefix)); ++ } ++ ++ /** ++ * Does update from old tlogs (not from buffer tlog)? ++ * If yes we must skip writing {@code cmd} to current tlog ++ */ ++ private boolean updateFromOldTlogs(UpdateCommand cmd) { ++ return (cmd.getFlags() & UpdateCommand.REPLAY) != 0 && state == State.REPLAYING; ++ } ++ + public String[] getLogList(File directory) { + final String prefix = TLOG_NAME+'.'; + String[] names = directory.list(new FilenameFilter() { +@@ -541,14 +556,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + // if ((cmd.getFlags() & UpdateCommand.REPLAY) != 0) return; + + synchronized (this) { +- long pos = -1; ++ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { ++ ensureBufferTlog(); ++ bufferTlog.write(cmd); ++ return; ++ } + ++ long pos = -1; + long prevPointer = getPrevPointerForUpdate(cmd); + + // don't log if we are replaying from another log +- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { ++ if (!updateFromOldTlogs(cmd)) { + ensureLog(); +- pos = tlog.write(cmd, prevPointer, operationFlags); ++ pos = tlog.write(cmd, prevPointer); + } + + if (!clearCaches) { +@@ -556,10 +576,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + // Only currently would be useful for RTG while in recovery mode though. + LogPtr ptr = new LogPtr(pos, cmd.getVersion(), prevPointer); + +- // only update our map if we're not buffering +- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { +- map.put(cmd.getIndexedId(), ptr); +- } ++ map.put(cmd.getIndexedId(), ptr); + + if (trace) { + log.trace("TLOG: added id " + cmd.getPrintableId() + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); +@@ -606,22 +623,21 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + BytesRef br = cmd.getIndexedId(); + + synchronized (this) { +- long pos = -1; ++ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { ++ ensureBufferTlog(); ++ bufferTlog.writeDelete(cmd); ++ return; ++ } + +- // don't log if we are replaying from another log +- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { ++ long pos = -1; ++ if (!updateFromOldTlogs(cmd)) { + ensureLog(); +- pos = tlog.writeDelete(cmd, operationFlags); ++ pos = tlog.writeDelete(cmd); + } + + LogPtr ptr = new LogPtr(pos, cmd.version); +- +- // only update our map if we're not buffering +- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { +- map.put(br, ptr); +- +- oldDeletes.put(br, ptr); +- } ++ map.put(br, ptr); ++ oldDeletes.put(br, ptr); + + if (trace) { + log.trace("TLOG: added delete for id " + cmd.id + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); +@@ -631,15 +647,20 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + + public void deleteByQuery(DeleteUpdateCommand cmd) { + synchronized (this) { ++ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { ++ ensureBufferTlog(); ++ bufferTlog.writeDeleteByQuery(cmd); ++ return; ++ } ++ + long pos = -1; +- // don't log if we are replaying from another log +- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { ++ if (!updateFromOldTlogs(cmd)) { + ensureLog(); +- pos = tlog.writeDeleteByQuery(cmd, operationFlags); ++ pos = tlog.writeDeleteByQuery(cmd); + } + +- // only change our caches if we are not buffering +- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0 && (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { ++ // skip purge our caches in case of tlog replica ++ if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { + // given that we just did a delete-by-query, we don't know what documents were + // affected and hence we must purge our caches. + openRealtimeSearcher(); +@@ -802,7 +823,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + if (prevTlog != null) { + // if we made it through the commit, write a commit command to the log + // TODO: check that this works to cap a tlog we were using to buffer so we don't replay on startup. +- prevTlog.writeCommit(cmd, operationFlags); ++ prevTlog.writeCommit(cmd); + + addOldLog(prevTlog, true); + // the old log list will decref when no longer needed +@@ -1152,9 +1173,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + public void copyOverBufferingUpdates(CommitUpdateCommand cuc) { + versionInfo.blockUpdates(); + try { +- operationFlags &= ~FLAG_GAP; +- state = State.ACTIVE; +- copyAndSwitchToNewTlog(cuc); ++ synchronized (this) { ++ state = State.ACTIVE; ++ if (bufferTlog == null) { ++ return; ++ } ++ // by calling this, we won't switch to new tlog (compared to applyBufferedUpdates()) ++ // if we switch to new tlog we can possible lose updates on the next fetch ++ copyOverOldUpdates(cuc.getVersion(), bufferTlog); ++ dropBufferTlog(); ++ } + } finally { + versionInfo.unblockUpdates(); + } +@@ -1165,33 +1193,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + * So any updates which hasn't made it to the index is preserved in the current tlog + * @param cuc any updates that have version larger than the version of cuc will be copied over + */ +- public void copyOverOldUpdates(CommitUpdateCommand cuc) { ++ public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) { + versionInfo.blockUpdates(); + try { +- copyAndSwitchToNewTlog(cuc); ++ synchronized (this) { ++ if (tlog == null) { ++ return; ++ } ++ preCommit(cuc); ++ try { ++ copyOverOldUpdates(cuc.getVersion()); ++ } finally { ++ postCommit(cuc); ++ } ++ } + } finally { + versionInfo.unblockUpdates(); + } + } + +- protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) { +- synchronized (this) { +- if (tlog == null) { +- return; +- } +- preCommit(cuc); +- try { +- copyOverOldUpdates(cuc.getVersion()); +- } finally { +- postCommit(cuc); +- } +- } +- } +- +- /** +- * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog +- * @param commitVersion any updates that have version larger than the commitVersion will be copied over +- */ + public void copyOverOldUpdates(long commitVersion) { + TransactionLog oldTlog = prevTlog; + if (oldTlog == null && !logs.isEmpty()) { +@@ -1207,6 +1227,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + log.warn("Exception reading log", e); + return; + } ++ copyOverOldUpdates(commitVersion, oldTlog); ++ } ++ ++ /** ++ * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog ++ * @param commitVersion any updates that have version larger than the commitVersion will be copied over ++ */ ++ public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) { + copyOverOldUpdatesMeter.mark(); + + SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core, +@@ -1270,6 +1298,22 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + } + } + ++ protected void ensureBufferTlog() { ++ if (bufferTlog != null) return; ++ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); ++ bufferTlog = newTransactionLog(new File(tlogDir, newLogName), globalStrings, false); ++ } ++ ++ // Cleanup old buffer tlogs ++ protected void deleteBufferLogs() { ++ String[] oldBufferTlog = getBufferLogList(tlogDir); ++ if (oldBufferTlog != null && oldBufferTlog.length != 0) { ++ for (String oldBufferLogName : oldBufferTlog) { ++ deleteFile(new File(tlogDir, oldBufferLogName)); ++ } ++ } ++ } ++ + + protected void ensureLog() { + if (tlog == null) { +@@ -1285,7 +1329,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + // record a commit + log.info("Recording current closed for " + uhandler.core + " log=" + theLog); + CommitUpdateCommand cmd = new CommitUpdateCommand(new LocalSolrQueryRequest(uhandler.core, new ModifiableSolrParams((SolrParams)null)), false); +- theLog.writeCommit(cmd, operationFlags); ++ theLog.writeCommit(cmd); + } + + theLog.deleteOnClose = false; +@@ -1314,6 +1358,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + log.forceClose(); + } + ++ if (bufferTlog != null) { ++ // should not delete bufferTlog on close, existing bufferTlog is a sign for skip peerSync ++ bufferTlog.deleteOnClose = false; ++ bufferTlog.decref(); ++ bufferTlog.forceClose(); ++ } ++ + try { + ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor); + } catch (Exception e) { +@@ -1347,7 +1398,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + HashMap updates; + List deleteByQueryList; + List deleteList; +- int latestOperation; + + public RecentUpdates(Deque logList) { + this.logList = logList; +@@ -1401,11 +1451,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + return result; + } + +- public int getLatestOperation() { +- return latestOperation; +- } +- +- + private void update() { + int numUpdates = 0; + updateList = new ArrayList<>(logList.size()); +@@ -1431,9 +1476,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + + // TODO: refactor this out so we get common error handling + int opAndFlags = (Integer)entry.get(UpdateLog.FLAGS_IDX); +- if (latestOperation == 0) { +- latestOperation = opAndFlags; +- } + int oper = opAndFlags & UpdateLog.OPERATION_MASK; + long version = (Long) entry.get(UpdateLog.VERSION_IDX); + +@@ -1525,6 +1567,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + tlog.incref(); + logList.addFirst(tlog); + } ++ if (bufferTlog != null) { ++ bufferTlog.incref(); ++ logList.addFirst(bufferTlog); ++ } + } + + // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and +@@ -1542,13 +1588,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + // reading state and acting on it in the distributed update processor + versionInfo.blockUpdates(); + try { +- if (state == State.BUFFERING) { +- log.info("Restarting buffering. previous=" + recoveryInfo); +- } else if (state != State.ACTIVE) { ++ if (state != State.ACTIVE && state != State.BUFFERING) { + // we don't currently have support for handling other states + log.warn("Unexpected state for bufferUpdates: " + state + ", Ignoring request."); + return; + } ++ dropBufferTlog(); ++ deleteBufferLogs(); + + recoveryInfo = new RecoveryInfo(); + +@@ -1556,15 +1602,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + log.info("Starting to buffer updates. " + this); + } + +- // since we blocked updates, this synchronization shouldn't strictly be necessary. +- synchronized (this) { +- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); +- } +- + state = State.BUFFERING; +- +- // currently, buffering is only called by recovery, meaning that there is most likely a gap in updates +- operationFlags |= FLAG_GAP; + } finally { + versionInfo.unblockUpdates(); + } +@@ -1580,25 +1618,24 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + log.info("Dropping buffered updates " + this); + } + +- // since we blocked updates, this synchronization shouldn't strictly be necessary. +- synchronized (this) { +- if (tlog != null) { +- tlog.rollback(recoveryInfo.positionOfStart); +- } +- } ++ dropBufferTlog(); + + state = State.ACTIVE; +- operationFlags &= ~FLAG_GAP; +- } catch (IOException e) { +- SolrException.log(log,"Error attempting to roll back log", e); +- return false; +- } +- finally { ++ } finally { + versionInfo.unblockUpdates(); + } + return true; + } + ++ private void dropBufferTlog() { ++ synchronized (this) { ++ if (bufferTlog != null) { ++ bufferTlog.decref(); ++ bufferTlog = null; ++ } ++ } ++ } ++ + + /** Returns the Future to wait on, or null if no replay was needed */ + public Future applyBufferedUpdates() { +@@ -1612,27 +1649,30 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + try { + cancelApplyBufferUpdate = false; + if (state != State.BUFFERING) return null; +- operationFlags &= ~FLAG_GAP; + +- // handle case when no log was even created because no updates +- // were received. +- if (tlog == null) { +- state = State.ACTIVE; +- return null; ++ synchronized (this) { ++ // handle case when no updates were received. ++ if (bufferTlog == null) { ++ state = State.ACTIVE; ++ return null; ++ } ++ bufferTlog.incref(); + } +- tlog.incref(); ++ + state = State.APPLYING_BUFFERED; + } finally { + versionInfo.unblockUpdates(); + } + + if (recoveryExecutor.isShutdown()) { +- tlog.decref(); + throw new RuntimeException("executor is not running..."); + } + ExecutorCompletionService cs = new ExecutorCompletionService<>(recoveryExecutor); +- LogReplayer replayer = new LogReplayer(Arrays.asList(new TransactionLog[]{tlog}), true); +- return cs.submit(replayer, recoveryInfo); ++ LogReplayer replayer = new LogReplayer(Collections.singletonList(bufferTlog), true); ++ return cs.submit(() -> { ++ replayer.run(); ++ dropBufferTlog(); ++ }, recoveryInfo); + } + + public State getState() { +@@ -1903,10 +1943,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + if (!activeLog) { + // if we are replaying an old tlog file, we need to add a commit to the end + // so we don't replay it again if we restart right after. +- +- // if the last operation we replayed had FLAG_GAP set, we want to use that again so we don't lose it +- // as the flag on the last operation. +- translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK)); ++ translog.writeCommit(cmd); + } + + try { +@@ -2037,10 +2074,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { + return cmd; + } + +- public void cancelApplyBufferedUpdates() { +- this.cancelApplyBufferUpdate = true; +- } +- + ThreadPoolExecutor recoveryExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0, + Integer.MAX_VALUE, 1, TimeUnit.SECONDS, new SynchronousQueue(), + new DefaultSolrThreadFactory("recoveryExecutor")); +diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java b/solr/core/src/test/org/apache/solr/search/TestRecovery.java +index 1d62207..1b79cee 100644 +--- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java ++++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java +@@ -24,7 +24,9 @@ import com.codahale.metrics.Gauge; + import com.codahale.metrics.Meter; + import com.codahale.metrics.Metric; + import com.codahale.metrics.MetricRegistry; ++import org.apache.solr.common.util.TimeSource; + import org.apache.solr.metrics.SolrMetricManager; ++import org.apache.solr.util.TimeOut; + import org.noggit.ObjectBuilder; + + import org.slf4j.Logger; +@@ -820,6 +822,7 @@ public class TestRecovery extends SolrTestCaseJ4 { + +"]" + ); + ++ // Note that the v101->v103 are dropped, therefore it does not present in RTG + assertJQ(req("qt","/get", "getVersions","6") + ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" + ); +@@ -929,7 +932,6 @@ public class TestRecovery extends SolrTestCaseJ4 { + ,"=={'versions':["+v105+","+v104+"]}" + ); + +- // this time add some docs first before buffering starts (so tlog won't be at pos 0) + updateJ(jsonAdd(sdoc("id","c100", "_version_",v200)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); + updateJ(jsonAdd(sdoc("id","c101", "_version_",v201)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); + +@@ -957,10 +959,8 @@ public class TestRecovery extends SolrTestCaseJ4 { + +"" +"]" + ); + +- // The updates that were buffered (but never applied) still appear in recent versions! +- // This is good for some uses, but may not be good for others. +- assertJQ(req("qt","/get", "getVersions","11") +- ,"=={'versions':["+String.join(",",v206,v205,v204,v203,v201,v200,v105,v104,v103,v102,v101)+"]}" ++ assertJQ(req("qt","/get", "getVersions","6") ++ ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" + ); + + assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state +@@ -1008,13 +1008,9 @@ public class TestRecovery extends SolrTestCaseJ4 { + + + @Test +- public void testBufferingFlags() throws Exception { ++ public void testExistOldBufferLog() throws Exception { + + DirectUpdateHandler2.commitOnClose = false; +- final Semaphore logReplayFinish = new Semaphore(0); +- +- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); +- + + SolrQueryRequest req = req(); + UpdateHandler uhandler = req.getCore().getUpdateHandler(); +@@ -1024,9 +1020,6 @@ public class TestRecovery extends SolrTestCaseJ4 { + String v101 = getNextVersion(); + String v102 = getNextVersion(); + String v103 = getNextVersion(); +- String v114 = getNextVersion(); +- String v115 = getNextVersion(); +- String v116 = getNextVersion(); + String v117 = getNextVersion(); + + clearIndex(); +@@ -1049,30 +1042,10 @@ public class TestRecovery extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- logReplayFinish.acquire(); // wait for replay to finish +- +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last +- +- // +- // Try again to ensure that the previous log replay didn't wipe out our flags +- // +- +- req.close(); +- h.close(); +- createCore(); +- +- req = req(); +- uhandler = req.getCore().getUpdateHandler(); +- ulog = uhandler.getUpdateLog(); +- +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); +- +- // now do some normal non-buffered adds +- updateJ(jsonAdd(sdoc("id","Q4", "_version_",v114)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- updateJ(jsonAdd(sdoc("id","Q5", "_version_",v115)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- updateJ(jsonAdd(sdoc("id","Q6", "_version_",v116)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- assertU(commit()); ++ // the core does not replay updates from buffer tlog on startup ++ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + ++ // buffer tlog won't be removed on restart + req.close(); + h.close(); + createCore(); +@@ -1081,10 +1054,9 @@ public class TestRecovery extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); ++ assertTrue(ulog.existOldBufferLog()); + + ulog.bufferUpdates(); +- // simulate receiving no updates + ulog.applyBufferedUpdates(); + updateJ(jsonAdd(sdoc("id","Q7", "_version_",v117)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); // do another add to make sure flags are back to normal + +@@ -1096,10 +1068,12 @@ public class TestRecovery extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 +- +- logReplayFinish.acquire(); +- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state ++ assertFalse(ulog.existOldBufferLog()); ++ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart ++ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); ++ timeout.waitFor("Timeout waiting for finish replay updates", ++ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); ++ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); + } finally { + DirectUpdateHandler2.commitOnClose = true; + UpdateLog.testing_logReplayHook = null; +diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java +index e6bb9a6..1796319 100644 +--- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java ++++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java +@@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster; + import org.apache.solr.SolrTestCaseJ4; + import org.apache.solr.cloud.hdfs.HdfsTestUtil; + import org.apache.solr.common.util.IOUtils; ++import org.apache.solr.common.util.TimeSource; + import org.apache.solr.request.SolrQueryRequest; + import org.apache.solr.update.DirectUpdateHandler2; + import org.apache.solr.update.HdfsUpdateLog; +@@ -51,6 +52,7 @@ import org.apache.solr.update.UpdateHandler; + import org.apache.solr.update.UpdateLog; + import org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase; + import org.apache.solr.util.BadHdfsThreadsFilter; ++import org.apache.solr.util.TimeOut; + import org.junit.AfterClass; + import org.junit.BeforeClass; + import org.junit.Ignore; +@@ -515,13 +517,9 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { + + + @Test +- public void testBufferingFlags() throws Exception { ++ public void testExistOldBufferLog() throws Exception { + + DirectUpdateHandler2.commitOnClose = false; +- final Semaphore logReplayFinish = new Semaphore(0); +- +- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); +- + + SolrQueryRequest req = req(); + UpdateHandler uhandler = req.getCore().getUpdateHandler(); +@@ -548,14 +546,10 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- logReplayFinish.acquire(); // wait for replay to finish +- +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last +- +- // +- // Try again to ensure that the previous log replay didn't wipe out our flags +- // ++ // the core no longer replay updates from buffer tlog on startup ++ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + ++ // buffer tlog won't be removed on restart + req.close(); + h.close(); + createCore(); +@@ -564,23 +558,7 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); +- +- // now do some normal non-buffered adds +- updateJ(jsonAdd(sdoc("id","Q4", "_version_","114")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- updateJ(jsonAdd(sdoc("id","Q5", "_version_","115")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- updateJ(jsonAdd(sdoc("id","Q6", "_version_","116")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); +- assertU(commit()); +- +- req.close(); +- h.close(); +- createCore(); +- +- req = req(); +- uhandler = req.getCore().getUpdateHandler(); +- ulog = uhandler.getUpdateLog(); +- +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); ++ assertTrue(ulog.existOldBufferLog()); + + ulog.bufferUpdates(); + // simulate receiving no updates +@@ -595,10 +573,12 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { + uhandler = req.getCore().getUpdateHandler(); + ulog = uhandler.getUpdateLog(); + +- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 +- +- logReplayFinish.acquire(); +- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state ++ assertFalse(ulog.existOldBufferLog()); ++ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart ++ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); ++ timeout.waitFor("Timeout waiting for finish replay updates", ++ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); ++ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); + } finally { + DirectUpdateHandler2.commitOnClose = true; + UpdateLog.testing_logReplayHook = null; +diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +index 1bf4ad4..d2b4b26 100644 +--- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java ++++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +@@ -35,7 +35,7 @@ public class TransactionLogTest extends LuceneTestCase { + transactionLog.lastAddSize = 2000000000; + AddUpdateCommand updateCommand = new AddUpdateCommand(null); + updateCommand.solrDoc = new SolrInputDocument(); +- transactionLog.write(updateCommand, 0); ++ transactionLog.write(updateCommand); + } + } + diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 66d885362fd..2c2191eda5b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -325,6 +325,8 @@ Optimizations SolrConstantScoreQuery as well. QWF since v5.4.0 sometimes needlessly internally executed and cached the query. Affects ExpandComponent, ChildDocTransformer, CurrencyFieldType, TermsQParser. (David Smiley) +* SOLR-9922: Write buffering updates to another tlog. (Cao Manh Dat) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index c8f5ae89fbe..966497b0938 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -449,7 +449,6 @@ public class RecoveryStrategy implements Runnable, Closeable { // TODO: perhaps make this grab a new core each time through the loop to handle core reloads? final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception { - boolean replayed = false; boolean successfulRecovery = false; UpdateLog ulog; @@ -500,8 +499,7 @@ public class RecoveryStrategy implements Runnable, Closeable { // when we went down. We may have received updates since then. recentVersions = startingVersions; try { - if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) { - // last operation at the time of startup had the GAP flag set... + if (ulog.existOldBufferLog()) { // this means we were previously doing a full index replication // that probably didn't complete and buffering updates in the // meantime. @@ -542,9 +540,9 @@ public class RecoveryStrategy implements Runnable, Closeable { } LOG.info("Begin buffering updates. core=[{}]", coreName); + // recalling buffer updates will drop the old buffer tlog ulog.bufferUpdates(); - replayed = false; - + LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(), ourUrl); zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); @@ -603,8 +601,7 @@ public class RecoveryStrategy implements Runnable, Closeable { LOG.info("Replaying updates buffered during PeerSync."); replay(core); - replayed = true; - + // sync success successfulRecovery = true; return; @@ -630,8 +627,7 @@ public class RecoveryStrategy implements Runnable, Closeable { } replayFuture = replay(core); - replayed = true; - + if (isClosed()) { LOG.info("RecoveryStrategy has been closed"); break; @@ -650,21 +646,6 @@ public class RecoveryStrategy implements Runnable, Closeable { } catch (Exception e) { SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e); } finally { - if (!replayed) { - // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates - // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date. - // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will - // reset our starting point for playback. - LOG.info("Replay not started, or was not successful... still buffering updates."); - - /** this prev code is retained in case we want to switch strategies. - try { - ulog.dropBufferedUpdates(); - } catch (Exception e) { - SolrException.log(log, "", e); - } - **/ - } if (successfulRecovery) { LOG.info("Registering as Active after recovery."); try { diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 0a742e3a5ae..aa648dd8869 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -97,7 +97,7 @@ public class ReplicateFromLeader { new ModifiableSolrParams()); CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); cuc.setVersion(Long.parseLong(commitVersion)); - updateLog.copyOverOldUpdates(cuc); + updateLog.commitAndSwitchToNewTlog(cuc); lastVersion = Long.parseLong(commitVersion); } }); diff --git a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java index 3534f622908..f668540325e 100644 --- a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java @@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory; * methods {@link #incref()}, {@link #close()} and {@link #reopenOutputStream()}. *
  • encode the number of records in the tlog file in the last commit record. The number of records will be * decoded and reuse if the tlog file is reopened. This is achieved by extending the constructor, and the - * methods {@link #writeCommit(CommitUpdateCommand, int)} and {@link #getReader(long)}.
  • + * methods {@link #writeCommit(CommitUpdateCommand)} and {@link #getReader(long)}. * */ public class CdcrTransactionLog extends TransactionLog { @@ -108,7 +108,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long write(AddUpdateCommand cmd, long prevPointer, int flags) { + public long write(AddUpdateCommand cmd, long prevPointer) { assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); LogCodec codec = new LogCodec(resolver); @@ -125,7 +125,7 @@ public class CdcrTransactionLog extends TransactionLog { codec.init(out); if (cmd.isInPlaceUpdate()) { codec.writeTag(JavaBinCodec.ARR, 6); - codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte + codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeLong(prevPointer); codec.writeLong(cmd.prevVersion); @@ -141,7 +141,7 @@ public class CdcrTransactionLog extends TransactionLog { } else { codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.ADD | flags); // should just take one byte + codec.writeInt(UpdateLog.ADD); // should just take one byte codec.writeLong(cmd.getVersion()); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { // if the update is received via cdcr source; add extra boolean entry @@ -179,7 +179,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeDelete(DeleteUpdateCommand cmd, int flags) { + public long writeDelete(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { @@ -190,7 +190,7 @@ public class CdcrTransactionLog extends TransactionLog { MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeByteArray(br.bytes, br.offset, br.length); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { @@ -217,7 +217,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { + public long writeDeleteByQuery(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { checkWriteHeader(codec, null); @@ -225,7 +225,7 @@ public class CdcrTransactionLog extends TransactionLog { MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(cmd.query); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { @@ -249,7 +249,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -261,7 +261,7 @@ public class CdcrTransactionLog extends TransactionLog { } codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeTag(JavaBinCodec.INT); // Enforce the encoding of a plain integer, to simplify decoding fos.writeInt(numRecords + 1); // the number of records in the file - +1 to account for the commit operation being written diff --git a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java index 6b202044d76..bff16122ecf 100644 --- a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java @@ -352,7 +352,6 @@ public class CdcrUpdateLog extends UpdateLog { long latestVersion = startingUpdates.getMaxRecentVersion(); try { startingVersions = startingUpdates.getVersions(numRecordsToKeep); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the index) for (int i=startingUpdates.deleteList.size()-1; i>=0; i--) { @@ -389,9 +388,7 @@ public class CdcrUpdateLog extends UpdateLog { */ private void copyBufferedUpdates(File tlogSrc, long offsetSrc, long latestVersion) { recoveryInfo = new RecoveryInfo(); - recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); state = State.BUFFERING; - operationFlags |= FLAG_GAP; ModifiableSolrParams params = new ModifiableSolrParams(); params.set(DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString()); diff --git a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java index 0f89016a107..8ed7d7ad65a 100644 --- a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java @@ -166,20 +166,6 @@ public class HdfsTransactionLog extends TransactionLog { } return true; } - - // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. - // This should only be used to roll back buffered updates, not actually applied updates. - @Override - public void rollback(long pos) throws IOException { - synchronized (this) { - assert snapshot_size == pos; - ensureFlushed(); - // TODO: how do we rollback with hdfs?? We need HDFS-3107 - fos.setWritten(pos); - assert fos.size() == pos; - numRecords = snapshot_numRecords; - } - } private void readHeader(FastInputStream fis) throws IOException { // read existing header @@ -210,7 +196,7 @@ public class HdfsTransactionLog extends TransactionLog { } @Override - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -223,7 +209,7 @@ public class HdfsTransactionLog extends TransactionLog { codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java index 7bb74d05bf9..8ca4b1cb3e5 100644 --- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java @@ -65,37 +65,6 @@ public class HdfsUpdateLog extends UpdateLog { this.confDir = confDir; } - // HACK - // while waiting for HDFS-3107, instead of quickly - // dropping, we slowly apply - // This is somewhat brittle, but current usage - // allows for it - @Override - public boolean dropBufferedUpdates() { - versionInfo.blockUpdates(); - try { - if (state != State.BUFFERING) return false; - - if (log.isInfoEnabled()) { - log.info("Dropping buffered updates " + this); - } - - // since we blocked updates, this synchronization shouldn't strictly be - // necessary. - synchronized (this) { - if (tlog != null) { - // tlog.rollback(recoveryInfo.positionOfStart); - } - } - - state = State.ACTIVE; - operationFlags &= ~FLAG_GAP; - } finally { - versionInfo.unblockUpdates(); - } - return true; - } - @Override public void init(PluginInfo info) { super.init(info); @@ -186,6 +155,11 @@ public class HdfsUpdateLog extends UpdateLog { throw new RuntimeException("Problem creating directory: " + tlogDir, e); } } + + String[] oldBufferTlog = getBufferLogList(fs, tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + existOldBufferLog = true; + } tlogFiles = getLogList(fs, tlogDir); id = getLastLogId() + 1; // add 1 since we will create a new log for the @@ -241,7 +215,6 @@ public class HdfsUpdateLog extends UpdateLog { // non-complete tlogs. try (RecentUpdates startingUpdates = getRecentUpdates()) { startingVersions = startingUpdates.getVersions(getNumRecordsToKeep()); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the // index) @@ -269,6 +242,23 @@ public class HdfsUpdateLog extends UpdateLog { public String getLogDir() { return tlogDir.toUri().toString(); } + + public static String[] getBufferLogList(FileSystem fs, Path tlogDir) { + final String prefix = BUFFER_TLOG_NAME+'.'; + assert fs != null; + FileStatus[] fileStatuses; + try { + fileStatuses = fs.listStatus(tlogDir, path -> path.getName().startsWith(prefix)); + } catch (IOException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Failed on listing old buffer tlog", e); + } + + String[] names = new String[fileStatuses.length]; + for (int i = 0; i < fileStatuses.length; i++) { + names[i] = fileStatuses[i].getPath().getName(); + } + return names; + } public static String[] getLogList(FileSystem fs, Path tlogDir) { final String prefix = TLOG_NAME + '.'; @@ -307,7 +297,35 @@ public class HdfsUpdateLog extends UpdateLog { IOUtils.closeQuietly(fs); } } - + + @Override + protected void ensureBufferTlog() { + if (bufferTlog != null) return; + String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); + bufferTlog = new HdfsTransactionLog(fs, new Path(tlogDir, newLogName), + globalStrings, tlogDfsReplication); + } + + @Override + protected void deleteBufferLogs() { + // Delete old buffer logs + String[] oldBufferTlog = getBufferLogList(fs, tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + for (String oldBufferLogName : oldBufferTlog) { + Path f = new Path(tlogDir, oldBufferLogName); + try { + boolean s = fs.delete(f, false); + if (!s) { + log.error("Could not remove old buffer tlog file:" + f); + } + } catch (IOException e) { + // No need to bubble up this exception, because it won't cause any problems on recovering + log.error("Could not remove old buffer tlog file:" + f, e); + } + } + } + } + @Override protected void ensureLog() { if (tlog == null) { diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java index 96a928cc1a8..2a23896d491 100644 --- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java @@ -85,9 +85,6 @@ public class TransactionLog implements Closeable { Map globalStringMap = new HashMap<>(); List globalStringList = new ArrayList<>(); - long snapshot_size; - int snapshot_numRecords; - // write a BytesRef as a byte array static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() { @Override @@ -153,7 +150,7 @@ public class TransactionLog implements Closeable { // Parse tlog id from the filename String filename = tlogFile.getName(); - id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20)); + id = Long.parseLong(filename.substring(filename.lastIndexOf('.')+1)); this.tlogFile = tlogFile; raf = new RandomAccessFile(this.tlogFile, "rw"); @@ -233,29 +230,6 @@ public class TransactionLog implements Closeable { return true; } - /** takes a snapshot of the current position and number of records - * for later possible rollback, and returns the position */ - public long snapshot() { - synchronized (this) { - snapshot_size = fos.size(); - snapshot_numRecords = numRecords; - return snapshot_size; - } - } - - // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. - // This should only be used to roll back buffered updates, not actually applied updates. - public void rollback(long pos) throws IOException { - synchronized (this) { - assert snapshot_size == pos; - fos.flush(); - raf.setLength(pos); - fos.setWritten(pos); - assert fos.size() == pos; - numRecords = snapshot_numRecords; - } - } - public long writeData(Object o) { @SuppressWarnings("resource") final LogCodec codec = new LogCodec(resolver); try { @@ -346,17 +320,16 @@ public class TransactionLog implements Closeable { /** * Writes an add update command to the transaction log. This is not applicable for - * in-place updates; use {@link #write(AddUpdateCommand, long, int)}. + * in-place updates; use {@link #write(AddUpdateCommand, long)}. * (The previous pointer (applicable for in-place updates) is set to -1 while writing * the command to the transaction log.) * @param cmd The add update command to be written - * @param flags Options for writing the command to the transaction log * @return Returns the position pointer of the written update command * - * @see #write(AddUpdateCommand, long, int) + * @see #write(AddUpdateCommand, long) */ - public long write(AddUpdateCommand cmd, int flags) { - return write(cmd, -1, flags); + public long write(AddUpdateCommand cmd) { + return write(cmd, -1); } /** @@ -365,10 +338,9 @@ public class TransactionLog implements Closeable { * @param cmd The add update command to be written * @param prevPointer The pointer in the transaction log which this update depends * on (applicable for in-place updates) - * @param flags Options for writing the command to the transaction log * @return Returns the position pointer of the written update command */ - public long write(AddUpdateCommand cmd, long prevPointer, int flags) { + public long write(AddUpdateCommand cmd, long prevPointer) { assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); LogCodec codec = new LogCodec(resolver); @@ -386,14 +358,14 @@ public class TransactionLog implements Closeable { codec.init(out); if (cmd.isInPlaceUpdate()) { codec.writeTag(JavaBinCodec.ARR, 5); - codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte + codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeLong(prevPointer); codec.writeLong(cmd.prevVersion); codec.writeSolrInputDocument(cmd.getSolrInputDocument()); } else { codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.ADD | flags); // should just take one byte + codec.writeInt(UpdateLog.ADD); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeSolrInputDocument(cmd.getSolrInputDocument()); } @@ -422,7 +394,7 @@ public class TransactionLog implements Closeable { } } - public long writeDelete(DeleteUpdateCommand cmd, int flags) { + public long writeDelete(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { @@ -433,7 +405,7 @@ public class TransactionLog implements Closeable { MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeByteArray(br.bytes, br.offset, br.length); @@ -452,7 +424,7 @@ public class TransactionLog implements Closeable { } - public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { + public long writeDeleteByQuery(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { checkWriteHeader(codec, null); @@ -460,7 +432,7 @@ public class TransactionLog implements Closeable { MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(cmd.query); @@ -478,7 +450,7 @@ public class TransactionLog implements Closeable { } - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -490,7 +462,7 @@ public class TransactionLog implements Closeable { } codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java index 7f821eafc0e..1bda23fc038 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java @@ -96,6 +96,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { private static final long STATUS_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); public static String LOG_FILENAME_PATTERN = "%s.%019d"; public static String TLOG_NAME="tlog"; + public static String BUFFER_TLOG_NAME="buffer.tlog"; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private boolean debug = log.isDebugEnabled(); @@ -139,11 +140,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public static final int DELETE_BY_QUERY = 0x03; public static final int COMMIT = 0x04; public static final int UPDATE_INPLACE = 0x08; - // Flag indicating that this is a buffered operation, and that a gap exists before buffering started. - // for example, if full index replication starts and we are buffering updates, then this flag should - // be set to indicate that replaying the log would not bring us into sync (i.e. peersync should - // fail if this flag is set on the last update in the tlog). - public static final int FLAG_GAP = 0x10; + // For backward-compatibility, we should delete this field in 9.0 public static final int OPERATION_MASK = 0x0f; // mask off flags to get the operation /** @@ -186,8 +183,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { long id = -1; protected State state = State.ACTIVE; - protected int operationFlags; // flags to write in the transaction log with operations (i.e. FLAG_GAP) + protected TransactionLog bufferTlog; protected TransactionLog tlog; protected TransactionLog prevTlog; protected final Deque logs = new LinkedList<>(); // list of recent logs, newest first @@ -206,6 +203,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { protected int maxNumLogsToKeep; protected int numVersionBuckets; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two. protected Long maxVersionFromIndex = null; + protected boolean existOldBufferLog = false; // keep track of deletes only... this is not updated on an add protected LinkedHashMap oldDeletes = new LinkedHashMap(numDeletesToKeep) { @@ -244,7 +242,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { volatile UpdateHandler uhandler; // a core reload can change this reference! protected volatile boolean cancelApplyBufferUpdate; List startingVersions; - int startingOperation; // last operation in the logs on startup // metrics protected Gauge bufferedOpsGauge; @@ -378,6 +375,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id); } + String[] oldBufferTlog = getBufferLogList(tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + existOldBufferLog = true; + } TransactionLog oldLog = null; for (String oldLogName : tlogFiles) { File f = new File(tlogDir, oldLogName); @@ -408,7 +409,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // TODO: these startingVersions assume that we successfully recover from all non-complete tlogs. try (RecentUpdates startingUpdates = getRecentUpdates()) { startingVersions = startingUpdates.getVersions(numRecordsToKeep); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the index) for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) { @@ -434,14 +434,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { this.metricManager = manager; this.registryName = registry; bufferedOpsGauge = () -> { + if (state == State.BUFFERING) { + if (bufferTlog == null) return 0; + // numRecords counts header as a record + return bufferTlog.numRecords() - 1; + } if (tlog == null) { return 0; } else if (state == State.APPLYING_BUFFERED) { // numRecords counts header as a record return tlog.numRecords() - 1 - recoveryInfo.adds - recoveryInfo.deleteByQuery - recoveryInfo.deletes - recoveryInfo.errors; - } else if (state == State.BUFFERING) { - // numRecords counts header as a record - return tlog.numRecords() - 1; } else { return 0; } @@ -472,8 +474,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return startingVersions; } - public int getStartingOperation() { - return startingOperation; + public boolean existOldBufferLog() { + return existOldBufferLog; } /* Takes over ownership of the log, keeping it until no longer needed @@ -509,6 +511,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { logs.addFirst(oldLog); } + public String[] getBufferLogList(File directory) { + final String prefix = BUFFER_TLOG_NAME+'.'; + return directory.list((dir, name) -> name.startsWith(prefix)); + } + + /** + * Does update from old tlogs (not from buffer tlog)? + * If yes we must skip writing {@code cmd} to current tlog + */ + private boolean updateFromOldTlogs(UpdateCommand cmd) { + return (cmd.getFlags() & UpdateCommand.REPLAY) != 0 && state == State.REPLAYING; + } + public String[] getLogList(File directory) { final String prefix = TLOG_NAME+'.'; String[] names = directory.list(new FilenameFilter() { @@ -541,14 +556,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // if ((cmd.getFlags() & UpdateCommand.REPLAY) != 0) return; synchronized (this) { - long pos = -1; + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.write(cmd); + return; + } + long pos = -1; long prevPointer = getPrevPointerForUpdate(cmd); // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (!updateFromOldTlogs(cmd)) { ensureLog(); - pos = tlog.write(cmd, prevPointer, operationFlags); + pos = tlog.write(cmd, prevPointer); } if (!clearCaches) { @@ -556,10 +576,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // Only currently would be useful for RTG while in recovery mode though. LogPtr ptr = new LogPtr(pos, cmd.getVersion(), prevPointer); - // only update our map if we're not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { - map.put(cmd.getIndexedId(), ptr); - } + map.put(cmd.getIndexedId(), ptr); if (trace) { log.trace("TLOG: added id " + cmd.getPrintableId() + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); @@ -606,22 +623,21 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { BytesRef br = cmd.getIndexedId(); synchronized (this) { - long pos = -1; + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.writeDelete(cmd); + return; + } - // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + long pos = -1; + if (!updateFromOldTlogs(cmd)) { ensureLog(); - pos = tlog.writeDelete(cmd, operationFlags); + pos = tlog.writeDelete(cmd); } LogPtr ptr = new LogPtr(pos, cmd.version); - - // only update our map if we're not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { - map.put(br, ptr); - - oldDeletes.put(br, ptr); - } + map.put(br, ptr); + oldDeletes.put(br, ptr); if (trace) { log.trace("TLOG: added delete for id " + cmd.id + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); @@ -631,15 +647,20 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public void deleteByQuery(DeleteUpdateCommand cmd) { synchronized (this) { - long pos = -1; - // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { - ensureLog(); - pos = tlog.writeDeleteByQuery(cmd, operationFlags); + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.writeDeleteByQuery(cmd); + return; } - // only change our caches if we are not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0 && (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { + long pos = -1; + if (!updateFromOldTlogs(cmd)) { + ensureLog(); + pos = tlog.writeDeleteByQuery(cmd); + } + + // skip purge our caches in case of tlog replica + if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { // given that we just did a delete-by-query, we don't know what documents were // affected and hence we must purge our caches. openRealtimeSearcher(); @@ -802,7 +823,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { if (prevTlog != null) { // if we made it through the commit, write a commit command to the log // TODO: check that this works to cap a tlog we were using to buffer so we don't replay on startup. - prevTlog.writeCommit(cmd, operationFlags); + prevTlog.writeCommit(cmd); addOldLog(prevTlog, true); // the old log list will decref when no longer needed @@ -1152,9 +1173,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public void copyOverBufferingUpdates(CommitUpdateCommand cuc) { versionInfo.blockUpdates(); try { - operationFlags &= ~FLAG_GAP; - state = State.ACTIVE; - copyAndSwitchToNewTlog(cuc); + synchronized (this) { + state = State.ACTIVE; + if (bufferTlog == null) { + return; + } + // by calling this, we won't switch to new tlog (compared to applyBufferedUpdates()) + // if we switch to new tlog we can possible lose updates on the next fetch + copyOverOldUpdates(cuc.getVersion(), bufferTlog); + dropBufferTlog(); + } } finally { versionInfo.unblockUpdates(); } @@ -1165,33 +1193,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { * So any updates which hasn't made it to the index is preserved in the current tlog * @param cuc any updates that have version larger than the version of cuc will be copied over */ - public void copyOverOldUpdates(CommitUpdateCommand cuc) { + public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) { versionInfo.blockUpdates(); try { - copyAndSwitchToNewTlog(cuc); + synchronized (this) { + if (tlog == null) { + return; + } + preCommit(cuc); + try { + copyOverOldUpdates(cuc.getVersion()); + } finally { + postCommit(cuc); + } + } } finally { versionInfo.unblockUpdates(); } } - protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) { - synchronized (this) { - if (tlog == null) { - return; - } - preCommit(cuc); - try { - copyOverOldUpdates(cuc.getVersion()); - } finally { - postCommit(cuc); - } - } - } - - /** - * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog - * @param commitVersion any updates that have version larger than the commitVersion will be copied over - */ public void copyOverOldUpdates(long commitVersion) { TransactionLog oldTlog = prevTlog; if (oldTlog == null && !logs.isEmpty()) { @@ -1207,6 +1227,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.warn("Exception reading log", e); return; } + copyOverOldUpdates(commitVersion, oldTlog); + } + + /** + * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog + * @param commitVersion any updates that have version larger than the commitVersion will be copied over + */ + public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) { copyOverOldUpdatesMeter.mark(); SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core, @@ -1270,6 +1298,22 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { } } + protected void ensureBufferTlog() { + if (bufferTlog != null) return; + String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); + bufferTlog = newTransactionLog(new File(tlogDir, newLogName), globalStrings, false); + } + + // Cleanup old buffer tlogs + protected void deleteBufferLogs() { + String[] oldBufferTlog = getBufferLogList(tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + for (String oldBufferLogName : oldBufferTlog) { + deleteFile(new File(tlogDir, oldBufferLogName)); + } + } + } + protected void ensureLog() { if (tlog == null) { @@ -1285,7 +1329,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // record a commit log.info("Recording current closed for " + uhandler.core + " log=" + theLog); CommitUpdateCommand cmd = new CommitUpdateCommand(new LocalSolrQueryRequest(uhandler.core, new ModifiableSolrParams((SolrParams)null)), false); - theLog.writeCommit(cmd, operationFlags); + theLog.writeCommit(cmd); } theLog.deleteOnClose = false; @@ -1314,6 +1358,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.forceClose(); } + if (bufferTlog != null) { + // should not delete bufferTlog on close, existing bufferTlog is a sign for skip peerSync + bufferTlog.deleteOnClose = false; + bufferTlog.decref(); + bufferTlog.forceClose(); + } + try { ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor); } catch (Exception e) { @@ -1347,7 +1398,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { HashMap updates; List deleteByQueryList; List deleteList; - int latestOperation; public RecentUpdates(Deque logList) { this.logList = logList; @@ -1401,11 +1451,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return result; } - public int getLatestOperation() { - return latestOperation; - } - - private void update() { int numUpdates = 0; updateList = new ArrayList<>(logList.size()); @@ -1431,9 +1476,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // TODO: refactor this out so we get common error handling int opAndFlags = (Integer)entry.get(UpdateLog.FLAGS_IDX); - if (latestOperation == 0) { - latestOperation = opAndFlags; - } int oper = opAndFlags & UpdateLog.OPERATION_MASK; long version = (Long) entry.get(UpdateLog.VERSION_IDX); @@ -1525,6 +1567,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { tlog.incref(); logList.addFirst(tlog); } + if (bufferTlog != null) { + bufferTlog.incref(); + logList.addFirst(bufferTlog); + } } // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and @@ -1542,13 +1588,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // reading state and acting on it in the distributed update processor versionInfo.blockUpdates(); try { - if (state == State.BUFFERING) { - log.info("Restarting buffering. previous=" + recoveryInfo); - } else if (state != State.ACTIVE) { + if (state != State.ACTIVE && state != State.BUFFERING) { // we don't currently have support for handling other states log.warn("Unexpected state for bufferUpdates: " + state + ", Ignoring request."); return; } + dropBufferTlog(); + deleteBufferLogs(); recoveryInfo = new RecoveryInfo(); @@ -1556,15 +1602,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.info("Starting to buffer updates. " + this); } - // since we blocked updates, this synchronization shouldn't strictly be necessary. - synchronized (this) { - recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); - } - state = State.BUFFERING; - - // currently, buffering is only called by recovery, meaning that there is most likely a gap in updates - operationFlags |= FLAG_GAP; } finally { versionInfo.unblockUpdates(); } @@ -1580,25 +1618,24 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.info("Dropping buffered updates " + this); } - // since we blocked updates, this synchronization shouldn't strictly be necessary. - synchronized (this) { - if (tlog != null) { - tlog.rollback(recoveryInfo.positionOfStart); - } - } + dropBufferTlog(); state = State.ACTIVE; - operationFlags &= ~FLAG_GAP; - } catch (IOException e) { - SolrException.log(log,"Error attempting to roll back log", e); - return false; - } - finally { + } finally { versionInfo.unblockUpdates(); } return true; } + private void dropBufferTlog() { + synchronized (this) { + if (bufferTlog != null) { + bufferTlog.decref(); + bufferTlog = null; + } + } + } + /** Returns the Future to wait on, or null if no replay was needed */ public Future applyBufferedUpdates() { @@ -1612,27 +1649,30 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { try { cancelApplyBufferUpdate = false; if (state != State.BUFFERING) return null; - operationFlags &= ~FLAG_GAP; - // handle case when no log was even created because no updates - // were received. - if (tlog == null) { - state = State.ACTIVE; - return null; + synchronized (this) { + // handle case when no updates were received. + if (bufferTlog == null) { + state = State.ACTIVE; + return null; + } + bufferTlog.incref(); } - tlog.incref(); + state = State.APPLYING_BUFFERED; } finally { versionInfo.unblockUpdates(); } if (recoveryExecutor.isShutdown()) { - tlog.decref(); throw new RuntimeException("executor is not running..."); } ExecutorCompletionService cs = new ExecutorCompletionService<>(recoveryExecutor); - LogReplayer replayer = new LogReplayer(Arrays.asList(new TransactionLog[]{tlog}), true); - return cs.submit(replayer, recoveryInfo); + LogReplayer replayer = new LogReplayer(Collections.singletonList(bufferTlog), true); + return cs.submit(() -> { + replayer.run(); + dropBufferTlog(); + }, recoveryInfo); } public State getState() { @@ -1903,10 +1943,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { if (!activeLog) { // if we are replaying an old tlog file, we need to add a commit to the end // so we don't replay it again if we restart right after. - - // if the last operation we replayed had FLAG_GAP set, we want to use that again so we don't lose it - // as the flag on the last operation. - translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK)); + translog.writeCommit(cmd); } try { @@ -2037,10 +2074,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return cmd; } - public void cancelApplyBufferedUpdates() { - this.cancelApplyBufferUpdate = true; - } - ThreadPoolExecutor recoveryExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0, Integer.MAX_VALUE, 1, TimeUnit.SECONDS, new SynchronousQueue(), new DefaultSolrThreadFactory("recoveryExecutor")); diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java b/solr/core/src/test/org/apache/solr/search/TestRecovery.java index 1d622076c99..1b79cee61c1 100644 --- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java +++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java @@ -24,7 +24,9 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.Meter; import com.codahale.metrics.Metric; import com.codahale.metrics.MetricRegistry; +import org.apache.solr.common.util.TimeSource; import org.apache.solr.metrics.SolrMetricManager; +import org.apache.solr.util.TimeOut; import org.noggit.ObjectBuilder; import org.slf4j.Logger; @@ -820,6 +822,7 @@ public class TestRecovery extends SolrTestCaseJ4 { +"]" ); + // Note that the v101->v103 are dropped, therefore it does not present in RTG assertJQ(req("qt","/get", "getVersions","6") ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" ); @@ -929,7 +932,6 @@ public class TestRecovery extends SolrTestCaseJ4 { ,"=={'versions':["+v105+","+v104+"]}" ); - // this time add some docs first before buffering starts (so tlog won't be at pos 0) updateJ(jsonAdd(sdoc("id","c100", "_version_",v200)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); updateJ(jsonAdd(sdoc("id","c101", "_version_",v201)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); @@ -957,10 +959,8 @@ public class TestRecovery extends SolrTestCaseJ4 { +"" +"]" ); - // The updates that were buffered (but never applied) still appear in recent versions! - // This is good for some uses, but may not be good for others. - assertJQ(req("qt","/get", "getVersions","11") - ,"=={'versions':["+String.join(",",v206,v205,v204,v203,v201,v200,v105,v104,v103,v102,v101)+"]}" + assertJQ(req("qt","/get", "getVersions","6") + ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" ); assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state @@ -1008,13 +1008,9 @@ public class TestRecovery extends SolrTestCaseJ4 { @Test - public void testBufferingFlags() throws Exception { + public void testExistOldBufferLog() throws Exception { DirectUpdateHandler2.commitOnClose = false; - final Semaphore logReplayFinish = new Semaphore(0); - - UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); - SolrQueryRequest req = req(); UpdateHandler uhandler = req.getCore().getUpdateHandler(); @@ -1024,9 +1020,6 @@ public class TestRecovery extends SolrTestCaseJ4 { String v101 = getNextVersion(); String v102 = getNextVersion(); String v103 = getNextVersion(); - String v114 = getNextVersion(); - String v115 = getNextVersion(); - String v116 = getNextVersion(); String v117 = getNextVersion(); clearIndex(); @@ -1049,14 +1042,10 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - logReplayFinish.acquire(); // wait for replay to finish - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last - - // - // Try again to ensure that the previous log replay didn't wipe out our flags - // + // the core does not replay updates from buffer tlog on startup + assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + // buffer tlog won't be removed on restart req.close(); h.close(); createCore(); @@ -1065,26 +1054,9 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); - - // now do some normal non-buffered adds - updateJ(jsonAdd(sdoc("id","Q4", "_version_",v114)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q5", "_version_",v115)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q6", "_version_",v116)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - assertU(commit()); - - req.close(); - h.close(); - createCore(); - - req = req(); - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); + assertTrue(ulog.existOldBufferLog()); ulog.bufferUpdates(); - // simulate receiving no updates ulog.applyBufferedUpdates(); updateJ(jsonAdd(sdoc("id","Q7", "_version_",v117)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); // do another add to make sure flags are back to normal @@ -1096,10 +1068,12 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 - - logReplayFinish.acquire(); - assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state + assertFalse(ulog.existOldBufferLog()); + // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart + TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); + timeout.waitFor("Timeout waiting for finish replay updates", + () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); + assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); } finally { DirectUpdateHandler2.commitOnClose = true; UpdateLog.testing_logReplayHook = null; diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java index e6bb9a6edb0..1796319295d 100644 --- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java +++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.cloud.hdfs.HdfsTestUtil; import org.apache.solr.common.util.IOUtils; +import org.apache.solr.common.util.TimeSource; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.HdfsUpdateLog; @@ -51,6 +52,7 @@ import org.apache.solr.update.UpdateHandler; import org.apache.solr.update.UpdateLog; import org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase; import org.apache.solr.util.BadHdfsThreadsFilter; +import org.apache.solr.util.TimeOut; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Ignore; @@ -515,13 +517,9 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { @Test - public void testBufferingFlags() throws Exception { + public void testExistOldBufferLog() throws Exception { DirectUpdateHandler2.commitOnClose = false; - final Semaphore logReplayFinish = new Semaphore(0); - - UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); - SolrQueryRequest req = req(); UpdateHandler uhandler = req.getCore().getUpdateHandler(); @@ -548,14 +546,10 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - logReplayFinish.acquire(); // wait for replay to finish - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last - - // - // Try again to ensure that the previous log replay didn't wipe out our flags - // + // the core no longer replay updates from buffer tlog on startup + assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + // buffer tlog won't be removed on restart req.close(); h.close(); createCore(); @@ -564,23 +558,7 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); - - // now do some normal non-buffered adds - updateJ(jsonAdd(sdoc("id","Q4", "_version_","114")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q5", "_version_","115")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q6", "_version_","116")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - assertU(commit()); - - req.close(); - h.close(); - createCore(); - - req = req(); - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); + assertTrue(ulog.existOldBufferLog()); ulog.bufferUpdates(); // simulate receiving no updates @@ -595,10 +573,12 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 - - logReplayFinish.acquire(); - assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state + assertFalse(ulog.existOldBufferLog()); + // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart + TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); + timeout.waitFor("Timeout waiting for finish replay updates", + () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); + assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); } finally { DirectUpdateHandler2.commitOnClose = true; UpdateLog.testing_logReplayHook = null; diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java index 1bf4ad41978..d2b4b26df01 100644 --- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java @@ -35,7 +35,7 @@ public class TransactionLogTest extends LuceneTestCase { transactionLog.lastAddSize = 2000000000; AddUpdateCommand updateCommand = new AddUpdateCommand(null); updateCommand.solrDoc = new SolrInputDocument(); - transactionLog.write(updateCommand, 0); + transactionLog.write(updateCommand); } } From 7720d7307446ced74f35199b6c1a85d408c4c4d9 Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Mon, 4 Jun 2018 11:37:47 +0700 Subject: [PATCH 06/38] Remove file that get accident committed from previous commit --- SOLR-9922.patch | 1294 ----------------------------------------------- 1 file changed, 1294 deletions(-) delete mode 100644 SOLR-9922.patch diff --git a/SOLR-9922.patch b/SOLR-9922.patch deleted file mode 100644 index 052abf4041a..00000000000 --- a/SOLR-9922.patch +++ /dev/null @@ -1,1294 +0,0 @@ -diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java -index c8f5ae8..966497b 100644 ---- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java -+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java -@@ -449,7 +449,6 @@ public class RecoveryStrategy implements Runnable, Closeable { - - // TODO: perhaps make this grab a new core each time through the loop to handle core reloads? - final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception { -- boolean replayed = false; - boolean successfulRecovery = false; - - UpdateLog ulog; -@@ -500,8 +499,7 @@ public class RecoveryStrategy implements Runnable, Closeable { - // when we went down. We may have received updates since then. - recentVersions = startingVersions; - try { -- if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) { -- // last operation at the time of startup had the GAP flag set... -+ if (ulog.existOldBufferLog()) { - // this means we were previously doing a full index replication - // that probably didn't complete and buffering updates in the - // meantime. -@@ -542,9 +540,9 @@ public class RecoveryStrategy implements Runnable, Closeable { - } - - LOG.info("Begin buffering updates. core=[{}]", coreName); -+ // recalling buffer updates will drop the old buffer tlog - ulog.bufferUpdates(); -- replayed = false; -- -+ - LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(), - ourUrl); - zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); -@@ -603,8 +601,7 @@ public class RecoveryStrategy implements Runnable, Closeable { - - LOG.info("Replaying updates buffered during PeerSync."); - replay(core); -- replayed = true; -- -+ - // sync success - successfulRecovery = true; - return; -@@ -630,8 +627,7 @@ public class RecoveryStrategy implements Runnable, Closeable { - } - - replayFuture = replay(core); -- replayed = true; -- -+ - if (isClosed()) { - LOG.info("RecoveryStrategy has been closed"); - break; -@@ -650,21 +646,6 @@ public class RecoveryStrategy implements Runnable, Closeable { - } catch (Exception e) { - SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e); - } finally { -- if (!replayed) { -- // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates -- // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date. -- // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will -- // reset our starting point for playback. -- LOG.info("Replay not started, or was not successful... still buffering updates."); -- -- /** this prev code is retained in case we want to switch strategies. -- try { -- ulog.dropBufferedUpdates(); -- } catch (Exception e) { -- SolrException.log(log, "", e); -- } -- **/ -- } - if (successfulRecovery) { - LOG.info("Registering as Active after recovery."); - try { -diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java -index 0a742e3..aa648dd 100644 ---- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java -+++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java -@@ -97,7 +97,7 @@ public class ReplicateFromLeader { - new ModifiableSolrParams()); - CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); - cuc.setVersion(Long.parseLong(commitVersion)); -- updateLog.copyOverOldUpdates(cuc); -+ updateLog.commitAndSwitchToNewTlog(cuc); - lastVersion = Long.parseLong(commitVersion); - } - }); -diff --git a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java -index 3534f62..f668540 100644 ---- a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java -+++ b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java -@@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory; - * methods {@link #incref()}, {@link #close()} and {@link #reopenOutputStream()}. - *
  • encode the number of records in the tlog file in the last commit record. The number of records will be - * decoded and reuse if the tlog file is reopened. This is achieved by extending the constructor, and the -- * methods {@link #writeCommit(CommitUpdateCommand, int)} and {@link #getReader(long)}.
  • -+ * methods {@link #writeCommit(CommitUpdateCommand)} and {@link #getReader(long)}. - * - */ - public class CdcrTransactionLog extends TransactionLog { -@@ -108,7 +108,7 @@ public class CdcrTransactionLog extends TransactionLog { - } - - @Override -- public long write(AddUpdateCommand cmd, long prevPointer, int flags) { -+ public long write(AddUpdateCommand cmd, long prevPointer) { - assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); - - LogCodec codec = new LogCodec(resolver); -@@ -125,7 +125,7 @@ public class CdcrTransactionLog extends TransactionLog { - codec.init(out); - if (cmd.isInPlaceUpdate()) { - codec.writeTag(JavaBinCodec.ARR, 6); -- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte -+ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeLong(prevPointer); - codec.writeLong(cmd.prevVersion); -@@ -141,7 +141,7 @@ public class CdcrTransactionLog extends TransactionLog { - - } else { - codec.writeTag(JavaBinCodec.ARR, 4); -- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte -+ codec.writeInt(UpdateLog.ADD); // should just take one byte - codec.writeLong(cmd.getVersion()); - if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { - // if the update is received via cdcr source; add extra boolean entry -@@ -179,7 +179,7 @@ public class CdcrTransactionLog extends TransactionLog { - } - - @Override -- public long writeDelete(DeleteUpdateCommand cmd, int flags) { -+ public long writeDelete(DeleteUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - - try { -@@ -190,7 +190,7 @@ public class CdcrTransactionLog extends TransactionLog { - MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); - codec.init(out); - codec.writeTag(JavaBinCodec.ARR, 4); -- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte -+ codec.writeInt(UpdateLog.DELETE); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeByteArray(br.bytes, br.offset, br.length); - if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { -@@ -217,7 +217,7 @@ public class CdcrTransactionLog extends TransactionLog { - } - - @Override -- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { -+ public long writeDeleteByQuery(DeleteUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - try { - checkWriteHeader(codec, null); -@@ -225,7 +225,7 @@ public class CdcrTransactionLog extends TransactionLog { - MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); - codec.init(out); - codec.writeTag(JavaBinCodec.ARR, 4); -- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte -+ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeStr(cmd.query); - if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { -@@ -249,7 +249,7 @@ public class CdcrTransactionLog extends TransactionLog { - } - - @Override -- public long writeCommit(CommitUpdateCommand cmd, int flags) { -+ public long writeCommit(CommitUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - synchronized (this) { - try { -@@ -261,7 +261,7 @@ public class CdcrTransactionLog extends TransactionLog { - } - codec.init(fos); - codec.writeTag(JavaBinCodec.ARR, 4); -- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte -+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeTag(JavaBinCodec.INT); // Enforce the encoding of a plain integer, to simplify decoding - fos.writeInt(numRecords + 1); // the number of records in the file - +1 to account for the commit operation being written -diff --git a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java -index 6b20204..bff1612 100644 ---- a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java -+++ b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java -@@ -352,7 +352,6 @@ public class CdcrUpdateLog extends UpdateLog { - long latestVersion = startingUpdates.getMaxRecentVersion(); - try { - startingVersions = startingUpdates.getVersions(numRecordsToKeep); -- startingOperation = startingUpdates.getLatestOperation(); - - // populate recent deletes list (since we can't get that info from the index) - for (int i=startingUpdates.deleteList.size()-1; i>=0; i--) { -@@ -389,9 +388,7 @@ public class CdcrUpdateLog extends UpdateLog { - */ - private void copyBufferedUpdates(File tlogSrc, long offsetSrc, long latestVersion) { - recoveryInfo = new RecoveryInfo(); -- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); - state = State.BUFFERING; -- operationFlags |= FLAG_GAP; - - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set(DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString()); -diff --git a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java -index 0f89016..8ed7d7a 100644 ---- a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java -+++ b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java -@@ -166,20 +166,6 @@ public class HdfsTransactionLog extends TransactionLog { - } - return true; - } -- -- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. -- // This should only be used to roll back buffered updates, not actually applied updates. -- @Override -- public void rollback(long pos) throws IOException { -- synchronized (this) { -- assert snapshot_size == pos; -- ensureFlushed(); -- // TODO: how do we rollback with hdfs?? We need HDFS-3107 -- fos.setWritten(pos); -- assert fos.size() == pos; -- numRecords = snapshot_numRecords; -- } -- } - - private void readHeader(FastInputStream fis) throws IOException { - // read existing header -@@ -210,7 +196,7 @@ public class HdfsTransactionLog extends TransactionLog { - } - - @Override -- public long writeCommit(CommitUpdateCommand cmd, int flags) { -+ public long writeCommit(CommitUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - synchronized (this) { - try { -@@ -223,7 +209,7 @@ public class HdfsTransactionLog extends TransactionLog { - - codec.init(fos); - codec.writeTag(JavaBinCodec.ARR, 3); -- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte -+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file - -diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java -index 7bb74d0..8ca4b1c 100644 ---- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java -+++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java -@@ -65,37 +65,6 @@ public class HdfsUpdateLog extends UpdateLog { - this.confDir = confDir; - } - -- // HACK -- // while waiting for HDFS-3107, instead of quickly -- // dropping, we slowly apply -- // This is somewhat brittle, but current usage -- // allows for it -- @Override -- public boolean dropBufferedUpdates() { -- versionInfo.blockUpdates(); -- try { -- if (state != State.BUFFERING) return false; -- -- if (log.isInfoEnabled()) { -- log.info("Dropping buffered updates " + this); -- } -- -- // since we blocked updates, this synchronization shouldn't strictly be -- // necessary. -- synchronized (this) { -- if (tlog != null) { -- // tlog.rollback(recoveryInfo.positionOfStart); -- } -- } -- -- state = State.ACTIVE; -- operationFlags &= ~FLAG_GAP; -- } finally { -- versionInfo.unblockUpdates(); -- } -- return true; -- } -- - @Override - public void init(PluginInfo info) { - super.init(info); -@@ -186,6 +155,11 @@ public class HdfsUpdateLog extends UpdateLog { - throw new RuntimeException("Problem creating directory: " + tlogDir, e); - } - } -+ -+ String[] oldBufferTlog = getBufferLogList(fs, tlogDir); -+ if (oldBufferTlog != null && oldBufferTlog.length != 0) { -+ existOldBufferLog = true; -+ } - - tlogFiles = getLogList(fs, tlogDir); - id = getLastLogId() + 1; // add 1 since we will create a new log for the -@@ -241,7 +215,6 @@ public class HdfsUpdateLog extends UpdateLog { - // non-complete tlogs. - try (RecentUpdates startingUpdates = getRecentUpdates()) { - startingVersions = startingUpdates.getVersions(getNumRecordsToKeep()); -- startingOperation = startingUpdates.getLatestOperation(); - - // populate recent deletes list (since we can't get that info from the - // index) -@@ -269,6 +242,23 @@ public class HdfsUpdateLog extends UpdateLog { - public String getLogDir() { - return tlogDir.toUri().toString(); - } -+ -+ public static String[] getBufferLogList(FileSystem fs, Path tlogDir) { -+ final String prefix = BUFFER_TLOG_NAME+'.'; -+ assert fs != null; -+ FileStatus[] fileStatuses; -+ try { -+ fileStatuses = fs.listStatus(tlogDir, path -> path.getName().startsWith(prefix)); -+ } catch (IOException e) { -+ throw new SolrException(ErrorCode.SERVER_ERROR, "Failed on listing old buffer tlog", e); -+ } -+ -+ String[] names = new String[fileStatuses.length]; -+ for (int i = 0; i < fileStatuses.length; i++) { -+ names[i] = fileStatuses[i].getPath().getName(); -+ } -+ return names; -+ } - - public static String[] getLogList(FileSystem fs, Path tlogDir) { - final String prefix = TLOG_NAME + '.'; -@@ -307,7 +297,35 @@ public class HdfsUpdateLog extends UpdateLog { - IOUtils.closeQuietly(fs); - } - } -- -+ -+ @Override -+ protected void ensureBufferTlog() { -+ if (bufferTlog != null) return; -+ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); -+ bufferTlog = new HdfsTransactionLog(fs, new Path(tlogDir, newLogName), -+ globalStrings, tlogDfsReplication); -+ } -+ -+ @Override -+ protected void deleteBufferLogs() { -+ // Delete old buffer logs -+ String[] oldBufferTlog = getBufferLogList(fs, tlogDir); -+ if (oldBufferTlog != null && oldBufferTlog.length != 0) { -+ for (String oldBufferLogName : oldBufferTlog) { -+ Path f = new Path(tlogDir, oldBufferLogName); -+ try { -+ boolean s = fs.delete(f, false); -+ if (!s) { -+ log.error("Could not remove old buffer tlog file:" + f); -+ } -+ } catch (IOException e) { -+ // No need to bubble up this exception, because it won't cause any problems on recovering -+ log.error("Could not remove old buffer tlog file:" + f, e); -+ } -+ } -+ } -+ } -+ - @Override - protected void ensureLog() { - if (tlog == null) { -diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java -index 96a928c..2a23896 100644 ---- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java -+++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java -@@ -85,9 +85,6 @@ public class TransactionLog implements Closeable { - Map globalStringMap = new HashMap<>(); - List globalStringList = new ArrayList<>(); - -- long snapshot_size; -- int snapshot_numRecords; -- - // write a BytesRef as a byte array - static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() { - @Override -@@ -153,7 +150,7 @@ public class TransactionLog implements Closeable { - - // Parse tlog id from the filename - String filename = tlogFile.getName(); -- id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20)); -+ id = Long.parseLong(filename.substring(filename.lastIndexOf('.')+1)); - - this.tlogFile = tlogFile; - raf = new RandomAccessFile(this.tlogFile, "rw"); -@@ -233,29 +230,6 @@ public class TransactionLog implements Closeable { - return true; - } - -- /** takes a snapshot of the current position and number of records -- * for later possible rollback, and returns the position */ -- public long snapshot() { -- synchronized (this) { -- snapshot_size = fos.size(); -- snapshot_numRecords = numRecords; -- return snapshot_size; -- } -- } -- -- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. -- // This should only be used to roll back buffered updates, not actually applied updates. -- public void rollback(long pos) throws IOException { -- synchronized (this) { -- assert snapshot_size == pos; -- fos.flush(); -- raf.setLength(pos); -- fos.setWritten(pos); -- assert fos.size() == pos; -- numRecords = snapshot_numRecords; -- } -- } -- - public long writeData(Object o) { - @SuppressWarnings("resource") final LogCodec codec = new LogCodec(resolver); - try { -@@ -346,17 +320,16 @@ public class TransactionLog implements Closeable { - - /** - * Writes an add update command to the transaction log. This is not applicable for -- * in-place updates; use {@link #write(AddUpdateCommand, long, int)}. -+ * in-place updates; use {@link #write(AddUpdateCommand, long)}. - * (The previous pointer (applicable for in-place updates) is set to -1 while writing - * the command to the transaction log.) - * @param cmd The add update command to be written -- * @param flags Options for writing the command to the transaction log - * @return Returns the position pointer of the written update command - * -- * @see #write(AddUpdateCommand, long, int) -+ * @see #write(AddUpdateCommand, long) - */ -- public long write(AddUpdateCommand cmd, int flags) { -- return write(cmd, -1, flags); -+ public long write(AddUpdateCommand cmd) { -+ return write(cmd, -1); - } - - /** -@@ -365,10 +338,9 @@ public class TransactionLog implements Closeable { - * @param cmd The add update command to be written - * @param prevPointer The pointer in the transaction log which this update depends - * on (applicable for in-place updates) -- * @param flags Options for writing the command to the transaction log - * @return Returns the position pointer of the written update command - */ -- public long write(AddUpdateCommand cmd, long prevPointer, int flags) { -+ public long write(AddUpdateCommand cmd, long prevPointer) { - assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); - - LogCodec codec = new LogCodec(resolver); -@@ -386,14 +358,14 @@ public class TransactionLog implements Closeable { - codec.init(out); - if (cmd.isInPlaceUpdate()) { - codec.writeTag(JavaBinCodec.ARR, 5); -- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte -+ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeLong(prevPointer); - codec.writeLong(cmd.prevVersion); - codec.writeSolrInputDocument(cmd.getSolrInputDocument()); - } else { - codec.writeTag(JavaBinCodec.ARR, 3); -- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte -+ codec.writeInt(UpdateLog.ADD); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeSolrInputDocument(cmd.getSolrInputDocument()); - } -@@ -422,7 +394,7 @@ public class TransactionLog implements Closeable { - } - } - -- public long writeDelete(DeleteUpdateCommand cmd, int flags) { -+ public long writeDelete(DeleteUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - - try { -@@ -433,7 +405,7 @@ public class TransactionLog implements Closeable { - MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); - codec.init(out); - codec.writeTag(JavaBinCodec.ARR, 3); -- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte -+ codec.writeInt(UpdateLog.DELETE); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeByteArray(br.bytes, br.offset, br.length); - -@@ -452,7 +424,7 @@ public class TransactionLog implements Closeable { - - } - -- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { -+ public long writeDeleteByQuery(DeleteUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - try { - checkWriteHeader(codec, null); -@@ -460,7 +432,7 @@ public class TransactionLog implements Closeable { - MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); - codec.init(out); - codec.writeTag(JavaBinCodec.ARR, 3); -- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte -+ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeStr(cmd.query); - -@@ -478,7 +450,7 @@ public class TransactionLog implements Closeable { - } - - -- public long writeCommit(CommitUpdateCommand cmd, int flags) { -+ public long writeCommit(CommitUpdateCommand cmd) { - LogCodec codec = new LogCodec(resolver); - synchronized (this) { - try { -@@ -490,7 +462,7 @@ public class TransactionLog implements Closeable { - } - codec.init(fos); - codec.writeTag(JavaBinCodec.ARR, 3); -- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte -+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte - codec.writeLong(cmd.getVersion()); - codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file - -diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java -index 7f821ea..1bda23f 100644 ---- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java -+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java -@@ -96,6 +96,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - private static final long STATUS_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); - public static String LOG_FILENAME_PATTERN = "%s.%019d"; - public static String TLOG_NAME="tlog"; -+ public static String BUFFER_TLOG_NAME="buffer.tlog"; - - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private boolean debug = log.isDebugEnabled(); -@@ -139,11 +140,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - public static final int DELETE_BY_QUERY = 0x03; - public static final int COMMIT = 0x04; - public static final int UPDATE_INPLACE = 0x08; -- // Flag indicating that this is a buffered operation, and that a gap exists before buffering started. -- // for example, if full index replication starts and we are buffering updates, then this flag should -- // be set to indicate that replaying the log would not bring us into sync (i.e. peersync should -- // fail if this flag is set on the last update in the tlog). -- public static final int FLAG_GAP = 0x10; -+ // For backward-compatibility, we should delete this field in 9.0 - public static final int OPERATION_MASK = 0x0f; // mask off flags to get the operation - - /** -@@ -186,8 +183,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - - long id = -1; - protected State state = State.ACTIVE; -- protected int operationFlags; // flags to write in the transaction log with operations (i.e. FLAG_GAP) - -+ protected TransactionLog bufferTlog; - protected TransactionLog tlog; - protected TransactionLog prevTlog; - protected final Deque logs = new LinkedList<>(); // list of recent logs, newest first -@@ -206,6 +203,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - protected int maxNumLogsToKeep; - protected int numVersionBuckets; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two. - protected Long maxVersionFromIndex = null; -+ protected boolean existOldBufferLog = false; - - // keep track of deletes only... this is not updated on an add - protected LinkedHashMap oldDeletes = new LinkedHashMap(numDeletesToKeep) { -@@ -244,7 +242,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - volatile UpdateHandler uhandler; // a core reload can change this reference! - protected volatile boolean cancelApplyBufferUpdate; - List startingVersions; -- int startingOperation; // last operation in the logs on startup - - // metrics - protected Gauge bufferedOpsGauge; -@@ -378,6 +375,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id); - } - -+ String[] oldBufferTlog = getBufferLogList(tlogDir); -+ if (oldBufferTlog != null && oldBufferTlog.length != 0) { -+ existOldBufferLog = true; -+ } - TransactionLog oldLog = null; - for (String oldLogName : tlogFiles) { - File f = new File(tlogDir, oldLogName); -@@ -408,7 +409,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - // TODO: these startingVersions assume that we successfully recover from all non-complete tlogs. - try (RecentUpdates startingUpdates = getRecentUpdates()) { - startingVersions = startingUpdates.getVersions(numRecordsToKeep); -- startingOperation = startingUpdates.getLatestOperation(); - - // populate recent deletes list (since we can't get that info from the index) - for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) { -@@ -434,14 +434,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - this.metricManager = manager; - this.registryName = registry; - bufferedOpsGauge = () -> { -+ if (state == State.BUFFERING) { -+ if (bufferTlog == null) return 0; -+ // numRecords counts header as a record -+ return bufferTlog.numRecords() - 1; -+ } - if (tlog == null) { - return 0; - } else if (state == State.APPLYING_BUFFERED) { - // numRecords counts header as a record - return tlog.numRecords() - 1 - recoveryInfo.adds - recoveryInfo.deleteByQuery - recoveryInfo.deletes - recoveryInfo.errors; -- } else if (state == State.BUFFERING) { -- // numRecords counts header as a record -- return tlog.numRecords() - 1; - } else { - return 0; - } -@@ -472,8 +474,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - return startingVersions; - } - -- public int getStartingOperation() { -- return startingOperation; -+ public boolean existOldBufferLog() { -+ return existOldBufferLog; - } - - /* Takes over ownership of the log, keeping it until no longer needed -@@ -509,6 +511,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - logs.addFirst(oldLog); - } - -+ public String[] getBufferLogList(File directory) { -+ final String prefix = BUFFER_TLOG_NAME+'.'; -+ return directory.list((dir, name) -> name.startsWith(prefix)); -+ } -+ -+ /** -+ * Does update from old tlogs (not from buffer tlog)? -+ * If yes we must skip writing {@code cmd} to current tlog -+ */ -+ private boolean updateFromOldTlogs(UpdateCommand cmd) { -+ return (cmd.getFlags() & UpdateCommand.REPLAY) != 0 && state == State.REPLAYING; -+ } -+ - public String[] getLogList(File directory) { - final String prefix = TLOG_NAME+'.'; - String[] names = directory.list(new FilenameFilter() { -@@ -541,14 +556,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - // if ((cmd.getFlags() & UpdateCommand.REPLAY) != 0) return; - - synchronized (this) { -- long pos = -1; -+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { -+ ensureBufferTlog(); -+ bufferTlog.write(cmd); -+ return; -+ } - -+ long pos = -1; - long prevPointer = getPrevPointerForUpdate(cmd); - - // don't log if we are replaying from another log -- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { -+ if (!updateFromOldTlogs(cmd)) { - ensureLog(); -- pos = tlog.write(cmd, prevPointer, operationFlags); -+ pos = tlog.write(cmd, prevPointer); - } - - if (!clearCaches) { -@@ -556,10 +576,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - // Only currently would be useful for RTG while in recovery mode though. - LogPtr ptr = new LogPtr(pos, cmd.getVersion(), prevPointer); - -- // only update our map if we're not buffering -- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { -- map.put(cmd.getIndexedId(), ptr); -- } -+ map.put(cmd.getIndexedId(), ptr); - - if (trace) { - log.trace("TLOG: added id " + cmd.getPrintableId() + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); -@@ -606,22 +623,21 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - BytesRef br = cmd.getIndexedId(); - - synchronized (this) { -- long pos = -1; -+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { -+ ensureBufferTlog(); -+ bufferTlog.writeDelete(cmd); -+ return; -+ } - -- // don't log if we are replaying from another log -- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { -+ long pos = -1; -+ if (!updateFromOldTlogs(cmd)) { - ensureLog(); -- pos = tlog.writeDelete(cmd, operationFlags); -+ pos = tlog.writeDelete(cmd); - } - - LogPtr ptr = new LogPtr(pos, cmd.version); -- -- // only update our map if we're not buffering -- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { -- map.put(br, ptr); -- -- oldDeletes.put(br, ptr); -- } -+ map.put(br, ptr); -+ oldDeletes.put(br, ptr); - - if (trace) { - log.trace("TLOG: added delete for id " + cmd.id + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); -@@ -631,15 +647,20 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - - public void deleteByQuery(DeleteUpdateCommand cmd) { - synchronized (this) { -+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { -+ ensureBufferTlog(); -+ bufferTlog.writeDeleteByQuery(cmd); -+ return; -+ } -+ - long pos = -1; -- // don't log if we are replaying from another log -- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { -+ if (!updateFromOldTlogs(cmd)) { - ensureLog(); -- pos = tlog.writeDeleteByQuery(cmd, operationFlags); -+ pos = tlog.writeDeleteByQuery(cmd); - } - -- // only change our caches if we are not buffering -- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0 && (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { -+ // skip purge our caches in case of tlog replica -+ if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { - // given that we just did a delete-by-query, we don't know what documents were - // affected and hence we must purge our caches. - openRealtimeSearcher(); -@@ -802,7 +823,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - if (prevTlog != null) { - // if we made it through the commit, write a commit command to the log - // TODO: check that this works to cap a tlog we were using to buffer so we don't replay on startup. -- prevTlog.writeCommit(cmd, operationFlags); -+ prevTlog.writeCommit(cmd); - - addOldLog(prevTlog, true); - // the old log list will decref when no longer needed -@@ -1152,9 +1173,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - public void copyOverBufferingUpdates(CommitUpdateCommand cuc) { - versionInfo.blockUpdates(); - try { -- operationFlags &= ~FLAG_GAP; -- state = State.ACTIVE; -- copyAndSwitchToNewTlog(cuc); -+ synchronized (this) { -+ state = State.ACTIVE; -+ if (bufferTlog == null) { -+ return; -+ } -+ // by calling this, we won't switch to new tlog (compared to applyBufferedUpdates()) -+ // if we switch to new tlog we can possible lose updates on the next fetch -+ copyOverOldUpdates(cuc.getVersion(), bufferTlog); -+ dropBufferTlog(); -+ } - } finally { - versionInfo.unblockUpdates(); - } -@@ -1165,33 +1193,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - * So any updates which hasn't made it to the index is preserved in the current tlog - * @param cuc any updates that have version larger than the version of cuc will be copied over - */ -- public void copyOverOldUpdates(CommitUpdateCommand cuc) { -+ public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) { - versionInfo.blockUpdates(); - try { -- copyAndSwitchToNewTlog(cuc); -+ synchronized (this) { -+ if (tlog == null) { -+ return; -+ } -+ preCommit(cuc); -+ try { -+ copyOverOldUpdates(cuc.getVersion()); -+ } finally { -+ postCommit(cuc); -+ } -+ } - } finally { - versionInfo.unblockUpdates(); - } - } - -- protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) { -- synchronized (this) { -- if (tlog == null) { -- return; -- } -- preCommit(cuc); -- try { -- copyOverOldUpdates(cuc.getVersion()); -- } finally { -- postCommit(cuc); -- } -- } -- } -- -- /** -- * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog -- * @param commitVersion any updates that have version larger than the commitVersion will be copied over -- */ - public void copyOverOldUpdates(long commitVersion) { - TransactionLog oldTlog = prevTlog; - if (oldTlog == null && !logs.isEmpty()) { -@@ -1207,6 +1227,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - log.warn("Exception reading log", e); - return; - } -+ copyOverOldUpdates(commitVersion, oldTlog); -+ } -+ -+ /** -+ * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog -+ * @param commitVersion any updates that have version larger than the commitVersion will be copied over -+ */ -+ public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) { - copyOverOldUpdatesMeter.mark(); - - SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core, -@@ -1270,6 +1298,22 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - } - } - -+ protected void ensureBufferTlog() { -+ if (bufferTlog != null) return; -+ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); -+ bufferTlog = newTransactionLog(new File(tlogDir, newLogName), globalStrings, false); -+ } -+ -+ // Cleanup old buffer tlogs -+ protected void deleteBufferLogs() { -+ String[] oldBufferTlog = getBufferLogList(tlogDir); -+ if (oldBufferTlog != null && oldBufferTlog.length != 0) { -+ for (String oldBufferLogName : oldBufferTlog) { -+ deleteFile(new File(tlogDir, oldBufferLogName)); -+ } -+ } -+ } -+ - - protected void ensureLog() { - if (tlog == null) { -@@ -1285,7 +1329,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - // record a commit - log.info("Recording current closed for " + uhandler.core + " log=" + theLog); - CommitUpdateCommand cmd = new CommitUpdateCommand(new LocalSolrQueryRequest(uhandler.core, new ModifiableSolrParams((SolrParams)null)), false); -- theLog.writeCommit(cmd, operationFlags); -+ theLog.writeCommit(cmd); - } - - theLog.deleteOnClose = false; -@@ -1314,6 +1358,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - log.forceClose(); - } - -+ if (bufferTlog != null) { -+ // should not delete bufferTlog on close, existing bufferTlog is a sign for skip peerSync -+ bufferTlog.deleteOnClose = false; -+ bufferTlog.decref(); -+ bufferTlog.forceClose(); -+ } -+ - try { - ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor); - } catch (Exception e) { -@@ -1347,7 +1398,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - HashMap updates; - List deleteByQueryList; - List deleteList; -- int latestOperation; - - public RecentUpdates(Deque logList) { - this.logList = logList; -@@ -1401,11 +1451,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - return result; - } - -- public int getLatestOperation() { -- return latestOperation; -- } -- -- - private void update() { - int numUpdates = 0; - updateList = new ArrayList<>(logList.size()); -@@ -1431,9 +1476,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - - // TODO: refactor this out so we get common error handling - int opAndFlags = (Integer)entry.get(UpdateLog.FLAGS_IDX); -- if (latestOperation == 0) { -- latestOperation = opAndFlags; -- } - int oper = opAndFlags & UpdateLog.OPERATION_MASK; - long version = (Long) entry.get(UpdateLog.VERSION_IDX); - -@@ -1525,6 +1567,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - tlog.incref(); - logList.addFirst(tlog); - } -+ if (bufferTlog != null) { -+ bufferTlog.incref(); -+ logList.addFirst(bufferTlog); -+ } - } - - // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and -@@ -1542,13 +1588,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - // reading state and acting on it in the distributed update processor - versionInfo.blockUpdates(); - try { -- if (state == State.BUFFERING) { -- log.info("Restarting buffering. previous=" + recoveryInfo); -- } else if (state != State.ACTIVE) { -+ if (state != State.ACTIVE && state != State.BUFFERING) { - // we don't currently have support for handling other states - log.warn("Unexpected state for bufferUpdates: " + state + ", Ignoring request."); - return; - } -+ dropBufferTlog(); -+ deleteBufferLogs(); - - recoveryInfo = new RecoveryInfo(); - -@@ -1556,15 +1602,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - log.info("Starting to buffer updates. " + this); - } - -- // since we blocked updates, this synchronization shouldn't strictly be necessary. -- synchronized (this) { -- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); -- } -- - state = State.BUFFERING; -- -- // currently, buffering is only called by recovery, meaning that there is most likely a gap in updates -- operationFlags |= FLAG_GAP; - } finally { - versionInfo.unblockUpdates(); - } -@@ -1580,25 +1618,24 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - log.info("Dropping buffered updates " + this); - } - -- // since we blocked updates, this synchronization shouldn't strictly be necessary. -- synchronized (this) { -- if (tlog != null) { -- tlog.rollback(recoveryInfo.positionOfStart); -- } -- } -+ dropBufferTlog(); - - state = State.ACTIVE; -- operationFlags &= ~FLAG_GAP; -- } catch (IOException e) { -- SolrException.log(log,"Error attempting to roll back log", e); -- return false; -- } -- finally { -+ } finally { - versionInfo.unblockUpdates(); - } - return true; - } - -+ private void dropBufferTlog() { -+ synchronized (this) { -+ if (bufferTlog != null) { -+ bufferTlog.decref(); -+ bufferTlog = null; -+ } -+ } -+ } -+ - - /** Returns the Future to wait on, or null if no replay was needed */ - public Future applyBufferedUpdates() { -@@ -1612,27 +1649,30 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - try { - cancelApplyBufferUpdate = false; - if (state != State.BUFFERING) return null; -- operationFlags &= ~FLAG_GAP; - -- // handle case when no log was even created because no updates -- // were received. -- if (tlog == null) { -- state = State.ACTIVE; -- return null; -+ synchronized (this) { -+ // handle case when no updates were received. -+ if (bufferTlog == null) { -+ state = State.ACTIVE; -+ return null; -+ } -+ bufferTlog.incref(); - } -- tlog.incref(); -+ - state = State.APPLYING_BUFFERED; - } finally { - versionInfo.unblockUpdates(); - } - - if (recoveryExecutor.isShutdown()) { -- tlog.decref(); - throw new RuntimeException("executor is not running..."); - } - ExecutorCompletionService cs = new ExecutorCompletionService<>(recoveryExecutor); -- LogReplayer replayer = new LogReplayer(Arrays.asList(new TransactionLog[]{tlog}), true); -- return cs.submit(replayer, recoveryInfo); -+ LogReplayer replayer = new LogReplayer(Collections.singletonList(bufferTlog), true); -+ return cs.submit(() -> { -+ replayer.run(); -+ dropBufferTlog(); -+ }, recoveryInfo); - } - - public State getState() { -@@ -1903,10 +1943,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - if (!activeLog) { - // if we are replaying an old tlog file, we need to add a commit to the end - // so we don't replay it again if we restart right after. -- -- // if the last operation we replayed had FLAG_GAP set, we want to use that again so we don't lose it -- // as the flag on the last operation. -- translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK)); -+ translog.writeCommit(cmd); - } - - try { -@@ -2037,10 +2074,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { - return cmd; - } - -- public void cancelApplyBufferedUpdates() { -- this.cancelApplyBufferUpdate = true; -- } -- - ThreadPoolExecutor recoveryExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0, - Integer.MAX_VALUE, 1, TimeUnit.SECONDS, new SynchronousQueue(), - new DefaultSolrThreadFactory("recoveryExecutor")); -diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java b/solr/core/src/test/org/apache/solr/search/TestRecovery.java -index 1d62207..1b79cee 100644 ---- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java -+++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java -@@ -24,7 +24,9 @@ import com.codahale.metrics.Gauge; - import com.codahale.metrics.Meter; - import com.codahale.metrics.Metric; - import com.codahale.metrics.MetricRegistry; -+import org.apache.solr.common.util.TimeSource; - import org.apache.solr.metrics.SolrMetricManager; -+import org.apache.solr.util.TimeOut; - import org.noggit.ObjectBuilder; - - import org.slf4j.Logger; -@@ -820,6 +822,7 @@ public class TestRecovery extends SolrTestCaseJ4 { - +"]" - ); - -+ // Note that the v101->v103 are dropped, therefore it does not present in RTG - assertJQ(req("qt","/get", "getVersions","6") - ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" - ); -@@ -929,7 +932,6 @@ public class TestRecovery extends SolrTestCaseJ4 { - ,"=={'versions':["+v105+","+v104+"]}" - ); - -- // this time add some docs first before buffering starts (so tlog won't be at pos 0) - updateJ(jsonAdd(sdoc("id","c100", "_version_",v200)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","c101", "_version_",v201)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - -@@ -957,10 +959,8 @@ public class TestRecovery extends SolrTestCaseJ4 { - +"" +"]" - ); - -- // The updates that were buffered (but never applied) still appear in recent versions! -- // This is good for some uses, but may not be good for others. -- assertJQ(req("qt","/get", "getVersions","11") -- ,"=={'versions':["+String.join(",",v206,v205,v204,v203,v201,v200,v105,v104,v103,v102,v101)+"]}" -+ assertJQ(req("qt","/get", "getVersions","6") -+ ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" - ); - - assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state -@@ -1008,13 +1008,9 @@ public class TestRecovery extends SolrTestCaseJ4 { - - - @Test -- public void testBufferingFlags() throws Exception { -+ public void testExistOldBufferLog() throws Exception { - - DirectUpdateHandler2.commitOnClose = false; -- final Semaphore logReplayFinish = new Semaphore(0); -- -- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); -- - - SolrQueryRequest req = req(); - UpdateHandler uhandler = req.getCore().getUpdateHandler(); -@@ -1024,9 +1020,6 @@ public class TestRecovery extends SolrTestCaseJ4 { - String v101 = getNextVersion(); - String v102 = getNextVersion(); - String v103 = getNextVersion(); -- String v114 = getNextVersion(); -- String v115 = getNextVersion(); -- String v116 = getNextVersion(); - String v117 = getNextVersion(); - - clearIndex(); -@@ -1049,30 +1042,10 @@ public class TestRecovery extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- logReplayFinish.acquire(); // wait for replay to finish -- -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last -- -- // -- // Try again to ensure that the previous log replay didn't wipe out our flags -- // -- -- req.close(); -- h.close(); -- createCore(); -- -- req = req(); -- uhandler = req.getCore().getUpdateHandler(); -- ulog = uhandler.getUpdateLog(); -- -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); -- -- // now do some normal non-buffered adds -- updateJ(jsonAdd(sdoc("id","Q4", "_version_",v114)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- updateJ(jsonAdd(sdoc("id","Q5", "_version_",v115)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- updateJ(jsonAdd(sdoc("id","Q6", "_version_",v116)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- assertU(commit()); -+ // the core does not replay updates from buffer tlog on startup -+ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last - -+ // buffer tlog won't be removed on restart - req.close(); - h.close(); - createCore(); -@@ -1081,10 +1054,9 @@ public class TestRecovery extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); -+ assertTrue(ulog.existOldBufferLog()); - - ulog.bufferUpdates(); -- // simulate receiving no updates - ulog.applyBufferedUpdates(); - updateJ(jsonAdd(sdoc("id","Q7", "_version_",v117)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); // do another add to make sure flags are back to normal - -@@ -1096,10 +1068,12 @@ public class TestRecovery extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 -- -- logReplayFinish.acquire(); -- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state -+ assertFalse(ulog.existOldBufferLog()); -+ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart -+ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); -+ timeout.waitFor("Timeout waiting for finish replay updates", -+ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); -+ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); - } finally { - DirectUpdateHandler2.commitOnClose = true; - UpdateLog.testing_logReplayHook = null; -diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java -index e6bb9a6..1796319 100644 ---- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java -+++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java -@@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster; - import org.apache.solr.SolrTestCaseJ4; - import org.apache.solr.cloud.hdfs.HdfsTestUtil; - import org.apache.solr.common.util.IOUtils; -+import org.apache.solr.common.util.TimeSource; - import org.apache.solr.request.SolrQueryRequest; - import org.apache.solr.update.DirectUpdateHandler2; - import org.apache.solr.update.HdfsUpdateLog; -@@ -51,6 +52,7 @@ import org.apache.solr.update.UpdateHandler; - import org.apache.solr.update.UpdateLog; - import org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase; - import org.apache.solr.util.BadHdfsThreadsFilter; -+import org.apache.solr.util.TimeOut; - import org.junit.AfterClass; - import org.junit.BeforeClass; - import org.junit.Ignore; -@@ -515,13 +517,9 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { - - - @Test -- public void testBufferingFlags() throws Exception { -+ public void testExistOldBufferLog() throws Exception { - - DirectUpdateHandler2.commitOnClose = false; -- final Semaphore logReplayFinish = new Semaphore(0); -- -- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); -- - - SolrQueryRequest req = req(); - UpdateHandler uhandler = req.getCore().getUpdateHandler(); -@@ -548,14 +546,10 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- logReplayFinish.acquire(); // wait for replay to finish -- -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last -- -- // -- // Try again to ensure that the previous log replay didn't wipe out our flags -- // -+ // the core no longer replay updates from buffer tlog on startup -+ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last - -+ // buffer tlog won't be removed on restart - req.close(); - h.close(); - createCore(); -@@ -564,23 +558,7 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); -- -- // now do some normal non-buffered adds -- updateJ(jsonAdd(sdoc("id","Q4", "_version_","114")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- updateJ(jsonAdd(sdoc("id","Q5", "_version_","115")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- updateJ(jsonAdd(sdoc("id","Q6", "_version_","116")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); -- assertU(commit()); -- -- req.close(); -- h.close(); -- createCore(); -- -- req = req(); -- uhandler = req.getCore().getUpdateHandler(); -- ulog = uhandler.getUpdateLog(); -- -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); -+ assertTrue(ulog.existOldBufferLog()); - - ulog.bufferUpdates(); - // simulate receiving no updates -@@ -595,10 +573,12 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - -- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 -- -- logReplayFinish.acquire(); -- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state -+ assertFalse(ulog.existOldBufferLog()); -+ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart -+ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); -+ timeout.waitFor("Timeout waiting for finish replay updates", -+ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); -+ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); - } finally { - DirectUpdateHandler2.commitOnClose = true; - UpdateLog.testing_logReplayHook = null; -diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java -index 1bf4ad4..d2b4b26 100644 ---- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java -+++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java -@@ -35,7 +35,7 @@ public class TransactionLogTest extends LuceneTestCase { - transactionLog.lastAddSize = 2000000000; - AddUpdateCommand updateCommand = new AddUpdateCommand(null); - updateCommand.solrDoc = new SolrInputDocument(); -- transactionLog.write(updateCommand, 0); -+ transactionLog.write(updateCommand); - } - } - From e7a0a12926c399758a4021715a7419e22e59dab6 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 4 Jun 2018 08:23:31 +0200 Subject: [PATCH 07/38] LUCENE-8335: Enforce soft-deletes field up-front Soft deletes field must be marked as such once it's introduced and can't be changed after the fact. Co-authored-by: Nhat Nguyen --- lucene/CHANGES.txt | 4 + .../SimpleTextFieldInfosFormat.java | 11 +- .../lucene50/Lucene50FieldInfosFormat.java | 2 +- .../lucene60/Lucene60FieldInfosFormat.java | 8 +- .../org/apache/lucene/index/FieldInfo.java | 18 ++- .../org/apache/lucene/index/FieldInfos.java | 51 ++++++--- .../org/apache/lucene/index/IndexWriter.java | 8 +- .../org/apache/lucene/index/MultiFields.java | 6 +- .../lucene/index/ParallelLeafReader.java | 7 +- .../org/apache/lucene/index/TestCodecs.java | 4 +- .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestFieldsReader.java | 2 +- .../apache/lucene/index/TestIndexWriter.java | 106 ++++++++++++++++++ .../lucene/index/TestPendingSoftDeletes.java | 10 +- .../lucene/index/TestSegmentMerger.java | 2 +- .../highlight/TermVectorLeafReader.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 4 +- .../index/BaseFieldInfoFormatTestCase.java | 14 +-- .../index/BaseIndexFileFormatTestCase.java | 2 +- .../lucene/index/MismatchedLeafReader.java | 3 +- .../lucene/index/RandomPostingsTester.java | 4 +- .../handler/component/ExpandComponent.java | 3 +- .../solr/search/CollapsingQParserPlugin.java | 2 +- .../java/org/apache/solr/search/Insanity.java | 2 +- .../solr/uninverting/UninvertingReader.java | 2 +- 25 files changed, 224 insertions(+), 55 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cd11e7dbeb5..6644453e1a4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -203,6 +203,10 @@ New Features now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) +* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked + as such once it's introduced and can't be changed after the fact. + (Nhat Nguyen via Simon Willnauer) + Bug Fixes * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 0ace1534d4e..1c40cbd4255 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -66,6 +66,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef ATT_VALUE = new BytesRef(" value "); static final BytesRef DIM_COUNT = new BytesRef(" dimensional count "); static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes "); + static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes "); @Override public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException { @@ -140,9 +141,13 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES); int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch)); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SOFT_DELETES); + boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch)); + infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts), - dimensionalCount, dimensionalNumBytes); + dimensionalCount, dimensionalNumBytes, isSoftDeletesField); } SimpleTextUtil.checkFooter(input); @@ -238,6 +243,10 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { SimpleTextUtil.write(out, DIM_NUM_BYTES); SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch); SimpleTextUtil.writeNewline(out); + + SimpleTextUtil.write(out, SOFT_DELETES); + SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch); + SimpleTextUtil.writeNewline(out); } SimpleTextUtil.writeChecksum(out, scratch); success = true; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java index a76bfeb6e7a..30dca7041f8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java @@ -148,7 +148,7 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat { lastAttributes = attributes; try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, - indexOptions, docValuesType, dvGen, attributes, 0, 0); + indexOptions, docValuesType, dvGen, attributes, 0, 0, false); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java index a35461e3ef7..522a73f1d27 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java @@ -136,6 +136,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; + boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -159,7 +160,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, - pointDimensionCount, pointNumBytes); + pointDimensionCount, pointNumBytes, isSoftDeletesField); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); @@ -277,6 +278,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { if (fi.hasVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; + if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -301,10 +303,12 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { // Codec header static final String CODEC_NAME = "Lucene60FieldInfos"; static final int FORMAT_START = 0; - static final int FORMAT_CURRENT = FORMAT_START; + static final int FORMAT_SOFT_DELETES = 1; + static final int FORMAT_CURRENT = FORMAT_SOFT_DELETES; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; + static final byte SOFT_DELETES_FIELD = 0x8; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 037fe5c1bc7..b50cb12cd5e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -53,14 +53,17 @@ public final class FieldInfo { private int pointDimensionCount; private int pointNumBytes; + // whether this field is used as the soft-deletes field + private final boolean softDeletesField; + /** * Sole constructor. * * @lucene.experimental */ - public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, - boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, - long dvGen, Map attributes, int pointDimensionCount, int pointNumBytes) { + public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads, + IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes, + int pointDimensionCount, int pointNumBytes, boolean softDeletesField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")"); @@ -78,6 +81,7 @@ public final class FieldInfo { this.attributes = Objects.requireNonNull(attributes); this.pointDimensionCount = pointDimensionCount; this.pointNumBytes = pointNumBytes; + this.softDeletesField = softDeletesField; assert checkConsistency(); } @@ -332,4 +336,12 @@ public final class FieldInfo { public Map attributes() { return attributes; } + + /** + * Returns true if this field is configured and used as the soft-deletes field. + * See {@link IndexWriterConfig#softDeletesField} + */ + public boolean isSoftDeletesField() { + return softDeletesField; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 4b472a55503..0a0ff5ee605 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -43,6 +43,7 @@ public class FieldInfos implements Iterable { private final boolean hasNorms; private final boolean hasDocValues; private final boolean hasPointValues; + private final String softDeletesField; // used only by fieldInfo(int) private final FieldInfo[] byNumber; @@ -62,6 +63,7 @@ public class FieldInfos implements Iterable { boolean hasNorms = false; boolean hasDocValues = false; boolean hasPointValues = false; + String softDeletesField = null; int size = 0; // number of elements in byNumberTemp, number of used array slots FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10 @@ -92,6 +94,12 @@ public class FieldInfos implements Iterable { hasDocValues |= info.getDocValuesType() != DocValuesType.NONE; hasPayloads |= info.hasPayloads(); hasPointValues |= (info.getPointDimensionCount() != 0); + if (info.isSoftDeletesField()) { + if (softDeletesField != null && softDeletesField.equals(info.name) == false) { + throw new IllegalArgumentException("multiple soft-deletes fields [" + info.name + ", " + softDeletesField + "]"); + } + softDeletesField = info.name; + } } this.hasVectors = hasVectors; @@ -102,6 +110,7 @@ public class FieldInfos implements Iterable { this.hasNorms = hasNorms; this.hasDocValues = hasDocValues; this.hasPointValues = hasPointValues; + this.softDeletesField = softDeletesField; List valuesTemp = new ArrayList<>(); byNumber = new FieldInfo[size]; @@ -153,6 +162,11 @@ public class FieldInfos implements Iterable { public boolean hasPointValues() { return hasPointValues; } + + /** Returns the soft-deletes field name if exists; otherwise returns null */ + public String getSoftDeletesField() { + return softDeletesField; + } /** Returns the number of fields */ public int size() { @@ -221,13 +235,17 @@ public class FieldInfos implements Iterable { // norms back on after they were already ommitted; today // we silently discard the norm but this is badly trappy private int lowestUnassignedFieldNumber = -1; + + // The soft-deletes field from IWC to enforce a single soft-deletes field + private final String softDeletesFieldName; - FieldNumbers() { + FieldNumbers(String softDeletesFieldName) { this.nameToNumber = new HashMap<>(); this.numberToName = new HashMap<>(); this.indexOptions = new HashMap<>(); this.docValuesType = new HashMap<>(); this.dimensions = new HashMap<>(); + this.softDeletesFieldName = softDeletesFieldName; } /** @@ -236,7 +254,7 @@ public class FieldInfos implements Iterable { * number assigned if possible otherwise the first unassigned field number * is used as the field number. */ - synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes) { + synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { if (indexOptions != IndexOptions.NONE) { IndexOptions currentOpts = this.indexOptions.get(fieldName); if (currentOpts == null) { @@ -284,6 +302,16 @@ public class FieldInfos implements Iterable { nameToNumber.put(fieldName, fieldNumber); } + if (isSoftDeletesField) { + if (softDeletesFieldName == null) { + throw new IllegalArgumentException("this index has [" + fieldName + "] as soft-deletes already but soft-deletes field is not configured in IWC"); + } else if (fieldName.equals(softDeletesFieldName) == false) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as soft-deletes already"); + } + } else if (fieldName.equals(softDeletesFieldName)) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as non-soft-deletes already"); + } + return fieldNumber.intValue(); } @@ -383,11 +411,7 @@ public class FieldInfos implements Iterable { private final HashMap byName = new HashMap<>(); final FieldNumbers globalFieldNumbers; private boolean finished; - - Builder() { - this(new FieldNumbers()); - } - + /** * Creates a new instance with the given {@link FieldNumbers}. */ @@ -413,8 +437,9 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0); - fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0); + final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName); + final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE); byName.put(fi.name, fi); @@ -427,7 +452,7 @@ public class FieldInfos implements Iterable { boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, - int dimensionCount, int dimensionNumBytes) { + int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { assert assertNotFinished(); if (docValues == null) { throw new NullPointerException("DocValuesType must not be null"); @@ -439,8 +464,8 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes); + final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); @@ -473,7 +498,7 @@ public class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), dvGen, - fi.getPointDimensionCount(), fi.getPointNumBytes()); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } public FieldInfo fieldInfo(String fieldName) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index bc2264b7eab..5efba70ad01 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -960,12 +960,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * If this {@link SegmentInfos} has no global field number map the returned instance is empty */ private FieldNumbers getFieldNumberMap() throws IOException { - final FieldNumbers map = new FieldNumbers(); + final FieldNumbers map = new FieldNumbers(config.softDeletesField); for(SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { - map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } } @@ -1787,7 +1787,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (globalFieldNumberMap.contains(f.name(), dvType) == false) { // if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we // get a consistent error message as if you try to do that during an indexing operation. - globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0); + globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, f.name().equals(config.softDeletesField)); assert globalFieldNumberMap.contains(f.name(), dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -2824,7 +2824,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { // This will throw exceptions if any of the incoming fields have an illegal schema change: - globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } infos.add(copySegmentAsIs(info, newSegName, context)); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index 1a7b15bd81c..19078a83c15 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -25,6 +25,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.util.Bits; @@ -263,7 +264,10 @@ public final class MultiFields extends Fields { * will be unavailable. */ public static FieldInfos getMergedFieldInfos(IndexReader reader) { - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final String softDeletesField = reader.leaves().stream() + .map(l -> l.reader().getFieldInfos().getSoftDeletesField()) + .filter(Objects::nonNull).findAny().orElse(null); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField)); for(final LeafReaderContext ctx : reader.leaves()) { builder.add(ctx.reader().getFieldInfos()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 492b6e7bc97..25f200a4243 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; @@ -101,9 +102,11 @@ public class ParallelLeafReader extends LeafReader { throw new IllegalArgumentException("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); } } - + final String softDeletesField = completeReaderSet.stream() + .map(r -> r.getFieldInfos().getSoftDeletesField()) + .filter(Objects::nonNull).findAny().orElse(null); // TODO: make this read-only in a cleaner way? - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField)); Sort indexSort = null; int createdVersionMajor = -1; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index acc6506f4f6..5ff0dde4515 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -212,7 +212,7 @@ public class TestCodecs extends LuceneTestCase { terms[i] = new TermData(text, docs, null); } - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); final FieldData field = new FieldData("field", builder, terms, true, false); final FieldData[] fields = new FieldData[] {field}; @@ -259,7 +259,7 @@ public class TestCodecs extends LuceneTestCase { } public void testRandomPostings() throws Throwable { - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); final FieldData[] fields = new FieldData[NUM_FIELDS]; for(int i=0;iasList(r1, r2), si, InfoStream.getDefault(), trackingDir, - new FieldInfos.FieldNumbers(), context); + new FieldInfos.FieldNumbers(null), context); MergeState mergeState = merger.merge(); r1.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java index 48d69ec4149..ce24b7f19a6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -44,7 +44,7 @@ public class TestFieldsReader extends LuceneTestCase { @BeforeClass public static void beforeClass() throws Exception { testDoc = new Document(); - fieldInfos = new FieldInfos.Builder(); + fieldInfos = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); DocHelper.setupDoc(testDoc); for (IndexableField field : testDoc.getFields()) { FieldInfo fieldInfo = fieldInfos.getOrAdd(field.name()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 5e394d560fc..8eac2fa4923 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3376,4 +3376,110 @@ public class TestIndexWriter extends LuceneTestCase { IOUtils.close(reader, writer, dir); } + public void testPreventChangingSoftDeletesField() throws Exception { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("my_deletes")); + Document v1 = new Document(); + v1.add(new StringField("id", "1", Field.Store.YES)); + v1.add(new StringField("version", "1", Field.Store.YES)); + writer.addDocument(v1); + Document v2 = new Document(); + v2.add(new StringField("id", "1", Field.Store.YES)); + v2.add(new StringField("version", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "1"), v2, new NumericDocValuesField("my_deletes", 1)); + writer.commit(); + writer.close(); + for (SegmentCommitInfo si : SegmentInfos.readLatestCommit(dir)) { + FieldInfos fieldInfos = IndexWriter.readFieldInfos(si); + assertEquals("my_deletes", fieldInfos.getSoftDeletesField()); + assertTrue(fieldInfos.fieldInfo("my_deletes").isSoftDeletesField()); + } + + IllegalArgumentException illegalError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("your_deletes")); + }); + assertEquals("cannot configure [your_deletes] as soft-deletes; " + + "this index uses [my_deletes] as soft-deletes already", illegalError.getMessage()); + + IndexWriterConfig softDeleteConfig = newIndexWriterConfig().setSoftDeletesField("my_deletes") + .setMergePolicy(new SoftDeletesRetentionMergePolicy("my_deletes", () -> new MatchAllDocsQuery(), newMergePolicy())); + writer = new IndexWriter(dir, softDeleteConfig); + Document tombstone = new Document(); + tombstone.add(new StringField("id", "tombstone", Field.Store.YES)); + tombstone.add(new NumericDocValuesField("my_deletes", 1)); + writer.addDocument(tombstone); + writer.flush(); + for (SegmentCommitInfo si : writer.segmentInfos) { + FieldInfos fieldInfos = IndexWriter.readFieldInfos(si); + assertEquals("my_deletes", fieldInfos.getSoftDeletesField()); + assertTrue(fieldInfos.fieldInfo("my_deletes").isSoftDeletesField()); + } + writer.close(); + // reopen writer without soft-deletes field should be prevented + IllegalArgumentException reopenError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig()); + }); + assertEquals("this index has [my_deletes] as soft-deletes already" + + " but soft-deletes field is not configured in IWC", reopenError.getMessage()); + dir.close(); + } + + public void testPreventAddingIndexesWithDifferentSoftDeletesField() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig().setSoftDeletesField("soft_deletes_1")); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + d.add(new StringField("version", Integer.toString(i), Field.Store.YES)); + w1.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("soft_deletes_1", 1)); + } + w1.commit(); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig().setSoftDeletesField("soft_deletes_2")); + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> w2.addIndexes(dir1)); + assertEquals("cannot configure [soft_deletes_2] as soft-deletes; this index uses [soft_deletes_1] as soft-deletes already", + error.getMessage()); + w2.close(); + + Directory dir3 = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("soft_deletes_1"); + IndexWriter w3 = new IndexWriter(dir3, config); + w3.addIndexes(dir1); + for (SegmentCommitInfo si : w3.segmentInfos) { + FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("soft_deletes_1"); + assertTrue(softDeleteField.isSoftDeletesField()); + } + w3.close(); + IOUtils.close(dir1, dir2, dir3); + } + + public void testNotAllowUsingExistingFieldAsSoftDeletes() throws Exception { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + if (random().nextBoolean()) { + d.add(new NumericDocValuesField("dv_field", 1)); + w.updateDocument(new Term("id", "1"), d); + } else { + w.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("dv_field", 1)); + } + } + w.commit(); + w.close(); + String softDeletesField = random().nextBoolean() ? "id" : "dv_field"; + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> { + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField(softDeletesField); + new IndexWriter(dir, config); + }); + assertEquals("cannot configure [" + softDeletesField + "] as soft-deletes;" + + " this index uses [" + softDeletesField + "] as non-soft-deletes already", error.getMessage()); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("non-existing-field"); + w = new IndexWriter(dir, config); + w.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 5fadd3f10cd..3047364781e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -120,7 +120,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { deletes.onNewReader(segmentReader, commitInfo); reader.close(); writer.close(); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { @@ -140,7 +140,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, true); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } @@ -182,7 +182,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { @@ -228,7 +228,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -247,7 +247,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertEquals(0, deletes.numPendingDeletes()); segmentInfo.advanceDocValuesGen(); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 6d0e04bbb2c..1171b906b98 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -88,7 +88,7 @@ public class TestSegmentMerger extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, - new FieldInfos.FieldNumbers(), + new FieldInfos.FieldNumbers(null), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 144209dcceb..1eef95fdd6d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -81,7 +81,7 @@ public class TermVectorLeafReader extends LeafReader { } FieldInfo fieldInfo = new FieldInfo(field, 0, true, true, terms.hasPayloads(), - indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0); + indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, false); fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo}); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index ff248c34538..11913d1cbee 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -501,7 +501,7 @@ public class MemoryIndex { IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads, indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(), - fieldType.pointDimensionCount(), fieldType.pointNumBytes()); + fieldType.pointDimensionCount(), fieldType.pointNumBytes(), false); } private void storePointValues(Info info, BytesRef pointValue) { @@ -520,7 +520,7 @@ public class MemoryIndex { info.fieldInfo = new FieldInfo( info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(), - info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes() + info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes(), info.fieldInfo.isSoftDeletesField() ); } else if (existingDocValuesType != docValuesType) { throw new IllegalArgumentException("Can't add [" + docValuesType + "] doc values field [" + fieldName + "], because [" + existingDocValuesType + "] doc values field already exists"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java index 9363ce63fe3..3515b9a9c97 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java @@ -53,7 +53,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes Directory dir = newDirectory(); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -75,7 +75,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes Directory dir = newDirectory(); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -115,7 +115,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -150,7 +150,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -185,7 +185,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -221,7 +221,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -251,7 +251,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes for (int i = 0; i < numFields; i++) { fieldNames.add(TestUtil.randomUnicodeString(random())); } - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); for (String field : fieldNames) { IndexableFieldType fieldType = randomFieldType(random()); FieldInfo fi = builder.getOrAdd(field); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index f5b52239057..83419de52e2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -323,7 +323,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), - proto.getPointDimensionCount(), proto.getPointNumBytes()); + proto.getPointDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index 7dd6ba89bd0..2c746773f94 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -77,7 +77,8 @@ public class MismatchedLeafReader extends FilterLeafReader { oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // dimension count - oldInfo.getPointNumBytes()); // dimension numBytes + oldInfo.getPointNumBytes(), // dimension numBytes + oldInfo.isSoftDeletesField()); // used as soft-deletes field shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 29962e609a7..9f2d9b7adc0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -130,7 +130,7 @@ public class RandomPostingsTester { fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); fieldUpto++; SortedMap postings = new TreeMap<>(); @@ -651,7 +651,7 @@ public class RandomPostingsTester { DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index 82a62d56d3e..9ffea4bc9c8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -797,7 +797,8 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia fieldInfo.getDocValuesGen(), fieldInfo.attributes(), fieldInfo.getPointDimensionCount(), - fieldInfo.getPointNumBytes()); + fieldInfo.getPointNumBytes(), + fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java index 76a52583e32..d0f8cd4633e 100644 --- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java @@ -425,7 +425,7 @@ public class CollapsingQParserPlugin extends QParserPlugin { DocValuesType.NONE, fieldInfo.getDocValuesGen(), fieldInfo.attributes(), - 0, 0); + 0, 0, fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java index aa366521e88..8fe081f947b 100644 --- a/solr/core/src/java/org/apache/solr/search/Insanity.java +++ b/solr/core/src/java/org/apache/solr/search/Insanity.java @@ -66,7 +66,7 @@ public class Insanity { if (fi.name.equals(insaneField)) { filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } else { filteredInfos.add(fi); } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 967db541414..9f0f5271c67 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -282,7 +282,7 @@ public class UninvertingReader extends FilterLeafReader { } filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); } From 727da63f4affca3a7001c19322dfa936ab507bc3 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Mon, 4 Jun 2018 22:14:37 +1000 Subject: [PATCH 08/38] SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 --- solr/CHANGES.txt | 2 + .../org/apache/solr/core/CoreContainer.java | 39 +++++++-------- .../security/PKIAuthenticationPlugin.java | 40 ++++------------ .../solr/security/PublicKeyHandler.java | 47 +++++++++++++++++++ .../org/apache/solr/servlet/HttpSolrCall.java | 4 +- .../solr/servlet/SolrDispatchFilter.java | 43 +++++++++-------- .../apache/solr/cloud/RollingRestartTest.java | 10 ++-- .../security/TestPKIAuthenticationPlugin.java | 7 ++- 8 files changed, 110 insertions(+), 82 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 2c2191eda5b..c88de1061ed 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -290,6 +290,8 @@ Bug Fixes * SOLR-12433: Recovering flag of a replica is set equals to leader even it failed to receive update on recovering. (Cao Manh Dat) +* SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 (noble) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index e108ae19f89..d546dd29b9c 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -16,20 +16,6 @@ */ package org.apache.solr.core; -import static java.util.Objects.requireNonNull; -import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; -import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; -import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH; -import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH; -import static org.apache.solr.common.params.CommonParams.METRICS_PATH; -import static org.apache.solr.common.params.CommonParams.ZK_PATH; -import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; - import java.io.IOException; import java.lang.invoke.MethodHandles; import java.nio.file.Path; @@ -64,15 +50,15 @@ import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder; import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.AuthSchemeRegistryProvider; import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.CredentialsProviderProvider; import org.apache.solr.client.solrj.util.SolrIdentifierValidator; -import org.apache.solr.cloud.autoscaling.AutoScalingHandler; import org.apache.solr.cloud.CloudDescriptor; import org.apache.solr.cloud.Overseer; import org.apache.solr.cloud.ZkController; +import org.apache.solr.cloud.autoscaling.AutoScalingHandler; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica.State; -import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.Utils; @@ -106,6 +92,7 @@ import org.apache.solr.security.AuthenticationPlugin; import org.apache.solr.security.AuthorizationPlugin; import org.apache.solr.security.HttpClientBuilderPlugin; import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.security.SecurityPluginHolder; import org.apache.solr.update.SolrCoreState; import org.apache.solr.update.UpdateShardHandler; @@ -116,7 +103,20 @@ import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.util.Objects.requireNonNull; +import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; +import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; +import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH; +import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH; +import static org.apache.solr.common.params.CommonParams.METRICS_PATH; +import static org.apache.solr.common.params.CommonParams.ZK_PATH; import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME; +import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; /** * @@ -301,6 +301,7 @@ public class CoreContainer { public CoreContainer(NodeConfig config, Properties properties, CoresLocator locator, boolean asyncSolrCoreLoad) { this.loader = config.getSolrResourceLoader(); this.solrHome = loader.getInstancePath().toString(); + containerHandlers.put(PublicKeyHandler.PATH, new PublicKeyHandler()); this.cfg = requireNonNull(config); this.coresLocator = locator; this.containerProperties = new Properties(properties); @@ -548,7 +549,8 @@ public class CoreContainer { hostName = cfg.getNodeName(); zkSys.initZooKeeper(this, solrHome, cfg.getCloudConfig()); - if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName()); + if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName(), + (PublicKeyHandler) containerHandlers.get(PublicKeyHandler.PATH)); MDCLoggingContext.setNode(this); @@ -592,8 +594,7 @@ public class CoreContainer { containerHandlers.put(AUTHZ_PATH, securityConfHandler); securityConfHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, AUTHZ_PATH); containerHandlers.put(AUTHC_PATH, securityConfHandler); - if(pkiAuthenticationPlugin != null) - containerHandlers.put(PKIAuthenticationPlugin.PATH, pkiAuthenticationPlugin.getRequestHandler()); + PluginInfo[] metricReporters = cfg.getMetricsConfig().getMetricReporters(); metricManager.loadReporters(metricReporters, loader, this, null, null, SolrInfoBean.Group.node); diff --git a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java index 877e4f16cd6..43dac480168 100644 --- a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java +++ b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java @@ -47,11 +47,7 @@ import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.SuppressForbidden; import org.apache.solr.common.util.Utils; import org.apache.solr.core.CoreContainer; -import org.apache.solr.handler.RequestHandlerBase; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.request.SolrRequestInfo; -import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.CryptoKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,7 +58,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; public class PKIAuthenticationPlugin extends AuthenticationPlugin implements HttpClientBuilderPlugin { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final Map keyCache = new ConcurrentHashMap<>(); - private final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair(); + private final PublicKeyHandler publicKeyHandler; private final CoreContainer cores; private final int MAX_VALIDITY = Integer.parseInt(System.getProperty("pkiauth.ttl", "10000")); private final String myNodeName; @@ -77,7 +73,8 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt return interceptorRegistered; } - public PKIAuthenticationPlugin(CoreContainer cores, String nodeName) { + public PKIAuthenticationPlugin(CoreContainer cores, String nodeName, PublicKeyHandler publicKeyHandler) { + this.publicKeyHandler = publicKeyHandler; this.cores = cores; myNodeName = nodeName; } @@ -92,7 +89,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt public boolean doAuthenticate(ServletRequest request, ServletResponse response, FilterChain filterChain) throws Exception { String requestURI = ((HttpServletRequest) request).getRequestURI(); - if (requestURI.endsWith(PATH)) { + if (requestURI.endsWith(PublicKeyHandler.PATH)) { filterChain.doFilter(request, response); return true; } @@ -198,7 +195,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt String url = cores.getZkController().getZkStateReader().getBaseUrlForNodeName(nodename); HttpEntity entity = null; try { - String uri = url + PATH + "?wt=json&omitHeader=true"; + String uri = url + PublicKeyHandler.PATH + "?wt=json&omitHeader=true"; log.debug("Fetching fresh public key from : {}",uri); HttpResponse rsp = cores.getUpdateShardHandler().getDefaultHttpClient() .execute(new HttpGet(uri), HttpClientUtil.createNewHttpClientRequestContext()); @@ -207,7 +204,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt Map m = (Map) Utils.fromJSON(bytes); String key = (String) m.get("key"); if (key == null) { - log.error("No key available from " + url + PATH); + log.error("No key available from " + url + PublicKeyHandler.PATH); return null; } else { log.info("New Key obtained from node: {} / {}", nodename, key); @@ -230,26 +227,6 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt return builder; } - public SolrRequestHandler getRequestHandler() { - return new RequestHandlerBase() { - @Override - public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { - rsp.add("key", keyPair.getPublicKeyStr()); - } - - @Override - public String getDescription() { - return "Return the public key of this server"; - } - - @Override - public Category getCategory() { - return Category.ADMIN; - } - - }; - } - public boolean needsAuthorization(HttpServletRequest req) { return req.getUserPrincipal() != SU; } @@ -292,7 +269,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt String s = usr + " " + System.currentTimeMillis(); byte[] payload = s.getBytes(UTF_8); - byte[] payloadCipher = keyPair.encrypt(ByteBuffer.wrap(payload)); + byte[] payloadCipher = publicKeyHandler.keyPair.encrypt(ByteBuffer.wrap(payload)); String base64Cipher = Base64.byteArrayToBase64(payloadCipher); httpRequest.setHeader(HEADER, myNodeName + " " + base64Cipher); } @@ -316,11 +293,10 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt } public String getPublicKey() { - return keyPair.getPublicKeyStr(); + return publicKeyHandler.getPublicKey(); } public static final String HEADER = "SolrAuth"; - public static final String PATH = "/admin/info/key"; public static final String NODE_IS_USER = "$"; // special principal to denote the cluster member private static final Principal SU = new BasicUserPrincipal("$"); diff --git a/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java new file mode 100644 index 00000000000..ad835782a74 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.security; + +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.CryptoKeys; + +public class PublicKeyHandler extends RequestHandlerBase { + public static final String PATH = "/admin/info/key"; + final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair(); + + public String getPublicKey() { + return keyPair.getPublicKeyStr(); + } + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + rsp.add("key", keyPair.getPublicKeyStr()); + } + + @Override + public String getDescription() { + return "Return the public key of this server"; + } + + @Override + public Category getCategory() { + return Category.ADMIN; + } +} diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java index d1347298505..b297a4430a1 100644 --- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java +++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java @@ -97,7 +97,7 @@ import org.apache.solr.security.AuthorizationContext; import org.apache.solr.security.AuthorizationContext.CollectionRequest; import org.apache.solr.security.AuthorizationContext.RequestType; import org.apache.solr.security.AuthorizationResponse; -import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.servlet.SolrDispatchFilter.Action; import org.apache.solr.servlet.cache.HttpCacheHeaderUtil; import org.apache.solr.servlet.cache.Method; @@ -547,7 +547,7 @@ public class HttpSolrCall { } private boolean shouldAuthorize() { - if(PKIAuthenticationPlugin.PATH.equals(path)) return false; + if(PublicKeyHandler.PATH.equals(path)) return false; //admin/info/key is the path where public key is exposed . it is always unsecured if (cores.getPkiAuthenticationPlugin() != null && req.getUserPrincipal() != null) { boolean b = cores.getPkiAuthenticationPlugin().needsAuthorization(req); diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java index c7fdd57f90d..78e58d000aa 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java @@ -16,6 +16,20 @@ */ package org.apache.solr.servlet; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ReadListener; +import javax.servlet.ServletException; +import javax.servlet.ServletInputStream; +import javax.servlet.ServletOutputStream; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.UnavailableException; +import javax.servlet.WriteListener; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletRequestWrapper; +import javax.servlet.http.HttpServletResponse; +import javax.servlet.http.HttpServletResponseWrapper; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -35,21 +49,10 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Matcher; import java.util.regex.Pattern; -import javax.servlet.FilterChain; -import javax.servlet.FilterConfig; -import javax.servlet.ReadListener; -import javax.servlet.ServletException; -import javax.servlet.ServletInputStream; -import javax.servlet.ServletOutputStream; -import javax.servlet.ServletRequest; -import javax.servlet.ServletResponse; -import javax.servlet.UnavailableException; -import javax.servlet.WriteListener; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletRequestWrapper; -import javax.servlet.http.HttpServletResponse; -import javax.servlet.http.HttpServletResponseWrapper; - +import com.codahale.metrics.jvm.ClassLoadingGaugeSet; +import com.codahale.metrics.jvm.GarbageCollectorMetricSet; +import com.codahale.metrics.jvm.MemoryUsageGaugeSet; +import com.codahale.metrics.jvm.ThreadStatesGaugeSet; import org.apache.commons.io.FileCleaningTracker; import org.apache.commons.lang.StringUtils; import org.apache.http.client.HttpClient; @@ -72,17 +75,13 @@ import org.apache.solr.metrics.SolrMetricManager; import org.apache.solr.request.SolrRequestInfo; import org.apache.solr.security.AuthenticationPlugin; import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.util.SolrFileCleaningTracker; import org.apache.solr.util.StartupLoggingUtils; import org.apache.solr.util.configuration.SSLConfigurationsFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.codahale.metrics.jvm.ClassLoadingGaugeSet; -import com.codahale.metrics.jvm.GarbageCollectorMetricSet; -import com.codahale.metrics.jvm.MemoryUsageGaugeSet; -import com.codahale.metrics.jvm.ThreadStatesGaugeSet; - /** * This filter looks at the incoming URL maps them to handlers defined in solrconfig.xml * @@ -441,8 +440,8 @@ public class SolrDispatchFilter extends BaseSolrFilter { // /admin/info/key must be always open. see SOLR-9188 // tests work only w/ getPathInfo //otherwise it's just enough to have getServletPath() - if (PKIAuthenticationPlugin.PATH.equals(request.getServletPath()) || - PKIAuthenticationPlugin.PATH.equals(request.getPathInfo())) return true; + if (PublicKeyHandler.PATH.equals(request.getServletPath()) || + PublicKeyHandler.PATH.equals(request.getPathInfo())) return true; String header = request.getHeader(PKIAuthenticationPlugin.HEADER); if (header != null && cores.getPkiAuthenticationPlugin() != null) authenticationPlugin = cores.getPkiAuthenticationPlugin(); diff --git a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java index 14586664ec0..addf732a6df 100644 --- a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java @@ -16,6 +16,11 @@ */ package org.apache.solr.cloud; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + import org.apache.commons.collections.CollectionUtils; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.common.cloud.SolrZkClient; @@ -24,11 +29,6 @@ import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.TimeUnit; - public class RollingRestartTest extends AbstractFullDistribZkTestBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); diff --git a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java index a664cc04205..2d324cbd534 100644 --- a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java +++ b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java @@ -35,7 +35,10 @@ import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrRequestInfo; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.CryptoKeys; -import static org.mockito.Mockito.*; + +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 { @@ -45,7 +48,7 @@ public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 { Map remoteKeys = new HashMap<>(); public MockPKIAuthenticationPlugin(CoreContainer cores, String node) { - super(cores, node); + super(cores, node, new PublicKeyHandler()); } @Override From 2ef3d07561654dc53b9d14644e63e6a64e1eae4a Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Mon, 4 Jun 2018 12:59:53 +0200 Subject: [PATCH 09/38] SOLR-11911: Fix a number of synchronization issues in the simulator. Enable this test for now. --- .../cloud/autoscaling/ComputePlanAction.java | 8 +- .../autoscaling/sim/SimCloudManager.java | 2 +- .../sim/SimClusterStateProvider.java | 50 +++++-- .../autoscaling/sim/SimNodeStateProvider.java | 128 +++++++++++------- .../autoscaling/sim/SimSolrCloudTestCase.java | 8 +- .../sim/TestClusterStateProvider.java | 6 +- .../autoscaling/sim/TestLargeCluster.java | 17 ++- .../solrj/cloud/autoscaling/Policy.java | 7 +- 8 files changed, 149 insertions(+), 77 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java index 4a9c7442774..22e3ef5e77e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java +++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java @@ -168,7 +168,13 @@ public class ComputePlanAction extends TriggerActionBase { // estimate a maximum default limit that should be sufficient for most purposes: // number of nodes * total number of replicas * 3 AtomicInteger totalRF = new AtomicInteger(); - clusterState.forEachCollection(coll -> totalRF.addAndGet(coll.getReplicationFactor() * coll.getSlices().size())); + clusterState.forEachCollection(coll -> { + Integer rf = coll.getReplicationFactor(); + if (rf == null) { + rf = coll.getReplicas().size() / coll.getSlices().size(); + } + totalRF.addAndGet(rf * coll.getSlices().size()); + }); int totalMax = clusterState.getLiveNodes().size() * totalRF.get() * 3; int maxOp = (Integer) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS, totalMax); Object o = event.getProperty(AutoScalingParams.MAX_COMPUTE_OPERATIONS, maxOp); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java index c09d4a48c35..234eaea29a1 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java @@ -392,8 +392,8 @@ public class SimCloudManager implements SolrCloudManager { public String simAddNode() throws Exception { Map values = createNodeValues(null); String nodeId = (String)values.get(ImplicitSnitch.NODE); - clusterStateProvider.simAddNode(nodeId); nodeStateProvider.simSetNodeValues(nodeId, values); + clusterStateProvider.simAddNode(nodeId); LOG.trace("-- added node " + nodeId); // initialize history handler if this is the first node if (historyHandler == null && liveNodesSet.size() == 1) { diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java index ca2dd48858d..20ffca92fe3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java @@ -111,11 +111,11 @@ import static org.apache.solr.common.params.CommonParams.NAME; public class SimClusterStateProvider implements ClusterStateProvider { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final Map> nodeReplicaMap = new ConcurrentHashMap<>(); private final LiveNodesSet liveNodes; private final SimDistribStateManager stateManager; private final SimCloudManager cloudManager; + private final Map> nodeReplicaMap = new ConcurrentHashMap<>(); private final Map clusterProperties = new ConcurrentHashMap<>(); private final Map> collProperties = new ConcurrentHashMap<>(); private final Map>> sliceProperties = new ConcurrentHashMap<>(); @@ -257,8 +257,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { try { Set collections = new HashSet<>(); // mark every replica on that node as down - setReplicaStates(nodeId, Replica.State.DOWN, collections); boolean res = liveNodes.remove(nodeId); + setReplicaStates(nodeId, Replica.State.DOWN, collections); if (!collections.isEmpty()) { collectionsStatesRef.set(null); } @@ -279,6 +279,20 @@ public class SimClusterStateProvider implements ClusterStateProvider { } } + /** + * Remove all replica information related to dead nodes. + */ + public void simRemoveDeadNodes() throws Exception { + lock.lockInterruptibly(); + try { + Set myNodes = new HashSet<>(nodeReplicaMap.keySet()); + myNodes.removeAll(liveNodes.get()); + collectionsStatesRef.set(null); + } finally { + lock.unlock(); + } + } + private synchronized void updateOverseerLeader() throws Exception { if (overseerLeader != null && liveNodes.contains(overseerLeader)) { return; @@ -436,6 +450,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { opDelay(replicaInfo.getCollection(), CollectionParams.CollectionAction.ADDREPLICA.name()); + // at this point nuke our cached DocCollection state + collectionsStatesRef.set(null); List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); // mark replica as active replicaInfo.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString()); @@ -445,8 +461,6 @@ public class SimClusterStateProvider implements ClusterStateProvider { replicaInfo.getVariables().put(Suggestion.coreidxsize, 1); replicas.add(replicaInfo); - // at this point nuke our cached DocCollection state - collectionsStatesRef.set(null); LOG.trace("-- simAddReplica {}", replicaInfo); Map values = cloudManager.getSimNodeStateProvider().simGetAllNodeValues() @@ -483,8 +497,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { * @param coreNodeName coreNodeName */ public void simRemoveReplica(String nodeId, String coreNodeName) throws Exception { - List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); lock.lockInterruptibly(); + List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); try { for (int i = 0; i < replicas.size(); i++) { if (coreNodeName.equals(replicas.get(i).getName())) { @@ -572,7 +586,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { }); } - private void simRunLeaderElection(String collection, Slice s, boolean saveClusterState) throws Exception { + private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception { AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE); Replica leader = s.getLeader(); if (leader == null || !liveNodes.contains(leader.getNodeName())) { @@ -636,8 +650,9 @@ public class SimClusterStateProvider implements ClusterStateProvider { } else { LOG.trace("-- already has leader for {} / {}", collection, s.getName()); } - if (stateChanged.get()) { + if (stateChanged.get() || saveState) { collectionsStatesRef.set(null); + saveClusterState.set(true); } } @@ -654,6 +669,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { List nodeList = new ArrayList<>(); List shardNames = new ArrayList<>(); final String collectionName = props.getStr(NAME); + // always force getting fresh state + collectionsStatesRef.set(null); ClusterState clusterState = getClusterState(); ZkWriteCommand cmd = new ClusterStateMutator(cloudManager).createCollection(clusterState, props); if (cmd.noop) { @@ -758,12 +775,18 @@ public class SimClusterStateProvider implements ClusterStateProvider { if (cores == 0) { throw new RuntimeException("Unexpected value of 'cores' (" + cores + ") on node: " + n); } - cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1); + try { + cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("interrupted"); + } } } } }); collectionsStatesRef.set(null); + saveClusterState.set(true); results.add("success", ""); } catch (Exception e) { LOG.warn("Exception", e); @@ -787,6 +810,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { values.put(ImplicitSnitch.DISK, 1000); }); collectionsStatesRef.set(null); + saveClusterState.set(true); } finally { lock.unlock(); } @@ -1057,7 +1081,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { } } - public synchronized void createSystemCollection() throws IOException { + public void createSystemCollection() throws IOException { try { if (simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) { return; @@ -1065,7 +1089,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { ZkNodeProps props = new ZkNodeProps( NAME, CollectionAdminParams.SYSTEM_COLL, REPLICATION_FACTOR, "1", - OverseerCollectionMessageHandler.NUM_SLICES, "1" + OverseerCollectionMessageHandler.NUM_SLICES, "1", + CommonAdminParams.WAIT_FOR_FINAL_STATE, "true" ); simCreateCollection(props, new NamedList()); } catch (Exception e) { @@ -1389,7 +1414,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { }); }); if (infos.isEmpty()) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist."); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist (shard=" + shard + ")."); } if (divide && value != null && (value instanceof Number)) { if ((value instanceof Long) || (value instanceof Integer)) { @@ -1455,6 +1480,9 @@ public class SimClusterStateProvider implements ClusterStateProvider { nodeReplicaMap.forEach((n, replicas) -> { replicas.forEach(ri -> collections.add(ri.getCollection())); }); + // check collProps and sliceProps too + collProperties.forEach((coll, props) -> collections.add(coll)); + sliceProperties.forEach((coll, slices) -> collections.add(coll)); return new ArrayList<>(collections); } finally { lock.unlock(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java index b9169eb2263..cb8640c155e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; import org.apache.solr.client.solrj.cloud.NodeStateProvider; @@ -50,6 +51,7 @@ public class SimNodeStateProvider implements NodeStateProvider { private final SimClusterStateProvider clusterStateProvider; private final SimDistribStateManager stateManager; private final LiveNodesSet liveNodesSet; + private final ReentrantLock lock = new ReentrantLock(); public SimNodeStateProvider(LiveNodesSet liveNodesSet, SimDistribStateManager stateManager, SimClusterStateProvider clusterStateProvider, @@ -84,14 +86,19 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param node node id * @param values values. */ - public void simSetNodeValues(String node, Map values) { - Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - existing.clear(); - if (values != null) { - existing.putAll(values); - } - if (values == null || values.isEmpty() || values.containsKey("nodeRole")) { - saveRoles(); + public void simSetNodeValues(String node, Map values) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + existing.clear(); + if (values != null) { + existing.putAll(values); + } + if (values == null || values.isEmpty() || values.containsKey("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -102,15 +109,20 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param key property name * @param value property value */ - public void simSetNodeValue(String node, String key, Object value) { - Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - if (value == null) { - existing.remove(key); - } else { - existing.put(key, value); - } - if (key.equals("nodeRole")) { - saveRoles(); + public void simSetNodeValue(String node, String key, Object value) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + if (value == null) { + existing.remove(key); + } else { + existing.put(key, value); + } + if (key.equals("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -121,21 +133,26 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param key property name * @param value property value. */ - public void simAddNodeValue(String node, String key, Object value) { - Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - Object existing = values.get(key); - if (existing == null) { - values.put(key, value); - } else if (existing instanceof Set) { - ((Set)existing).add(value); - } else { - Set vals = new HashSet<>(); - vals.add(existing); - vals.add(value); - values.put(key, vals); - } - if (key.equals("nodeRole")) { - saveRoles(); + public void simAddNodeValue(String node, String key, Object value) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + Object existing = values.get(key); + if (existing == null) { + values.put(key, value); + } else if (existing instanceof Set) { + ((Set)existing).add(value); + } else { + Set vals = new HashSet<>(); + vals.add(existing); + vals.add(value); + values.put(key, vals); + } + if (key.equals("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -144,10 +161,16 @@ public class SimNodeStateProvider implements NodeStateProvider { * /roles.json is updated. * @param node node id */ - public void simRemoveNodeValues(String node) { - Map values = nodeValues.remove(node); - if (values != null && values.containsKey("nodeRole")) { - saveRoles(); + public void simRemoveNodeValues(String node) throws InterruptedException { + LOG.debug("--removing value for " + node); + lock.lockInterruptibly(); + try { + Map values = nodeValues.remove(node); + if (values != null && values.containsKey("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -155,19 +178,24 @@ public class SimNodeStateProvider implements NodeStateProvider { * Remove values that correspond to dead nodes. If values contained a 'nodeRole' * key then /roles.json is updated. */ - public void simRemoveDeadNodes() { + public void simRemoveDeadNodes() throws InterruptedException { Set myNodes = new HashSet<>(nodeValues.keySet()); myNodes.removeAll(liveNodesSet.get()); - AtomicBoolean updateRoles = new AtomicBoolean(false); - myNodes.forEach(n -> { - LOG.debug("- removing dead node values: " + n); - Map vals = nodeValues.remove(n); - if (vals.containsKey("nodeRole")) { - updateRoles.set(true); + lock.lockInterruptibly(); + try { + AtomicBoolean updateRoles = new AtomicBoolean(false); + myNodes.forEach(n -> { + LOG.debug("- removing dead node values: " + n); + Map vals = nodeValues.remove(n); + if (vals.containsKey("nodeRole")) { + updateRoles.set(true); + } + }); + if (updateRoles.get()) { + saveRoles(); } - }); - if (updateRoles.get()) { - saveRoles(); + } finally { + lock.unlock(); } } @@ -187,7 +215,7 @@ public class SimNodeStateProvider implements NodeStateProvider { return nodeValues; } - private synchronized void saveRoles() { + private void saveRoles() { final Map> roles = new HashMap<>(); nodeValues.forEach((n, values) -> { String nodeRole = (String)values.get("nodeRole"); @@ -211,6 +239,9 @@ public class SimNodeStateProvider implements NodeStateProvider { * @return map of metrics names / values */ public Map getReplicaMetricsValues(String node, Collection tags) { + if (!liveNodesSet.contains(node)) { + throw new RuntimeException("non-live node " + node); + } List replicas = clusterStateProvider.simGetReplicaInfos(node); if (replicas == null || replicas.isEmpty()) { return Collections.emptyMap(); @@ -258,8 +289,7 @@ public class SimNodeStateProvider implements NodeStateProvider { public Map getNodeValues(String node, Collection tags) { LOG.trace("-- requested values for " + node + ": " + tags); if (!liveNodesSet.contains(node)) { - nodeValues.remove(node); - return Collections.emptyMap(); + throw new RuntimeException("non-live node " + node); } if (tags.isEmpty()) { return Collections.emptyMap(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java index 757e2975cd9..e83f72f5712 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java @@ -84,6 +84,10 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { // clear any persisted configuration cluster.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), -1); cluster.getDistribStateManager().setData(ZkStateReader.ROLES, Utils.toJSON(new HashMap<>()), -1); + cluster.getSimClusterStateProvider().simDeleteAllCollections(); + cluster.simClearSystemCollection(); + cluster.getSimNodeStateProvider().simRemoveDeadNodes(); + cluster.getSimClusterStateProvider().simRemoveDeadNodes(); // restore the expected number of nodes int currentSize = cluster.getLiveNodesSet().size(); if (currentSize < clusterNodeCount) { @@ -99,10 +103,6 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { removeChildren(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH); removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); - cluster.getSimClusterStateProvider().simDeleteAllCollections(); - cluster.simClearSystemCollection(); - // clear any dead nodes - cluster.getSimNodeStateProvider().simRemoveDeadNodes(); cluster.getSimClusterStateProvider().simResetLeaderThrottles(); cluster.simRestartOverseer(null); cluster.getTimeSource().sleep(5000); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java index 71106452ffb..e395985d027 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java @@ -109,7 +109,11 @@ public class TestClusterStateProvider extends SolrCloudTestCase { simCloudManager.getSimClusterStateProvider().simSetClusterProperties(clusterProperties); simCloudManager.getSimDistribStateManager().simSetAutoScalingConfig(autoScalingConfig); nodeValues.forEach((n, values) -> { - simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values); + try { + simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values); + } catch (InterruptedException e) { + fail("Interrupted:" + e); + } }); simCloudManager.getSimClusterStateProvider().simSetClusterState(realState); ClusterState simState = simCloudManager.getClusterStateProvider().getClusterState(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java index 6d53363a078..6e6b4aa3bd8 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java @@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; -import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig; @@ -54,6 +53,7 @@ import org.apache.solr.cloud.autoscaling.CapturedEvent; import org.apache.solr.cloud.autoscaling.TriggerValidationException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.Pair; @@ -74,7 +74,7 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut @TimeoutSuite(millis = 4 * 3600 * 1000) @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG") @ThreadLeakLingering(linger = 20000) // ComputePlanAction may take significant time to complete -@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075") +//@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075") public class TestLargeCluster extends SimSolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -94,7 +94,6 @@ public class TestLargeCluster extends SimSolrCloudTestCase { @Before public void setupTest() throws Exception { - waitForSeconds = 5; triggerFiredCount.set(0); triggerFiredLatch = new CountDownLatch(1); @@ -107,6 +106,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); NamedList response = solrClient.request(req); assertEquals(response.get("result").toString(), "success"); + + // do this in advance if missing + if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) { + cluster.getSimClusterStateProvider().createSystemCollection(); + CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS, + CloudTestUtils.clusterShape(1, 1)); + } } public static class TestTriggerListener extends TriggerListenerBase { @@ -520,8 +526,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } @Test - // JIRA closed 24-Feb-2018. Still apparently a problem. - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714") + //@BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714") public void testSearchRate() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); String collectionName = "testSearchRate"; @@ -575,7 +580,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { assertEquals(response.get("result").toString(), "success"); - boolean await = triggerFiredLatch.await(40000 / SPEED, TimeUnit.MILLISECONDS); + boolean await = triggerFiredLatch.await(waitForSeconds * 20000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("The trigger did not fire at all", await); // wait for listener to capture the SUCCEEDED stage cluster.getTimeSource().sleep(2000); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java index fb01cc5e962..60ff0c929be 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java @@ -383,11 +383,10 @@ public class Policy implements MapWriter { return p.compare(r1, r2, false); }); } catch (Exception e) { - LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, compare : {} matrix = {}", + LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, matrix = {}", clusterPreferences, - lastComparison[0].node, - lastComparison[1].node, - p.compare(lastComparison[0],lastComparison[1], false ), + lastComparison[0], + lastComparison[1], Utils.toJSONString(Utils.getDeepCopy(tmpMatrix, 6, false))); throw e; } From fe83838ec3768f25964a04510cd10772cf034d34 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 26 May 2018 21:35:46 +0200 Subject: [PATCH 10/38] LUCENE-8341: Record soft deletes in SegmentCommitInfo This change add the number of documents that are soft deletes but not hard deleted to the segment commit info. This is the last step towards making soft deletes as powerful as hard deltes since now the number of document can be read from commit points without opening a full blown reader. This also allows merge posliies to make decisions without requiring an NRT reader to get the relevant statistics. This change doesn't enforce any field to be used as soft deletes and the statistic is maintained per segment. --- .../lucene/index/BinaryDocValuesWriter.java | 5 ++ .../lucene/index/BufferedUpdatesStream.java | 2 +- .../org/apache/lucene/index/CheckIndex.java | 27 +++++++- .../lucene/index/DefaultIndexingChain.java | 16 +++++ .../org/apache/lucene/index/DocConsumer.java | 9 +++ .../apache/lucene/index/DocValuesWriter.java | 3 + .../index/DocumentsWriterPerThread.java | 19 +++++- .../org/apache/lucene/index/IndexWriter.java | 39 +++++++---- .../lucene/index/NumericDocValuesWriter.java | 5 ++ .../apache/lucene/index/PendingDeletes.java | 8 ++- .../lucene/index/PendingSoftDeletes.java | 42 ++++++++++-- .../org/apache/lucene/index/ReaderPool.java | 4 +- .../lucene/index/ReadersAndUpdates.java | 6 -- .../lucene/index/SegmentCommitInfo.java | 34 +++++++++- .../org/apache/lucene/index/SegmentInfos.java | 19 +++++- .../apache/lucene/index/SegmentMerger.java | 1 + .../lucene/index/SegmentWriteState.java | 4 +- .../SoftDeletesDirectoryReaderWrapper.java | 68 ++++++++++++++++++- .../SoftDeletesRetentionMergePolicy.java | 2 +- .../lucene/index/SortedDocValuesWriter.java | 5 ++ .../index/SortedNumericDocValuesWriter.java | 5 ++ .../index/SortedSetDocValuesWriter.java | 4 ++ .../apache/lucene/index/TestAddIndexes.java | 46 +++++++++++++ .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestIndexWriter.java | 20 +++++- .../TestIndexWriterThreadsToSegments.java | 2 +- .../TestOneMergeWrappingMergePolicy.java | 2 +- .../lucene/index/TestPendingDeletes.java | 6 +- .../lucene/index/TestPendingSoftDeletes.java | 65 +++++++++++++++--- .../apache/lucene/index/TestSegmentInfos.java | 6 +- .../lucene/index/TestSegmentMerger.java | 2 +- ...TestSoftDeletesDirectoryReaderWrapper.java | 3 +- .../apache/lucene/index/IndexSplitter.java | 2 +- .../index/BaseLiveDocsFormatTestCase.java | 4 +- .../lucene/index/BaseMergePolicyTestCase.java | 2 +- 35 files changed, 418 insertions(+), 71 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 2701a622a3f..1aeab4c9802 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -200,4 +200,9 @@ class BinaryDocValuesWriter extends DocValuesWriter { return value.get(); } } + + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java b/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java index dcc8bbfb985..91e590c3171 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java +++ b/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java @@ -259,10 +259,10 @@ final class BufferedUpdatesStream implements Accountable { SegmentState(ReadersAndUpdates rld, IOUtils.IOConsumer onClose, SegmentCommitInfo info) throws IOException { this.rld = rld; + reader = rld.getReader(IOContext.READ); startDelCount = rld.getDelCount(); delGen = info.getBufferedDeletesGen(); this.onClose = onClose; - reader = rld.getReader(IOContext.READ); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 0f7871acd23..9a4e3e50d93 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -46,6 +46,7 @@ import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -411,7 +412,7 @@ public final class CheckIndex implements Closeable { * that would otherwise be more complicated to debug if they had to close the writer * for each check. */ - public CheckIndex(Directory dir, Lock writeLock) throws IOException { + public CheckIndex(Directory dir, Lock writeLock) { this.dir = dir; this.writeLock = writeLock; this.infoStream = null; @@ -781,7 +782,10 @@ public final class CheckIndex implements Closeable { throw new RuntimeException("Points test failed"); } } - + final String softDeletesField = reader.getFieldInfos().getSoftDeletesField(); + if (softDeletesField != null) { + checkSoftDeletes(softDeletesField, info, reader, infoStream, failFast); + } msg(infoStream, ""); if (verbose) { @@ -3049,6 +3053,25 @@ public final class CheckIndex implements Closeable { } } + private static void checkSoftDeletes(String softDeletesField, SegmentCommitInfo info, SegmentReader reader, PrintStream infoStream, boolean failFast) throws IOException { + if (infoStream != null) + infoStream.print(" test: check soft deletes....."); + try { + int softDeletes = PendingSoftDeletes.countSoftDeletes(DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(softDeletesField, reader), reader.getLiveDocs()); + if (softDeletes != info.getSoftDelCount()) { + throw new RuntimeException("actual soft deletes: " + softDeletes + " but expected: " +info.getSoftDelCount()); + } + } catch (Exception e) { + if (failFast) { + throw IOUtils.rethrowAlways(e); + } + msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]"); + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + } + private static double nsToSec(long ns) { return ns/1000000000.0; } diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index 705d7bc6de4..e55251696e9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -36,6 +36,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.document.FieldType; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.similarities.Similarity; @@ -841,4 +842,19 @@ final class DefaultIndexingChain extends DocConsumer { } } } + + @Override + DocIdSetIterator getHasDocValues(String field) { + PerField perField = getPerField(field); + if (perField != null) { + if (perField.docValuesWriter != null) { + if (perField.fieldInfo.getDocValuesType() == DocValuesType.NONE) { + return null; + } + + return perField.docValuesWriter.getDocIdSet(); + } + } + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java index a64f13c5ba2..d124434a5f9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java @@ -19,8 +19,17 @@ package org.apache.lucene.index; import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; + abstract class DocConsumer { abstract void processDocument() throws IOException; abstract Sorter.DocMap flush(final SegmentWriteState state) throws IOException; abstract void abort() throws IOException; + + /** + * Returns a {@link DocIdSetIterator} for the given field or null if the field doesn't have + * doc values. + */ + abstract DocIdSetIterator getHasDocValues(String field); + } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java index 9dde81728f2..b739b14a2a7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java @@ -20,10 +20,13 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortField; abstract class DocValuesWriter { abstract void finish(int numDoc); abstract void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer consumer) throws IOException; abstract Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException; + abstract DocIdSetIterator getDocIdSet(); + } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 094afc5a568..c8ebc4db491 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -28,6 +28,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; @@ -460,14 +461,27 @@ final class DocumentsWriterPerThread { } final Sorter.DocMap sortMap; try { + DocIdSetIterator softDeletedDocs; + if (indexWriterConfig.getSoftDeletesField() != null) { + softDeletedDocs = consumer.getHasDocValues(indexWriterConfig.getSoftDeletesField()); + } else { + softDeletedDocs = null; + } sortMap = consumer.flush(flushState); + if (softDeletedDocs == null) { + flushState.softDelCountOnFlush = 0; + } else { + flushState.softDelCountOnFlush = PendingSoftDeletes.countSoftDeletes(softDeletedDocs, flushState.liveDocs); + assert flushState.segmentInfo.maxDoc() >= flushState.softDelCountOnFlush + flushState.delCountOnFlush; + } // We clear this here because we already resolved them (private to this segment) when writing postings: pendingUpdates.clearDeleteTerms(); segmentInfo.setFiles(new HashSet<>(directory.getCreatedFiles())); - final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, -1L, -1L, -1L); + final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, flushState.softDelCountOnFlush, -1L, -1L, -1L); if (infoStream.isEnabled("DWPT")) { infoStream.message("DWPT", "new segment has " + (flushState.liveDocs == null ? 0 : flushState.delCountOnFlush) + " deleted docs"); + infoStream.message("DWPT", "new segment has " + flushState.softDelCountOnFlush + " soft-deleted docs"); infoStream.message("DWPT", "new segment has " + (flushState.fieldInfos.hasVectors() ? "vectors" : "no vectors") + "; " + (flushState.fieldInfos.hasNorms() ? "norms" : "no norms") + "; " + @@ -497,8 +511,7 @@ final class DocumentsWriterPerThread { assert segmentInfo != null; FlushedSegment fs = new FlushedSegment(infoStream, segmentInfoPerCommit, flushState.fieldInfos, - segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush, - sortMap); + segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush, sortMap); sealFlushedSegment(fs, sortMap, flushNotifications); if (infoStream.isEnabled("DWPT")) { infoStream.message("DWPT", "flush time " + ((System.nanoTime() - t0) / 1000000.0) + " msec"); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 5efba70ad01..037ff7230b6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -49,6 +49,7 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate; import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate; import org.apache.lucene.index.FieldInfos.FieldNumbers; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; @@ -347,6 +348,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * much like how hotels place an "authorization hold" on your credit * card to make sure they can later charge you when you check out. */ final AtomicLong pendingNumDocs = new AtomicLong(); + final boolean softDeletesEnabled; private final DocumentsWriter.FlushNotifications flushNotifications = new DocumentsWriter.FlushNotifications() { @Override @@ -639,7 +641,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (rld != null) { return rld.getDelCount(); // get the full count from here since SCI might change concurrently } else { - int delCount = info.getDelCount(); + final int delCount = info.getDelCount(softDeletesEnabled); assert delCount <= info.info.maxDoc(): "delCount: " + delCount + " maxDoc: " + info.info.maxDoc(); return delCount; } @@ -703,7 +705,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, conf.setIndexWriter(this); // prevent reuse by other instances config = conf; infoStream = config.getInfoStream(); - + softDeletesEnabled = config.getSoftDeletesField() != null; // obtain the write.lock. If the user configured a timeout, // we wrap with a sleeper and this might take some time. writeLock = d.obtainLock(WRITE_LOCK_NAME); @@ -1154,7 +1156,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (docWriter.anyDeletions()) { return true; } - if (readerPool.anyPendingDeletes()) { + if (readerPool.anyDeletions()) { return true; } for (final SegmentCommitInfo info : segmentInfos) { @@ -2939,11 +2941,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, // long so we can detect int overflow: long numDocs = 0; - - Sort indexSort = config.getIndexSort(); - long seqNo; - try { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "flush at addIndexes(CodecReader...)"); @@ -2951,10 +2949,15 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, flush(false, true); String mergedName = newSegmentName(); - + int numSoftDeleted = 0; for (CodecReader leaf : readers) { numDocs += leaf.numDocs(); validateMergeReader(leaf); + if (softDeletesEnabled) { + Bits liveDocs = leaf.getLiveDocs(); + numSoftDeleted += PendingSoftDeletes.countSoftDeletes( + DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(), leaf), liveDocs); + } } // Best-effort up front check: @@ -2979,8 +2982,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, } merger.merge(); // merge 'em - - SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, -1L, -1L, -1L); + SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, numSoftDeleted, -1L, -1L, -1L); info.setFiles(new HashSet<>(trackingDir.getCreatedFiles())); trackingDir.clearCreatedFiles(); @@ -3057,7 +3059,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, SegmentInfo newInfo = new SegmentInfo(directoryOrig, info.info.getVersion(), info.info.getMinVersion(), segName, info.info.maxDoc(), info.info.getUseCompoundFile(), info.info.getCodec(), info.info.getDiagnostics(), info.info.getId(), info.info.getAttributes(), info.info.getIndexSort()); - SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getDelGen(), + SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getSoftDelCount(), info.getDelGen(), info.getFieldInfosGen(), info.getDocValuesGen()); newInfo.setFiles(info.info.files()); @@ -4249,7 +4251,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, details.put("mergeMaxNumSegments", "" + merge.maxNumSegments); details.put("mergeFactor", Integer.toString(merge.segments.size())); setDiagnostics(si, SOURCE_MERGE, details); - merge.setMergeInfo(new SegmentCommitInfo(si, 0, -1L, -1L, -1L)); + merge.setMergeInfo(new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L)); if (infoStream.isEnabled("IW")) { infoStream.message("IW", "merge seg=" + merge.info.info.name + " " + segString(merge.segments)); @@ -4373,16 +4375,25 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, // Let the merge wrap readers List mergeReaders = new ArrayList<>(); + int numSoftDeleted = 0; for (SegmentReader reader : merge.readers) { CodecReader wrappedReader = merge.wrapForMerge(reader); validateMergeReader(wrappedReader); mergeReaders.add(wrappedReader); + if (softDeletesEnabled) { + if (reader != wrappedReader) { // if we don't have a wrapped reader we won't preserve any soft-deletes + Bits liveDocs = wrappedReader.getLiveDocs(); + numSoftDeleted += PendingSoftDeletes.countSoftDeletes( + DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(), wrappedReader), + liveDocs); + } + } } final SegmentMerger merger = new SegmentMerger(mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context); - + merge.info.setSoftDelCount(numSoftDeleted); merge.checkAborted(); merge.mergeStartNS = System.nanoTime(); @@ -4604,7 +4615,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * * @lucene.internal */ private synchronized String segString(SegmentCommitInfo info) { - return info.toString(numDeletedDocs(info) - info.getDelCount()); + return info.toString(numDeletedDocs(info) - info.getDelCount(softDeletesEnabled)); } private synchronized void doWait() { diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index 0a58f0d5021..980849fb58c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -82,6 +82,11 @@ class NumericDocValuesWriter extends DocValuesWriter { return Sorter.getDocComparator(maxDoc, sortField, () -> null, () -> docValues); } + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } + static SortingLeafReader.CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException { FixedBitSet docsWithField = new FixedBitSet(maxDoc); long[] values = new long[maxDoc]; diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java index f19b05391d3..4ab037c9501 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java @@ -220,7 +220,7 @@ class PendingDeletes { * Returns true iff the segment represented by this {@link PendingDeletes} is fully deleted */ boolean isFullyDeleted(IOSupplier readerIOSupplier) throws IOException { - return info.getDelCount() + numPendingDeletes() == info.info.maxDoc(); + return getDelCount() == info.info.maxDoc(); } /** @@ -246,7 +246,8 @@ class PendingDeletes { * Returns the number of deleted docs in the segment. */ final int getDelCount() { - return info.getDelCount() + numPendingDeletes(); + int delCount = info.getDelCount() + info.getSoftDelCount() + numPendingDeletes(); + return delCount; } /** @@ -270,7 +271,8 @@ class PendingDeletes { count = info.info.maxDoc(); } assert numDocs() == count: "info.maxDoc=" + info.info.maxDoc() + " info.getDelCount()=" + info.getDelCount() + - " pendingDeletes=" + toString() + " count=" + count; + " info.getSoftDelCount()=" + info.getSoftDelCount() + + " pendingDeletes=" + toString() + " count=" + count + " numDocs: " + numDocs(); assert reader.numDocs() == numDocs() : "reader.numDocs() = " + reader.numDocs() + " numDocs() " + numDocs(); assert reader.numDeletedDocs() <= info.info.maxDoc(): "delCount=" + reader.numDeletedDocs() + " info.maxDoc=" + info.info.maxDoc() + " rld.pendingDeleteCount=" + numPendingDeletes() + diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index 1c32e4fa92e..4074903a363 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -58,7 +58,7 @@ final class PendingSoftDeletes extends PendingDeletes { } else { // if it was deleted subtract the delCount pendingDeleteCount--; - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + assert assertPendingDeletes(); } return true; } @@ -76,11 +76,15 @@ final class PendingSoftDeletes extends PendingDeletes { hardDeletes.onNewReader(reader, info); if (dvGeneration < info.getDocValuesGen()) { // only re-calculate this if we haven't seen this generation final DocIdSetIterator iterator = DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); + int newDelCount; if (iterator != null) { // nothing is deleted we don't have a soft deletes field in this segment assert info.info.maxDoc() > 0 : "maxDoc is 0"; - pendingDeleteCount += applySoftDeletes(iterator, getMutableBits()); - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + newDelCount = applySoftDeletes(iterator, getMutableBits()); + assert newDelCount >= 0 : " illegal pending delete count: " + newDelCount; + } else { + newDelCount = 0; } + assert info.getSoftDelCount() == newDelCount : "softDeleteCount doesn't match " + info.getSoftDelCount() + " != " + newDelCount; dvGeneration = info.getDocValuesGen(); } assert getDelCount() <= info.info.maxDoc() : getDelCount() + " > " + info.info.maxDoc(); @@ -88,8 +92,15 @@ final class PendingSoftDeletes extends PendingDeletes { @Override boolean writeLiveDocs(Directory dir) throws IOException { + // we need to set this here to make sure our stats in SCI are up-to-date otherwise we might hit an assertion + // when the hard deletes are set since we need to account for docs that used to be only soft-delete but now hard-deleted + this.info.setSoftDelCount(this.info.getSoftDelCount() + pendingDeleteCount); + super.dropChanges(); // delegate the write to the hard deletes - it will only write if somebody used it. - return hardDeletes.writeLiveDocs(dir); + if (hardDeletes.writeLiveDocs(dir)) { + return true; + } + return false; } @Override @@ -134,13 +145,21 @@ final class PendingSoftDeletes extends PendingDeletes { void onDocValuesUpdate(FieldInfo info, DocValuesFieldUpdates.Iterator iterator) throws IOException { if (this.field.equals(info.name)) { pendingDeleteCount += applySoftDeletes(iterator, getMutableBits()); - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + assert assertPendingDeletes(); assert dvGeneration < info.getDocValuesGen() : "we have seen this generation update already: " + dvGeneration + " vs. " + info.getDocValuesGen(); assert dvGeneration != -2 : "docValues generation is still uninitialized"; dvGeneration = info.getDocValuesGen(); + this.info.setSoftDelCount(this.info.getSoftDelCount() + pendingDeleteCount); + super.dropChanges(); } } + private boolean assertPendingDeletes() { + assert pendingDeleteCount + info.getSoftDelCount() >= 0 : " illegal pending delete count: " + pendingDeleteCount + info.getSoftDelCount(); + assert info.info.maxDoc() >= getDelCount(); + return true; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -210,4 +229,17 @@ final class PendingSoftDeletes extends PendingDeletes { Bits getHardLiveDocs() { return hardDeletes.getLiveDocs(); } + + static int countSoftDeletes(DocIdSetIterator softDeletedDocs, Bits hardDeletes) throws IOException { + int count = 0; + if (softDeletedDocs != null) { + int doc; + while ((doc = softDeletedDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (hardDeletes == null || hardDeletes.get(doc)) { + count++; + } + } + } + return count; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 45f58a602ca..5f62c3724d6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -130,9 +130,9 @@ final class ReaderPool implements Closeable { /** * Returns true iff any of the buffered readers and updates has at least one pending delete */ - synchronized boolean anyPendingDeletes() { + synchronized boolean anyDeletions() { for(ReadersAndUpdates rld : readerMap.values()) { - if (rld.anyPendingDeletes()) { + if (rld.getDelCount() > 0) { return true; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 710b74876cd..3453447ecce 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -406,10 +406,6 @@ final class ReadersAndUpdates { } } - synchronized boolean anyPendingDeletes() { - return pendingDeletes.numPendingDeletes() != 0; - } - /** * This class merges the current on-disk DV with an incoming update DV instance and merges the two instances * giving the incoming update precedence in terms of values, in other words the values of the update always @@ -713,8 +709,6 @@ final class ReadersAndUpdates { reader = createNewReaderWithLatestLiveDocs(reader); } assert pendingDeletes.verifyDocCounts(reader); - - return new MergeReader(reader, pendingDeletes.getHardLiveDocs()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java index 661283b4203..954a1382a48 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java @@ -38,6 +38,9 @@ public class SegmentCommitInfo { // How many deleted docs in the segment: private int delCount; + // How many soft-deleted docs in the segment that are not also hard-deleted: + private int softDelCount; + // Generation number of the live docs file (-1 if there // are no deletes yet): private long delGen; @@ -73,7 +76,7 @@ public class SegmentCommitInfo { // NOTE: only used in-RAM by IW to track buffered deletes; // this is never written to/read from the Directory private long bufferedDeletesGen = -1; - + /** * Sole constructor. * @@ -88,9 +91,10 @@ public class SegmentCommitInfo { * @param docValuesGen * DocValues generation number (used to name doc-values updates files) */ - public SegmentCommitInfo(SegmentInfo info, int delCount, long delGen, long fieldInfosGen, long docValuesGen) { + public SegmentCommitInfo(SegmentInfo info, int delCount, int softDelCount, long delGen, long fieldInfosGen, long docValuesGen) { this.info = info; this.delCount = delCount; + this.softDelCount = softDelCount; this.delGen = delGen; this.nextWriteDelGen = delGen == -1 ? 1 : delGen + 1; this.fieldInfosGen = fieldInfosGen; @@ -313,13 +317,29 @@ public class SegmentCommitInfo { return delCount; } + /** + * Returns the number of only soft-deleted docs. + */ + public int getSoftDelCount() { + return softDelCount; + } + void setDelCount(int delCount) { if (delCount < 0 || delCount > info.maxDoc()) { throw new IllegalArgumentException("invalid delCount=" + delCount + " (maxDoc=" + info.maxDoc() + ")"); } + assert softDelCount + delCount <= info.maxDoc(); this.delCount = delCount; } + void setSoftDelCount(int softDelCount) { + if (softDelCount < 0 || softDelCount > info.maxDoc()) { + throw new IllegalArgumentException("invalid softDelCount=" + softDelCount + " (maxDoc=" + info.maxDoc() + ")"); + } + assert softDelCount + delCount <= info.maxDoc(); + this.softDelCount = softDelCount; + } + /** Returns a description of this segment. */ public String toString(int pendingDelCount) { String s = info.toString(delCount + pendingDelCount); @@ -332,6 +352,10 @@ public class SegmentCommitInfo { if (docValuesGen != -1) { s += ":dvGen=" + docValuesGen; } + if (softDelCount > 0) { + s += " :softDel=" + softDelCount; + } + return s; } @@ -342,7 +366,7 @@ public class SegmentCommitInfo { @Override public SegmentCommitInfo clone() { - SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, delGen, fieldInfosGen, docValuesGen); + SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, docValuesGen); // Not clear that we need to carry over nextWriteDelGen // (i.e. do we ever clone after a failed write and // before the next successful write?), but just do it to @@ -360,4 +384,8 @@ public class SegmentCommitInfo { return other; } + + final int getDelCount(boolean includeSoftDeletes) { + return includeSoftDeletes ? getDelCount() + getSoftDelCount() : getDelCount(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java index ec88fef6bc2..5697eed3cd2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java @@ -122,8 +122,9 @@ public final class SegmentInfos implements Cloneable, Iterable VERSION_72 ? input.readInt() : 0; + if (softDelCount < 0 || softDelCount > info.maxDoc()) { + throw new CorruptIndexException("invalid deletion count: " + softDelCount + " vs maxDoc=" + info.maxDoc(), input); + } + if (softDelCount + delCount > info.maxDoc()) { + throw new CorruptIndexException("invalid deletion count: " + softDelCount + delCount + " vs maxDoc=" + info.maxDoc(), input); + } + SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen); siPerCommit.setFieldInfosFiles(input.readSetOfStrings()); final Map> dvUpdateFiles; final int numDVFields = input.readInt(); @@ -517,6 +525,11 @@ public final class SegmentInfos implements Cloneable, Iterable si.maxDoc()) { + throw new IllegalStateException("cannot write segment: invalid maxDoc segment=" + si.name + " maxDoc=" + si.maxDoc() + " softDelCount=" + softDelCount); + } + out.writeInt(softDelCount); out.writeSetOfStrings(siPerCommit.getFieldInfosFiles()); final Map> dvUpdatesFiles = siPerCommit.getDocValuesUpdatesFiles(); out.writeInt(dvUpdatesFiles.size()); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index ad60a94298d..6554cc59da1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -71,6 +71,7 @@ final class SegmentMerger { if (minVersion.onOrAfter(leafMinVersion)) { minVersion = leafMinVersion; } + } assert segmentInfo.minVersion == null : "The min version should be set by SegmentMerger for merged segments"; segmentInfo.minVersion = minVersion; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java index f56970109e8..d00a19edf52 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java @@ -47,7 +47,9 @@ public class SegmentWriteState { /** Number of deleted documents set while flushing the * segment. */ public int delCountOnFlush; - + /** Number of only soft deleted documents set while flushing the + * segment. */ + public int softDelCountOnFlush; /** * Deletes and updates to apply while we are flushing the segment. A Term is * enrolled in here if it was deleted/updated at one point, and it's mapped to diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java index 36568f6d5f1..dc350115c19 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java @@ -62,6 +62,8 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead // we try to reuse the life docs instances here if the reader cache key didn't change if (reader instanceof SoftDeletesFilterLeafReader && reader.getReaderCacheHelper() != null) { readerCache.put(((SoftDeletesFilterLeafReader) reader).reader.getReaderCacheHelper().getKey(), reader); + } else if (reader instanceof SoftDeletesFilterCodecReader && reader.getReaderCacheHelper() != null) { + readerCache.put(((SoftDeletesFilterCodecReader) reader).reader.getReaderCacheHelper().getKey(), reader); } } @@ -112,9 +114,35 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead bits = new FixedBitSet(reader.maxDoc()); bits.set(0, reader.maxDoc()); } - int numDeletes = reader.numDeletedDocs() + PendingSoftDeletes.applySoftDeletes(iterator, bits); + int numSoftDeletes = PendingSoftDeletes.applySoftDeletes(iterator, bits); + int numDeletes = reader.numDeletedDocs() + numSoftDeletes; int numDocs = reader.maxDoc() - numDeletes; - return new SoftDeletesFilterLeafReader(reader, bits, numDocs); + assert assertDocCounts(numDocs, numSoftDeletes, reader); + return reader instanceof CodecReader ? new SoftDeletesFilterCodecReader((CodecReader) reader, bits, numDocs) + : new SoftDeletesFilterLeafReader(reader, bits, numDocs); + } + + private static boolean assertDocCounts(int expectedNumDocs, int numSoftDeletes, LeafReader reader) { + if (reader instanceof SegmentReader) { + SegmentReader segmentReader = (SegmentReader) reader; + SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); + if (segmentReader.isNRT == false) { + int numDocs = segmentInfo.info.maxDoc() - segmentInfo.getSoftDelCount() - segmentInfo.getDelCount(); + assert numDocs == expectedNumDocs : "numDocs: " + numDocs + " expected: " + expectedNumDocs + + " maxDoc: " + segmentInfo.info.maxDoc() + + " getDelCount: " + segmentInfo.getDelCount() + + " getSoftDelCount: " + segmentInfo.getSoftDelCount() + + " numSoftDeletes: " + numSoftDeletes + + " reader.numDeletedDocs(): " + reader.numDeletedDocs(); + } + // in the NRT case we don't have accurate numbers for getDelCount and getSoftDelCount since they might not be + // flushed to disk when this reader is opened. We don't necessarily flush deleted doc on reopen but + // we do for docValues. + + + } + + return true; } static final class SoftDeletesFilterLeafReader extends FilterLeafReader { @@ -153,6 +181,42 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead } } + final static class SoftDeletesFilterCodecReader extends FilterCodecReader { + private final LeafReader reader; + private final FixedBitSet bits; + private final int numDocs; + private final CacheHelper readerCacheHelper; + + private SoftDeletesFilterCodecReader(CodecReader reader, FixedBitSet bits, int numDocs) { + super(reader); + this.reader = reader; + this.bits = bits; + this.numDocs = numDocs; + this.readerCacheHelper = reader.getReaderCacheHelper() == null ? null : + new DelegatingCacheHelper(reader.getReaderCacheHelper()); + } + + @Override + public Bits getLiveDocs() { + return bits; + } + + @Override + public int numDocs() { + return numDocs; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return reader.getCoreCacheHelper(); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return readerCacheHelper; + } + } + private static class DelegatingCacheHelper implements CacheHelper { private final CacheHelper delegate; private final CacheKey cacheKey = new CacheKey(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java index ad725ff02e0..515068c207a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java @@ -175,7 +175,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge @Override public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier readerSupplier) throws IOException { final int numDeletesToMerge = super.numDeletesToMerge(info, delCount, readerSupplier); - if (numDeletesToMerge != 0) { + if (numDeletesToMerge != 0 && info.getSoftDelCount() > 0) { final CodecReader reader = readerSupplier.get(); if (reader.getLiveDocs() != null) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index be7f4886588..86d0f0bab33 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -244,4 +244,9 @@ class SortedDocValuesWriter extends DocValuesWriter { return valueCount; } } + + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java index 8f58014f69c..bdc65cc8057 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java @@ -231,4 +231,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { return docsWithField.cost(); } } + + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java index 22be7e50ba9..700090a48fd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java @@ -315,4 +315,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter { return scratch; } } + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 0df7ac8c74a..13073186abc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1410,4 +1410,50 @@ public class TestAddIndexes extends LuceneTestCase { dir1.close(); dir2.close(); } + + public void testAddIndicesWithSoftDeletes() throws IOException { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + IndexWriter writer = new IndexWriter(dir1, iwc1); + for (int i = 0; i < 30; i++) { + Document doc = new Document(); + int docID = random().nextInt(5); + doc.add(new StringField("id", "" + docID, Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "" + docID), doc, new NumericDocValuesField("soft_delete", 1)); + if (random().nextBoolean()) { + writer.flush(); + } + } + writer.commit(); + writer.close(); + DirectoryReader reader = DirectoryReader.open(dir1); + DirectoryReader wrappedReader = new SoftDeletesDirectoryReaderWrapper(reader, "soft_delete"); + Directory dir2 = newDirectory(); + int numDocs = reader.numDocs(); + int maxDoc = reader.maxDoc(); + assertEquals(numDocs, maxDoc); + iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + writer = new IndexWriter(dir2, iwc1); + CodecReader[] readers = new CodecReader[reader.leaves().size()]; + for (int i = 0; i < readers.length; i++) { + readers[i] = (CodecReader)reader.leaves().get(i).reader(); + } + writer.addIndexes(readers); + assertEquals(wrappedReader.numDocs(), writer.numDocs()); + assertEquals(maxDoc, writer.maxDoc()); + writer.commit(); + SegmentCommitInfo commitInfo = writer.segmentInfos.asList().get(0); + assertEquals(maxDoc-wrappedReader.numDocs(), commitInfo.getSoftDelCount()); + writer.close(); + Directory dir3 = newDirectory(); + iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + writer = new IndexWriter(dir3, iwc1); + for (int i = 0; i < readers.length; i++) { + readers[i] = (CodecReader)wrappedReader.leaves().get(i).reader(); + } + writer.addIndexes(readers); + assertEquals(wrappedReader.numDocs(), writer.numDocs()); + assertEquals(wrappedReader.numDocs(), writer.maxDoc()); + IOUtils.close(reader, writer, dir3, dir2, dir1); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index b3262588e65..d7eea7a1ad3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -238,7 +238,7 @@ public class TestDoc extends LuceneTestCase { } } - return new SegmentCommitInfo(si, 0, -1L, -1L, -1L); + return new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 8eac2fa4923..967055ed852 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3137,7 +3137,11 @@ public class TestIndexWriter extends LuceneTestCase { searcher = new IndexSearcher(reader); topDocs = searcher.search(new TermQuery(new Term("id", "1")), 10); assertEquals(0, topDocs.totalHits); - + int numSoftDeleted = 0; + for (SegmentCommitInfo info : writer.segmentInfos) { + numSoftDeleted += info.getSoftDelCount(); + } + assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); writer.close(); reader.close(); dir.close(); @@ -3267,6 +3271,20 @@ public class TestIndexWriter extends LuceneTestCase { assertEquals(1, reader.docFreq(new Term("id", id))); } } + int numSoftDeleted = 0; + for (SegmentCommitInfo info : writer.segmentInfos) { + numSoftDeleted += info.getSoftDelCount() + info.getDelCount(); + } + assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); + writer.commit(); + try (DirectoryReader dirReader = DirectoryReader.open(dir)) { + int delCount = 0; + for (LeafReaderContext ctx : dirReader.leaves()) { + SegmentCommitInfo segmentInfo = ((SegmentReader) ctx.reader()).getSegmentInfo(); + delCount += segmentInfo.getSoftDelCount() + segmentInfo.getDelCount(); + } + assertEquals(numSoftDeleted, delCount); + } IOUtils.close(reader, writer, dir); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java index 359e7d02739..4339d3e5b86 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java @@ -331,7 +331,7 @@ public class TestIndexWriterThreadsToSegments extends LuceneTestCase { byte id[] = readSegmentInfoID(dir, fileName); SegmentInfo si = TestUtil.getDefaultCodec().segmentInfoFormat().read(dir, segName, id, IOContext.DEFAULT); si.setCodec(codec); - SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); SegmentReader sr = new SegmentReader(sci, Version.LATEST.major, IOContext.DEFAULT); try { thread0Count += sr.docFreq(new Term("field", "threadID0")); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java index 219c7770d83..e240f549ecd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java @@ -137,7 +137,7 @@ public class TestOneMergeWrappingMergePolicy extends LuceneTestCase { Collections.emptyMap(), // attributes null /* indexSort */); final List segments = new LinkedList(); - segments.add(new SegmentCommitInfo(si, 0, 0, 0, 0)); + segments.add(new SegmentCommitInfo(si, 0, 0, 0, 0, 0)); ms.add(new MergePolicy.OneMerge(segments)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java index ecc2d4de51e..d4530344adf 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java @@ -40,7 +40,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 10, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); PendingDeletes deletes = newPendingDeletes(commitInfo); assertNull(deletes.getLiveDocs()); int docToDelete = TestUtil.nextInt(random(), 0, 7); @@ -74,7 +74,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 6, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); PendingDeletes deletes = newPendingDeletes(commitInfo); assertFalse(deletes.writeLiveDocs(dir)); assertEquals(0, dir.listAll().length); @@ -131,7 +131,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 3, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]); si.getCodec().fieldInfosFormat().write(dir, si, "", fieldInfos, IOContext.DEFAULT); PendingDeletes deletes = newPendingDeletes(commitInfo); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 3047364781e..b6438552976 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -44,6 +44,45 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { return new PendingSoftDeletes("_soft_deletes", commitInfo); } + public void testHardDeleteSoftDeleted() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig() + .setSoftDeletesField("_soft_deletes") + // make sure all docs will end up in the same segment + .setMaxBufferedDocs(10) + .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)); + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "1"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "2"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "2"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + writer.commit(); + DirectoryReader reader = writer.getReader(); + assertEquals(1, reader.leaves().size()); + SegmentReader segmentReader = (SegmentReader) reader.leaves().get(0).reader(); + SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); + PendingSoftDeletes pendingSoftDeletes = newPendingDeletes(segmentInfo); + pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); + assertTrue(pendingSoftDeletes.getLiveDocs().get(0)); + assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); + assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); + assertNull(pendingSoftDeletes.getHardLiveDocs()); + assertTrue(pendingSoftDeletes.delete(1)); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(-1, pendingSoftDeletes.pendingDeleteCount); // transferred the delete + assertEquals(1, pendingSoftDeletes.getDelCount()); + IOUtils.close(reader, writer, dir); + } + public void testDeleteSoft() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig() @@ -70,7 +109,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingSoftDeletes pendingSoftDeletes = newPendingDeletes(segmentInfo); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); assertTrue(pendingSoftDeletes.getLiveDocs().get(0)); assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); @@ -78,7 +118,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { // pass reader again Bits liveDocs = pendingSoftDeletes.getLiveDocs(); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); assertSame(liveDocs, pendingSoftDeletes.getLiveDocs()); // now apply a hard delete @@ -91,7 +132,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { segmentInfo = segmentReader.getSegmentInfo(); pendingSoftDeletes = newPendingDeletes(segmentInfo); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(2, pendingSoftDeletes.getDelCount()); assertFalse(pendingSoftDeletes.getLiveDocs().get(0)); assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); @@ -106,7 +148,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 10, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); for (int i = 0; i < si.maxDoc(); i++) { writer.addDocument(new Document()); @@ -126,7 +168,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(4, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(4, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); @@ -144,7 +187,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(5, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(5, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertFalse(deletes.getLiveDocs().get(2)); @@ -188,7 +232,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); @@ -199,7 +244,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); IOUtils.close(reader, writer, dir); } @@ -257,7 +303,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); IOUtils.close(reader, writer, dir); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index e9edf4e3ce9..de78ffc50d0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -60,7 +60,7 @@ public class TestSegmentInfos extends LuceneTestCase { Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, 0, -1, -1, -1); sis.add(commitInfo); sis.commit(dir); @@ -82,14 +82,14 @@ public class TestSegmentInfos extends LuceneTestCase { Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, 0, -1, -1, -1); sis.add(commitInfo); info = new SegmentInfo(dir, Version.LUCENE_8_0_0, Version.LUCENE_8_0_0, "_1", 1, false, Codec.getDefault(), Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + commitInfo = new SegmentCommitInfo(info, 0, 0,-1, -1, -1); sis.add(commitInfo); sis.commit(dir); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 1171b906b98..610523a9fd4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -96,7 +96,7 @@ public class TestSegmentMerger extends LuceneTestCase { //Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo( mergeState.segmentInfo, - 0, -1L, -1L, -1L), + 0, 0, -1L, -1L, -1L), Version.LATEST.major, newIOContext(random())); assertTrue(mergedReader != null); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java index dea7bc977be..d7a79997dc1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java @@ -104,7 +104,8 @@ public class TestSoftDeletesDirectoryReaderWrapper extends LuceneTestCase { } private boolean isWrapped(LeafReader reader) { - return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader; + return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader + || reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterCodecReader; } public void testMixSoftAndHardDeletes() throws IOException { diff --git a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java index 892564826f3..a586f838170 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java +++ b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java @@ -141,7 +141,7 @@ public class IndexSplitter { // Same info just changing the dir: SegmentInfo newInfo = new SegmentInfo(destFSDir, info.getVersion(), info.getMinVersion(), info.name, info.maxDoc(), info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId(), new HashMap<>(), null); - destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), + destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), infoPerCommit.getSoftDelCount(), infoPerCommit.getDelGen(), infoPerCommit.getFieldInfosGen(), infoPerCommit.getDocValuesGen())); // now copy files over diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java index b4799f86fdb..9c01990b195 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java @@ -125,10 +125,10 @@ public abstract class BaseLiveDocsFormatTestCase extends LuceneTestCase { final Directory dir = newDirectory(); final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "foo", maxDoc, random().nextBoolean(), codec, Collections.emptyMap(), StringHelper.randomId(), Collections.emptyMap(), null); - SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, -1, -1); + SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, 0, -1, -1); format.writeLiveDocs(bits, dir, sci, maxDoc - numLiveDocs, IOContext.DEFAULT); - sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 1, -1, -1); + sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 0, 1, -1, -1); final Bits bits2 = format.readLiveDocs(dir, sci, IOContext.READONCE); assertEquals(maxDoc, bits2.length()); for (int i = 0; i < maxDoc; ++i) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java index 8f986277f5d..477b0a3c548 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java @@ -116,7 +116,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase { Collections.emptyMap(), // attributes null /* indexSort */); info.setFiles(Collections.emptyList()); - infos.add(new SegmentCommitInfo(info, random().nextInt(1), -1, -1, -1)); + infos.add(new SegmentCommitInfo(info, random().nextInt(1), 0, -1, -1, -1)); } MergePolicy.MergeSpecification forcedDeletesMerges = mp.findForcedDeletesMerges(infos, context); if (forcedDeletesMerges != null) { From 59087d148ac186930c4a51917e361c599e7314c1 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 4 Jun 2018 17:28:32 +0200 Subject: [PATCH 11/38] [TEST] Ensure MDW.assertNoUnreferencedFilesOnClose is threadsafe --- .../src/java/org/apache/lucene/store/MockDirectoryWrapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java index 60f671c25fc..019417771ed 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java @@ -804,7 +804,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { } // NOTE: This is off by default; see LUCENE-5574 - private boolean assertNoUnreferencedFilesOnClose; + private volatile boolean assertNoUnreferencedFilesOnClose; public void setAssertNoUnrefencedFilesOnClose(boolean v) { assertNoUnreferencedFilesOnClose = v; From 2c1ab31b4e5595595cf0f1549eb61b33c8555000 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 4 Jun 2018 21:24:20 -0400 Subject: [PATCH 12/38] LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters --- lucene/CHANGES.txt | 3 + .../ngram/EdgeNGramFilterFactory.java | 6 +- .../analysis/ngram/EdgeNGramTokenFilter.java | 112 +++++++++++---- .../analysis/ngram/NGramFilterFactory.java | 6 +- .../analysis/ngram/NGramTokenFilter.java | 132 ++++++++++++++---- .../analysis/core/TestBugInSomething.java | 2 +- .../ngram/EdgeNGramTokenFilterTest.java | 122 +++++++++++----- .../analysis/ngram/NGramTokenFilterTest.java | 118 ++++++++++++---- .../analysis/ngram/TestNGramFilters.java | 18 ++- .../classification/BM25NBClassifierTest.java | 2 +- .../CachingNaiveBayesClassifierTest.java | 2 +- .../SimpleNaiveBayesClassifierTest.java | 2 +- .../analyzing/AnalyzingInfixSuggester.java | 2 +- 13 files changed, 392 insertions(+), 135 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6644453e1a4..3466d773f16 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -202,6 +202,9 @@ New Features IndexFileDeleter already accounts for that for existing files which we can now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) + +* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters. + (Ingomar Wesp, Shawn Heisey via Robert Muir) * LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked as such once it's introduced and can't be changed after the fact. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java index 020b85bb5e9..bd7ca1f84e8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> + * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class EdgeNGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize); + return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 56efd897d17..154f075e716 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * supplementary characters. */ public final class EdgeNGramTokenFilter extends TokenFilter { + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MAX_GRAM_SIZE = 1; + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MIN_GRAM_SIZE = 1; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; private final int minGram; private final int maxGram; + private final boolean preserveOriginal; + private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; - private int savePosIncr; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * + * Creates an EdgeNGramTokenFilter that, for a given input term, produces all + * edge n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is outside the min/max size range. */ - public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + public EdgeNGramTokenFilter( + TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { super(input); if (minGram < 1) { @@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter { this.minGram = minGram; this.maxGram = maxGram; + this.preserveOriginal = preserveOriginal; + } + + /** + * Creates an EdgeNGramTokenFilter that produces edge n-grams of the given + * size. + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the n-gram size to generate. + */ + public EdgeNGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); + } + + /** + * Creates an EdgeNGramTokenFilter that, for a given input term, produces all + * edge n-grams with lengths >= minGram and <= maxGram. + * + *

    + * Behaves the same as + * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, minGram, maxGram, false)} + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * + * @deprecated since 7.4. Use + * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead. + */ + @Deprecated + public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); } @Override @@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - state = captureState(); - savePosIncr += posIncrAtt.getPositionIncrement(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength); + curPosIncr += posIncrAtt.getPositionIncrement(); + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit - if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams - // grab gramSize chars from front or back + + if (curGramSize <= curTermCodePointCount) { + if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram restoreState(state); // first ngram gets increment, others don't - if (curGramSize == minGram) { - posIncrAtt.setPositionIncrement(savePosIncr); - savePosIncr = 0; - } else { - posIncrAtt.setPositionIncrement(0); - } + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } + else if (preserveOriginal) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } } + // Done with this input token, get next token on the next iteration. curTermBuffer = null; } } @@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; - savePosIncr = 0; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 2064716b78b..0a7e77dcb43 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/> + * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class NGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); + return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index a2e0aa7e588..8e1a7e40936 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization. */ public final class NGramTokenFilter extends TokenFilter { + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MIN_NGRAM_SIZE = 1; - public static final int DEFAULT_MAX_NGRAM_SIZE = 2; - private final int minGram, maxGram; + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated + public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; + + private final int minGram; + private final int maxGram; + private final boolean preserveOriginal; private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; private int curPos; - private int curPosInc; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt; + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates NGramTokenFilter with given min and max n-grams. + * Creates an NGramTokenFilter that, for a given input term, produces all + * contained n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * + * Note: Care must be taken when choosing minGram and maxGram; depending + * on the input token size, this filter potentially produces a huge number + * of terms. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is shorter than minGram or longer than maxGram */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); + public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { + super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -72,51 +93,107 @@ public final class NGramTokenFilter extends TokenFilter { } this.minGram = minGram; this.maxGram = maxGram; + this.preserveOriginal = preserveOriginal; + } + + /** + * Creates an NGramTokenFilter that produces n-grams of the indicated size. + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the size of n-grams to generate. + */ + public NGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); + } - posIncAtt = addAttribute(PositionIncrementAttribute.class); + /** + * Creates an NGramTokenFilter that, for a given input term, produces all + * contained n-grams with lengths >= minGram and <= maxGram. + * + *

    + * Behaves the same as + * {@link #NGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, minGram, maxGram, false)} + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * + * @deprecated since 7.4. Use + * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. + */ + @Deprecated + public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); } /** * Creates NGramTokenFilter with default min and max n-grams. + * + *

    + * Behaves the same as + * {@link #NGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, 1, 2, false)} + * * @param input {@link TokenStream} holding the input to be tokenized + * @deprecated since 7.4. Use + * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. */ + @Deprecated public NGramTokenFilter(TokenStream input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL); } - /** Returns the next token in the stream, or null at EOS. */ @Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - curPos = 0; - curPosInc = posIncAtt.getPositionIncrement(); - state = captureState(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); + curPosIncr += posIncrAtt.getPositionIncrement(); + curPos = 0; + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { + if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) { ++curPos; curGramSize = minGram; } - if ((curPos + curGramSize) <= curCodePointCount) { + if ((curPos + curGramSize) <= curTermCodePointCount) { restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); - posIncAtt.setPositionIncrement(curPosInc); - curPosInc = 0; + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; curGramSize++; return true; } - curTermBuffer = null; + else if (preserveOriginal && curTermCodePointCount > maxGram) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } + + // Done with this input token, get next token on next iteration. + curTermBuffer = null; } } @@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java index 1d17237fe3f..6cdff4bd5c9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java @@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); - stream = new NGramTokenFilter(stream, 55, 83); + stream = new NGramTokenFilter(stream, 55, 83, false); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index d7536e7050f..fd1949a0359 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 0, 0); + new EdgeNGramTokenFilter(input, 0, 0, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 2, 1); + new EdgeNGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput3() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, -1, 2); + new EdgeNGramTokenFilter(input, -1, 2, false); }); } public void testFrontUnigram() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5}); } public void testOversizedNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + + public void testPreserveOriginal() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "ef", "efg", "jk" }, + new int[] { 2, 2, 6, 6, 12 }, + new int[] { 5, 5, 11, 11, 14 }, + new int[] { 2, 0, 1, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" }, + new int[] { 0, 2, 2, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 1, 0, 0, 1 }); + } + } + public void testFrontRangeOfNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); } public void testFilterPositions() throws Exception { TokenStream ts = whitespaceMockTokenizer("abcde vwxyz"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false); assertTokenStreamContents(tokenizer, - new String[]{"a","ab","abc","v","vw","vwx"}, - new int[]{0,0,0,6,6,6}, - new int[]{5,5,5,11,11,11}, - null, - new int[]{1,0,0,1,0,0}, - null, - null, - false); + new String[] {"a","ab","abc","v","vw","vwx"}, + new int[] {0, 0, 0, 6, 6, 6}, + new int[] {5, 5, 5, 11, 11, 11}); } private static class PositionFilter extends TokenFilter { @@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testFirstTokenPositionIncrement() throws Exception { TokenStream ts = whitespaceMockTokenizer("a abc"); ts = new PositionFilter(ts); // All but first token will get 0 position increment - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); // The first token "a" will not be output, since it's smaller than the mingram size of 2. // The second token on input to EdgeNGramTokenFilter will have position increment of 0, // which should be increased to 1, since this is the first output token in the stream. @@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false); assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); @@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, min, max)); + new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 100*RANDOM_MULTIPLIER); @@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, 2, 15)); + new EdgeNGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { TokenStream tk = new LetterTokenizer(); ((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); - tk = new EdgeNGramTokenFilter(tk, 7, 10); + tk = new EdgeNGramTokenFilter(tk, 7, 10, false); assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6,11,11,14 }, @@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { - assertTrue(tk.incrementToken()); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(s.length(), offsetAtt.endOffset()); - final int end = Character.offsetByCodePoints(s, 0, i); - assertEquals(s.substring(0, end), termAtt.toString()); + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int end = Character.offsetByCodePoints(s, 0, j); + assertEquals(s.substring(0, end), termAtt.toString()); + } + + if (codePointCount > maxGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index d8591a9726e..2a473961c06 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 2, 1); + new NGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 0, 1); + new NGramTokenFilter(input, 0, 1, false); }); } public void testUnigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } public void testBigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2); + NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false); assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0}); } public void testNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testNgramsNoIncrement() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testOversizedNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7); + NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false); assertTokenStreamContents(filter, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false); + assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + } + + public void testSmallTokenInStreamPreserveOriginal() throws Exception { + input = whitespaceMockTokenizer("abc de fgh"); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true); + assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1}); + } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } + public void testKeepShortTermKeepLongTerm() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" }, + new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" }, + new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }); + } + } + // LUCENE-3642 // EdgeNgram blindly adds term length to offset, but this can take things out of bounds // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) @@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); - filters = new NGramTokenFilter(filters, 2, 2); + filters = new NGramTokenFilter(filters, 2, 2, false); return new TokenStreamComponents(tokenizer, filters); } }; @@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, min, max)); + new NGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20); @@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, 2, 15)); + new NGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new NGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int start = 0; start < codePointCount; ++start) { - for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); - final int startIndex = Character.offsetByCodePoints(s, 0, start); - final int endIndex = Character.offsetByCodePoints(s, 0, end); - assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + assertEquals(s, termAtt.toString()); } + + for (int start = 0; start < codePointCount; ++start) { + for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int startIndex = Character.offsetByCodePoints(s, 0, start); + final int endIndex = Character.offsetByCodePoints(s, 0, end); + assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + } + } + + if (codePointCount > maxGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } - } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 5de532f4c09..aa98f403644 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test the NGramFilterFactory + * Test the NGramFilterFactory with old defaults */ public void testNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("NGram").create(stream); + stream = tokenFilterFactory("NGram", + "minGramSize", "1", + "maxGramSize", "2").create(stream); assertTokenStreamContents(stream, new String[] { "t", "te", "e", "es", "s", "st", "t" }); } @@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test EdgeNGramFilterFactory + * Test EdgeNGramFilterFactory with old defaults */ public void testEdgeNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("EdgeNGram").create(stream); + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", + "maxGramSize", "1").create(stream); assertTokenStreamContents(stream, new String[] { "t" }); } @@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + IllegalArgumentException expected = null; + expected = expectThrows(IllegalArgumentException.class, () -> { tokenizerFactory("NGram", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); @@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("NGram", "bogusArg", "bogusValue"); + tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue"); + tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java index 237c53fd643..050073c32fe 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java @@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20))); + return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false))); } } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java index 00fa4fe3505..8669df4e05a 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java @@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase - * Behaves the same as - * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean) - * NGramTokenFilter(input, minGram, maxGram, false)} - * - * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the minimum length of the generated n-grams - * @param maxGram the maximum length of the generated n-grams - * - * @deprecated since 7.4. Use - * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead. - */ - @Deprecated - public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { - this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); - } - @Override public final boolean incrementToken() throws IOException { while (true) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 0a7e77dcb43..9a681dfaf3e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -41,8 +41,8 @@ public class NGramFilterFactory extends TokenFilterFactory { /** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map args) { super(args); - minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); - maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + minGramSize = requireInt(args, "minGramSize"); + maxGramSize = requireInt(args, "maxGramSize"); preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index 8e1a7e40936..5b6147b8ea0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -39,17 +39,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization. */ public final class NGramTokenFilter extends TokenFilter { - /** - * @deprecated since 7.4 - this value will be required. - */ - @Deprecated - public static final int DEFAULT_MIN_NGRAM_SIZE = 1; - - /** - * @deprecated since 7.4 - this value will be required. - */ - @Deprecated - public static final int DEFAULT_MAX_NGRAM_SIZE = 2; public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; private final int minGram; @@ -106,44 +95,6 @@ public final class NGramTokenFilter extends TokenFilter { this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); } - /** - * Creates an NGramTokenFilter that, for a given input term, produces all - * contained n-grams with lengths >= minGram and <= maxGram. - * - *

    - * Behaves the same as - * {@link #NGramTokenFilter(TokenStream, int, int, boolean) - * NGramTokenFilter(input, minGram, maxGram, false)} - * - * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the minimum length of the generated n-grams - * @param maxGram the maximum length of the generated n-grams - * - * @deprecated since 7.4. Use - * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. - */ - @Deprecated - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); - } - - /** - * Creates NGramTokenFilter with default min and max n-grams. - * - *

    - * Behaves the same as - * {@link #NGramTokenFilter(TokenStream, int, int, boolean) - * NGramTokenFilter(input, 1, 2, false)} - * - * @param input {@link TokenStream} holding the input to be tokenized - * @deprecated since 7.4. Use - * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. - */ - @Deprecated - public NGramTokenFilter(TokenStream input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL); - } - @Override public final boolean incrementToken() throws IOException { while (true) { From f27d8a2dbfee4ba75b7bada786328a4077865d5b Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 5 Jun 2018 12:53:22 +1000 Subject: [PATCH 15/38] SOLR-12387: added documentation --- solr/solr-ref-guide/src/collections-api.adoc | 42 ++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/solr/solr-ref-guide/src/collections-api.adoc b/solr/solr-ref-guide/src/collections-api.adoc index 1e895b287fb..53b6395d3fe 100644 --- a/solr/solr-ref-guide/src/collections-api.adoc +++ b/solr/solr-ref-guide/src/collections-api.adoc @@ -1085,6 +1085,48 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERPROP&name=urlScheme&v ---- +=== Deeply Nested Cluster Properties === + +==== `collectionDefaults` ==== +It is possible to set cluster-wide default values for certain attributes of a collection. + + +*Example 1: Set/update default values* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : { + "numShards" : 2, + "nrtReplicas" : 1, + "tlogReplicas" : 1, + "pullReplicas" : 1, + + } +}' http://localhost:8983/api/cluster +---- + +*Example 2: Unset the value of `nrtReplicas` alone* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : { + "nrtReplicas" : null, + } +}' http://localhost:8983/api/cluster +---- + +*Example 2: Unset all values in `collectionDefaults`* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : null +}' http://localhost:8983/api/cluster +---- + + [[collectionprop]] == COLLECTIONPROP: Collection Properties From f9f5e837450e082ae7e1a82a0693760af7485a1b Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 4 Jun 2018 23:07:31 -0400 Subject: [PATCH 16/38] LUCENE-8332: New ConcatenateGraphFilter (from CompletionTokenStream). * Added a test for FingerprintFilter and clarified FF's end condition. --- lucene/CHANGES.txt | 5 + .../miscellaneous/ConcatenateGraphFilter.java | 375 ++++++++++++++++++ .../ConcatenateGraphFilterFactory.java | 70 ++++ .../miscellaneous/FingerprintFilter.java | 4 +- ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../analysis/core/TestRandomChains.java | 7 +- .../TestConcatenateGraphFilter.java} | 135 +++---- .../TestConcatenateGraphFilterFactory.java | 83 ++++ .../miscellaneous/TestFingerprintFilter.java | 9 + .../suggest/document/CompletionAnalyzer.java | 21 +- .../suggest/document/CompletionQuery.java | 2 +- .../document/CompletionTokenStream.java | 297 +------------- .../search/suggest/document/ContextQuery.java | 5 +- .../suggest/document/ContextSuggestField.java | 1 + .../document/FuzzyCompletionQuery.java | 7 +- .../suggest/document/NRTSuggesterBuilder.java | 3 +- .../document/PrefixCompletionQuery.java | 5 +- .../search/suggest/document/SuggestField.java | 3 +- .../document/TestContextSuggestField.java | 13 +- .../suggest/document/TestSuggestField.java | 29 +- 20 files changed, 688 insertions(+), 387 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java rename lucene/{suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java => analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java} (52%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 55e03671f56..9eecc4206b0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -210,6 +210,11 @@ New Features as such once it's introduced and can't be changed after the fact. (Nhat Nguyen via Simon Willnauer) +* LUCENE-8332: New ConcatenateGraphFilter for concatenating all tokens into one (or more + in the event of a graph input). This is useful for fast analyzed exact-match lookup, + suggesters, and as a component of a named entity recognition system. This was excised + out of CompletionTokenStream in the NRT doc suggester. (David Smiley, Jim Ferenczi) + Bug Fixes * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java new file mode 100644 index 00000000000..b6c4f223eb4 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java @@ -0,0 +1,375 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.Util; + +/** + * Concatenates/Joins every incoming token with a separator into one output token for every path through the + * token stream (which is a graph). In simple cases this yields one token, but in the presence of any tokens with + * a zero positionIncrmeent (e.g. synonyms) it will be more. This filter uses the token bytes, position increment, + * and position length of the incoming stream. Other attributes are not used or manipulated. + * + * @lucene.experimental + */ +public final class ConcatenateGraphFilter extends TokenStream { + + /* + * Token stream which converts a provided token stream to an automaton. + * The accepted strings enumeration from the automaton are available through the + * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute + * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store + * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)}) + */ + + /** + * Represents the separation between tokens, if + * preserveSep is true. + */ + public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP; + public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; + public final static boolean DEFAULT_PRESERVE_SEP = true; + public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; + + private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final TokenStream inputTokenStream; + private final boolean preserveSep; + private final boolean preservePositionIncrements; + private final int maxGraphExpansions; + + private LimitedFiniteStringsIterator finiteStrings; + private CharTermAttribute charTermAttribute; + private boolean wasReset = false; + private int endOffset; + + /** + * Creates a token stream to convert input to a token stream + * of accepted strings by its token stream graph. + *

    + * This constructor uses the default settings of the constants in this class. + */ + public ConcatenateGraphFilter(TokenStream inputTokenStream) { + this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + } + + /** + * Creates a token stream to convert input to a token stream + * of accepted strings by its token stream graph. + * + * @param inputTokenStream The input/incoming TokenStream + * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token + * @param preservePositionIncrements Whether to add an empty token for missing positions. + * The effect is a consecutive {@link #SEP_LABEL}. + * When false, it's as if there were no missing positions + * (we pretend the surrounding tokens were adjacent). + * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw + * {@link TooComplexToDeterminizeException} to preserve the stability and memory of the + * machine. + * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code maxGraphExpansions} + * expansions + * + */ + public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in the first call to incrementToken + this.inputTokenStream = inputTokenStream; + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxGraphExpansions = maxGraphExpansions; + } + + @Override + public void reset() throws IOException { + super.reset(); + // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); // may return null + wasReset = true; + } + + @Override + public boolean incrementToken() throws IOException { + if (finiteStrings == null) { + if (wasReset == false) { + throw new IllegalStateException("reset() missing before incrementToken"); + } + // lazy init/consume + Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream + finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + //note: would be nice to know the startOffset but toAutomaton doesn't capture it. We'll assume 0 + endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset(); + } + + IntsRef string = finiteStrings.next(); + if (string == null) { + return false; + } + + clearAttributes(); + + if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one... + posIncrAtt.setPositionIncrement(0); // stacked + } + + offsetAtt.setOffset(0, endOffset); + + Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 + if (charTermAttribute != null) { + charTermAttribute.setLength(0); + charTermAttribute.append(bytesAtt.toUTF16()); + } + + return true; + } + + @Override + public void end() throws IOException { + super.end(); + if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end() + inputTokenStream.end(); // the input TS may really want to see "end()" called even if incrementToken hasn't. + } // else we already eagerly consumed inputTokenStream including end() + if (endOffset != -1) { + offsetAtt.setOffset(0, endOffset); + } + } + + @Override + public void close() throws IOException { + super.close(); + //delegate lifecycle. Note toAutomaton does not close the stream + inputTokenStream.close(); + finiteStrings = null; + wasReset = false;//reset + endOffset = -1;//reset + } + + /** + * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does *not* close it. + */ + public Automaton toAutomaton() throws IOException { + return toAutomaton(false); + } + + /** + * Converts the tokenStream to an automaton. Does *not* close it. + */ + public Automaton toAutomaton(boolean unicodeAware) throws IOException { + // TODO refactor this + // maybe we could hook up a modified automaton from TermAutomatonQuery here? + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + final TokenStreamToAutomaton tsta; + if (preserveSep) { + tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); + } else { + // When we're not preserving sep, we don't steal 0xff + // byte, so we don't need to do any escaping: + tsta = new TokenStreamToAutomaton(); + } + tsta.setPreservePositionIncrements(preservePositionIncrements); + tsta.setUnicodeArcs(unicodeAware); + + Automaton automaton = tsta.toAutomaton(inputTokenStream); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + automaton = replaceSep(automaton, preserveSep, SEP_LABEL); + // This automaton should not blow up during determinize: + return Operations.determinize(automaton, maxGraphExpansions); + } + + /** + * Just escapes the {@link #SEP_LABEL} byte with an extra. + */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRefBuilder spare = new BytesRefBuilder(); + final byte sepLabel; + + public EscapingTokenStreamToAutomaton(int sepLabel) { + assert sepLabel <= Byte.MAX_VALUE; + this.sepLabel = (byte) sepLabel; + } + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for (int i = 0; i < in.length; i++) { + byte b = in.bytes[in.offset + i]; + if (b == sepLabel) { + spare.grow(upto + 2); + spare.setByteAt(upto++, sepLabel); + spare.setByteAt(upto++, b); + } else { + spare.grow(upto + 1); + spare.setByteAt(upto++, b); + } + } + spare.setLength(upto); + return spare.get(); + } + } + + // Replaces SEP with epsilon or remaps them if + // we were asked to preserve them: + private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) { + + Automaton result = new Automaton(); + + // Copy all states over + int numStates = a.getNumStates(); + for (int s = 0; s < numStates; s++) { + result.createState(); + result.setAccept(s, a.isAccept(s)); + } + + // Go in reverse topo sort so we know we only have to + // make one pass: + Transition t = new Transition(); + int[] topoSortStates = Operations.topoSortStates(a); + for (int i = 0; i < topoSortStates.length; i++) { + int state = topoSortStates[topoSortStates.length - 1 - i]; + int count = a.initTransition(state, t); + for (int j = 0; j < count; j++) { + a.getNextTransition(t); + if (t.min == TokenStreamToAutomaton.POS_SEP) { + assert t.max == TokenStreamToAutomaton.POS_SEP; + if (preserveSep) { + // Remap to SEP_LABEL: + result.addTransition(state, t.dest, sepLabel); + } else { + result.addEpsilon(state, t.dest); + } + } else if (t.min == TokenStreamToAutomaton.HOLE) { + assert t.max == TokenStreamToAutomaton.HOLE; + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + result.addEpsilon(state, t.dest); + } else { + result.addTransition(state, t.dest, t.min, t.max); + } + } + } + + result.finishState(); + + return result; + } + + /** + * Attribute providing access to the term builder and UTF-16 conversion + * @lucene.internal + */ + public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { + /** + * Returns the builder from which the term is derived. + */ + BytesRefBuilder builder(); + + /** + * Returns the term represented as UTF-16 + */ + CharSequence toUTF16(); + } + + /** + * Implementation of {@link BytesRefBuilderTermAttribute} + * @lucene.internal + */ + public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute { + private final BytesRefBuilder bytes = new BytesRefBuilder(); + private transient CharsRefBuilder charsRef; + + /** + * Sole constructor + * no-op + */ + public BytesRefBuilderTermAttributeImpl() { + } + + @Override + public BytesRefBuilder builder() { + return bytes; + } + + @Override + public BytesRef getBytesRef() { + return bytes.get(); + } + + @Override + public void clear() { + bytes.clear(); + } + + @Override + public void copyTo(AttributeImpl target) { + BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target; + other.bytes.copyBytes(bytes); + } + + @Override + public AttributeImpl clone() { + BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl(); + copyTo(other); + return other; + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef()); + } + + @Override + public CharSequence toUTF16() { + if (charsRef == null) { + charsRef = new CharsRefBuilder(); + } + charsRef.copyUTF8Bytes(getBytesRef()); + return charsRef.get(); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java new file mode 100644 index 00000000000..5d8ccbacf3d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; + +/** + * Factory for {@link ConcatenateGraphFilter}. + * + *

      + *
    • preserveSep: + * Whether {@link ConcatenateGraphFilter#SEP_LABEL} + * should separate the input tokens in the concatenated token + *
    • + *
    • preservePositionIncrements: + * Whether to add an empty token for missing positions. + * The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}. + * When false, it's as if there were no missing positions + * (we pretend the surrounding tokens were adjacent). + *
    • + *
    • maxGraphExpansions: + * If the tokenStream graph has more than this many possible paths through, then we'll throw + * {@link TooComplexToDeterminizeException} to preserve the stability and memory of the + * machine. + *
    • + *
    + * @see ConcatenateGraphFilter + * @since 7.4.0 + */ +public class ConcatenateGraphFilterFactory extends TokenFilterFactory { + + private boolean preserveSep; + private boolean preservePositionIncrements; + private int maxGraphExpansions; + + public ConcatenateGraphFilterFactory(Map args) { + super(args); + + preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP); + preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS); + maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); + + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java index dfe06c88fbf..71dab429191 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java @@ -81,8 +81,7 @@ public class FingerprintFilter extends TokenFilter { @Override public final boolean incrementToken() throws IOException { - if (uniqueTerms != null) { - // We have already built the single output token - there's no more + if (inputEnded) { return false; } boolean result = buildSingleOutputToken(); @@ -177,6 +176,7 @@ public class FingerprintFilter extends TokenFilter { } }); + //TODO lets append directly to termAttribute? StringBuilder sb = new StringBuilder(); for (Object item : items) { if (sb.length() >= 1) { diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 18119202179..df868a0a1e6 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -64,6 +64,7 @@ org.apache.lucene.analysis.minhash.MinHashFilterFactory org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory +org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 8cb159129ef..d94b39607bb 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -72,6 +72,7 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.minhash.MinHashFilter; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; @@ -119,10 +120,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase { private static final Set> avoidConditionals = new HashSet<>(); static { - // Fingerprint filter needs to consume the whole tokenstream, so conditionals don't make sense here + // These filters needs to consume the whole tokenstream, so conditionals don't make sense here avoidConditionals.add(FingerprintFilter.class); - // Ditto MinHashFilter avoidConditionals.add(MinHashFilter.class); + avoidConditionals.add(ConcatenateGraphFilter.class); } private static final Map,Predicate> brokenConstructors = new HashMap<>(); @@ -156,7 +157,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); for (Class c : Arrays.>asList( - // doesn't actual reset itself! + // doesn't actual reset itself! TODO this statement is probably obsolete as of LUCENE-6121 ? CachingTokenFilter.class, // LUCENE-8092: doesn't handle graph inputs CJKBigramFilter.class, diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java similarity index 52% rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java index 6f558d1985d..453dcbf9dab 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java @@ -14,50 +14,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.search.suggest.document; +package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.junit.Test; -public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { +public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase { + private static final char SEP_LABEL = (char) ConcatenateGraphFilter.SEP_LABEL; + @Test public void testBasic() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {input}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); + assertTokenStreamContents(stream, new String[] {input}, null, null, new int[] { 1 }); } @Test public void testWithNoPreserveSep() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream, false, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100); + assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 }); } @Test @@ -65,17 +57,14 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); - BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); CharsRefBuilder builder = new CharsRefBuilder(); builder.append("mykeyword"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(SEP_LABEL); builder.append("another"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(SEP_LABEL); builder.append("keyword"); - assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); + assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1}); } @Test @@ -85,11 +74,8 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); - BytesRef payload = new BytesRef("payload"); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter); + assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 }); } @Test @@ -100,26 +86,48 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true); - BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100); String[] expectedOutputs = new String[2]; CharsRefBuilder expectedOutput = new CharsRefBuilder(); expectedOutput.append("mykeyword"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[0] = expectedOutput.toCharsRef().toString(); expectedOutput.clear(); expectedOutput.append("mysynonym"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[1] = expectedOutput.toCharsRef().toString(); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0}); + } + + @Test + public void testWithStopword() throws Exception { + for (boolean preservePosInc : new boolean[]{true, false}) { + Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); + String input = "a mykeyword a keyword"; //LUCENE-8344 add "a" + tokenStream.setReader(new StringReader(input)); + TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a")); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10); + CharsRefBuilder builder = new CharsRefBuilder(); + if (preservePosInc) { + builder.append(SEP_LABEL); + } + builder.append("mykeyword"); + builder.append(SEP_LABEL); + if (preservePosInc) { + builder.append(SEP_LABEL); + } + builder.append("keyword"); +// if (preservePosInc) { LUCENE-8344 uncomment +// builder.append(SEP_LABEL); +// } + assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()}); + } } @Test @@ -137,41 +145,24 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); - completionTokenStream.setPayload(new BytesRef()); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - stream.reset(); - CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class); - PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class); - int maxPos = 0; - int count = 0; - while(stream.incrementToken()) { - count++; - assertNotNull(attr.getBytesRef()); - assertTrue(attr.getBytesRef().length > 0); - maxPos += posAttr.getPositionIncrement(); - } - stream.close(); - assertEquals(count, 256); - assertEquals(count, maxPos); - } - - public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { - private PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private TypeAttribute type = addAttribute(TypeAttribute.class); - - protected PayloadAttrToTypeAttrFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - // we move them over so we can assert them more easily in the tests - type.setType(payload.getPayload().utf8ToString()); - return true; + int count; + try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) { + stream.reset(); + ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class); + count = 0; + while (stream.incrementToken()) { + count++; + assertNotNull(attr.getBytesRef()); + assertTrue(attr.getBytesRef().length > 0); } - return false; } + assertEquals(count, 256); } + + public void testEmpty() throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(""); + ConcatenateGraphFilter filter = new ConcatenateGraphFilter(tokenizer); + assertTokenStreamContents(filter, new String[0]); + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java new file mode 100644 index 00000000000..1e149f03b1b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase { + public void test() throws Exception { + for (final boolean consumeAll : new boolean[]{true, false}) { + final String input = "A1 B2 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = tokenizer; + stream = tokenFilterFactory("ConcatenateGraph").create(stream); + assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); + } + } + + public void testPreserveSep() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1A1D4C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preserveSep", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output}); + } + + public void testPreservePositionIncrements() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preservePositionIncrements", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); + } + + public void testRequired() throws Exception { + // no params are required + tokenFilterFactory("ConcatenateGraph"); + } + + /** + * Test that bogus arguments result in exception + */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> + tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue")); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java index 450447ac9ba..76bd617f408 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java @@ -69,4 +69,13 @@ public class TestFingerprintFilter extends BaseTokenStreamTestCase { } } + public void testEmpty() throws Exception { + for (final boolean consumeAll : new boolean[] { true, false }) { + MockTokenizer tokenizer = whitespaceMockTokenizer(""); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = new FingerprintFilter(tokenizer); + assertTokenStreamContents(stream, new String[0]); + } + } + } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java index 13bd392aa9d..8888382a5ca 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java @@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest.document; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; /** * Wraps an {@link org.apache.lucene.analysis.Analyzer} @@ -37,24 +37,11 @@ import org.apache.lucene.util.automaton.Operations; */ public final class CompletionAnalyzer extends AnalyzerWrapper { - /** - * Represents the separation between tokens, if - * preserveSep is true - *

    - * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream} - * payload - */ - final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP; - /** * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton} */ final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE; - final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; - final static boolean DEFAULT_PRESERVE_SEP = true; - final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; - private final Analyzer analyzer; /** @@ -101,7 +88,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * preserving token separation, position increments and no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -109,7 +96,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * with no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) { - this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, preserveSep, preservePositionIncrements, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -117,7 +104,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * preserving token separation and position increments */ public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); } /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java index 49fe7d08dff..6be0c91117f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.suggest.BitsProducer; import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; +import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_LABEL; /** * Abstract {@link Query} that match documents containing terms with a specified prefix diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java index 7308e65acc9..d3bec8e50c9 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java @@ -14,71 +14,43 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.search.suggest.document; import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.FiniteStringsIterator; -import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Util; - -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; /** - * Token stream which converts a provided token stream to an automaton. - * The accepted strings enumeration from the automaton are available through the - * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute - * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store - * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}) - * + * A {@link ConcatenateGraphFilter} but we can set the payload and provide access to config options. * @lucene.experimental */ -public final class CompletionTokenStream extends TokenStream { +public final class CompletionTokenStream extends TokenFilter { private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); - private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); + // package accessible on purpose final TokenStream inputTokenStream; final boolean preserveSep; final boolean preservePositionIncrements; final int maxGraphExpansions; - private FiniteStringsIterator finiteStrings; - private BytesRef payload; - private CharTermAttribute charTermAttribute; + private BytesRef payload; // note doesn't participate in TokenStream lifecycle; it's effectively constant - /** - * Creates a token stream to convert input to a token stream - * of accepted strings by its automaton. - *

    - * The token stream input is converted to an automaton - * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer} - */ CompletionTokenStream(TokenStream inputTokenStream) { - this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(inputTokenStream, + ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, + ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, + ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { - // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume - // the input stream entirely in the first call to incrementToken + super(new ConcatenateGraphFilter(inputTokenStream, preserveSep, preservePositionIncrements, maxGraphExpansions)); this.inputTokenStream = inputTokenStream; this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; @@ -94,248 +66,23 @@ public final class CompletionTokenStream extends TokenStream { @Override public boolean incrementToken() throws IOException { - clearAttributes(); - if (finiteStrings == null) { - Automaton automaton = toAutomaton(); - finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); - } - - IntsRef string = finiteStrings.next(); - if (string == null) { + if (input.incrementToken()) { + payloadAttr.setPayload(payload); + return true; + } else { return false; } - - Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 - if (charTermAttribute != null) { - charTermAttribute.setLength(0); - charTermAttribute.append(bytesAtt.toUTF16()); - } - if (payload != null) { - payloadAttr.setPayload(this.payload); - } - - return true; } - @Override - public void end() throws IOException { - super.end(); - if (finiteStrings == null) { - inputTokenStream.end(); - } - } - - @Override - public void close() throws IOException { - if (finiteStrings == null) { - inputTokenStream.close(); - } - } - - @Override - public void reset() throws IOException { - super.reset(); - if (hasAttribute(CharTermAttribute.class)) { - // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion - charTermAttribute = getAttribute(CharTermAttribute.class); - } - finiteStrings = null; - } - - /** - * Converts the token stream to an automaton, - * treating the transition labels as utf-8 - */ + /** Delegates to...At + * @see ConcatenateGraphFilter#toAutomaton() */ public Automaton toAutomaton() throws IOException { - return toAutomaton(false); + return ((ConcatenateGraphFilter)input).toAutomaton(); } - /** - * Converts the tokenStream to an automaton - */ + /** Delegates to... + * @see ConcatenateGraphFilter#toAutomaton(boolean) */ public Automaton toAutomaton(boolean unicodeAware) throws IOException { - // TODO refactor this - // maybe we could hook up a modified automaton from TermAutomatonQuery here? - Automaton automaton = null; - try { - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as - // separator between tokens: - final TokenStreamToAutomaton tsta; - if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL); - } else { - // When we're not preserving sep, we don't steal 0xff - // byte, so we don't need to do any escaping: - tsta = new TokenStreamToAutomaton(); - } - tsta.setPreservePositionIncrements(preservePositionIncrements); - tsta.setUnicodeArcs(unicodeAware); - - automaton = tsta.toAutomaton(inputTokenStream); - } finally { - IOUtils.closeWhileHandlingException(inputTokenStream); - } - - // TODO: we can optimize this somewhat by determinizing - // while we convert - automaton = replaceSep(automaton, preserveSep, SEP_LABEL); - // This automaton should not blow up during determinize: - return Operations.determinize(automaton, maxGraphExpansions); - } - - /** - * Just escapes the 0xff byte (which we still for SEP). - */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { - - final BytesRefBuilder spare = new BytesRefBuilder(); - private char sepLabel; - - public EscapingTokenStreamToAutomaton(char sepLabel) { - this.sepLabel = sepLabel; - } - - @Override - protected BytesRef changeToken(BytesRef in) { - int upto = 0; - for (int i = 0; i < in.length; i++) { - byte b = in.bytes[in.offset + i]; - if (b == (byte) sepLabel) { - spare.grow(upto + 2); - spare.setByteAt(upto++, (byte) sepLabel); - spare.setByteAt(upto++, b); - } else { - spare.grow(upto + 1); - spare.setByteAt(upto++, b); - } - } - spare.setLength(upto); - return spare.get(); - } - } - - // Replaces SEP with epsilon or remaps them if - // we were asked to preserve them: - private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) { - - Automaton result = new Automaton(); - - // Copy all states over - int numStates = a.getNumStates(); - for (int s = 0; s < numStates; s++) { - result.createState(); - result.setAccept(s, a.isAccept(s)); - } - - // Go in reverse topo sort so we know we only have to - // make one pass: - Transition t = new Transition(); - int[] topoSortStates = Operations.topoSortStates(a); - for (int i = 0; i < topoSortStates.length; i++) { - int state = topoSortStates[topoSortStates.length - 1 - i]; - int count = a.initTransition(state, t); - for (int j = 0; j < count; j++) { - a.getNextTransition(t); - if (t.min == TokenStreamToAutomaton.POS_SEP) { - assert t.max == TokenStreamToAutomaton.POS_SEP; - if (preserveSep) { - // Remap to SEP_LABEL: - result.addTransition(state, t.dest, sepLabel); - } else { - result.addEpsilon(state, t.dest); - } - } else if (t.min == TokenStreamToAutomaton.HOLE) { - assert t.max == TokenStreamToAutomaton.HOLE; - - // Just remove the hole: there will then be two - // SEP tokens next to each other, which will only - // match another hole at search time. Note that - // it will also match an empty-string token ... if - // that's somehow a problem we can always map HOLE - // to a dedicated byte (and escape it in the - // input). - result.addEpsilon(state, t.dest); - } else { - result.addTransition(state, t.dest, t.min, t.max); - } - } - } - - result.finishState(); - - return result; - } - - /** - * Attribute providing access to the term builder and UTF-16 conversion - */ - public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { - /** - * Returns the builder from which the term is derived. - */ - BytesRefBuilder builder(); - - /** - * Returns the term represented as UTF-16 - */ - CharSequence toUTF16(); - } - - /** - * Custom attribute implementation for completion token stream - */ - public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute { - private final BytesRefBuilder bytes = new BytesRefBuilder(); - private transient CharsRefBuilder charsRef; - - /** - * Sole constructor - * no-op - */ - public BytesRefBuilderTermAttributeImpl() { - } - - @Override - public BytesRefBuilder builder() { - return bytes; - } - - @Override - public BytesRef getBytesRef() { - return bytes.get(); - } - - @Override - public void clear() { - bytes.clear(); - } - - @Override - public void copyTo(AttributeImpl target) { - BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target; - other.bytes.copyBytes(bytes); - } - - @Override - public AttributeImpl clone() { - BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl(); - copyTo(other); - return other; - } - - @Override - public void reflectWith(AttributeReflector reflector) { - reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef()); - } - - @Override - public CharSequence toUTF16() { - if (charsRef == null) { - charsRef = new CharsRefBuilder(); - } - charsRef.copyUTF8Bytes(getBytesRef()); - return charsRef.get(); - } + return ((ConcatenateGraphFilter)input).toAutomaton(unicodeAware); } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index 6217ca38f85..1a2680cb553 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; @@ -178,7 +179,7 @@ public class ContextQuery extends CompletionQuery { // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to // include the SEP_LABEL in the query as well - Automaton optionalSepLabel = Operations.optional(Automata.makeChar(CompletionAnalyzer.SEP_LABEL)); + Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL)); Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton); Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton); contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); @@ -302,7 +303,7 @@ public class ContextQuery extends CompletionQuery { } ref.offset = ++i; assert ref.offset < ref.length : "input should not end with the context separator"; - if (ref.ints[i] == CompletionAnalyzer.SEP_LABEL) { + if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) { ref.offset++; assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL"; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java index 4cb91b8053c..cf462e1dbc8 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java @@ -90,6 +90,7 @@ public class ContextSuggestField extends SuggestField { } CompletionTokenStream completionTokenStream; if (stream instanceof CompletionTokenStream) { + //TODO this is awkward; is there a better way avoiding re-creating the chain? completionTokenStream = (CompletionTokenStream) stream; PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(completionTokenStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts); completionTokenStream = new CompletionTokenStream(prefixTokenFilter, diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java index b243f4ede83..14479fecd12 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java @@ -144,9 +144,12 @@ public class FuzzyCompletionQuery extends PrefixCompletionQuery { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); + final Automaton originalAutomata; + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) { + originalAutomata = stream.toAutomaton(unicodeAware); + } Set refs = new HashSet<>(); - Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); + Automaton automaton = toLevenshteinAutomata(originalAutomata, refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 270463175d7..5ca4993396f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.document; import java.io.IOException; import java.util.PriorityQueue; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -42,7 +43,7 @@ final class NRTSuggesterBuilder { * Label used to separate surface form and docID * in the output */ - public static final int PAYLOAD_SEP = '\u001F'; + public static final int PAYLOAD_SEP = ConcatenateGraphFilter.SEP_LABEL; /** * Marks end of the analyzed input and start of dedup diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java index 7bb75e9261c..a8da150f504 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java @@ -68,8 +68,9 @@ public class PrefixCompletionQuery extends CompletionQuery { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); - return new CompletionWeight(this, stream.toAutomaton()); + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) { + return new CompletionWeight(this, stream.toAutomaton()); + } } /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java index 7f06328ee1b..b2d24c2c84e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; @@ -140,7 +141,7 @@ public class SuggestField extends Field { private boolean isReserved(char c) { switch (c) { - case CompletionAnalyzer.SEP_LABEL: + case ConcatenateGraphFilter.SEP_LABEL: case CompletionAnalyzer.HOLE_CHARACTER: case NRTSuggesterBuilder.END_BYTE: return true; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java index 0c3b254c132..8beea129622 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java @@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -109,21 +110,21 @@ public class TestContextSuggestField extends LuceneTestCase { CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); } @Test diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index a6659e082d5..e6d7062c925 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -32,7 +32,11 @@ import java.util.concurrent.CyclicBarrier; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene70.Lucene70Codec; @@ -99,7 +103,7 @@ public class TestSuggestField extends LuceneTestCase { public void testReservedChars() throws Exception { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); charsRefBuilder.append("sugg"); - charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL); + charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new SuggestField("name", charsRefBuilder.toString(), 1); }); @@ -144,11 +148,11 @@ public class TestSuggestField extends LuceneTestCase { output.writeByte(SuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); + TokenStream stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); + stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); } @@ -894,4 +898,23 @@ public class TestSuggestField extends LuceneTestCase { iwc.setCodec(filterCodec); return iwc; } + + public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { + private PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private TypeAttribute type = addAttribute(TypeAttribute.class); + + protected PayloadAttrToTypeAttrFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + // we move them over so we can assert them more easily in the tests + type.setType(payload.getPayload().utf8ToString()); + return true; + } + return false; + } + } } From 662477361369e2c5c8117ee9b535bd19256eccc5 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Tue, 5 Jun 2018 12:21:36 +0200 Subject: [PATCH 17/38] SOLR-11911: Make sure all processing is completed before asserting. --- .../autoscaling/sim/TestLargeCluster.java | 103 +++++++++++++----- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java index 6e6b4aa3bd8..934d2ea77cb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java @@ -60,6 +60,7 @@ import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.util.LogLevel; +import org.apache.solr.util.TimeOut; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -83,8 +84,9 @@ public class TestLargeCluster extends SimSolrCloudTestCase { public static final int NUM_NODES = 100; static Map> listenerEvents = new ConcurrentHashMap<>(); - static AtomicInteger triggerFiredCount = new AtomicInteger(); - static CountDownLatch triggerFiredLatch; + static AtomicInteger triggerFinishedCount = new AtomicInteger(); + static AtomicInteger triggerStartedCount = new AtomicInteger(); + static CountDownLatch triggerFinishedLatch; static int waitForSeconds; @BeforeClass @@ -95,8 +97,9 @@ public class TestLargeCluster extends SimSolrCloudTestCase { @Before public void setupTest() throws Exception { waitForSeconds = 5; - triggerFiredCount.set(0); - triggerFiredLatch = new CountDownLatch(1); + triggerStartedCount.set(0); + triggerFinishedCount.set(0); + triggerFinishedLatch = new CountDownLatch(1); listenerEvents.clear(); // disable .scheduled_maintenance String suspendTriggerCommand = "{" + @@ -129,11 +132,18 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } - public static class TestTriggerAction extends TriggerActionBase { + public static class FinishTriggerAction extends TriggerActionBase { @Override public void process(TriggerEvent event, ActionContext context) throws Exception { - triggerFiredCount.incrementAndGet(); - triggerFiredLatch.countDown(); + triggerFinishedCount.incrementAndGet(); + triggerFinishedLatch.countDown(); + } + } + + public static class StartTriggerAction extends TriggerActionBase { + @Override + public void process(TriggerEvent event, ActionContext context) throws Exception { + triggerStartedCount.incrementAndGet(); } } @@ -142,14 +152,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_lost_trigger'," + + "'name' : 'node_lost_trigger1'," + "'event' : 'nodeLost'," + "'waitFor' : '" + waitForSeconds + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -160,7 +171,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "'set-listener' : " + "{" + "'name' : 'foo'," + - "'trigger' : 'node_lost_trigger'," + + "'trigger' : 'node_lost_trigger1'," + "'stage' : ['STARTED','ABORTED','SUCCEEDED', 'FAILED']," + "'beforeAction' : ['compute', 'execute']," + "'afterAction' : ['compute', 'execute']," + @@ -223,6 +234,19 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } + + log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS, CloudTestUtils.clusterShape(5, 15)) + "ms"); long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name()); @@ -238,14 +262,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_added_trigger'," + + "'name' : 'node_added_trigger2'," + "'event' : 'nodeAdded'," + "'waitFor' : '" + waitForSeconds + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -263,20 +288,34 @@ public class TestLargeCluster extends SimSolrCloudTestCase { log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS, CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms"); + // start adding nodes int numAddNode = NUM_NODES / 5; List addNodesList = new ArrayList<>(numAddNode); for (int i = 0; i < numAddNode; i++) { addNodesList.add(cluster.simAddNode()); cluster.getTimeSource().sleep(5000); } - boolean await = triggerFiredLatch.await(1000000 / SPEED, TimeUnit.MILLISECONDS); + // wait until at least one event is generated + boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("trigger did not fire", await); + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } + List systemColl = cluster.simGetSystemCollection(); int startedEventPos = -1; for (int i = 0; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODEADDED".equals(d.getFieldValue("event.type_s")) && @@ -298,13 +337,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrInputDocument finishedEvent = null; long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); while (count-- > 0) { - cluster.getTimeSource().sleep(150000); + cluster.getTimeSource().sleep(10000); long currentNumOps = cluster.simGetOpCount("MOVEREPLICA"); if (currentNumOps == lastNumOps) { int size = systemColl.size() - 1; for (int i = size; i > lastIgnoredPos; i--) { SolrInputDocument d = systemColl.get(i); - if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) { continue; } if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { @@ -407,14 +446,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_lost_trigger'," + + "'name' : 'node_lost_trigger3'," + "'event' : 'nodeLost'," + "'waitFor' : '" + waitFor + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -441,8 +481,8 @@ public class TestLargeCluster extends SimSolrCloudTestCase { cluster.simRemoveNode(nodes.get(i), false); cluster.getTimeSource().sleep(killDelay); } - // wait for the trigger to fire - boolean await = triggerFiredLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); + // wait for the trigger to fire at least once + boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("trigger did not fire within timeout, " + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, await); @@ -450,7 +490,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { int startedEventPos = -1; for (int i = 0; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODELOST".equals(d.getFieldValue("event.type_s")) && @@ -463,11 +503,22 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, startedEventPos > -1); SolrInputDocument startedEvent = systemColl.get(startedEventPos); + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } int ignored = 0; int lastIgnoredPos = startedEventPos; for (int i = startedEventPos + 1; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODELOST".equals(d.getFieldValue("event.type_s"))) { @@ -492,13 +543,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrInputDocument finishedEvent = null; long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); while (count-- > 0) { - cluster.getTimeSource().sleep(150000); + cluster.getTimeSource().sleep(waitFor * 10000); long currentNumOps = cluster.simGetOpCount("MOVEREPLICA"); if (currentNumOps == lastNumOps) { int size = systemColl.size() - 1; for (int i = size; i > lastIgnoredPos; i--) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { @@ -560,7 +611,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "'actions' : [" + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -580,7 +631,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { assertEquals(response.get("result").toString(), "success"); - boolean await = triggerFiredLatch.await(waitForSeconds * 20000 / SPEED, TimeUnit.MILLISECONDS); + boolean await = triggerFinishedLatch.await(waitForSeconds * 20000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("The trigger did not fire at all", await); // wait for listener to capture the SUCCEEDED stage cluster.getTimeSource().sleep(2000); From add77d272582e909e9a7cf008d35ea72c3914230 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 5 Jun 2018 22:29:06 +1000 Subject: [PATCH 18/38] SOLR-12444: Updating a cluster policy fails --- solr/CHANGES.txt | 2 ++ .../autoscaling/AutoScalingHandlerTest.java | 21 +++++++++++++++++++ .../solrj/cloud/autoscaling/Clause.java | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c88de1061ed..1b454a004d6 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -1312,6 +1312,8 @@ Bug Fixes * SOLR-11477: Disallow resolving of external entities in the XML query parser (defType=xmlparser). (Michael Stepankin, Olga Barinova, Uwe Schindler, Christine Poerschke) +* SOLR-12444: Updating a cluster policy fails (noble) + Optimizations ---------------------- diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java index 483b60c14ff..cf119535e12 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java @@ -17,6 +17,7 @@ package org.apache.solr.cloud.autoscaling; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.List; import java.util.Map; @@ -26,6 +27,7 @@ import java.util.concurrent.TimeUnit; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrResponse; +import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.cloud.autoscaling.Policy; import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.client.solrj.impl.CloudSolrClient; @@ -1011,6 +1013,25 @@ public class AutoScalingHandlerTest extends SolrCloudTestCase { assertEquals(5L, properties.get(AutoScalingParams.ACTION_THROTTLE_PERIOD_SECONDS)); } + public void testUpdatePolicy() throws IOException, SolrServerException { + CloudSolrClient solrClient = cluster.getSolrClient(); + String setPropertiesCommand = "{'set-cluster-policy': [" + + "{'cores': '<4','node': '#ANY'}]}"; + solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand)); + SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.GET, null); + NamedList response = solrClient.request(req); + assertEquals("<4", Utils.getObjectByPath(response,false,"cluster-policy[0]/cores")); + assertEquals("#ANY", Utils.getObjectByPath(response,false,"cluster-policy[0]/node")); + setPropertiesCommand = "{'set-cluster-policy': [" + + "{'cores': '<3','node': '#ANY'}]}"; + solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand)); + req = createAutoScalingRequest(SolrRequest.METHOD.GET, null); + response = solrClient.request(req); + System.out.println(response); + + + } + static class AutoScalingRequest extends SolrRequest { protected final String message; diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java index c739588d354..8f198bd8bbc 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java @@ -140,7 +140,7 @@ public class Clause implements MapWriter, Comparable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Clause that = (Clause)o; - return compareTo(that) == 0; + return Objects.equals(this.original, that.original); } void addTags(Collection params) { From f9d807af755f33dea1384cb8b9c591875076dd1a Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 5 Jun 2018 23:14:11 +1000 Subject: [PATCH 19/38] SOLR-12387: fixing a test failure --- .../solr/handler/admin/CollectionsHandler.java | 5 +++-- .../solr/common/cloud/ClusterProperties.java | 18 ++++++++++++++++++ .../solr/common/cloud/ZkStateReader.java | 6 ++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 01d2fe89884..269bb50641d 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -206,9 +206,10 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission return this.coreContainer; } - protected void copyFromClusterProp(Map props, String prop) { + protected void copyFromClusterProp(Map props, String prop) throws IOException { if (props.get(prop) != null) return;//if it's already specified , return - Object defVal = coreContainer.getZkController().getZkStateReader().getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null); + Object defVal = new ClusterProperties(coreContainer.getZkController().getZkStateReader().getZkClient()) + .getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null); if (defVal != null) props.put(prop, String.valueOf(defVal)); } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java index 87896daad5b..446923b81de 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import org.apache.solr.common.SolrException; @@ -67,6 +68,23 @@ public class ClusterProperties { return value; } + /** + * Read the value of a cluster property, returning a default if it is not set + * + * @param key the property name or the full path to the property as a list of parts. + * @param defaultValue the default value + * @param the type of the property + * @return the property value + * @throws IOException if there is an error reading the value from the cluster + */ + @SuppressWarnings("unchecked") + public T getClusterProperty(List key, T defaultValue) throws IOException { + T value = (T) Utils.getObjectByPath(getClusterProperties(), false, key); + if (value == null) + return defaultValue; + return value; + } + /** * Return the cluster properties * @throws IOException if there is an error reading properties from the cluster diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 6b65c344d4c..a86c5e28448 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -961,6 +961,12 @@ public class ZkStateReader implements Closeable { return value; } + /**Same as the above but allows a full json path as a list of parts + * + * @param keyPath path to the property example ["collectionDefauls", "numShards"] + * @param defaultValue a default value to use if no such property exists + * @return the cluster property, or a default if the property is not set + */ public T getClusterProperty(List keyPath, T defaultValue) { T value = (T) Utils.getObjectByPath( clusterProperties, false, keyPath); if (value == null) From 7d0b64f9d5484c1f03ccf422bdee50c31443344c Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 5 Jun 2018 10:55:53 -0400 Subject: [PATCH 20/38] SOLR-12417: doc: fix CHANGES credit --- solr/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1b454a004d6..66d20269722 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -283,7 +283,7 @@ Bug Fixes * SOLR-12374: SnapShooter.getIndexCommit can forget to decref the searcher; though it's not clear in practice when. (David Smiley) -* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (yonik) +* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (Mano Kovacs, yonik) * SOLR-12271: Fixed bug in how Analytics component reads negative values from float and double fields. (Houston Putman) From c587598096cde769c299594fb26d0a23b7bd5930 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Tue, 5 Jun 2018 12:30:34 -0400 Subject: [PATCH 21/38] LUCENE-7960: fix Solr test to include mandatory args --- .../src/test/org/apache/solr/core/ResourceLoaderTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java index 8ab9606a7b8..785195acea5 100644 --- a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java +++ b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java @@ -72,7 +72,8 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 { Class clazz = ResourceLoaderAware.class; // Check ResourceLoaderAware valid objects - assertAwareCompatibility(clazz, new NGramFilterFactory(new HashMap<>())); + //noinspection unchecked + assertAwareCompatibility(clazz, new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2"))); assertAwareCompatibility(clazz, new KeywordTokenizerFactory(new HashMap<>())); // Make sure it throws an error for invalid objects @@ -98,8 +99,9 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 { assertAwareCompatibility(clazz, new JSONResponseWriter()); // Make sure it throws an error for invalid objects + //noinspection unchecked invalid = new Object[] { - new NGramFilterFactory(new HashMap<>()), + new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2")), "hello", new Float( 12.3f ), new KeywordTokenizerFactory(new HashMap<>()) }; From cf63392183ffc96428fc4c52f546fec2cdf766d5 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Tue, 5 Jun 2018 14:04:55 -0400 Subject: [PATCH 22/38] SOLR-12376: New TaggerRequestHandler (SolrTextTagger). --- solr/CHANGES.txt | 3 + solr/NOTICE.txt | 14 + .../apache/solr/core/SolrResourceLoader.java | 12 +- .../solr/handler/tagger/OffsetCorrector.java | 178 ++++++++ .../handler/tagger/TagClusterReducer.java | 103 +++++ .../org/apache/solr/handler/tagger/TagLL.java | 176 ++++++++ .../apache/solr/handler/tagger/Tagger.java | 230 ++++++++++ .../handler/tagger/TaggerRequestHandler.java | 397 ++++++++++++++++++ .../solr/handler/tagger/TaggingAttribute.java | 65 +++ .../handler/tagger/TaggingAttributeImpl.java | 79 ++++ .../solr/handler/tagger/TermPrefixCursor.java | 189 +++++++++ .../handler/tagger/XmlOffsetCorrector.java | 113 +++++ .../solr/handler/tagger/package-info.java | 27 ++ .../solr/collection1/conf/schema-tagger.xml | 187 +++++++++ .../collection1/conf/solrconfig-tagger.xml | 59 +++ .../tagger/EmbeddedSolrNoSerializeTest.java | 153 +++++++ .../handler/tagger/RandomizedTaggerTest.java | 150 +++++++ .../solr/handler/tagger/Tagger2Test.java | 175 ++++++++ .../solr/handler/tagger/TaggerTest.java | 296 +++++++++++++ .../solr/handler/tagger/TaggerTestCase.java | 251 +++++++++++ .../handler/tagger/TaggingAttributeTest.java | 73 ++++ .../tagger/WordLengthTaggingFilter.java | 110 +++++ .../WordLengthTaggingFilterFactory.java | 67 +++ .../handler/tagger/XmlInterpolationTest.java | 224 ++++++++++ solr/solr-ref-guide/src/searching.adoc | 33 +- .../src/the-tagger-handler.adoc | 265 ++++++++++++ 26 files changed, 3622 insertions(+), 7 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java create mode 100644 solr/core/src/java/org/apache/solr/handler/tagger/package-info.java create mode 100644 solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml create mode 100644 solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java create mode 100644 solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java create mode 100644 solr/solr-ref-guide/src/the-tagger-handler.adoc diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 66d20269722..479406fb0c2 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -154,6 +154,9 @@ New Features * SOLR-12389: support deeply nested json objects in clusterprops.json (noble) +* SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text. It's used as a component of + NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley + Bug Fixes ---------------------- diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt index fd954f4ef4f..a5b2070a39b 100644 --- a/solr/NOTICE.txt +++ b/solr/NOTICE.txt @@ -537,3 +537,17 @@ See http://www.restlet.org/ Protocol Buffers - Google's data interchange format Copyright 2008 Google Inc. http://code.google.com/apis/protocolbuffers/ + +========================================================================= +== SolrTextTagger Notice == +========================================================================= + +The TaggerRequestHandler and related classes in its package came from the +OpenSextant Solr Text Tagger, +Copyright 2013 The MITRE Corporation. All Rights Reserved. + + This software was produced for the U. S. Government + under Contract No. W15P7T-11-C-F600, and is + subject to the Rights in Noncommercial Computer Software + and Noncommercial Computer Software Documentation + Clause 252.227-7014 (JUN 1995) \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java index 22753dd0c6a..0ff5c7b362c 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java +++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java @@ -16,6 +16,10 @@ */ package org.apache.solr.core; +import javax.naming.Context; +import javax.naming.InitialContext; +import javax.naming.NamingException; +import javax.naming.NoInitialContextException; import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; @@ -47,10 +51,6 @@ import java.util.concurrent.ConcurrentSkipListSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; -import javax.naming.NoInitialContextException; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.util.CharFilterFactory; @@ -88,9 +88,9 @@ public class SolrResourceLoader implements ResourceLoader,Closeable static final String project = "solr"; static final String base = "org.apache" + "." + project; static final String[] packages = { - "", "analysis.", "schema.", "handler.", "search.", "update.", "core.", "response.", "request.", + "", "analysis.", "schema.", "handler.", "handler.tagger.", "search.", "update.", "core.", "response.", "request.", "update.processor.", "util.", "spelling.", "handler.component.", "handler.dataimport.", - "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.","handler.admin.", + "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.", "handler.admin.", "cloud.autoscaling." }; private static final java.lang.String SOLR_CORE_NAME = "solr.core.name"; diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java new file mode 100644 index 00000000000..1fb4911195d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java @@ -0,0 +1,178 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.Arrays; + +import com.carrotsearch.hppc.IntArrayList; + +public abstract class OffsetCorrector { + + //TODO support a streaming style of consuming input text so that we need not take a + // String. Trickier because we need to keep more information as we parse to know when tags + // are adjacent with/without whitespace + + //Data structure requirements: + // Given a character offset: + // * determine what tagId is it's parent. + // * determine if it is adjacent to the parent open tag, ignoring whitespace + // * determine if it is adjacent to the parent close tag, ignoring whitespace + // Given a tagId: + // * What is it's parent tagId + // * What's the char offset of the start and end of the open tag + // * What's the char offset of the start and end of the close tag + + /** Document text. */ + protected final String docText; + + /** Array of tag info comprised of 5 int fields: + * [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff]. + * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */ + protected final IntArrayList tagInfo; + + /** offsets of parent tag id change (ascending order) */ + protected final IntArrayList parentChangeOffsets; + /** tag id; parallel array to parentChangeOffsets */ + protected final IntArrayList parentChangeIds; + + protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state + + /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */ + protected final IntArrayList nonTaggableOffsets; + + /** + * Initialize based on the document text. + * @param docText non-null structured content. + * @param hasNonTaggable if there may be "non-taggable" tags to track + */ + protected OffsetCorrector(String docText, boolean hasNonTaggable) { + this.docText = docText; + final int guessNumElements = Math.max(docText.length() / 20, 4); + + tagInfo = new IntArrayList(guessNumElements * 5); + parentChangeOffsets = new IntArrayList(guessNumElements * 2); + parentChangeIds = new IntArrayList(guessNumElements * 2); + nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null; + } + + /** Corrects the start and end offset pair. It will return null if it can't + * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags. + * The start (left) offset is pulled left as needed over whitespace and opening tags. The end + * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as + * a 2-element array. + *

    Note that the returned array is internally reused; just use it to examine the response. + */ + public int[] correctPair(int leftOffset, int rightOffset) { + rightOffset = correctEndOffsetForCloseElement(rightOffset); + if (spansNonTaggable(leftOffset, rightOffset)) + return null; + + int startTag = lookupTag(leftOffset); + //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag)); + int endTag = lookupTag(rightOffset-1); + //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag)); + + // Find the ancestor tag enclosing offsetPair. And bump out left offset along the way. + int iTag = startTag; + for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) { + //Ensure there is nothing except whitespace thru OpenEndOff + int tagOpenEndOff = getOpenEndOff(iTag); + if (hasNonWhitespace(tagOpenEndOff, leftOffset)) + return null; + leftOffset = getOpenStartOff(iTag); + } + final int ancestorTag = iTag; + // Bump out rightOffset until we get to ancestorTag. + for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) { + //Ensure there is nothing except whitespace thru CloseStartOff + int tagCloseStartOff = getCloseStartOff(iTag); + if (hasNonWhitespace(rightOffset, tagCloseStartOff)) + return null; + rightOffset = getCloseEndOff(iTag); + } + + offsetPair[0] = leftOffset; + offsetPair[1] = rightOffset; + return offsetPair; + } + + /** Correct endOffset for adjacent element at the right side. E.g. offsetPair might point to: + *

    +   *   foo</tag>
    +   * 
    + * and this method pulls the end offset left to the '<'. This is necessary for use with + * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}. + * + * See https://issues.apache.org/jira/browse/LUCENE-5734 */ + protected int correctEndOffsetForCloseElement(int endOffset) { + if (docText.charAt(endOffset-1) == '>') { + final int newEndOffset = docText.lastIndexOf('<', endOffset - 2); + if (newEndOffset > offsetPair[0])//just to be sure + return newEndOffset; + } + return endOffset; + } + + protected boolean hasNonWhitespace(int start, int end) { + for (int i = start; i < end; i++) { + if (!Character.isWhitespace(docText.charAt(i))) + return true; + } + return false; + } + + protected boolean tagEnclosesOffset(int tag, int off) { + return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag); + } + + protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); } + protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); } + protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); } + protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); } + protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); } + + protected int lookupTag(int off) { + int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off); + if (idx < 0) + idx = (-idx - 1) - 1;//round down + return parentChangeIds.get(idx); + } + + protected boolean spansNonTaggable(int startOff, int endOff) { + if (nonTaggableOffsets == null) + return false; + int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff); + //if tag start coincides with first or last char of non-taggable span then result is true. + // (probably never happens since those characters are actual element markup) + if (idx >= 0) + return true; + idx = -idx - 1;//modify for where we would insert + //if idx is odd then our span intersects a non-taggable span; return true + if ((idx & 1) == 1) + return true; + //it's non-taggable if the next non-taggable start span is before our endOff + if (idx == nonTaggableOffsets.size()) + return false; + return nonTaggableOffsets.get(idx) < endOff; + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java new file mode 100644 index 00000000000..9310a0429e1 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java @@ -0,0 +1,103 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +public interface TagClusterReducer { + /** + * Reduces the linked-list to only those tags that should be emitted + * @param head not null; 1-element array to head which isn't null either + */ + void reduce(TagLL[] head); + + static final TagClusterReducer ALL = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + } + }; + + static final TagClusterReducer NO_SUB = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + //loop forward over all tags + for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) { + //loop backwards over prev tags from this tag + for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) { + assert tPrev.startOffset <= tag.startOffset; + //if a previous tag's endOffset is <= this one's, tForward can be removed + if (tPrev.endOffset >= tag.endOffset) { + tag.removeLL(); + break; + } else if (tPrev.startOffset == tag.startOffset) { + tPrev.removeLL(); + //continue; 'tag' is still valid + } + } + } + } + }; + + static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + + //--Optimize for common single-tag case + if (head[0].nextTag == null) + return; + + while (true) { + //--Find longest not already marked + TagLL longest = null; + for (TagLL t = head[0]; t != null; t = t.nextTag) { + if (!t.mark && (longest == null || t.charLen() >= longest.charLen())) + longest = t; + } + if (longest == null) + break; + //--Mark longest (so we return it eventually) + longest.mark = true; + //--Remove tags overlapping this longest + for (TagLL t = head[0]; t != null; t = t.nextTag) { + if (t.mark) + continue; + + if (t.overlaps(longest)) { + t.removeLL(); + } else if (t.startOffset >= longest.endOffset) { + break;//no subsequent can possibly overlap + } + } + }//loop + + //all-remaining should be marked +// for (TagLL t = head; t != null; t = t.nextTag) { +// assert t.mark; +//// if (!t.mark) { +//// t.removeLL(); +//// if (head == t) +//// head = t.nextTag; +//// } +// } + assert head[0].mark; + } + }; +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java new file mode 100644 index 00000000000..e8bb0a3bc9b --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java @@ -0,0 +1,176 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; + +import org.apache.lucene.util.BytesRef; + +/** + * This is a Tag -- a startOffset, endOffset and value. + *

    + * A Tag starts without a value in an + * "advancing" state. {@link #advance(org.apache.lucene.util.BytesRef, int)} + * is called with subsequent words and then eventually it won't advance any + * more, and value is set (could be null). + *

    + * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share + * a reference to the head via a 1-element array, which is potentially modified + * if any of the linked-list methods are called. Tags in the list should have + * equal or increasing start offsets. + */ +public class TagLL{ + + private final TagLL[] head;//a shared pointer to the head; 1 element + TagLL prevTag, nextTag; // linked list + + private TermPrefixCursor cursor; + + final int startOffset;//inclusive + int endOffset;//exclusive + Object value;//null means unset + + /** optional boolean used by some TagClusterReducer's */ + boolean mark = false; + + TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) { + this.head = head; + this.cursor = cursor; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.value = value; + } + + /** + * Advances this tag with "word" at offset "offset". If this tag is not in + * an advancing state then it does nothing. If it is advancing and prior to + * advancing further it sees a value, then a non-advancing tag may be inserted + * into the LL as side-effect. If this returns false (it didn't advance) and + * if there is no value, then it will also be removed. + * + * + * @param word The next word or null if at an end + * @param offset The last character in word's offset in the underlying + * stream. If word is null then it's meaningless. + * + * @return Whether it advanced or not. + */ + boolean advance(BytesRef word, int offset) throws IOException { + if (!isAdvancing()) + return false; + + Object iVal = cursor.getDocIds(); + + if (word != null && cursor.advance(word)) { + + if (iVal != null) { + addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal)); + } + + assert offset >= endOffset; + endOffset = offset; + return true; + } else { + this.value = iVal; + this.cursor = null; + if (iVal == null) + removeLL(); + return false; + } + } + + /** Removes this tag from the chain, connecting prevTag and nextTag. Does not + * modify "this" object's pointers, so the caller can refer to nextTag after + * removing it. */ + public void removeLL() { + if (head[0] == this) + head[0] = nextTag; + if (prevTag != null) { + prevTag.nextTag = nextTag; + } + if (nextTag != null) { + nextTag.prevTag = prevTag; + } + } + + void addBeforeLL(TagLL tag) { + assert tag.startOffset <= startOffset; + if (prevTag != null) { + assert prevTag.startOffset <= tag.startOffset; + prevTag.nextTag = tag; + tag.prevTag = prevTag; + } else { + assert head[0] == this; + head[0] = tag; + } + prevTag = tag; + tag.nextTag = this; + } + + void addAfterLL(TagLL tag) { + assert tag.startOffset >= startOffset; + if (nextTag != null) { + assert nextTag.startOffset >= tag.startOffset; + nextTag.prevTag = tag; + tag.nextTag = nextTag; + } + nextTag = tag; + tag.prevTag = this; + } + + public int charLen() { + return endOffset - startOffset; + } + + public TagLL getNextTag() { + return nextTag; + } + + public TagLL getPrevTag() { + return prevTag; + } + + public int getStartOffset() { + return startOffset; + } + public int getEndOffset() { + return endOffset; + } + public boolean overlaps(TagLL other) { + //don't use >= or <= because startOffset is inclusive while endOffset is exclusive + if (startOffset < other.startOffset) + return endOffset > other.startOffset; + else + return startOffset < other.endOffset; + } + + boolean isAdvancing() { + return cursor != null; + } + + @Override + public String toString() { + return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') + + " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value); + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java new file mode 100644 index 00000000000..12a4cf0a035 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java @@ -0,0 +1,230 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Tags maximum string of words in a corpus. This is a callback-style API + * in which you implement {@link #tagCallback(int, int, Object)}. + * + * This class should be independently usable outside Solr. + */ +public abstract class Tagger { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final TokenStream tokenStream; + private final TermToBytesRefAttribute byteRefAtt; + private final PositionIncrementAttribute posIncAtt; + private final OffsetAttribute offsetAtt; + private final TaggingAttribute taggingAtt; + + private final TagClusterReducer tagClusterReducer; + private final Terms terms; + private final Bits liveDocs; + private final boolean skipAltTokens; + private final boolean ignoreStopWords; + + private Map docIdsCache; + + /** Whether the WARNING about skipped tokens was already logged. */ + private boolean loggedSkippedAltTokenWarning = false; + + public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, + TagClusterReducer tagClusterReducer, boolean skipAltTokens, + boolean ignoreStopWords) throws IOException { + this.terms = terms; + this.liveDocs = liveDocs; + this.tokenStream = tokenStream; + this.skipAltTokens = skipAltTokens; + this.ignoreStopWords = ignoreStopWords; + byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); + posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); + taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); + tokenStream.reset(); + + this.tagClusterReducer = tagClusterReducer; + } + + public void enableDocIdsCache(int initSize) { + if (initSize > 0) + docIdsCache = new HashMap<>(initSize); + } + + public void process() throws IOException { + if (terms == null) + return; + + //a shared pointer to the head used by this method and each Tag instance. + final TagLL[] head = new TagLL[1]; + + TermPrefixCursor cursor = null;//re-used + + //boolean switch used to log warnings in case tokens where skipped during tagging. + boolean skippedTokens = false; + + while (tokenStream.incrementToken()) { + if (log.isTraceEnabled()) { + log.trace("Token: {}, posInc: {}, offset: [{},{}]", + byteRefAtt, posIncAtt.getPositionIncrement(), + offsetAtt.startOffset(), offsetAtt.endOffset()); + } + //check for posInc < 1 (alternate Tokens, such as expanded Synonyms) + if (posIncAtt.getPositionIncrement() < 1) { + //(a) Deal with this as a configuration issue and throw an exception + if (!skipAltTokens) { + //TODO throw UnsupportedTokenException when PhraseBuilder is ported + throw new IllegalStateException("Query Analyzer generates alternate " + + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS + + "' might result in wrong tagging results if the index time analyzer " + + "is not configured accordingly. For detailed information see " + + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); + } else { + //(b) In case the index time analyser had indexed all variants (users + // need to ensure that) processing of alternate tokens can be skipped + // as anyways all alternatives will be contained in the FST. + skippedTokens = true; + log.trace(" ... ignored token"); + continue; + } + } + //-- If PositionIncrement > 1 (stopwords) + if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { + log.trace(" - posInc > 1 ... mark cluster as done"); + advanceTagsAndProcessClusterIfDone(head, null); + } + + final BytesRef term; + //NOTE: we need to lookup tokens if + // * the LookupAtt is true OR + // * there are still advancing tags (to find the longest possible match) + if(taggingAtt.isTaggable() || head[0] != null){ + //-- Lookup the term id from the next token + term = byteRefAtt.getBytesRef(); + if (term.length == 0) { + throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token"); + } + } else { //no current cluster AND lookup == false ... + term = null; //skip this token + } + + //-- Process tag + advanceTagsAndProcessClusterIfDone(head, term); + + //-- only create new Tags for Tokens we need to lookup + if (taggingAtt.isTaggable() && term != null) { + + //determine if the terms index has a term starting with the provided term + // TODO create a pool of these cursors to reuse them more? could be trivial impl + if (cursor == null)// (else the existing cursor will be re-used) + cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache); + if (cursor.advance(term)) { + TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); + cursor = null;//because the new tag now "owns" this instance + //and add it to the end + if (head[0] == null) { + head[0] = newTail; + } else { + for (TagLL t = head[0]; true; t = t.nextTag) { + if (t.nextTag == null) { + t.addAfterLL(newTail); + break; + } + } + } + } + }//if termId >= 0 + }//end while(incrementToken()) + + //-- Finish all tags + advanceTagsAndProcessClusterIfDone(head, null); + assert head[0] == null; + + if(!loggedSkippedAltTokenWarning && skippedTokens){ + loggedSkippedAltTokenWarning = true; //only log once + log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) " + + "while processing text. This may cause problems with some Analyzer " + + "configurations (e.g. query time synonym expansion). For details see " + + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); + } + + tokenStream.end(); + //tokenStream.close(); caller closes because caller acquired it + } + + private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException { + //-- Advance tags + final int endOffset = term != null ? offsetAtt.endOffset() : -1; + boolean anyAdvance = false; + for (TagLL t = head[0]; t != null; t = t.nextTag) { + anyAdvance |= t.advance(term, endOffset); + } + + //-- Process cluster if done + if (!anyAdvance && head[0] != null) { + tagClusterReducer.reduce(head); + for (TagLL t = head[0]; t != null; t = t.nextTag) { + assert t.value != null; + tagCallback(t.startOffset, t.endOffset, t.value); + } + head[0] = null; + } + } + + /** + * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset + * given in the previous call. + * + * @param startOffset The character offset of the original stream where the tag starts. + * @param endOffset One more than the character offset of the original stream where the tag ends. + * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}. + */ + protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey); + + /** + * Returns a sorted array of integer docIds given the corresponding key. + * @param docIdsKey The lookup key. + * @return Not null + */ + protected IntsRef lookupDocIds(Object docIdsKey) { + return (IntsRef) docIdsKey; + } +} + diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java new file mode 100644 index 00000000000..a972e47165a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java @@ -0,0 +1,397 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; + +import com.google.common.io.CharStreams; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.StopFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.Terms; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; +import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.BitDocSet; +import org.apache.solr.search.DocList; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.DocSlice; +import org.apache.solr.search.QParser; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.search.SolrReturnFields; +import org.apache.solr.search.SyntaxError; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Scans posted text, looking for matching strings in the Solr index. + * The public static final String members are request parameters. + * This handler is also called the "SolrTextTagger". + * + * @since 7.4.0 + */ +public class TaggerRequestHandler extends RequestHandlerBase { + + /** Request parameter. */ + public static final String OVERLAPS = "overlaps"; + /** Request parameter. */ + public static final String TAGS_LIMIT = "tagsLimit"; + /** Request parameter. */ + public static final String MATCH_TEXT = "matchText"; + /** Request parameter. */ + public static final String SKIP_ALT_TOKENS = "skipAltTokens"; + /** Request parameter. */ + public static final String IGNORE_STOPWORDS = "ignoreStopwords"; + /** Request parameter. */ + public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust"; + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + @Override + public String getDescription() { + return "Processes input text to find matching tokens stored in the index."; + } + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + + //--Read params + final String indexedField = req.getParams().get("field"); + if (indexedField == null) + throw new RuntimeException("required param 'field'"); + + final TagClusterReducer tagClusterReducer = + chooseTagClusterReducer(req.getParams().get(OVERLAPS)); + final int rows = req.getParams().getInt(CommonParams.ROWS, 10000); + final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000); + final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false); + final SchemaField idSchemaField = req.getSchema().getUniqueKeyField(); + if (idSchemaField == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" + + "uniqueKey in the schema.");//TODO this could be relaxed + } + final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false); + final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS, + fieldHasIndexedStopFilter(indexedField, req)); + + //--Get posted data + Reader inputReader = null; + Iterable streams = req.getContentStreams(); + if (streams != null) { + Iterator iter = streams.iterator(); + if (iter.hasNext()) { + inputReader = iter.next().getReader(); + } + if (iter.hasNext()) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging? + } + } + if (inputReader == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + getClass().getSimpleName()+" requires text to be POSTed to it"); + } + + // We may or may not need to read the input into a string + final InputStringLazy inputStringFuture = new InputStringLazy(inputReader); + + final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture); + + final String inputString;//only populated if needed + if (addMatchText || inputStringFuture.inputString != null) { + //Read the input fully into a String buffer that we'll need later, + // then replace the input with a reader wrapping the buffer. + inputString = inputStringFuture.call(); + inputReader.close(); + inputReader = new StringReader(inputString); + } else { + inputString = null;//not used + } + + final SolrIndexSearcher searcher = req.getSearcher(); + final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc()); + final List tags = new ArrayList(2000); + + try { + Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer(); + try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) { + Terms terms = searcher.getSlowAtomicReader().terms(indexedField); + if (terms == null) + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "field " + indexedField + " has no indexed data"); + Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer, + skipAltTokens, ignoreStopWords) { + @SuppressWarnings("unchecked") + @Override + protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) { + if (tags.size() >= tagsLimit) + return; + if (offsetCorrector != null) { + int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset); + if (offsetPair == null) { + log.debug("Discarded offsets [{}, {}] because couldn't balance XML.", + startOffset, endOffset); + return; + } + startOffset = offsetPair[0]; + endOffset = offsetPair[1]; + } + + NamedList tag = new NamedList(); + tag.add("startOffset", startOffset); + tag.add("endOffset", endOffset); + if (addMatchText) + tag.add("matchText", inputString.substring(startOffset, endOffset)); + //below caches, and also flags matchDocIdsBS + tag.add("ids", lookupSchemaDocIds(docIdsKey)); + tags.add(tag); + } + + Map docIdsListCache = new HashMap<>(2000); + + ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher, + idSchemaField.getType().getValueSource(idSchemaField, null)); + + @SuppressWarnings("unchecked") + private List lookupSchemaDocIds(Object docIdsKey) { + List schemaDocIds = docIdsListCache.get(docIdsKey); + if (schemaDocIds != null) + return schemaDocIds; + IntsRef docIds = lookupDocIds(docIdsKey); + //translate lucene docIds to schema ids + schemaDocIds = new ArrayList(docIds.length); + for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) { + int docId = docIds.ints[i]; + assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?"; + matchDocIdsBS.set(docId);//also, flip docid in bitset + try { + schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + assert !schemaDocIds.isEmpty(); + + docIdsListCache.put(docIds, schemaDocIds); + return schemaDocIds; + } + + }; + tagger.enableDocIdsCache(2000);//TODO configurable + tagger.process(); + } + } finally { + inputReader.close(); + } + rsp.add("tagsCount",tags.size()); + rsp.add("tags", tags); + + rsp.setReturnFields(new SolrReturnFields( req )); + + //Solr's standard name for matching docs in response + rsp.add("response", getDocList(rows, matchDocIdsBS)); + } + + private static class InputStringLazy implements Callable { + final Reader inputReader; + String inputString; + + InputStringLazy(Reader inputReader) { + this.inputReader = inputReader; + } + + @Override + public String call() throws IOException { + if (inputString == null) { + inputString = CharStreams.toString(inputReader); + } + return inputString; + } + } + + protected OffsetCorrector getOffsetCorrector(SolrParams params, Callable inputStringProvider) throws Exception { + final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false); + if (!xmlOffsetAdjust) { + return null; + } + try { + return new XmlOffsetCorrector(inputStringProvider.call()); + } catch (XMLStreamException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Expecting XML but wasn't: " + e, e); + } + } + + private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException { + //Now we must supply a Solr DocList and add it to the response. + // Typically this is gotten via a SolrIndexSearcher.search(), but in this case we + // know exactly what documents to return, the order doesn't matter nor does + // scoring. + // Ideally an implementation of DocList could be directly implemented off + // of a BitSet, but there are way too many methods to implement for a minor + // payoff. + int matchDocs = matchDocIdsBS.cardinality(); + int[] docIds = new int[ Math.min(rows, matchDocs) ]; + DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1); + for (int i = 0; i < docIds.length; i++) { + docIds[i] = docIdIter.nextDoc(); + } + return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f); + } + + private TagClusterReducer chooseTagClusterReducer(String overlaps) { + TagClusterReducer tagClusterReducer; + if (overlaps == null || overlaps.equals("NO_SUB")) { + tagClusterReducer = TagClusterReducer.NO_SUB; + } else if (overlaps.equals("ALL")) { + tagClusterReducer = TagClusterReducer.ALL; + } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) { + tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT; + } else { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "unknown tag overlap mode: "+overlaps); + } + return tagClusterReducer; + } + + /** + * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs + * either. If null is returned, then all docs are available. + */ + private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException { + final String[] corpusFilterQueries = req.getParams().getParams("fq"); + final SolrIndexSearcher searcher = req.getSearcher(); + final Bits docBits; + if (corpusFilterQueries != null && corpusFilterQueries.length > 0) { + List filterQueries = new ArrayList(corpusFilterQueries.length); + for (String corpusFilterQuery : corpusFilterQueries) { + QParser qParser = QParser.getParser(corpusFilterQuery, null, req); + try { + filterQueries.add(qParser.parse()); + } catch (SyntaxError e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + } + + final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache + //note: before Solr 4.7 we could call docSet.getBits() but no longer. + if (docSet instanceof BitDocSet) { + docBits = ((BitDocSet)docSet).getBits(); + } else { + docBits = new Bits() { + + @Override + public boolean get(int index) { + return docSet.exists(index); + } + + @Override + public int length() { + return searcher.maxDoc(); + } + }; + } + } else { + docBits = searcher.getSlowAtomicReader().getLiveDocs(); + } + return docBits; + } + + private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) { + FieldType fieldType = req.getSchema().getFieldType(field); + Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer + if (analyzer instanceof TokenizerChain) { + TokenizerChain tokenizerChain = (TokenizerChain) analyzer; + TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories(); + for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { + if (tokenFilterFactory instanceof StopFilterFactory) + return true; + } + } + return false; + } + + /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */ + static class ValueSourceAccessor { + private final List readerContexts; + private final ValueSource valueSource; + private final Map fContext; + private final FunctionValues[] functionValuesPerSeg; + private final int[] functionValuesDocIdPerSeg; + + ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) { + readerContexts = searcher.getIndexReader().leaves(); + this.valueSource = valueSource; + fContext = ValueSource.newContext(searcher); + functionValuesPerSeg = new FunctionValues[readerContexts.size()]; + functionValuesDocIdPerSeg = new int[readerContexts.size()]; + } + + Object objectVal(int topDocId) throws IOException { + // lookup segment level stuff: + int segIdx = ReaderUtil.subIndex(topDocId, readerContexts); + LeafReaderContext rcontext = readerContexts.get(segIdx); + int segDocId = topDocId - rcontext.docBase; + // unfortunately Lucene 7.0 requires forward only traversal (with no reset method). + // So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-( + FunctionValues functionValues = functionValuesPerSeg[segIdx]; + if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) { + functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext); + } + functionValuesDocIdPerSeg[segIdx] = segDocId; + + // get value: + return functionValues.objectVal(segDocId); + } + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java new file mode 100644 index 00000000000..b7803e4f31a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java @@ -0,0 +1,65 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.Attribute; + +/** + * Attribute used by the {@link Tagger} to decide if a token can start a + * new {@link TagLL tag}. + *

    + * By default this Attribute will return true, but it might be + * reset by some {@link TokenFilter} added to the {@link TokenStream} used + * to analyze the parsed text. Typically this will be done based on NLP + * processing results (e.g. to only lookup Named Entities). + *

    + * NOTE: that all Tokens are used to advance existing {@link TagLL tags}. + */ +public interface TaggingAttribute extends Attribute { + + /** + * By default this Attribute will be initialised with true. + * This ensures that all tokens are taggable by default (especially if + * the {@link TaggingAttribute} is not set by any component in the configured + * {@link TokenStream} + */ + public static final boolean DEFAULT_TAGGABLE = true; + + /** + * Getter for the taggable state of the current Token + * + * @return the state + */ + public boolean isTaggable(); + + /** + * Setter for the taggable state. Typically called by code within + * {@link TokenFilter#incrementToken()}. + * + * @param lookup the state + */ + public void setTaggable(boolean lookup); + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java new file mode 100644 index 00000000000..55ecfbc6ef2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java @@ -0,0 +1,79 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; + +/** + * Implementation of the {@link TaggingAttribute} + */ +public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute { + + /** + * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE} + */ + private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE; + + /* + * (non-Javadoc) + * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup() + */ + @Override + public boolean isTaggable() { + return taggable; + } + + /* + * (non-Javadoc) + * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean) + */ + @Override + public void setTaggable(boolean lookup) { + this.taggable = lookup; + } + + /* + * (non-Javadoc) + * @see org.apache.lucene.util.AttributeImpl#clear() + */ + @Override + public void clear() { + taggable = DEFAULT_TAGGABLE; + } + + /* + * (non-Javadoc) + * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl) + */ + @Override + public void copyTo(AttributeImpl target) { + ((TaggingAttribute) target).setTaggable(taggable); + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(TaggingAttribute.class, "taggable", isTaggable()); + } + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java new file mode 100644 index 00000000000..1e82dbe4b5b --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java @@ -0,0 +1,189 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IntsRef; + +/** + * Cursor into the terms that advances by prefix. + */ +class TermPrefixCursor { + + //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup. + // Maybe that could be added to Lucene. + + // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict? + + private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable? + private static final IntsRef EMPTY_INTSREF = new IntsRef(); + + private final TermsEnum termsEnum; + private final Bits liveDocs; + private final Map docIdsCache; + + private BytesRef prefixBuf;//we append to this + private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder(); + private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied + private PostingsEnum postingsEnum; + private IntsRef docIds; + + TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) { + this.termsEnum = termsEnum; + this.liveDocs = liveDocs; + this.docIdsCache = docIdsCache; + } + + /** Appends the separator char (if not the first) plus the given word to the prefix buffer, + * then seeks to it. If the seek fails, false is returned and this cursor + * can be re-used as if in a new state. The {@code word} BytesRef is considered temporary, + * and is not saved within this class. */ + boolean advance(BytesRef word) throws IOException { + if (prefixBuf == null) { // first advance + //set prefixBuf to word temporary. When advance() completes, we either null out or copy. + prefixBuf = word; + prefixBufOnLoan = true; + if (seekPrefix()) {//... and we have to + ensureBufIsACopy(); + return true; + } else { + prefixBuf = null;//just to be darned sure 'word' isn't referenced here + return false; + } + + } else { // subsequent advance + //append to existing + assert !prefixBufOnLoan; + + prefixBufBuilder.append(SEPARATOR_CHAR); + prefixBufBuilder.append(word); + prefixBuf = prefixBufBuilder.get(); + if (seekPrefix()) { + return true; + } else { + prefixBuf = null; + return false; + } + } + } + + private void ensureBufIsACopy() { + if (!prefixBufOnLoan) + return; + + prefixBufBuilder.clear(); + prefixBufBuilder.copyBytes(prefixBuf); + prefixBuf = prefixBufBuilder.get(); + prefixBufOnLoan = false; + } + + /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char. + * Sets docIds. **/ + private boolean seekPrefix() throws IOException { + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf); + + docIds = null;//invalidate + switch (seekStatus) { + case END: + return false; + + case FOUND: + postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); + docIds = postingsEnumToIntsRef(postingsEnum, liveDocs); + if (docIds.length > 0) { + return true; + } + + //Pretend we didn't find it; go to next term + docIds = null; + if (termsEnum.next() == null) { // case END + return false; + } + //fall through to NOT_FOUND + + case NOT_FOUND: + //termsEnum must start with prefixBuf to continue + BytesRef teTerm = termsEnum.term(); + + if (teTerm.length > prefixBuf.length) { + for (int i = 0; i < prefixBuf.length; i++) { + if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i]) + return false; + } + if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR) + return false; + return true; + } + return false; + } + throw new IllegalStateException(seekStatus.toString()); + } + + /** Returns an IntsRef either cached or reading postingsEnum. Not null. */ + private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { + // (The cache can have empty IntsRefs) + + //lookup prefixBuf in a cache + if (docIdsCache != null) { + docIds = docIdsCache.get(prefixBuf); + if (docIds != null) { + return docIds; + } + } + + //read postingsEnum + docIds = new IntsRef(termsEnum.docFreq()); + int docId; + while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { + if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { + continue; + } + docIds.ints[docIds.length++] = docId; + } + if (docIds.length == 0) + docIds = EMPTY_INTSREF; + + //cache + if (docIdsCache != null) { + ensureBufIsACopy(); + //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to + docIdsCache.put(prefixBuf.clone(), docIds); + } + return docIds; + } + + /** The docIds of the last call to advance, if it returned true. It might be null, but + * its length won't be 0. Treat as immutable. */ + IntsRef getDocIds() { + assert docIds == null || docIds.length != 0; + return docIds; + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java new file mode 100644 index 00000000000..576328f65be --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java @@ -0,0 +1,113 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.stream.XMLResolver; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.XMLEvent; +import java.io.InputStream; +import java.io.StringReader; + +import com.ctc.wstx.stax.WstxInputFactory; +import org.apache.commons.io.input.ClosedInputStream; +import org.codehaus.stax2.LocationInfo; +import org.codehaus.stax2.XMLInputFactory2; +import org.codehaus.stax2.XMLStreamReader2; + +/** + * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be + * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end + * offset of the tagger, and have it be valid XML. See {@link #correctPair(int, int)}. + * + * This will not work on invalid XML. + * + * Not thread-safe. + */ +public class XmlOffsetCorrector extends OffsetCorrector { + + //TODO use StAX without hard requirement on woodstox. xmlStreamReader.getLocation().getCharacterOffset() + + private static final XMLInputFactory2 XML_INPUT_FACTORY; + static { + // note: similar code in Solr's EmptyEntityResolver + XML_INPUT_FACTORY = new WstxInputFactory(); + XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() { + @Override + public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) { + return ClosedInputStream.CLOSED_INPUT_STREAM; + } + }); + // TODO disable DTD? + // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE) + XML_INPUT_FACTORY.configureForSpeed(); + } + + /** + * Initialize based on the document text. + * @param docText non-null XML content. + * @throws XMLStreamException If there's a problem parsing the XML. + */ + public XmlOffsetCorrector(String docText) throws XMLStreamException { + super(docText, false); + + int tagCounter = 0; + int thisTag = -1; + + //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag, + // but we shouldn't need to because there is no findable text outside the top element. + + final XMLStreamReader2 xmlStreamReader = + (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText)); + + while (xmlStreamReader.hasNext()) { + int eventType = xmlStreamReader.next(); + switch (eventType) { + case XMLEvent.START_ELEMENT: { + tagInfo.ensureCapacity(tagInfo.size() + 5); + final int parentTag = thisTag; + final LocationInfo info = xmlStreamReader.getLocationInfo(); + tagInfo.add(parentTag); + tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset()); + tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag + thisTag = tagCounter++; + + parentChangeOffsets.add((int) info.getStartingCharOffset()); + parentChangeIds.add(thisTag); + break; + } + case XMLEvent.END_ELEMENT: { + final LocationInfo info = xmlStreamReader.getLocationInfo(); + tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset()); + tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset()); + thisTag = getParentTag(thisTag); + + parentChangeOffsets.add((int) info.getEndingCharOffset()); + parentChangeIds.add(thisTag); + break; + } + default: //do nothing + } + } + } + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java new file mode 100644 index 00000000000..c2055b308e5 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java @@ -0,0 +1,27 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The {@link org.apache.solr.handler.tagger.TaggerRequestHandler} and supporting classes. + * This was formerly known as OpenSextant's SolrTextTagger. + */ +package org.apache.solr.handler.tagger; \ No newline at end of file diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml new file mode 100644 index 00000000000..051cd10c7a5 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml @@ -0,0 +1,187 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml new file mode 100644 index 00000000000..e0d367731d5 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml @@ -0,0 +1,59 @@ + + + + + + ${tests.luceneMatchVersion:LUCENE_CURRENT} + ${solr.data.dir:} + + + + + + + + + + + + + name_tag:[* TO *] + + + + + name_tag:[* TO *] + + + + + + + + + name_tag + + + + diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java new file mode 100644 index 00000000000..8d31ad007ef --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java @@ -0,0 +1,153 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiFunction; + +import org.apache.lucene.document.Field; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.StreamingResponseCallback; +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Tests that we can skip serialization of the documents when embedding + * Solr. + */ +public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 { + + static EmbeddedSolrServer solrServer; + + @BeforeClass + public static void init() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1"); + //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core + } + + @Before + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(adoc("id", "9999", "name", "Boston")); + assertU(commit()); + } + + @Test + public void testTag() throws SolrServerException, IOException { + ModifiableSolrParams params = params(); + String input = "foo boston bar";//just one tag; + QueryRequest req = new SolrTaggerRequest(params, input); + req.setPath("/tag"); + + QueryResponse rsp = req.process(solrServer); + SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response"); + assertNotNull(rsp.getResponse().get("tags")); + assertNotNull(results.get(0)); + } + + @SuppressWarnings("serial") + public static class SolrTaggerRequest extends QueryRequest { + + private final String input; + + public SolrTaggerRequest(SolrParams p, String input) { + super(p, METHOD.POST); + this.input = input; + } + + // Deprecated in 7.2 but should live on until 8.x + @SuppressWarnings("deprecation") + @Override + public Collection getContentStreams() { + return Collections.singleton(new ContentStreamBase.StringStream(input)); + } + + // As of 7.2. But won't work until: https://issues.apache.org/jira/browse/SOLR-12142 +// @Override +// public RequestWriter.ContentWriter getContentWriter(String expectedType) { +// return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8"); +// } + } + + @Test + public void testSearch() throws Exception { + QueryResponse rsp = solrServer.query(params("q", "name:Boston")); + assertNotNull(rsp.getResults().get(0)); + } + + @Test + public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception { + doTestAssertTagStreaming(SolrTaggerRequest::new); + } + + @Test + @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO + // and it can't be enabled with EmbeddedSolrServer until SOLR-12126 + public void testAssertTagStreamingWithStreamBodyParam() throws Exception { + doTestAssertTagStreaming((params, input) -> { + params.set("stream.body", input); + return new QueryRequest(params); + }); + } + + public void doTestAssertTagStreaming(BiFunction newQueryRequest) throws IOException, SolrServerException { + ModifiableSolrParams params = params(); + String input = "foo boston bar";//just one tag; + QueryRequest req = newQueryRequest.apply(params, input); + req.setPath("/tag"); + + final AtomicReference refDoc = new AtomicReference<>(); + req.setStreamingResponseCallback(new StreamingResponseCallback() { + @Override + public void streamSolrDocument(SolrDocument doc) { + refDoc.set(doc); + } + + @Override + public void streamDocListInfo(long numFound, long start, Float maxScore) { + + } + }); + QueryResponse rsp = req.process(solrServer); + assertNotNull(rsp.getResponse().get("tags")); + assertNotNull(refDoc.get()); + assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue()); + } +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java new file mode 100644 index 00000000000..cb742a87a8c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java @@ -0,0 +1,150 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Random; +import java.util.Set; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Randomly generate taggable text and verify via simple tag algorithm. + */ +@Repeat(iterations = 10) +public class RandomizedTaggerTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Test + public void test() throws Exception { + final Random R = random(); + + Set names = new HashSet<>(); + //random list of single-word names + final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5); + for (int i = 0; i < NUM_SINGLES; i++) { + if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers) + names.add(randomStringOfLength(16, 32)); + else + names.add(randomString()); + } + + //add random list of multi-word names, partially including existing names + final int NUM_MULTI = 10; + for (int i = 0; i < NUM_MULTI; i++) { + final int numWords = RandomNumbers.randomIntBetween(R, 2, 4); + StringBuilder buf = new StringBuilder(); + for (int j = 0; j < numWords; j++) { + if (j != 0) + buf.append(' '); + if (R.nextBoolean()) {//new likely non-existent word + buf.append(randomString()); + } else {//existing word (possible multi-word from prev iteration) + buf.append(RandomPicks.randomFrom(R, names)); + } + } + names.add(buf.toString()); + } + + // BUILD NAMES + buildNames(names.toArray(new String[names.size()])); + + // QUERY LOOP + for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) { + // Build up random input, similar to multi-word random names above + StringBuilder input = new StringBuilder(); + final int INPUT_WORD_LEN = 20; + input.append(' ');//must start with space based on assertBruteForce logic + for (int i = 0; i < INPUT_WORD_LEN; i++) { + if (R.nextBoolean()) {//new likely non-existent word + input.append(randomString()); + } else {//existing word (possible multi-word from prev iteration) + input.append(RandomPicks.randomFrom(R, NAMES)); + } + input.append(' ');//must end with a space + } + + boolean madeIt = false; + try { + assertBruteForce(input.toString()); + madeIt = true; + } finally { + if (!madeIt) { + System.out.println("Reproduce with:"); + System.out.print(" buildNames("); + for (int i = 0; i < NAMES.size(); i++) { + if (i != 0) + System.out.print(','); + System.out.print('"'); + System.out.print(NAMES.get(i)); + System.out.print('"'); + } + System.out.println(");"); + System.out.println(" assertBruteForce(\"" + input+"\");"); + } + } + } + + } + + private void assertBruteForce(String input) throws Exception { + assert input.matches(" .* "); + baseParams.set("overlaps", "ALL"); + + //loop through NAMES and find all tag offsets + List testTags = new ArrayList<>(); + for (String name : NAMES) { + String spaceName = " "+name+" "; + int off = 0; + while (true) { + int idx = input.indexOf(spaceName, off); + if (idx < 0) + break; + testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name)); + off = idx + 1; + } + } + + //assert + assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()])); + } + + private String randomString() { return randomStringOfLength(1, 1); } + + private String randomStringOfLength(int min, int max) { + return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase(Locale.ROOT); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java new file mode 100644 index 00000000000..c7580e1f729 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java @@ -0,0 +1,175 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.nio.charset.StandardCharsets; + +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Test the {@link TaggerRequestHandler}. + */ +public class Tagger2Test extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + } + + /** whole matching, no sub-tags */ + @Test + public void testLongestDominantRight() throws Exception { + buildNames("in", "San", "in San", "Francisco", "San Francisco", + "San Francisco State College", "College of California", + "Clayton", "Clayton North", "North Carolina"); + + assertTags("He lived in San Francisco.", + "in", "San Francisco"); + + assertTags("He enrolled in San Francisco State College of California", + "in", "San Francisco State College"); + + assertTags("He lived in Clayton North Carolina", + "in", "Clayton", "North Carolina"); + + } + + // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list, + // Jan 26th 2015. Honestly it's not particularly important to us but it renders this test + // pointless. + /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2 related: #13 */ + @Test + @Ignore + public void testVeryLongWord() throws Exception { + String SANFRAN = "San Francisco"; + buildNames(SANFRAN); + + // exceeds default 255 max token length which means it in-effect becomes a stop-word + StringBuilder STOP = new StringBuilder(260);//>255 + for (int i = 0; i < STOP.capacity(); i++) { + STOP.append((char) ('0' + (i % 10))); + } + + String doc = "San " + STOP + " Francisco"; + assertTags(doc);//no match due to default stop word handling + //and we find it when we ignore stop words + assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN))); + } + + /** Support for stopwords (posInc > 1); + * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */ + @Test + public void testStopWords() throws Exception { + baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query + + String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query + String ACITYA = "A City A"; + + buildNames(SOUTHOFWALES, ACITYA); + + //round-trip works + assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES, + lookupByName(SOUTHOFWALES))); + // but offsets doesn't include stopword when leading or trailing... + assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City", + lookupByName(ACITYA))); + //break on stop words + assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing + } + + /** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */ + @Test + public void testWDF() throws Exception { + baseParams.set("field", "name_tagWDF"); + + final String WINSTONSALEM = "City of Winston-Salem";//hyphen + final String BOSTONHARBOR = "Boston Harbor";//space + buildNames(WINSTONSALEM, BOSTONHARBOR); + + //round-trip works + assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM, + lookupByName(WINSTONSALEM))); + + // space separated works + final String WS_SPACE = WINSTONSALEM.replace('-', ' '); + assertTags(reqDoc(WS_SPACE), + new TestTag(0, WS_SPACE.length(), WS_SPACE, + lookupByName(WINSTONSALEM))); + + //must be full match + assertTags(reqDoc("Winston"));//match nothing + assertTags(reqDoc("Salem"));//match nothing + + // round-trip works + assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR, + lookupByName(BOSTONHARBOR))); + + // hyphen separated works + final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-'); + assertTags(reqDoc(BH_HYPHEN), + new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN, + lookupByName(BOSTONHARBOR))); + //must be full match + assertTags(reqDoc("Boston"));//match nothing + assertTags(reqDoc("Harbor"));//match nothing + } + + /** Ensure character offsets work for multi-byte characters */ + @Test + public void testMultibyteChar() throws Exception { + // https://unicode-table.com/en/2019/ + // 0 1 2 3 4 + // 01234567890123456789012345678901234567890 + String TEXT = "He mentionned ’Obama’ in the White House"; + assertEquals(40, TEXT.length()); // char length (in Java, UTF16) + + String QUOTE = TEXT.substring(14, 15); + assertEquals(8217, QUOTE.codePointAt(0)); + + //UTF8 + assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length); + assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length); + assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length); + + //UTF16 big endian (by specifying big/little endian, there is no "byte order mark") + assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length); + assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length); + assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length); + + + buildNames("Obama"); + + assertTags(TEXT, "Obama"); + + // TODO test surrogate pairs (i.e. code points not in the BMP) + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java new file mode 100644 index 00000000000..93b11b50a28 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java @@ -0,0 +1,296 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.BeforeClass; +import org.junit.Ignore; + +/** + * The original test for {@link TaggerRequestHandler}. + */ +public class TaggerTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + private void indexAndBuild() throws Exception { + N[] names = N.values(); + String[] namesStrs = new String[names.length]; + for (int i = 0; i < names.length; i++) { + namesStrs[i] = names[i].getName(); + } + buildNames(namesStrs); + } + + /** Name corpus */ + enum N { + //keep order to retain ord() + London, London_Business_School, Boston, City_of_London, + of, the//filtered out of the corpus by a custom query + ; + + String getName() { return name().replace('_',' '); } + static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); } + int getId() { return ordinal(); } + } + + public void testFormat() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + indexAndBuild(); + + String rspStr = _testFormatRequest(false); + String expected = "\n" + + "\n" + + "\n" + + "1\n" + + "\n" + + " \n" + + " 0\n" + + " 22\n" + + " \n" + + " 1\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 1\n" + + " London Business School\n" + + "\n" + + "\n"; + assertEquals(expected, rspStr); + } + + public void testFormatMatchText() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + indexAndBuild(); + + String rspStr = _testFormatRequest(true); + String expected = "\n" + + "\n" + + "\n" + + "1\n" + + "\n" + + " \n" + + " 0\n" + + " 22\n" + + " london business school\n" + + " \n" + + " 1\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 1\n" + + " London Business School\n" + + "\n" + + "\n"; + assertEquals(expected, rspStr); + } + + private String _testFormatRequest(boolean matchText) throws Exception { + String doc = "london business school";//just one tag + SolrQueryRequest req = reqDoc(doc, "indent", "on", "omitHeader", "on", "matchText", ""+matchText); + String rspStr = h.query(req); + req.close(); + return rspStr; + } + + /** Partial matching, no sub-tags */ + @Ignore //TODO ConcatenateGraphFilter uses a special separator char that we can't put into XML (invalid char) + public void testPartialMatching() throws Exception { + baseParams.set("field", "name_tagPartial"); + baseParams.set("overlaps", "NO_SUB"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("") ); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + String doc; + + //just London Business School via "school" substring + doc = "school"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + doc = "a school"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + doc = "school a"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + //More interesting + + doc = "school City"; + assertTags(reqDoc(doc), + tt(doc, "school", 0, N.London_Business_School), + tt(doc, "City", 0, N.City_of_London) ); + + doc = "City of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London Business School", 0, N.London_Business_School)); + } + + /** whole matching, no sub-tags */ + public void testWholeMatching() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("")); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + //partial on N.London_Business_School matches nothing + assertTags(reqDoc("school") ); + assertTags(reqDoc("a school") ); + assertTags(reqDoc("school a") ); + assertTags(reqDoc("school City") ); + + String doc; + + doc = "school business london";//backwards + assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); + + doc = "of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "London Business School", 0, N.London_Business_School)); + + //More interesting + doc = "City of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London Business School", 0, N.London_Business_School)); + + doc = "City of London Business"; + assertTags(reqDoc(doc), //no plain London (sub-tag) no Business (partial-match) + tt(doc, "City of London", 0, N.City_of_London)); + + doc = "London Business magazine"; + assertTags(reqDoc(doc), //Just London; L.B.S. fails + tt(doc, "London", 0, N.London)); + } + + /** whole matching, with sub-tags */ + public void testSubTags() throws Exception { + baseParams.set("overlaps", "ALL"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("")); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + //partial on N.London_Business_School matches nothing + assertTags(reqDoc("school") ); + assertTags(reqDoc("a school") ); + assertTags(reqDoc("school a") ); + assertTags(reqDoc("school City") ); + + String doc; + + doc = "school business london";//backwards + assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); + + //More interesting + doc = "City of London Business School"; + assertTags(reqDoc(doc), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London), + tt(doc, "London Business School", 0, N.London_Business_School)); + + doc = "City of London Business"; + assertTags(reqDoc(doc), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London)); + } + + public void testMultipleFilterQueries() throws Exception { + baseParams.set("overlaps", "ALL"); + + // build up the corpus with some additional fields for filtering purposes + deleteByQueryAndGetVersion("*:*", null); + + int i = 0; + assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK")); + assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK")); + assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US")); + assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK")); + assertU(commit()); + + // not calling buildNames so that we can bring along extra attributes for filtering + NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList()); + + // phrase that matches everything + String doc = "City of London Business School in Boston"; + + // first do no filtering + ModifiableSolrParams p = new ModifiableSolrParams(); + p.add(CommonParams.Q, "*:*"); + assertTags(reqDoc(doc, p), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London), + tt(doc, "London Business School", 0, N.London_Business_School), + tt(doc, "Boston", 0, N.Boston)); + + // add a single fq + p.add(CommonParams.FQ, "type:city"); + assertTags(reqDoc(doc, p), + tt(doc, "London", 0, N.London), + tt(doc, "Boston", 0, N.Boston)); + + // add another fq + p.add(CommonParams.FQ, "country:US"); + assertTags(reqDoc(doc, p), + tt(doc, "Boston", 0, N.Boston)); + } + + private TestTag tt(String doc, String substring, int substringIndex, N name) { + assert substringIndex == 0; + + //little bit of copy-paste code from super.tt() + int startOffset = -1, endOffset; + int substringIndex1 = 0; + for(int i = 0; i <= substringIndex1; i++) { + startOffset = doc.indexOf(substring, ++startOffset); + assert startOffset >= 0 : "The test itself is broken"; + } + endOffset = startOffset+ substring.length();//1 greater (exclusive) + return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName())); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java new file mode 100644 index 00000000000..e525ce9265a --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java @@ -0,0 +1,251 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.EqualsBuilder; +import org.apache.lucene.document.Document; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; +import org.junit.Rule; +import org.junit.rules.TestWatcher; +import org.junit.runner.Description; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class TaggerTestCase extends SolrTestCaseJ4 { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + @Rule + public TestWatcher watchman = new TestWatcher() { + @Override + protected void starting(Description description) { + log.info("{} being run...", description.getDisplayName()); + } + }; + + protected final ModifiableSolrParams baseParams = new ModifiableSolrParams(); + + //populated in buildNames; tested in assertTags + protected static List NAMES; + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.clear(); + baseParams.set(CommonParams.QT, "/tag"); + baseParams.set(CommonParams.WT, "xml"); + } + + protected void assertTags(String doc, String... tags) throws Exception { + TestTag[] tts = new TestTag[tags.length]; + for (int i = 0; i < tags.length; i++) { + tts[i] = tt(doc, tags[i]); + } + assertTags(reqDoc(doc), tts); + } + + protected static void buildNames(String... names) throws Exception { + deleteByQueryAndGetVersion("*:*", null); + NAMES = Arrays.asList(names); + //Collections.sort(NAMES); + int i = 0; + for (String n : NAMES) { + assertU(adoc("id", ""+(i++), "name", n)); + } + assertU(commit()); + } + + protected String lookupByName(String name) { + for (String n : NAMES) { + if (n.equalsIgnoreCase(name)) + return n; + } + return null; + } + + protected TestTag tt(String doc, String substring) { + int startOffset = -1, endOffset; + int substringIndex = 0; + for(int i = 0; i <= substringIndex; i++) { + startOffset = doc.indexOf(substring,++startOffset); + assert startOffset >= 0 : "The test itself is broken"; + } + endOffset = startOffset+substring.length();//1 greater (exclusive) + return new TestTag(startOffset, endOffset, substring, lookupByName(substring)); + } + + /** Asserts the tags. Will call req.close(). */ + protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception { + try { + SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req); + TestTag[] aTags = pullTagsFromResponse(req, rsp); + + String message; + if (aTags.length > 10) + message = null; + else + message = Arrays.asList(aTags).toString(); + Arrays.sort(eTags); + assertSortedArrayEquals(message, eTags, aTags); + + } finally { + req.close(); + } + } + + @SuppressWarnings("unchecked") + protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException { + NamedList rspValues = rsp.getValues(); + Map matchingNames = new HashMap<>(); + SolrIndexSearcher searcher = req.getSearcher(); + DocList docList = (DocList) rspValues.get("response"); + DocIterator iter = docList.iterator(); + while (iter.hasNext()) { + int docId = iter.next(); + Document doc = searcher.doc(docId); + String id = doc.getField("id").stringValue(); + String name = lookupByName(doc.get("name")); + assertEquals("looking for "+name, NAMES.indexOf(name)+"", id); + matchingNames.put(id, name); + } + + //build TestTag[] aTags from response ('a' is actual) + List mTagsList = (List) rspValues.get("tags"); + List aTags = new ArrayList<>(); + for (NamedList map : mTagsList) { + List foundIds = (List) map.get("ids"); + for (String id : foundIds) { + aTags.add(new TestTag( + ((Number)map.get("startOffset")).intValue(), + ((Number)map.get("endOffset")).intValue(), + null, + matchingNames.get(id))); + } + } + return aTags.toArray(new TestTag[0]); + } + + /** REMEMBER to close() the result req object. */ + protected SolrQueryRequest reqDoc(String doc, String... moreParams) { + return reqDoc(doc, params(moreParams)); + } + + /** REMEMBER to close() the result req object. */ + protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) { + log.debug("Test doc: "+doc); + SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams); + SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {}; + Iterable stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc)); + req.setContentStreams(stream); + return req; + } + + /** Asserts the sorted arrays are equals, with a helpful error message when not.*/ + public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) { + AssertionError error = null; + try { + assertArrayEquals(null, expecteds, actuals); + } catch (AssertionError e) { + error = e; + } + if (error == null) + return; + TreeSet expectedRemaining = new TreeSet<>(Arrays.asList(expecteds)); + expectedRemaining.removeAll(Arrays.asList(actuals)); + if (!expectedRemaining.isEmpty()) + fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error); + TreeSet actualsRemaining = new TreeSet<>(Arrays.asList(actuals)); + actualsRemaining.removeAll(Arrays.asList(expecteds)); + fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error); + } + + class TestTag implements Comparable { + final int startOffset, endOffset; + final String substring; + final String docName; + + TestTag(int startOffset, int endOffset, String substring, String docName) { + this.startOffset = startOffset; + this.endOffset = endOffset; + this.substring = substring; + this.docName = docName; + } + + @Override + public String toString() { + return "TestTag{" + + "[" + startOffset + "-" + endOffset + "]" + + " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" + + (docName.equals(substring) || substring == null ? "" : " substr="+substring)+ + '}'; + } + + @Override + public boolean equals(Object obj) { + TestTag that = (TestTag) obj; + return new EqualsBuilder() + .append(this.startOffset, that.startOffset) + .append(this.endOffset, that.endOffset) + .append(this.docName, that.docName) + .isEquals(); + } + + @Override + public int hashCode() { + return startOffset;//cheesy but acceptable + } + + @Override + public int compareTo(Object o) { + TestTag that = (TestTag) o; + return new CompareToBuilder() + .append(this.startOffset, that.startOffset) + .append(this.endOffset, that.endOffset) + .append(this.docName,that.docName) + .toComparison(); + } + } +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java new file mode 100644 index 00000000000..39c78286713 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java @@ -0,0 +1,73 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Test the {@link TaggerRequestHandler} with + * a Analyzer chain that does use the {@link TaggingAttribute}. See the test + * configuration under 'taggingattribute'. + */ +public class TaggingAttributeTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + /** + * Whole matching, no sub-tags. Links only words with > 3 letters. + * Because of that "San" is not used to start tags + * + */ + @Test + public void testTaggingAttribute() throws Exception { + baseParams.set("field", "name_tagAttribute"); // has WordLengthTaggingFilter using the TaggingAttribute + // this test is based on the longest dominant right test, so we use the + // the same TagClusterReducer setting + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + + buildNames("in", "San", "in San", "Francisco", "San Francisco", + "San Francisco State College", "College of California", + "Clayton", "Clayton North", "North Carolina"); + + assertTags("He lived in San Francisco.", + //"in", "San Francisco"); //whis would be expected without taggable + "Francisco");// this are the expected results with taggable + + assertTags("He enrolled in San Francisco State College of California", + //"in", "San Francisco State College"); //without taggable enabled + "Francisco", "College of California");// With taggable + //NOTE this also tests that started tags are advanced for non-taggable + // tokens, as otherwise 'College of California' would not be + // suggested. + + assertTags("He lived in Clayton North Carolina", + //"in", "Clayton", "North Carolina"); + "Clayton", "North Carolina"); + + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java new file mode 100644 index 00000000000..237a8b82c39 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java @@ -0,0 +1,110 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Simple TokenFilter that lookup only Tokens with more as the parsed number + * of chars.

    + * NOTE:This implementation is only intended to be used as an example + * and for unit testing the {@link TaggingAttribute} feature. Typically + * implementations will be based on NLP results (e.g. using POS tags or + * detected Named Entities). + *

    + * Example Usage:

    + * Currently the usage requires to modify the Analyzer as defined by the + * indexedField. An alternative would be to allow the configuration + * of a special FieldType in the schema.xml and use this Analyzer for processing + * the text sent to the request.

    + * While the current solution is fine for direct API usage, defining the + * Analyzer in the schema.xml would be better suitable for using this feature + * with the {@link TaggerRequestHandler}. + * + *

    + *     Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
    + *     //get the TokenStream from the Analyzer
    + *     TokenStream baseStream = analyzer.tokenStream("", reader);
    + *     //add a FilterStream that sets the LookupAttribute to the end
    + *     TokenStream filterStream = new WordLengthLookupFilter(baseStream);
    + *     //create the Tagger using the modified analyzer chain.
    + *     new Tagger(corpus, filterStream, tagClusterReducer) {
    + *
    + *         protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
    + *             //implement the callback
    + *         }
    + *
    + *     }.process();
    + * 
    + */ +public class WordLengthTaggingFilter extends TokenFilter { + + /** + * The default minimum length is 3 + */ + public static final int DEFAULT_MIN_LENGTH = 3; + private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private int minLength; + + /** + * TokenFilter only marks tokens to be looked up with equals or more as + * {@link #DEFAULT_MIN_LENGTH} characters + */ + public WordLengthTaggingFilter(TokenStream input) { + this(input, null); + } + + /** + * TokenFilter only marks tokens to be looked up with equals or more characters + * as the parsed minimum. + * + * @param input the TokenStream to consume tokens from + * @param minLength The minimum length to lookup a Token. null + * or <= 0 to use the #DEFAULT_MIN_LENGTH + */ + public WordLengthTaggingFilter(TokenStream input, Integer minLength) { + super(input); + if (minLength == null || minLength <= 0) { + this.minLength = DEFAULT_MIN_LENGTH; + } else { + this.minLength = minLength; + } + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int size = offsetAtt.endOffset() - offsetAtt.startOffset(); + lookupAtt.setTaggable(size >= minLength); + return true; + } else { + return false; + } + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java new file mode 100644 index 00000000000..dbfc5381bb6 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java @@ -0,0 +1,67 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.lang.invoke.MethodHandles; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WordLengthTaggingFilterFactory extends TokenFilterFactory { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public static final String MIN_LENGTH = "minLength"; + + private final Integer minLength; + + public WordLengthTaggingFilterFactory(Map args) { + super(args); + int minLength = -1; + Object value = args.get(MIN_LENGTH); + if (value != null) { + try { + minLength = Integer.parseInt(value.toString()); + } catch (NumberFormatException e) { + log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value); + + } + } + if (minLength <= 0) { + log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH); + this.minLength = null; + } else { + log.info("set minLength={}", minLength); + this.minLength = minLength; + } + } + + @Override + public TokenStream create(TokenStream input) { + return new WordLengthTaggingFilter(input, minLength); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java new file mode 100644 index 00000000000..d7dd5dff213 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java @@ -0,0 +1,224 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.solr.common.SolrException; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; +import org.junit.Test; +import org.xml.sax.InputSource; + +public class XmlInterpolationTest extends TaggerTestCase { + + private static DocumentBuilder xmlDocBuilder; + + + @BeforeClass + public static void beforeClass() throws Exception { + DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance(); + xmlDocBuilderFactory.setValidating(true); + xmlDocBuilderFactory.setNamespaceAware(true); + xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder(); + + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.set("field", "name_tagXml"); + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + baseParams.set("xmlOffsetAdjust", "true"); + } + + @Test + public void test() throws Exception { + buildNames("start end"); + + assertXmlTag("before start end after", true); + assertXmlTag("before start
    end after
    ", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true);//adjacent tags + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + + assertXmlTag("

    before start

    end after
    ", false); + assertXmlTag("before start

    end after

    ", false); + + assertXmlTag("before start end after", true); + } + + @Test(expected = SolrException.class) + public void testInvalidXml() throws Exception { + assertXmlTag("notXml", false); + } + + @Test(expected = Exception.class) + public void testValidatingXml() throws Exception { + validateXml("foo"); + } + + protected void assertXmlTag(String docText, boolean expected) throws Exception { + final SolrQueryRequest req = reqDoc(docText); + try { // 5.4 and beyond we can use try-with-resources + final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req); + final TestTag[] testTags = pullTagsFromResponse(req, rsp); + if (!expected) { + assertEquals(0, testTags.length); + } else { + assertEquals(1, testTags.length); + final TestTag tag = testTags[0]; + validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName)); + } + } finally { + req.close(); + } + } + + protected void validateXml(String xml) throws Exception { + // the "parse" method also validates XML, will throw an exception if mis-formatted + xmlDocBuilder.parse(new InputSource(new StringReader(xml))); + } + + + @Test + public void testLuceneHtmlFilterBehavior() { + String docText; + + //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734 + docText = "start end"; + assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); + + //Space after "end" means offset doesn't include + docText = "start end "; + assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); + + //Matches entity at end + final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd'); + docText = "start " + endStr + ""; + assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end")); + //... and at start + final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's'); + docText = "" + startStr + " end"; + assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end")); + + //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start". + docText = "" + + "]>&start;"; + assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start")); + + //Test entity behavior + docText = " — – & &foo;   a b"; + assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"}, + analyzeReturnTokens(docText)); + + //Observe offset adjustment of trailing entity to end tag + docText = "foo bar"; + assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo")); + } + + private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) { + String insertStart = "";// (normally we'd escape id) + String insertEnd = ""; + return docText.substring(0, startOffset) + + insertStart + + docText.substring(startOffset, endOffset) + + insertEnd + + docText.substring(endOffset); + } + + private int[] tagExpect(String docText, String start, String end) { + return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()}; + } + + private int[] analyzeTagOne(String docText, String start, String end) { + int[] result = {-1, -1}; + + Reader filter = new HTMLStripCharFilter(new StringReader(docText)); + + WhitespaceTokenizer ts = new WhitespaceTokenizer(); + final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); + try { + ts.setReader(filter); + ts.reset(); + while (ts.incrementToken()) { + final String termString = termAttribute.toString(); + if (termString.equals(start)) + result[0] = offsetAttribute.startOffset(); + if (termString.equals(end)) { + result[1] = offsetAttribute.endOffset(); + return result; + } + } + ts.end(); + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeQuietly(ts); + } + return result; + } + + private String[] analyzeReturnTokens(String docText) { + List result = new ArrayList<>(); + + Reader filter = new HTMLStripCharFilter(new StringReader(docText), + Collections.singleton("unescaped")); + WhitespaceTokenizer ts = new WhitespaceTokenizer(); + final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); + try { + ts.setReader(filter); + ts.reset(); + while (ts.incrementToken()) { + result.add(termAttribute.toString()); + } + ts.end(); + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeQuietly(ts); + } + return result.toArray(new String[result.size()]); + } + +} diff --git a/solr/solr-ref-guide/src/searching.adoc b/solr/solr-ref-guide/src/searching.adoc index 145c1a4dcd5..753c2d88038 100644 --- a/solr/solr-ref-guide/src/searching.adoc +++ b/solr/solr-ref-guide/src/searching.adoc @@ -1,5 +1,35 @@ = Searching -:page-children: overview-of-searching-in-solr, velocity-search-ui, relevance, query-syntax-and-parsing, json-request-api, json-facet-api, faceting, highlighting, spell-checking, query-re-ranking, transforming-result-documents, suggester, morelikethis, pagination-of-results, collapse-and-expand-results, result-grouping, result-clustering, spatial-search, the-terms-component, the-term-vector-component, the-stats-component, the-query-elevation-component, response-writers, near-real-time-searching, realtime-get, exporting-result-sets, streaming-expressions, parallel-sql-interface, analytics +:page-children: overview-of-searching-in-solr, + + velocity-search-ui, + + relevance, + + query-syntax-and-parsing, + + json-request-api, + + json-facet-api, + + faceting, + + highlighting, + + spell-checking, + + query-re-ranking, + + transforming-result-documents, + + suggester, + + morelikethis, + + pagination-of-results, + + collapse-and-expand-results, + + result-grouping, + + result-clustering, + + spatial-search, + + the-terms-component, + + the-term-vector-component, + + the-stats-component, + + the-query-elevation-component, + + the-tagger-handler, + + response-writers, + + near-real-time-searching, + + realtime-get, + + exporting-result-sets, + + streaming-expressions, + + parallel-sql-interface, + + analytics + // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -50,6 +80,7 @@ This section describes how Solr works with search requests. It covers the follow * <>: How to get term information about specific documents. * <>: How to return information from numeric fields within a document set. * <>: How to force documents to the top of the results for certain queries. +* <>: The SolrTextTagger, for basic named entity tagging in text. * <>: Detailed information about configuring and using Solr's response writers. * <>: How to include documents in search results nearly immediately after they are indexed. * <>: How to get the latest version of a document without opening a searcher. diff --git a/solr/solr-ref-guide/src/the-tagger-handler.adoc b/solr/solr-ref-guide/src/the-tagger-handler.adoc new file mode 100644 index 00000000000..14ba8ed6f9f --- /dev/null +++ b/solr/solr-ref-guide/src/the-tagger-handler.adoc @@ -0,0 +1,265 @@ +[[the-tagger-handler]] += The Tagger Handler + +The "Tagger" Request Handler, AKA the "SolrTextTagger" is a "text tagger". +Given a dictionary (a Solr index) with a name-like field, + you post text to this request handler and it will return every occurrence of one of those names with offsets and other document metadata desired. +It's used for named entity recognition (NER). +It doesn't do any NLP (outside of Lucene text analysis) so it's said to be a "naive tagger", + but it's definitely useful as-is and a more complete NER or ERD (entity recognition and disambiguation) + system can be built with this as a key component. +The SolrTextTagger might be used on queries for query-understanding or large documents as well. + +To get a sense of how to use it, jump to the tutorial below. + +The tagger does not yet support a sharded index. +Tens, perhaps hundreds of millions of names (documents) are supported, mostly limited by memory. + +[[tagger-configuration]] +== Configuration + +The Solr schema needs 2 things: + +* A unique key field (see ``). + Recommended field settings: set `docValues=true` +* A tag field, a TextField, with `ConcatenateGraphFilterFactory` at the end of the index chain (not the query chain): + Set `preservePositionIncrements=false` on that filter. + Recommended field settings: `omitNorms=true`, `omitTermFreqAndPositions=true` and `postingsFormat=FST50` + +The text field's _index analysis chain_, aside from needing ConcatenateGraphFilterFactory at the end, + can otherwise have whatever tokenizer and filters suit your matching preferences. +It can have multi-word synonyms and use WordDelimiterGraphFilterFactory for example. +However, do _not_ use FlattenGraphFilterFactory as it will interfere with ConcatenateGraphFilterFactory. +Position gaps (e.g. stop words) get ignored; it's not (yet) supported for the gap to be significant. + +The text field's _query analysis chain_, on the other hand, is more limited. +There should not be tokens at the same position, thus no synonym expansion -- do that at index time instead. +Stop words (or any other filter introducing a position gap) are supported. +At runtime the tagger can be configured to either treat it as a tag break or to ignore it. + +The Solr config needs the `solr.TagRequestHandler` defined, which supports `defaults`, `invariants`, and `appends` +sections just like the search handler. + +[[tagger-parameters]] +== Tagger Parameters + +The tagger's execution is completely configurable with request parameters. Only `field` is required. + +`field`:: + The tag field that serves as the dictionary. + This is required; you'll probably specify it in the request handler. + +`fq`:: + You can specify some number of _filter queries_ to limit the dictionary used for tagging. + This parameter is the same as is used by the `solr.SearchHandler`. + +`rows`:: + The maximum number of documents to return, but defaulting to 10000 for a tag request. + This parameter is the same as is used by the `solr.SearchHandler`. + +`fl`:: + Solr's standard param for listing the fields to return. + This parameter is the same as is used by the `solr.SearchHandler`. + +`overlaps`:: + Choose the algorithm to determine which tags in an overlapping set should be retained, versus being pruned away. + Options are: + + * `ALL`: Emit all tags. + * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag). + * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest one (by character length). + If there is a tie, pick the right-most. + Remove any tags overlapping with this tag then repeat the algorithm to potentially find other tags + that can be emitted in the cluster. + +`matchText`:: + A boolean indicating whether to return the matched text in the tag response. + This will trigger the tagger to fully buffer the input before tagging. + +`tagsLimit`:: + The maximum number of tags to return in the response. + Tagging effectively stops after this point. + By default this is 1000. + +`skipAltTokens`:: + A boolean flag used to suppress errors that can occur if, for example, + you enable synonym expansion at query time in the analyzer, which you normally shouldn't do. + Let this default to false unless you know that such tokens can't be avoided. + +`ignoreStopwords`:: + A boolean flag that causes stopwords (or any condition causing positions to skip like >255 char words) + to be ignored as if it wasn't there. + Otherwise, the behavior is to treat them as breaks in tagging on the presumption your indexed text-analysis + configuration doesn't have a StopWordFilter. + By default the indexed analysis chain is checked for the presence of a StopWordFilter and if found + then ignoreStopWords is true if unspecified. + You probably shouldn't have a StopWordFilter configured and probably won't need to set this param either. + +`xmlOffsetAdjust`:: + A boolean indicating that the input is XML and furthermore that the offsets of returned tags should be adjusted as + necessary to allow for the client to insert an openening and closing element at the tag offset pair. + If it isn't possible to do so then the tag will be omitted. + You are expected to configure `HTMLStripCharFilterFactory` in the schema when using this option. + This will trigger the tagger to fully buffer the input before tagging. + +Solr's parameters for controlling the response format are supported, like: + `echoParams`, `wt`, `indent`, etc. + +[[tagger-tutorial-with-geonames]] +== Tutorial with Geonames + +This is a tutorial that demonstrates how to configure and use the text +tagger with the popular Geonames data set. It's more than a tutorial; +it's a how-to with information that wasn't described above. + +[[tagger-create-and-configure-a-solr-collection]] +=== Create and Configure a Solr Collection + +Create a Solr collection named "geonames". For the tutorial, we'll +assume the default "data-driven" configuration. It's good for +experimentation and getting going fast but not for production or being +optimal. + +.... +bin/solr create -c geonames +.... + +[[tagger-configuring]] +==== Configuring + +We need to configure the schema first. The "data driven" mode we're +using allows us to keep this step fairly minimal -- we just need to +declare a field type, 2 fields, and a copy-field. The critical part +up-front is to define the "tag" field type. There are many many ways to +configure text analysis; and we're not going to get into those choices +here. But an important bit is the `ConcatenateGraphFilterFactory` at the +end of the index analyzer chain. Another important bit for performance +is postingsFormat=FST50 resulting in a compact FST based in-memory data +structure that is especially beneficial for the text tagger. + +Schema configuration: + +.... +curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/schema -d '{ + "add-field-type":{ + "name":"tag", + "class":"solr.TextField", + "postingsFormat":"FST50", + "omitNorms":true, + "omitTermFreqAndPositions":true, + "indexAnalyzer":{ + "tokenizer":{ + "class":"solr.StandardTokenizerFactory" }, + "filters":[ + {"class":"solr.EnglishPossessiveFilterFactory"}, + {"class":"solr.ASCIIFoldingFilterFactory"}, + {"class":"solr.LowerCaseFilterFactory"}, + {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false } + ]}, + "queryAnalyzer":{ + "tokenizer":{ + "class":"solr.StandardTokenizerFactory" }, + "filters":[ + {"class":"solr.EnglishPossessiveFilterFactory"}, + {"class":"solr.ASCIIFoldingFilterFactory"}, + {"class":"solr.LowerCaseFilterFactory"} + ]} + }, + + "add-field":{ "name":"name", "type":"text_general"}, + + "add-field":{ "name":"name_tag", "type":"tag", "stored":false }, + + "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]} +}' +.... + +Configure a custom Solr Request Handler: + +.... +curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{ + "add-requesthandler" : { + "name": "/tag", + "class":"solr.TaggerRequestHandler", + "defaults":{ "field":"name_tag" } + } +}' +.... + +[[tagger-load-some-sample-data]] +=== Load Some Sample Data + +We'll go with some Geonames.org data in CSV format. Solr is quite +flexible in loading data in a variety of formats. This +http://download.geonames.org/export/dump/cities1000.zip[cities1000.zip] +should be almost 7MB file expanding to a cities1000.txt file around +22.2MB containing 145k lines, each a city in the world of at least 1000 +population. + +Using bin/post: +.... +bin/post -c geonames -type text/csv \ + -params 'optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' \ + /tmp/cities1000.txt +.... +or using curl: +.... +curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \ + 'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' +.... + +That might take around 35 seconds; it depends. It can be a lot faster if +the schema were tuned to only have what we truly need (no text search if +not needed). + +In that command we said optimize=true to put the index in a state that +will make tagging faster. The encapsulator=%00 is a bit of a hack to +disable the default double-quote. + +[[tagger-tag-time]] +=== Tag Time! + +This is a trivial example tagging a small piece of text. For more +options, see the earlier documentation. + +.... +curl -X POST \ + 'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \ + -H 'Content-Type:text/plain' -d 'Hello New York City' +.... + +The response should be this (the QTime may vary): + +.... +{ + "responseHeader":{ + "status":0, + "QTime":1}, + "tagsCount":1, + "tags":[[ + "startOffset",6, + "endOffset",19, + "ids",["5128581"]]], + "response":{"numFound":1,"start":0,"docs":[ + { + "id":"5128581", + "name":["New York City"], + "countrycode":["US"]}] + }} +.... + +[[tagger-tips]] +== Tips + +Performance Tips: + +* Follow the recommended configuration field settings, especially `postingsFormat=FST50`. +* "optimize" after loading your dictionary down to 1 Lucene segment, or at least to as few as possible. +* For bulk tagging lots of documents, there are some strategies, not mutually exclusive: +** Batch them. + The tagger doesn't directly support batching but as a hack you can send a bunch of documents concatenated with + a nonsense word that is not in the dictionary like "ZZYYXXAABBCC" between them. + You'll need to keep track of the character offsets of these so you can subtract them from the results. +** For reducing tagging latency even further, consider embedding Solr with `EmbeddedSolrServer`. + See `EmbeddedSolrNoSerializeTest`. +** Use more than one thread -- perhaps as many as there are CPU cores available to Solr. \ No newline at end of file From 0c6f38a315d0df5abd01e7d4efe481bc53444a49 Mon Sep 17 00:00:00 2001 From: Jeff Date: Tue, 5 Jun 2018 15:11:32 -0400 Subject: [PATCH 23/38] SOLR-12233: QParserPlugin's static registry of builtins can be optimized to avoid needless ClassLoader activity on SolrCore load. --- solr/CHANGES.txt | 6 +- .../java/org/apache/solr/core/SolrCore.java | 2 +- .../org/apache/solr/search/QParserPlugin.java | 80 +++++++++---------- .../solr/search/TestStandardQParsers.java | 10 +-- 4 files changed, 51 insertions(+), 47 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 479406fb0c2..5adc6770f38 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -155,7 +155,7 @@ New Features * SOLR-12389: support deeply nested json objects in clusterprops.json (noble) * SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text. It's used as a component of - NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley + NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley) Bug Fixes ---------------------- @@ -332,6 +332,10 @@ Optimizations * SOLR-9922: Write buffering updates to another tlog. (Cao Manh Dat) +* SOLR-12233: QParserPlugin's built-in static registry now holds actual QParserPlugin instances instead of class + references. This is consistent with other plugin registries and allows a SolrCore to load faster. + (Jeff Miller, David Smiley) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java index 99c0cca0669..feab22dce1c 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrCore.java +++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java @@ -958,7 +958,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab initIndex(prev != null, reload); initWriters(); - qParserPlugins.init(createInstances(QParserPlugin.standardPlugins), this); + qParserPlugins.init(QParserPlugin.standardPlugins, this); valueSourceParsers.init(ValueSourceParser.standardValueSourceParsers, this); transformerFactories.init(TransformerFactory.defaultFactories, this); loadSearchComponents(); diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java index f80bc9c3cda..b20c3c87ac9 100644 --- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java @@ -37,53 +37,53 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI public static final String DEFAULT_QTYPE = LuceneQParserPlugin.NAME; /** - * Internal use - name to class mappings of builtin parsers. + * Internal use - name to parser for the builtin parsers. * Each query parser plugin extending {@link QParserPlugin} has own instance of standardPlugins. * This leads to cyclic dependencies of static fields and to case when NAME field is not yet initialized. * This result to NPE during initialization. * For every plugin, listed here, NAME field has to be final and static. */ - public static final Map> standardPlugins; + public static final Map standardPlugins; static { - HashMap> map = new HashMap<>(30, 1); - map.put(LuceneQParserPlugin.NAME, LuceneQParserPlugin.class); - map.put(FunctionQParserPlugin.NAME, FunctionQParserPlugin.class); - map.put(PrefixQParserPlugin.NAME, PrefixQParserPlugin.class); - map.put(BoostQParserPlugin.NAME, BoostQParserPlugin.class); - map.put(DisMaxQParserPlugin.NAME, DisMaxQParserPlugin.class); - map.put(ExtendedDismaxQParserPlugin.NAME, ExtendedDismaxQParserPlugin.class); - map.put(FieldQParserPlugin.NAME, FieldQParserPlugin.class); - map.put(RawQParserPlugin.NAME, RawQParserPlugin.class); - map.put(TermQParserPlugin.NAME, TermQParserPlugin.class); - map.put(TermsQParserPlugin.NAME, TermsQParserPlugin.class); - map.put(NestedQParserPlugin.NAME, NestedQParserPlugin.class); - map.put(FunctionRangeQParserPlugin.NAME, FunctionRangeQParserPlugin.class); - map.put(SpatialFilterQParserPlugin.NAME, SpatialFilterQParserPlugin.class); - map.put(SpatialBoxQParserPlugin.NAME, SpatialBoxQParserPlugin.class); - map.put(JoinQParserPlugin.NAME, JoinQParserPlugin.class); - map.put(SurroundQParserPlugin.NAME, SurroundQParserPlugin.class); - map.put(SwitchQParserPlugin.NAME, SwitchQParserPlugin.class); - map.put(MaxScoreQParserPlugin.NAME, MaxScoreQParserPlugin.class); - map.put(BlockJoinParentQParserPlugin.NAME, BlockJoinParentQParserPlugin.class); - map.put(BlockJoinChildQParserPlugin.NAME, BlockJoinChildQParserPlugin.class); - map.put(FiltersQParserPlugin.NAME, FiltersQParserPlugin.class); - map.put(CollapsingQParserPlugin.NAME, CollapsingQParserPlugin.class); - map.put(SimpleQParserPlugin.NAME, SimpleQParserPlugin.class); - map.put(ComplexPhraseQParserPlugin.NAME, ComplexPhraseQParserPlugin.class); - map.put(ReRankQParserPlugin.NAME, ReRankQParserPlugin.class); - map.put(ExportQParserPlugin.NAME, ExportQParserPlugin.class); - map.put(MLTQParserPlugin.NAME, MLTQParserPlugin.class); - map.put(HashQParserPlugin.NAME, HashQParserPlugin.class); - map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class); - map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class); - map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class); - map.put(IGainTermsQParserPlugin.NAME, IGainTermsQParserPlugin.class); - map.put(TextLogisticRegressionQParserPlugin.NAME, TextLogisticRegressionQParserPlugin.class); - map.put(SignificantTermsQParserPlugin.NAME, SignificantTermsQParserPlugin.class); - map.put(PayloadScoreQParserPlugin.NAME, PayloadScoreQParserPlugin.class); - map.put(PayloadCheckQParserPlugin.NAME, PayloadCheckQParserPlugin.class); - map.put(BoolQParserPlugin.NAME, BoolQParserPlugin.class); + HashMap map = new HashMap<>(30, 1); + map.put(LuceneQParserPlugin.NAME, new LuceneQParserPlugin()); + map.put(FunctionQParserPlugin.NAME, new FunctionQParserPlugin()); + map.put(PrefixQParserPlugin.NAME, new PrefixQParserPlugin()); + map.put(BoostQParserPlugin.NAME, new BoostQParserPlugin()); + map.put(DisMaxQParserPlugin.NAME, new DisMaxQParserPlugin()); + map.put(ExtendedDismaxQParserPlugin.NAME, new ExtendedDismaxQParserPlugin()); + map.put(FieldQParserPlugin.NAME, new FieldQParserPlugin()); + map.put(RawQParserPlugin.NAME, new RawQParserPlugin()); + map.put(TermQParserPlugin.NAME, new TermQParserPlugin()); + map.put(TermsQParserPlugin.NAME, new TermsQParserPlugin()); + map.put(NestedQParserPlugin.NAME, new NestedQParserPlugin()); + map.put(FunctionRangeQParserPlugin.NAME, new FunctionRangeQParserPlugin()); + map.put(SpatialFilterQParserPlugin.NAME, new SpatialFilterQParserPlugin()); + map.put(SpatialBoxQParserPlugin.NAME, new SpatialBoxQParserPlugin()); + map.put(JoinQParserPlugin.NAME, new JoinQParserPlugin()); + map.put(SurroundQParserPlugin.NAME, new SurroundQParserPlugin()); + map.put(SwitchQParserPlugin.NAME, new SwitchQParserPlugin()); + map.put(MaxScoreQParserPlugin.NAME, new MaxScoreQParserPlugin()); + map.put(BlockJoinParentQParserPlugin.NAME, new BlockJoinParentQParserPlugin()); + map.put(BlockJoinChildQParserPlugin.NAME, new BlockJoinChildQParserPlugin()); + map.put(FiltersQParserPlugin.NAME, new FiltersQParserPlugin()); + map.put(CollapsingQParserPlugin.NAME, new CollapsingQParserPlugin()); + map.put(SimpleQParserPlugin.NAME, new SimpleQParserPlugin()); + map.put(ComplexPhraseQParserPlugin.NAME, new ComplexPhraseQParserPlugin()); + map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin()); + map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin()); + map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin()); + map.put(HashQParserPlugin.NAME, new HashQParserPlugin()); + map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin()); + map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin()); + map.put(GraphTermsQParserPlugin.NAME, new GraphTermsQParserPlugin()); + map.put(IGainTermsQParserPlugin.NAME, new IGainTermsQParserPlugin()); + map.put(TextLogisticRegressionQParserPlugin.NAME, new TextLogisticRegressionQParserPlugin()); + map.put(SignificantTermsQParserPlugin.NAME, new SignificantTermsQParserPlugin()); + map.put(PayloadScoreQParserPlugin.NAME, new PayloadScoreQParserPlugin()); + map.put(PayloadCheckQParserPlugin.NAME, new PayloadCheckQParserPlugin()); + map.put(BoolQParserPlugin.NAME, new BoolQParserPlugin()); standardPlugins = Collections.unmodifiableMap(map); } diff --git a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java index ff9ffffcdfa..cab9026602a 100644 --- a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java +++ b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java @@ -16,15 +16,15 @@ */ package org.apache.solr.search; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Test; - import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + /** * Check standard query parsers for class loading problems during initialization (NAME field is final and static). * Because every query plugin extend {@link org.apache.solr.search.QParserPlugin} and contains own instance of {@link org.apache.solr.search.QParserPlugin#standardPlugins}, @@ -50,9 +50,9 @@ public class TestStandardQParsers extends LuceneTestCase { List notFinal = new ArrayList<>(QParserPlugin.standardPlugins.size()); List mismatch = new ArrayList<>(QParserPlugin.standardPlugins.size()); - for (Map.Entry> pair : QParserPlugin.standardPlugins.entrySet()) { + for (Map.Entry pair : QParserPlugin.standardPlugins.entrySet()) { String regName = pair.getKey(); - Class clazz = pair.getValue(); + Class clazz = pair.getValue().getClass();; Field nameField = clazz.getField(FIELD_NAME); int modifiers = nameField.getModifiers(); From 124b5e047df697bfd61f1d14f87a0ee5d24f68d8 Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Tue, 5 Jun 2018 15:16:12 -0700 Subject: [PATCH 24/38] SOLR-11453: fix typos in the CHANGES entry --- solr/CHANGES.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5adc6770f38..6dd4889300e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -70,8 +70,8 @@ Upgrade Notes To return the previous behavior pass false to skipCommitOnMasterVersionZero in slave section of replication handler configuration, or pass it to the fetchindex command. -* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log . - Previously they would get logged in the solr.xml file +* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log. + Previously they would get logged in the solr.log file. New Features ---------------------- @@ -140,7 +140,7 @@ New Features * SOLR-12328: JSON Facet API: Domain change with graph query. (Daniel Meehl, Kevin Watters, yonik) -* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log . +* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log. (Shawn Heisey, Remko Popma, Varun Thacker) * SOLR-12401: Add getValue() and setValue() Stream Evaluators (Joel Bernstein, janhoy) From 9ff3f5a13614c9aefe50c84b9abc653490f536c4 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Wed, 6 Jun 2018 13:13:20 +1000 Subject: [PATCH 25/38] SOLR-12444: added more assertions to the test --- .../apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java index cf119535e12..cf57c48f18f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java @@ -1027,8 +1027,8 @@ public class AutoScalingHandlerTest extends SolrCloudTestCase { solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand)); req = createAutoScalingRequest(SolrRequest.METHOD.GET, null); response = solrClient.request(req); - System.out.println(response); - + assertEquals("<3", Utils.getObjectByPath(response,false,"cluster-policy[0]/cores")); + assertEquals("#ANY", Utils.getObjectByPath(response,false,"cluster-policy[0]/node")); } From 2b406a57c41c28a95db1deefc17527e600fa244d Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 6 Jun 2018 09:18:25 +0200 Subject: [PATCH 26/38] LUCENE-8346: Remove final modifier on QueryBuilder#createSpanQuery to allow override --- lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 2cb066bd208..697e3bb6657 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -346,9 +346,9 @@ public class QueryBuilder { /** * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is - * returned. When multiple tokens, an ordered SpanNearQuery with slop of 0 is returned. + * returned. When multiple tokens, an ordered SpanNearQuery with slop 0 is returned. */ - protected final SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { + protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; From 2f19ae1907e63864e19eda5b6325083f61f9cf66 Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Wed, 6 Jun 2018 15:50:19 +0700 Subject: [PATCH 27/38] SOLR-12250: Create the temporary tlog file properly --- .../org/apache/solr/update/TransactionLogTest.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java index d2b4b26df01..100d4d9a5c9 100644 --- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java @@ -17,8 +17,10 @@ package org.apache.solr.update; +import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Locale; @@ -29,9 +31,11 @@ import org.junit.Test; public class TransactionLogTest extends LuceneTestCase { @Test - public void testBigLastAddSize() throws IOException { - String tlogFileName = String.format(Locale.ROOT, UpdateLog.LOG_FILENAME_PATTERN, UpdateLog.TLOG_NAME, 0); - try (TransactionLog transactionLog = new TransactionLog(Files.createTempFile(tlogFileName, "").toFile(), new ArrayList<>())) { + public void testBigLastAddSize() { + String tlogFileName = String.format(Locale.ROOT, UpdateLog.LOG_FILENAME_PATTERN, UpdateLog.TLOG_NAME, Long.MAX_VALUE); + Path path = createTempDir(); + File logFile = new File(path.toFile(), tlogFileName); + try (TransactionLog transactionLog = new TransactionLog(logFile, new ArrayList<>())) { transactionLog.lastAddSize = 2000000000; AddUpdateCommand updateCommand = new AddUpdateCommand(null); updateCommand.solrDoc = new SolrInputDocument(); From 0358fcb1759038f76d91c124703c5d3244125f75 Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Wed, 6 Jun 2018 15:52:15 +0700 Subject: [PATCH 28/38] SOLR-12250: Fix precommit --- .../src/test/org/apache/solr/update/TransactionLogTest.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java index 100d4d9a5c9..66ecbc65ff6 100644 --- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java @@ -18,8 +18,6 @@ package org.apache.solr.update; import java.io.File; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Locale; From af7dfb182ebc175df44b2d0a846a7d11f58bf84b Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Wed, 6 Jun 2018 11:39:39 +0200 Subject: [PATCH 29/38] LUCENE-8299: Geo3D wrapper uses new polygon method factory that gives better support for polygons with many points (>100) --- lucene/CHANGES.txt | 3 +++ .../spatial/spatial4j/Geo3dShapeFactory.java | 10 ++++---- .../spatial/spatial4j/Geo3dRptTest.java | 23 +++++++++++++++++-- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9eecc4206b0..8f307fe45ab 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -267,6 +267,9 @@ Other * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss) +* LUCENE-8299: Geo3D wrapper uses new polygon method factory that gives better + support for polygons with many points (>100). (Ignacio vera) + * LUCENE-8261: InterpolatedProperties.interpolate and recursive property references. (Steve Rowe, Dawid Weiss) diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/spatial4j/Geo3dShapeFactory.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/spatial4j/Geo3dShapeFactory.java index a6147dfdf41..ccbd6df2e43 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/spatial4j/Geo3dShapeFactory.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/spatial4j/Geo3dShapeFactory.java @@ -311,7 +311,7 @@ public class Geo3dShapeFactory implements S2ShapeFactory { */ private class Geo3dPolygonBuilder extends Geo3dPointBuilder implements PolygonBuilder { - List polyHoles; + List polyHoles = new ArrayList<>(); @Override public HoleBuilder hole() { @@ -321,10 +321,7 @@ public class Geo3dShapeFactory implements S2ShapeFactory { class Geo3dHoleBuilder extends Geo3dPointBuilder implements PolygonBuilder.HoleBuilder { @Override public PolygonBuilder endHole() { - if (polyHoles == null) { - polyHoles = new ArrayList<>(); - } - polyHoles.add(GeoPolygonFactory.makeGeoPolygon(planetModel, points)); + polyHoles.add(new GeoPolygonFactory.PolygonDescription(points)); return Geo3dPolygonBuilder.this; } } @@ -332,7 +329,8 @@ public class Geo3dShapeFactory implements S2ShapeFactory { @SuppressWarnings("unchecked") @Override public Shape build() { - GeoPolygon polygon = GeoPolygonFactory.makeGeoPolygon(planetModel, points, polyHoles); + GeoPolygonFactory.PolygonDescription description = new GeoPolygonFactory.PolygonDescription(points, polyHoles); + GeoPolygon polygon = GeoPolygonFactory.makeGeoPolygon(planetModel, description); return new Geo3dShape<>(polygon, context); } diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/spatial4j/Geo3dRptTest.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/spatial4j/Geo3dRptTest.java index d3b144f154c..eb6ed5bc69f 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/spatial4j/Geo3dRptTest.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/spatial4j/Geo3dRptTest.java @@ -18,9 +18,11 @@ package org.apache.lucene.spatial.spatial4j; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import com.carrotsearch.randomizedtesting.annotations.Repeat; +import org.apache.lucene.spatial.SpatialTestData; import org.apache.lucene.spatial.composite.CompositeSpatialStrategy; import org.apache.lucene.spatial.prefix.RandomSpatialOpStrategyTestCase; import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy; @@ -96,7 +98,7 @@ public class Geo3dRptTest extends RandomSpatialOpStrategyTestCase { points.add(new GeoPoint(planetModel, 14 * DEGREES_TO_RADIANS, -180 * DEGREES_TO_RADIANS)); points.add(new GeoPoint(planetModel, -15 * DEGREES_TO_RADIANS, 153 * DEGREES_TO_RADIANS)); - final Shape triangle = new Geo3dShape(GeoPolygonFactory.makeGeoPolygon(planetModel, points),ctx); + final Shape triangle = new Geo3dShape<>(GeoPolygonFactory.makeGeoPolygon(planetModel, points),ctx); final Rectangle rect = ctx.makeRectangle(-49, -45, 73, 86); testOperation(rect,SpatialOperation.Intersects,triangle, false); } @@ -116,7 +118,7 @@ public class Geo3dRptTest extends RandomSpatialOpStrategyTestCase { new GeoPoint(planetModel, 54.0 * DEGREES_TO_RADIANS, 165.0 * DEGREES_TO_RADIANS), new GeoPoint(planetModel, -90.0 * DEGREES_TO_RADIANS, 0.0)}; final GeoPath path = GeoPathFactory.makeGeoPath(planetModel, 29 * DEGREES_TO_RADIANS, pathPoints); - final Shape shape = new Geo3dShape(path,ctx); + final Shape shape = new Geo3dShape<>(path,ctx); final Rectangle rect = ctx.makeRectangle(131, 143, 39, 54); testOperation(rect,SpatialOperation.Intersects,shape,true); } @@ -146,6 +148,23 @@ public class Geo3dRptTest extends RandomSpatialOpStrategyTestCase { return new Geo3dShape<>(areaShape, ctx); } + @Test + public void testOperationsFromFile() throws IOException { + setupStrategy(); + final Iterator indexedSpatialData = getSampleData( "states-poly.txt"); + final List indexedShapes = new ArrayList<>(); + while(indexedSpatialData.hasNext()) { + indexedShapes.add(indexedSpatialData.next().shape); + } + final Iterator querySpatialData = getSampleData( "states-bbox.txt"); + final List queryShapes = new ArrayList<>(); + while(querySpatialData.hasNext()) { + queryShapes.add(querySpatialData.next().shape); + queryShapes.add(randomQueryShape()); + } + testOperation(SpatialOperation.Intersects, indexedShapes, queryShapes, random().nextBoolean()); + } + //TODO move to a new test class? @Test public void testWKT() throws Exception { From c1f42906ef1e0929f3887736dd9bb5bd0f9975c8 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Wed, 6 Jun 2018 11:06:50 +0200 Subject: [PATCH 30/38] SOLR-11911: Move simulator reset to @Before method. --- .../autoscaling/sim/SimSolrCloudTestCase.java | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java index e83f72f5712..270e7e72e4f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java @@ -80,40 +80,38 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { @Override public void setUp() throws Exception { super.setUp(); - if (cluster != null) { - // clear any persisted configuration - cluster.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), -1); - cluster.getDistribStateManager().setData(ZkStateReader.ROLES, Utils.toJSON(new HashMap<>()), -1); - cluster.getSimClusterStateProvider().simDeleteAllCollections(); - cluster.simClearSystemCollection(); - cluster.getSimNodeStateProvider().simRemoveDeadNodes(); - cluster.getSimClusterStateProvider().simRemoveDeadNodes(); - // restore the expected number of nodes - int currentSize = cluster.getLiveNodesSet().size(); - if (currentSize < clusterNodeCount) { - int addCnt = clusterNodeCount - currentSize; - while (addCnt-- > 0) { - cluster.simAddNode(); - } - } else if (currentSize > clusterNodeCount) { - cluster.simRemoveRandomNodes(currentSize - clusterNodeCount, true, random()); - } - // clean any persisted trigger state or events - removeChildren(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH); - removeChildren(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH); - removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); - removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); - cluster.getSimClusterStateProvider().simResetLeaderThrottles(); - cluster.simRestartOverseer(null); - cluster.getTimeSource().sleep(5000); - cluster.simResetOpCounts(); - } } @Before - public void checkClusterConfiguration() { + public void checkClusterConfiguration() throws Exception { if (cluster == null) throw new RuntimeException("SimCloudManager not configured - have you called configureCluster()?"); + // clear any persisted configuration + cluster.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), -1); + cluster.getDistribStateManager().setData(ZkStateReader.ROLES, Utils.toJSON(new HashMap<>()), -1); + cluster.getSimClusterStateProvider().simDeleteAllCollections(); + cluster.simClearSystemCollection(); + cluster.getSimNodeStateProvider().simRemoveDeadNodes(); + cluster.getSimClusterStateProvider().simRemoveDeadNodes(); + // restore the expected number of nodes + int currentSize = cluster.getLiveNodesSet().size(); + if (currentSize < clusterNodeCount) { + int addCnt = clusterNodeCount - currentSize; + while (addCnt-- > 0) { + cluster.simAddNode(); + } + } else if (currentSize > clusterNodeCount) { + cluster.simRemoveRandomNodes(currentSize - clusterNodeCount, true, random()); + } + // clean any persisted trigger state or events + removeChildren(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH); + removeChildren(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH); + removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); + removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); + cluster.getSimClusterStateProvider().simResetLeaderThrottles(); + cluster.simRestartOverseer(null); + cluster.getTimeSource().sleep(5000); + cluster.simResetOpCounts(); } protected void removeChildren(String path) throws Exception { From 3022bbce2ec493c759eceb4fac7eeab0fb908b59 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Wed, 6 Jun 2018 15:14:22 +0200 Subject: [PATCH 31/38] SOLR-12445: Upgrade Dropwizard Metrics to version 3.2.6. --- lucene/ivy-versions.properties | 2 +- solr/licenses/metrics-core-3.2.2.jar.sha1 | 1 - solr/licenses/metrics-core-3.2.6.jar.sha1 | 1 + solr/licenses/metrics-ganglia-3.2.2.jar.sha1 | 1 - solr/licenses/metrics-ganglia-3.2.6.jar.sha1 | 1 + solr/licenses/metrics-graphite-3.2.2.jar.sha1 | 1 - solr/licenses/metrics-graphite-3.2.6.jar.sha1 | 1 + solr/licenses/metrics-jetty9-3.2.2.jar.sha1 | 1 - solr/licenses/metrics-jetty9-3.2.6.jar.sha1 | 1 + solr/licenses/metrics-jvm-3.2.2.jar.sha1 | 1 - solr/licenses/metrics-jvm-3.2.6.jar.sha1 | 1 + 11 files changed, 6 insertions(+), 6 deletions(-) delete mode 100644 solr/licenses/metrics-core-3.2.2.jar.sha1 create mode 100644 solr/licenses/metrics-core-3.2.6.jar.sha1 delete mode 100644 solr/licenses/metrics-ganglia-3.2.2.jar.sha1 create mode 100644 solr/licenses/metrics-ganglia-3.2.6.jar.sha1 delete mode 100644 solr/licenses/metrics-graphite-3.2.2.jar.sha1 create mode 100644 solr/licenses/metrics-graphite-3.2.6.jar.sha1 delete mode 100644 solr/licenses/metrics-jetty9-3.2.2.jar.sha1 create mode 100644 solr/licenses/metrics-jetty9-3.2.6.jar.sha1 delete mode 100644 solr/licenses/metrics-jvm-3.2.2.jar.sha1 create mode 100644 solr/licenses/metrics-jvm-3.2.6.jar.sha1 diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 981bbe69fd6..49a119096fa 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -60,7 +60,7 @@ com.sun.jersey.version = 1.9 /dom4j/dom4j = 1.6.1 /info.ganglia.gmetric4j/gmetric4j = 1.0.7 -io.dropwizard.metrics.version = 3.2.2 +io.dropwizard.metrics.version = 3.2.6 /io.dropwizard.metrics/metrics-core = ${io.dropwizard.metrics.version} /io.dropwizard.metrics/metrics-ganglia = ${io.dropwizard.metrics.version} /io.dropwizard.metrics/metrics-graphite = ${io.dropwizard.metrics.version} diff --git a/solr/licenses/metrics-core-3.2.2.jar.sha1 b/solr/licenses/metrics-core-3.2.2.jar.sha1 deleted file mode 100644 index d14a04ef7a7..00000000000 --- a/solr/licenses/metrics-core-3.2.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -cd9886f498ee2ab2d994f0c779e5553b2c450416 diff --git a/solr/licenses/metrics-core-3.2.6.jar.sha1 b/solr/licenses/metrics-core-3.2.6.jar.sha1 new file mode 100644 index 00000000000..13fae6d0b21 --- /dev/null +++ b/solr/licenses/metrics-core-3.2.6.jar.sha1 @@ -0,0 +1 @@ +62fe170cffeded1cef60e9e3402a93b45ce14327 diff --git a/solr/licenses/metrics-ganglia-3.2.2.jar.sha1 b/solr/licenses/metrics-ganglia-3.2.2.jar.sha1 deleted file mode 100644 index e5d8496ca50..00000000000 --- a/solr/licenses/metrics-ganglia-3.2.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d5bb1883e9b0daf0e4187e558746f5058f4585c1 diff --git a/solr/licenses/metrics-ganglia-3.2.6.jar.sha1 b/solr/licenses/metrics-ganglia-3.2.6.jar.sha1 new file mode 100644 index 00000000000..32c9d30a57d --- /dev/null +++ b/solr/licenses/metrics-ganglia-3.2.6.jar.sha1 @@ -0,0 +1 @@ +a44039835eafd2dad8842a9ed16a60c088c5b7ef diff --git a/solr/licenses/metrics-graphite-3.2.2.jar.sha1 b/solr/licenses/metrics-graphite-3.2.2.jar.sha1 deleted file mode 100644 index 5d11db4d518..00000000000 --- a/solr/licenses/metrics-graphite-3.2.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -908e8cbec1bbdb2f4023334e424c7de2832a95af diff --git a/solr/licenses/metrics-graphite-3.2.6.jar.sha1 b/solr/licenses/metrics-graphite-3.2.6.jar.sha1 new file mode 100644 index 00000000000..26a1bbce30c --- /dev/null +++ b/solr/licenses/metrics-graphite-3.2.6.jar.sha1 @@ -0,0 +1 @@ +ecbc470e9097bb3d7ff0232cca47f3badde2e20b diff --git a/solr/licenses/metrics-jetty9-3.2.2.jar.sha1 b/solr/licenses/metrics-jetty9-3.2.2.jar.sha1 deleted file mode 100644 index 92d35089db6..00000000000 --- a/solr/licenses/metrics-jetty9-3.2.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -3fc94d99f41dc3f5be5483c81828138104df4449 diff --git a/solr/licenses/metrics-jetty9-3.2.6.jar.sha1 b/solr/licenses/metrics-jetty9-3.2.6.jar.sha1 new file mode 100644 index 00000000000..5d7bfa49dd4 --- /dev/null +++ b/solr/licenses/metrics-jetty9-3.2.6.jar.sha1 @@ -0,0 +1 @@ +5dae1c13d8607663fbc7b22cf8c05aacd22f802e diff --git a/solr/licenses/metrics-jvm-3.2.2.jar.sha1 b/solr/licenses/metrics-jvm-3.2.2.jar.sha1 deleted file mode 100644 index 0c02f93213b..00000000000 --- a/solr/licenses/metrics-jvm-3.2.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9cbf2030242f7ffb97fae23f8a81421eb8d4ad45 diff --git a/solr/licenses/metrics-jvm-3.2.6.jar.sha1 b/solr/licenses/metrics-jvm-3.2.6.jar.sha1 new file mode 100644 index 00000000000..219d02bb0b5 --- /dev/null +++ b/solr/licenses/metrics-jvm-3.2.6.jar.sha1 @@ -0,0 +1 @@ +a7a475393fe47dfee2042415430da3f01d4fe94e From d1631593f3a0ff9f58468ff0f32d7c1e8a404567 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Wed, 6 Jun 2018 15:30:32 +0200 Subject: [PATCH 32/38] SOLR-12445: Update CHANGES. --- solr/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 6dd4889300e..c93ccd8e696 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -295,6 +295,8 @@ Bug Fixes * SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 (noble) +* SOLR-12445: Upgrade Dropwizard Metrics to version 3.2.6. (ab) + Optimizations ---------------------- From 7c6d74376a784224963b57cb8380a07279fd7608 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 6 Jun 2018 14:45:17 -0400 Subject: [PATCH 33/38] SOLR-12376: AwaitsFix testStopWords pending LUCENE-8344 --- .../src/test/org/apache/solr/handler/tagger/Tagger2Test.java | 1 + 1 file changed, 1 insertion(+) diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java index c7580e1f729..cafda46ade4 100644 --- a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java +++ b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java @@ -86,6 +86,7 @@ public class Tagger2Test extends TaggerTestCase { /** Support for stopwords (posInc > 1); * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */ + @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8344") @Test public void testStopWords() throws Exception { baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query From f8131e4661058707f5ed11f2b04932d18f41cfff Mon Sep 17 00:00:00 2001 From: Joel Bernstein Date: Wed, 6 Jun 2018 21:53:22 -0400 Subject: [PATCH 34/38] SOLR-10651, SOLR-10784: Add new statistical and machine learning functions to CHANGES.txt for 7.4 release --- solr/CHANGES.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c93ccd8e696..1a88c5b0419 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -157,6 +157,20 @@ New Features * SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text. It's used as a component of NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley) +* SOLR-12266: Add discrete Fourier transform Stream Evaluators (Joel Bernstein) + +* SOLR-12158: Allow the monteCarlo Stream Evaluator to support variables (Joel Bernstein) + +* SOLR-11734: Add ones and zeros Stream Evaluators (Joel Bernstein) + +* SOLR-12273: Create Stream Evaluators for distance measures (Joel Bernstein) + +* SOLR-12159: Add memset Stream Evaluator (Joel Bernstein) + +* SOLR-12221: Add valueAt Stream Evaluator (Joel Bernstein) + +* SOLR-12175: Add random field type and dynamic field to the default managed-schema (Joel Bernstein) + Bug Fixes ---------------------- @@ -338,6 +352,8 @@ Optimizations references. This is consistent with other plugin registries and allows a SolrCore to load faster. (Jeff Miller, David Smiley) +* SOLR-12198: Stream Evaluators should not copy matrices needlessly (Joel Bernstein) + Other Changes ---------------------- @@ -400,6 +416,8 @@ Other Changes * SOLR-12374: Added SolrCore.withSearcher(lambda) to make grabbing the searcher easier than the more awkward RefCounted API. (David Smiley) +* SOLR-12183: Refactor Streaming Expression test cases (Joel Bernstein) + ================== 7.3.1 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. From 8a6f1bf5ada407ce75ce7b12a88e8a681d529825 Mon Sep 17 00:00:00 2001 From: Nhat Nguyen Date: Thu, 7 Jun 2018 09:33:44 +0200 Subject: [PATCH 35/38] LUCENE-8165: Ban copyOf and copyOfRange. These methods are lenient with out-of-bounds indices. Signed-off-by: Adrien Grand --- .../lucene/analysis/hunspell/Dictionary.java | 8 +- .../miscellaneous/WordDelimiterFilter.java | 7 +- .../analysis/synonym/SynonymFilter.java | 4 +- .../synonym/WordnetSynonymParser.java | 7 +- .../analysis/ngram/NGramTokenizerTest.java | 3 +- .../codecs/blockterms/BlockTermsWriter.java | 10 +- .../simpletext/SimpleTextBKDWriter.java | 10 +- .../SimpleTextSegmentInfoFormat.java | 3 +- .../CompressingStoredFieldsIndexReader.java | 25 ++- .../CompressingStoredFieldsWriter.java | 5 +- .../CompressingTermVectorsWriter.java | 8 +- .../index/SortedSetDocValuesWriter.java | 2 +- .../lucene/index/SortingLeafReader.java | 6 +- .../lucene/search/BlendedTermQuery.java | 8 +- .../lucene/search/CachingCollector.java | 9 +- .../apache/lucene/search/MaxScoreCache.java | 2 +- .../org/apache/lucene/search/PhraseQuery.java | 2 +- .../apache/lucene/search/PointInSetQuery.java | 2 +- .../apache/lucene/search/PointRangeQuery.java | 5 +- .../lucene/search/spans/SpanWeight.java | 4 +- .../org/apache/lucene/util/ArrayUtil.java | 200 +++++++++++++++++- .../java/org/apache/lucene/util/BytesRef.java | 9 +- .../apache/lucene/util/BytesRefBuilder.java | 4 +- .../java/org/apache/lucene/util/CharsRef.java | 3 +- .../apache/lucene/util/CharsRefBuilder.java | 4 +- .../apache/lucene/util/DocIdSetBuilder.java | 5 +- .../java/org/apache/lucene/util/IntsRef.java | 4 +- .../java/org/apache/lucene/util/LongsRef.java | 4 +- .../org/apache/lucene/util/PagedBytes.java | 7 +- .../apache/lucene/util/RoaringDocIdSet.java | 3 +- .../apache/lucene/util/SparseFixedBitSet.java | 3 +- .../org/apache/lucene/util/StringHelper.java | 2 +- .../DaciukMihovAutomatonBuilder.java | 4 +- .../org/apache/lucene/util/bkd/BKDWriter.java | 10 +- .../lucene/util/bkd/HeapPointWriter.java | 7 +- .../util/packed/DeltaPackedLongValues.java | 9 +- .../util/packed/MonotonicLongValues.java | 11 +- .../lucene/util/packed/PackedLongValues.java | 6 +- .../lucene/analysis/TestCharacterUtils.java | 4 +- .../AbstractTestCompressionMode.java | 11 +- .../lucene/codecs/lucene50/TestForUtil.java | 14 +- .../lucene/index/TestPerSegmentDeletes.java | 3 +- .../apache/lucene/search/TestBoolean2.java | 4 +- .../lucene/search/TestDoubleValuesSource.java | 3 +- .../lucene/search/TestLongValuesSource.java | 3 +- .../apache/lucene/search/TestPhraseQuery.java | 3 +- .../TestSimpleExplanationsWithFillerDocs.java | 5 +- .../apache/lucene/util/BaseSortTestCase.java | 4 +- .../lucene/util/StressRamUsageEstimator.java | 5 +- .../org/apache/lucene/util/TestArrayUtil.java | 87 ++++++++ .../org/apache/lucene/util/TestBytesRef.java | 8 + .../org/apache/lucene/util/TestCharsRef.java | 8 + .../org/apache/lucene/util/TestIntsRef.java | 8 + .../lucene/util/TestLSBRadixSorter.java | 4 +- .../org/apache/lucene/util/TestLongsRef.java | 47 ++++ .../lucene/util/TestMSBRadixSorter.java | 4 +- .../lucene/util/TestStringMSBRadixSorter.java | 4 +- .../lucene/util/packed/TestPackedInts.java | 5 +- .../expressions/TestExpressionSorts.java | 3 +- .../search/intervals/IntervalQuery.java | 4 +- .../lucene/document/TestHalfFloatPoint.java | 2 +- .../spatial/prefix/tree/QuadPrefixTree.java | 4 +- .../lucene/spatial/DistanceStrategyTest.java | 4 +- .../suggest/document/TestContextQuery.java | 3 +- .../index/BaseStoredFieldsFormatTestCase.java | 2 +- .../lucene/search/BlockScoreQueryWrapper.java | 10 +- .../util/automaton/AutomatonTestUtil.java | 4 +- lucene/tools/forbiddenApis/lucene.txt | 24 +++ 68 files changed, 517 insertions(+), 202 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestLongsRef.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 7885daab8d8..f42aac3e474 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -62,7 +62,6 @@ import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.OfflineSorter; -import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.fst.Builder; @@ -927,10 +926,7 @@ public class Dictionary { if (hasStemExceptions && end+1 < line.length()) { String stemException = parseStemException(line.substring(end+1)); if (stemException != null) { - if (stemExceptionCount == stemExceptions.length) { - int newSize = ArrayUtil.oversize(stemExceptionCount+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); - stemExceptions = Arrays.copyOf(stemExceptions, newSize); - } + stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount+1); stemExceptionID = stemExceptionCount+1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } @@ -1125,7 +1121,7 @@ public class Dictionary { } if (upto < flags.length) { - flags = Arrays.copyOf(flags, upto); + flags = ArrayUtil.copyOfSubArray(flags, 0, upto); } return flags; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index 16edb3dbef7..4124e846c46 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; @@ -427,9 +426,9 @@ public final class WordDelimiterFilter extends TokenFilter { private void buffer() { if (bufferedLen == buffered.length) { int newSize = ArrayUtil.oversize(bufferedLen+1, 8); - buffered = Arrays.copyOf(buffered, newSize); - startOff = Arrays.copyOf(startOff, newSize); - posInc = Arrays.copyOf(posInc, newSize); + buffered = ArrayUtil.growExact(buffered, newSize); + startOff = ArrayUtil.growExact(startOff, newSize); + posInc = ArrayUtil.growExact(posInc, newSize); } startOff[bufferedLen] = offsetAttribute.startOffset(); posInc[bufferedLen] = posIncAttribute.getPositionIncrement(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index ec2676f7804..a51edb5dba0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.synonym; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -33,7 +32,6 @@ import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; /** @@ -206,7 +204,7 @@ public final class SynonymFilter extends TokenFilter { public void add(char[] output, int offset, int len, int endOffset, int posLength) { if (count == outputs.length) { - outputs = Arrays.copyOf(outputs, ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); + outputs = ArrayUtil.grow(outputs, count+1); } if (count == endOffsets.length) { final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)]; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java index b74e37185d3..a4183d711b3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java @@ -21,9 +21,9 @@ import java.io.IOException; import java.io.LineNumberReader; import java.io.Reader; import java.text.ParseException; -import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; @@ -59,10 +59,7 @@ public class WordnetSynonymParser extends SynonymMap.Parser { synsetSize = 0; } - if (synset.length <= synsetSize+1) { - synset = Arrays.copyOf(synset, synset.length * 2); - } - + synset = ArrayUtil.grow(synset, synsetSize + 1); synset[synsetSize] = parseSynonym(line, new CharsRefBuilder()); synsetSize++; lastSynSetID = synSetID; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java index 46a0c1c143e..cb54fa2fc44 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.TestUtil; import com.carrotsearch.randomizedtesting.generators.RandomStrings; @@ -179,7 +180,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { } } assertTrue(grams.incrementToken()); - assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt)); + assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posLenAtt.getPositionLength()); assertEquals(offsets[start], offsetAtt.startOffset()); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index 9ed87b57a90..9de9d732a25 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -20,7 +20,6 @@ package org.apache.lucene.codecs.blockterms; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.lucene.codecs.BlockTermState; @@ -44,7 +43,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; // TODO: currently we encode all terms between two indexed // terms as a block; but, we could decouple the two, ie @@ -260,11 +258,9 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { //System.out.println(" index term!"); } - if (pendingTerms.length == pendingCount) { - pendingTerms = Arrays.copyOf(pendingTerms, ArrayUtil.oversize(pendingCount+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); - for(int i=pendingCount;i 0) { // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end: - leafBlockStartValues.add(Arrays.copyOf(leafValues, packedBytesLength)); + leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength)); } leafBlockFPs.add(out.getFilePointer()); checkMaxLeafNodeCount(leafBlockFPs.size()); @@ -539,8 +539,8 @@ final class SimpleTextBKDWriter implements Closeable { return scratch; } }; - assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength), - Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), + assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength), + ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), packedValues, leafDocs, 0); writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues); } @@ -1206,8 +1206,8 @@ final class SimpleTextBKDWriter implements Closeable { reader.getValue(mid, scratchBytesRef1); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim); - byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength); - byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength); + byte[] minSplitPackedValue = ArrayUtil.copyOfSubArray(minPackedValue, 0, packedBytesLength); + byte[] maxSplitPackedValue = ArrayUtil.copyOfSubArray(maxPackedValue, 0, packedBytesLength); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 8a71c6df7dc..6d5bfe4c3b0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -42,6 +42,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.StringHelper; @@ -158,7 +159,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_ID); - final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length()); + final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length()); if (!Arrays.equals(segmentID, id)) { throw new CorruptIndexException("file mismatch, expected: " + StringHelper.idToString(segmentID) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java index 0685d799497..61410b68e5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java @@ -21,7 +21,6 @@ import static org.apache.lucene.util.BitUtil.zigZagDecode; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -73,12 +72,12 @@ public final class CompressingStoredFieldsIndexReader implements Cloneable, Acco } if (blockCount == docBases.length) { final int newSize = ArrayUtil.oversize(blockCount + 1, 8); - docBases = Arrays.copyOf(docBases, newSize); - startPointers = Arrays.copyOf(startPointers, newSize); - avgChunkDocs = Arrays.copyOf(avgChunkDocs, newSize); - avgChunkSizes = Arrays.copyOf(avgChunkSizes, newSize); - docBasesDeltas = Arrays.copyOf(docBasesDeltas, newSize); - startPointersDeltas = Arrays.copyOf(startPointersDeltas, newSize); + docBases = ArrayUtil.growExact(docBases, newSize); + startPointers = ArrayUtil.growExact(startPointers, newSize); + avgChunkDocs = ArrayUtil.growExact(avgChunkDocs, newSize); + avgChunkSizes = ArrayUtil.growExact(avgChunkSizes, newSize); + docBasesDeltas = ArrayUtil.growExact(docBasesDeltas, newSize); + startPointersDeltas = ArrayUtil.growExact(startPointersDeltas, newSize); } // doc bases @@ -102,12 +101,12 @@ public final class CompressingStoredFieldsIndexReader implements Cloneable, Acco ++blockCount; } - this.docBases = Arrays.copyOf(docBases, blockCount); - this.startPointers = Arrays.copyOf(startPointers, blockCount); - this.avgChunkDocs = Arrays.copyOf(avgChunkDocs, blockCount); - this.avgChunkSizes = Arrays.copyOf(avgChunkSizes, blockCount); - this.docBasesDeltas = Arrays.copyOf(docBasesDeltas, blockCount); - this.startPointersDeltas = Arrays.copyOf(startPointersDeltas, blockCount); + this.docBases = ArrayUtil.copyOfSubArray(docBases, 0, blockCount); + this.startPointers = ArrayUtil.copyOfSubArray(startPointers, 0, blockCount); + this.avgChunkDocs = ArrayUtil.copyOfSubArray(avgChunkDocs, 0, blockCount); + this.avgChunkSizes = ArrayUtil.copyOfSubArray(avgChunkSizes, 0, blockCount); + this.docBasesDeltas = ArrayUtil.copyOfSubArray(docBasesDeltas, 0, blockCount); + this.startPointersDeltas = ArrayUtil.copyOfSubArray(startPointersDeltas, 0, blockCount); } private int block(int docID) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 8cd8ccbc279..5b8eb9e1249 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -19,7 +19,6 @@ package org.apache.lucene.codecs.compressing; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.lucene.codecs.CodecUtil; @@ -158,8 +157,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { public void finishDocument() throws IOException { if (numBufferedDocs == this.numStoredFields.length) { final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4); - this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength); - endOffsets = Arrays.copyOf(endOffsets, newLength); + this.numStoredFields = ArrayUtil.growExact(this.numStoredFields, newLength); + endOffsets = ArrayUtil.growExact(endOffsets, newLength); } this.numStoredFields[numBufferedDocs] = numStoredFieldsInDoc; numStoredFieldsInDoc = 0; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index ee948c3e3e7..4f8d004e200 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -176,8 +176,8 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { if (hasOffsets) { if (offStart + totalPositions == startOffsetsBuf.length) { final int newLength = ArrayUtil.oversize(offStart + totalPositions, 4); - startOffsetsBuf = Arrays.copyOf(startOffsetsBuf, newLength); - lengthsBuf = Arrays.copyOf(lengthsBuf, newLength); + startOffsetsBuf = ArrayUtil.growExact(startOffsetsBuf, newLength); + lengthsBuf = ArrayUtil.growExact(lengthsBuf, newLength); } startOffsetsBuf[offStart + totalPositions] = startOffset; lengthsBuf[offStart + totalPositions] = length; @@ -705,8 +705,8 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { final int offStart = curField.offStart + curField.totalPositions; if (offStart + numProx > startOffsetsBuf.length) { final int newLength = ArrayUtil.oversize(offStart + numProx, 4); - startOffsetsBuf = Arrays.copyOf(startOffsetsBuf, newLength); - lengthsBuf = Arrays.copyOf(lengthsBuf, newLength); + startOffsetsBuf = ArrayUtil.growExact(startOffsetsBuf, newLength); + lengthsBuf = ArrayUtil.growExact(lengthsBuf, newLength); } int lastOffset = 0, startOffset, endOffset; for (int i = 0; i < numProx; ++i) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java index 700090a48fd..71a14a5cb7c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java @@ -164,7 +164,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { } docOrds[upto++] = ord; } - ords[newDocID] = Arrays.copyOfRange(docOrds, 0, upto); + ords[newDocID] = ArrayUtil.copyOfSubArray(docOrds, 0, upto); } return ords; } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java index eb9f7ed8a52..55e4d200164 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java @@ -890,8 +890,8 @@ class SortingLeafReader extends FilterLeafReader { while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.length) { final int newLength = ArrayUtil.oversize(i + 1, 4); - docs = Arrays.copyOf(docs, newLength); - offsets = Arrays.copyOf(offsets, newLength); + docs = ArrayUtil.growExact(docs, newLength); + offsets = ArrayUtil.growExact(offsets, newLength); } docs[i] = docMap.oldToNew(doc); offsets[i] = out.getFilePointer(); @@ -1230,7 +1230,7 @@ class SortingLeafReader extends FilterLeafReader { } docOrds[upto++] = ord; } - ords[newDocID] = Arrays.copyOfRange(docOrds, 0, upto); + ords[newDocID] = ArrayUtil.copyOfSubArray(docOrds, 0, upto); } cachedSortedSetDVs.put(field, ords); } diff --git a/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java index cca667575a4..8f85e25b442 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java @@ -102,9 +102,9 @@ public final class BlendedTermQuery extends Query { /** Build the {@link BlendedTermQuery}. */ public BlendedTermQuery build() { return new BlendedTermQuery( - Arrays.copyOf(terms, numTerms), - Arrays.copyOf(boosts, numTerms), - Arrays.copyOf(contexts, numTerms), + ArrayUtil.copyOfSubArray(terms, 0, numTerms), + ArrayUtil.copyOfSubArray(boosts, 0, numTerms), + ArrayUtil.copyOfSubArray(contexts, 0, numTerms), rewriteMethod); } @@ -263,7 +263,7 @@ public final class BlendedTermQuery extends Query { @Override public final Query rewrite(IndexReader reader) throws IOException { - final TermStates[] contexts = Arrays.copyOf(this.contexts, this.contexts.length); + final TermStates[] contexts = ArrayUtil.copyOfSubArray(this.contexts, 0, this.contexts.length); for (int i = 0; i < contexts.length; ++i) { if (contexts[i] == null || contexts[i].wasBuiltFor(reader.getContext()) == false) { contexts[i] = TermStates.build(reader.getContext(), terms[i], true); diff --git a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java index 3bed88dd998..6681c5997aa 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java @@ -18,7 +18,6 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.lucene.index.LeafReaderContext; @@ -213,7 +212,7 @@ public abstract class CachingCollector extends FilterCollector { } protected void grow(int newLen) { - docs = Arrays.copyOf(docs, newLen); + docs = ArrayUtil.growExact(docs, newLen); } protected void invalidate() { @@ -250,7 +249,7 @@ public abstract class CachingCollector extends FilterCollector { } int[] cachedDocs() { - return docs == null ? null : Arrays.copyOf(docs, docCount); + return docs == null ? null : ArrayUtil.copyOfSubArray(docs, 0, docCount); } } @@ -274,7 +273,7 @@ public abstract class CachingCollector extends FilterCollector { @Override protected void grow(int newLen) { super.grow(newLen); - scores = Arrays.copyOf(scores, newLen); + scores = ArrayUtil.growExact(scores, newLen); } @Override @@ -290,7 +289,7 @@ public abstract class CachingCollector extends FilterCollector { } float[] cachedScores() { - return docs == null ? null : Arrays.copyOf(scores, docCount); + return docs == null ? null : ArrayUtil.copyOfSubArray(scores, 0, docCount); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java index 17e4efc0c16..51fee1acf4e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java @@ -53,7 +53,7 @@ final class MaxScoreCache { if (maxScoreCache.length < size) { int oldLength = maxScoreCache.length; maxScoreCache = ArrayUtil.grow(maxScoreCache, size); - maxScoreCacheUpTo = Arrays.copyOf(maxScoreCacheUpTo, maxScoreCache.length); + maxScoreCacheUpTo = ArrayUtil.growExact(maxScoreCacheUpTo, maxScoreCache.length); Arrays.fill(maxScoreCacheUpTo, oldLength, maxScoreCacheUpTo.length, -1); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 16642e51f11..70d2e09ef3b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -413,7 +413,7 @@ public class PhraseQuery extends Query { } } if (termUpTo > 0) { - return similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo)); + return similarity.scorer(boost, searcher.collectionStatistics(field), ArrayUtil.copyOfSubArray(termStats, 0, termUpTo)); } else { return null; // no terms at all, we won't use similarity } diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java index 689d64a50d7..149fa2080ae 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -333,7 +333,7 @@ public abstract class PointInSetQuery extends Query { upto++; BytesRef next = iterator.next(); - return Arrays.copyOfRange(next.bytes, next.offset, next.length); + return BytesRef.deepCopyOf(next).bytes; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 7e48383b472..3af001b4443 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; @@ -392,9 +393,9 @@ public abstract class PointRangeQuery extends Query { int startOffset = bytesPerDim * i; sb.append('['); - sb.append(toString(i, Arrays.copyOfRange(lowerPoint, startOffset, startOffset + bytesPerDim))); + sb.append(toString(i, ArrayUtil.copyOfSubArray(lowerPoint, startOffset, startOffset + bytesPerDim))); sb.append(" TO "); - sb.append(toString(i, Arrays.copyOfRange(upperPoint, startOffset, startOffset + bytesPerDim))); + sb.append(toString(i, ArrayUtil.copyOfSubArray(upperPoint, startOffset, startOffset + bytesPerDim))); sb.append(']'); } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java index 35c36d77c73..0313d56510c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,7 +18,6 @@ package org.apache.lucene.search.spans; import java.io.IOException; -import java.util.Arrays; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; @@ -32,6 +31,7 @@ import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.ArrayUtil; /** * Expert-only. Public for use by other weight implementations @@ -103,7 +103,7 @@ public abstract class SpanWeight extends Weight { } CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField()); if (termUpTo > 0) { - return similarity.scorer(boost, collectionStats, Arrays.copyOf(termStats, termUpTo)); + return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo)); } else { return null; // no terms at all exist, we won't use similarity } diff --git a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java index 3c5897f6c85..f6bab105911 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.util; -import java.util.Arrays; +import java.lang.reflect.Array; import java.util.Comparator; /** @@ -211,94 +211,172 @@ public final class ArrayUtil { } } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static T[] growExact(T[] array, int newLength) { + Class type = array.getClass(); + @SuppressWarnings("unchecked") + T[] copy = (type == Object[].class) + ? (T[]) new Object[newLength] + : (T[]) Array.newInstance(type.getComponentType(), newLength); + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static T[] grow(T[] array, int minSize) { - assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; + assert minSize >= 0 : "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); + final int newLength = oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + return growExact(array, newLength); } else return array; } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static short[] growExact(short[] array, int newLength) { + short[] copy = new short[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static short[] grow(short[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Short.BYTES)); + return growExact(array, oversize(minSize, Short.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static short[] grow(short[] array) { return grow(array, 1 + array.length); } - + + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static float[] growExact(float[] array, int newLength) { + float[] copy = new float[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static float[] grow(float[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Float.BYTES)); + float[] copy = new float[oversize(minSize, Float.BYTES)]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static float[] grow(float[] array) { return grow(array, 1 + array.length); } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static double[] growExact(double[] array, int newLength) { + double[] copy = new double[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static double[] grow(double[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Double.BYTES)); + return growExact(array, oversize(minSize, Double.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static double[] grow(double[] array) { return grow(array, 1 + array.length); } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static int[] growExact(int[] array, int newLength) { + int[] copy = new int[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static int[] grow(int[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Integer.BYTES)); + return growExact(array, oversize(minSize, Integer.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static int[] grow(int[] array) { return grow(array, 1 + array.length); } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static long[] growExact(long[] array, int newLength) { + long[] copy = new long[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static long[] grow(long[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Long.BYTES)); + return growExact(array, oversize(minSize, Long.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static long[] grow(long[] array) { return grow(array, 1 + array.length); } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static byte[] growExact(byte[] array, int newLength) { + byte[] copy = new byte[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static byte[] grow(byte[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Byte.BYTES)); + return growExact(array, oversize(minSize, Byte.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static byte[] grow(byte[] array) { return grow(array, 1 + array.length); } + /** Returns a new array whose size is exact the specified {@code newLength} without over-allocating */ + public static char[] growExact(char[] array, int newLength) { + char[] copy = new char[newLength]; + System.arraycopy(array, 0, copy, 0, array.length); + return copy; + } + + /** Returns an array whose size is at least {@code minSize}, generally over-allocating exponentially */ public static char[] grow(char[] array, int minSize) { assert minSize >= 0: "size must be positive (got " + minSize + "): likely integer overflow?"; if (array.length < minSize) { - return Arrays.copyOf(array, oversize(minSize, Character.BYTES)); + return growExact(array, oversize(minSize, Character.BYTES)); } else return array; } + /** Returns a larger array, generally over-allocating exponentially */ public static char[] grow(char[] array) { return grow(array, 1 + array.length); } @@ -429,4 +507,104 @@ public final class ArrayUtil { }.select(from, to, k); } + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static byte[] copyOfSubArray(byte[] array, int from, int to) { + final byte[] copy = new byte[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static char[] copyOfSubArray(char[] array, int from, int to) { + final char[] copy = new char[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static short[] copyOfSubArray(short[] array, int from, int to) { + final short[] copy = new short[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static int[] copyOfSubArray(int[] array, int from, int to) { + final int[] copy = new int[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static long[] copyOfSubArray(long[] array, int from, int to) { + final long[] copy = new long[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static float[] copyOfSubArray(float[] array, int from, int to) { + final float[] copy = new float[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static double[] copyOfSubArray(double[] array, int from, int to) { + final double[] copy = new double[to-from]; + System.arraycopy(array, from, copy, 0, to-from); + return copy; + } + + /** + * Copies the specified range of the given array into a new sub array. + * @param array the input array + * @param from the initial index of range to be copied (inclusive) + * @param to the final index of range to be copied (exclusive) + */ + public static T[] copyOfSubArray(T[] array, int from, int to) { + final int subLength = to - from; + final Class type = array.getClass(); + @SuppressWarnings("unchecked") + final T[] copy = (type == Object[].class) + ? (T[]) new Object[subLength] + : (T[]) Array.newInstance(type.getComponentType(), subLength); + System.arraycopy(array, from, copy, 0, subLength); + return copy; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java index 42c4e87ad58..a9a05e6ce92 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java @@ -16,9 +16,6 @@ */ package org.apache.lucene.util; - -import java.util.Arrays; - /** Represents byte[], as a slice (offset + length) into an * existing byte[]. The {@link #bytes} member should never be null; * use {@link #EMPTY_BYTES} if necessary. @@ -172,11 +169,7 @@ public final class BytesRef implements Comparable,Cloneable { * and an offset of zero. */ public static BytesRef deepCopyOf(BytesRef other) { - BytesRef copy = new BytesRef(); - copy.bytes = Arrays.copyOfRange(other.bytes, other.offset, other.offset + other.length); - copy.offset = 0; - copy.length = other.length; - return copy; + return new BytesRef(ArrayUtil.copyOfSubArray(other.bytes, other.offset, other.offset + other.length), 0, other.length); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java index 08fda910a55..6abd8660791 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java @@ -17,8 +17,6 @@ package org.apache.lucene.util; -import java.util.Arrays; - /** * A builder for {@link BytesRef} instances. * @lucene.internal @@ -170,7 +168,7 @@ public class BytesRefBuilder { * Build a new {@link BytesRef} that has the same content as this buffer. */ public BytesRef toBytesRef() { - return new BytesRef(Arrays.copyOf(ref.bytes, ref.length)); + return new BytesRef(ArrayUtil.copyOfSubArray(ref.bytes, 0, ref.length)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java index eb839a85fa0..7cb8ce900c9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java @@ -17,7 +17,6 @@ package org.apache.lucene.util; -import java.util.Arrays; import java.util.Comparator; /** @@ -202,7 +201,7 @@ public final class CharsRef implements Comparable, CharSequence, Clone * and an offset of zero. */ public static CharsRef deepCopyOf(CharsRef other) { - return new CharsRef(Arrays.copyOfRange(other.chars, other.offset, other.offset + other.length), 0, other.length); + return new CharsRef(ArrayUtil.copyOfSubArray(other.chars, other.offset, other.offset + other.length), 0, other.length); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/CharsRefBuilder.java index 09830e692cf..fdc16f4f0c7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/CharsRefBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/CharsRefBuilder.java @@ -17,8 +17,6 @@ package org.apache.lucene.util; -import java.util.Arrays; - /** * A builder for {@link CharsRef} instances. * @lucene.internal @@ -153,7 +151,7 @@ public class CharsRefBuilder implements Appendable { /** Build a new {@link CharsRef} that has the same content as this builder. */ public CharsRef toCharsRef() { - return new CharsRef(Arrays.copyOf(ref.chars, ref.length), 0, ref.length); + return new CharsRef(ArrayUtil.copyOfSubArray(ref.chars, 0, ref.length), 0, ref.length); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java index 954614b3ad5..e57292b08c9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -18,7 +18,6 @@ package org.apache.lucene.util; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.lucene.index.PointValues; @@ -226,7 +225,7 @@ public final class DocIdSetBuilder { } private void growBuffer(Buffer buffer, int additionalCapacity) { - buffer.array = Arrays.copyOf(buffer.array, buffer.array.length + additionalCapacity); + buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity); totalAllocated += additionalCapacity; } @@ -297,7 +296,7 @@ public final class DocIdSetBuilder { } int[] docs = largestBuffer.array; if (docs.length < totalLength + 1) { - docs = Arrays.copyOf(docs, totalLength + 1); + docs = ArrayUtil.growExact(docs, totalLength + 1); } totalLength = largestBuffer.length; for (Buffer buffer : buffers) { diff --git a/lucene/core/src/java/org/apache/lucene/util/IntsRef.java b/lucene/core/src/java/org/apache/lucene/util/IntsRef.java index aa7bbcee11c..0c6cfa527c5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntsRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntsRef.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.util; -import java.util.Arrays; - /** Represents int[], as a slice (offset + length) into an * existing int[]. The {@link #ints} member should never be null; use @@ -127,7 +125,7 @@ public final class IntsRef implements Comparable, Cloneable { * and an offset of zero. */ public static IntsRef deepCopyOf(IntsRef other) { - return new IntsRef(Arrays.copyOfRange(other.ints, other.offset, other.offset + other.length), 0, other.length); + return new IntsRef(ArrayUtil.copyOfSubArray(other.ints, other.offset, other.offset + other.length), 0, other.length); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/LongsRef.java b/lucene/core/src/java/org/apache/lucene/util/LongsRef.java index 952d189a2a3..e4ca3ef1206 100644 --- a/lucene/core/src/java/org/apache/lucene/util/LongsRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/LongsRef.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.util; -import java.util.Arrays; - /** Represents long[], as a slice (offset + length) into an * existing long[]. The {@link #longs} member should never be null; use @@ -126,7 +124,7 @@ public final class LongsRef implements Comparable, Cloneable { * and an offset of zero. */ public static LongsRef deepCopyOf(LongsRef other) { - return new LongsRef(Arrays.copyOfRange(other.longs, other.offset, other.offset + other.length), 0, other.length); + return new LongsRef(ArrayUtil.copyOfSubArray(other.longs, other.offset, other.offset + other.length), 0, other.length); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java index ad26f857e41..e07046c40ef 100644 --- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java +++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java @@ -18,7 +18,6 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -62,7 +61,7 @@ public final class PagedBytes implements Accountable { private final long bytesUsedPerBlock; private Reader(PagedBytes pagedBytes) { - blocks = Arrays.copyOf(pagedBytes.blocks, pagedBytes.numBlocks); + blocks = ArrayUtil.copyOfSubArray(pagedBytes.blocks, 0, pagedBytes.numBlocks); blockBits = pagedBytes.blockBits; blockMask = pagedBytes.blockMask; blockSize = pagedBytes.blockSize; @@ -154,9 +153,7 @@ public final class PagedBytes implements Accountable { } private void addBlock(byte[] block) { - if (blocks.length == numBlocks) { - blocks = Arrays.copyOf(blocks, ArrayUtil.oversize(numBlocks, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); - } + blocks = ArrayUtil.grow(blocks, numBlocks + 1); blocks[numBlocks++] = block; } diff --git a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java index 9709c300058..5f704bbe04f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java @@ -18,7 +18,6 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; @@ -73,7 +72,7 @@ public class RoaringDocIdSet extends DocIdSet { // Use sparse encoding assert denseBuffer == null; if (currentBlockCardinality > 0) { - sets[currentBlock] = new ShortArrayDocIdSet(Arrays.copyOf(buffer, currentBlockCardinality)); + sets[currentBlock] = new ShortArrayDocIdSet(ArrayUtil.copyOfSubArray(buffer, 0, currentBlockCardinality)); } } else { assert denseBuffer != null; diff --git a/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java index 0324291fc71..4fcbbef3e98 100644 --- a/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java @@ -18,7 +18,6 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.search.DocIdSetIterator; @@ -372,7 +371,7 @@ public class SparseFixedBitSet extends BitSet implements Bits, Accountable { // fast path: if we currently have nothing in the block, just copy the data // this especially happens all the time if you call OR on an empty set indices[i4096] = index; - this.bits[i4096] = Arrays.copyOf(bits, nonZeroLongCount); + this.bits[i4096] = ArrayUtil.copyOfSubArray(bits, 0, nonZeroLongCount); this.nonZeroLongCount += nonZeroLongCount; return; } diff --git a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java index 4c6d4fac3a7..bdab07bf0f4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java +++ b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java @@ -307,7 +307,7 @@ public abstract class StringHelper { if (bits.length > ID_LENGTH) { assert bits.length == ID_LENGTH + 1; assert bits[0] == 0; - return Arrays.copyOfRange(bits, 1, bits.length); + return ArrayUtil.copyOfSubArray(bits, 1, bits.length); } else { byte[] result = new byte[ID_LENGTH]; System.arraycopy(bits, 0, result, result.length - bits.length, bits.length); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 704a6c41243..60ec8659346 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -137,8 +137,8 @@ public final class DaciukMihovAutomatonBuilder { assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " + label; - labels = Arrays.copyOf(labels, labels.length + 1); - states = Arrays.copyOf(states, states.length + 1); + labels = ArrayUtil.growExact(labels, labels.length + 1); + states = ArrayUtil.growExact(states, states.length + 1); labels[labels.length - 1] = label; return states[states.length - 1] = new State(); diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 7f55895842e..fb7e1cee7d8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -656,7 +656,7 @@ public class BKDWriter implements Closeable { if (leafBlockFPs.size() > 0) { // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end: - leafBlockStartValues.add(Arrays.copyOf(leafValues, packedBytesLength)); + leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength)); } leafBlockFPs.add(out.getFilePointer()); checkMaxLeafNodeCount(leafBlockFPs.size()); @@ -687,8 +687,8 @@ public class BKDWriter implements Closeable { return scratchBytesRef1; } }; - assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength), - Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), + assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength), + ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), packedValues, leafDocs, 0); writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues); out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition()); @@ -1591,8 +1591,8 @@ public class BKDWriter implements Closeable { reader.getValue(mid, scratchBytesRef1); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim); - byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength); - byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength); + byte[] minSplitPackedValue = ArrayUtil.copyOfSubArray(minPackedValue, 0, packedBytesLength); + byte[] maxSplitPackedValue = ArrayUtil.copyOfSubArray(maxPackedValue, 0, packedBytesLength); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index e102651ceb2..eb1d48b9f12 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -18,7 +18,6 @@ package org.apache.lucene.util.bkd; import java.io.Closeable; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.lucene.util.ArrayUtil; @@ -116,12 +115,12 @@ public final class HeapPointWriter implements PointWriter { if (docIDs.length == nextWrite) { int nextSize = Math.min(maxSize, ArrayUtil.oversize(nextWrite+1, Integer.BYTES)); assert nextSize > nextWrite: "nextSize=" + nextSize + " vs nextWrite=" + nextWrite; - docIDs = Arrays.copyOf(docIDs, nextSize); + docIDs = ArrayUtil.growExact(docIDs, nextSize); if (singleValuePerDoc == false) { if (ordsLong != null) { - ordsLong = Arrays.copyOf(ordsLong, nextSize); + ordsLong = ArrayUtil.growExact(ordsLong, nextSize); } else { - ords = Arrays.copyOf(ords, nextSize); + ords = ArrayUtil.growExact(ords, nextSize); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DeltaPackedLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/DeltaPackedLongValues.java index 80534da8902..6aabb079281 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DeltaPackedLongValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DeltaPackedLongValues.java @@ -17,8 +17,7 @@ package org.apache.lucene.util.packed; -import java.util.Arrays; - +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.PackedInts.Reader; @@ -70,8 +69,8 @@ class DeltaPackedLongValues extends PackedLongValues { public DeltaPackedLongValues build() { finish(); pending = null; - final PackedInts.Reader[] values = Arrays.copyOf(this.values, valuesOff); - final long[] mins = Arrays.copyOf(this.mins, valuesOff); + final PackedInts.Reader[] values = ArrayUtil.copyOfSubArray(this.values, 0, valuesOff); + final long[] mins = ArrayUtil.copyOfSubArray(this.mins, 0, valuesOff); final long ramBytesUsed = DeltaPackedLongValues.BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(values) + RamUsageEstimator.sizeOf(mins); return new DeltaPackedLongValues(pageShift, pageMask, values, mins, size, ramBytesUsed); @@ -94,7 +93,7 @@ class DeltaPackedLongValues extends PackedLongValues { void grow(int newBlockCount) { super.grow(newBlockCount); ramBytesUsed -= RamUsageEstimator.sizeOf(mins); - mins = Arrays.copyOf(mins, newBlockCount); + mins = ArrayUtil.growExact(mins, newBlockCount); ramBytesUsed += RamUsageEstimator.sizeOf(mins); } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicLongValues.java index 09b3ecd5cb2..89ad5ab3ca2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicLongValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicLongValues.java @@ -17,10 +17,9 @@ package org.apache.lucene.util.packed; -import java.util.Arrays; - import static org.apache.lucene.util.packed.MonotonicBlockPackedReader.expected; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.PackedInts.Reader; @@ -72,9 +71,9 @@ class MonotonicLongValues extends DeltaPackedLongValues { public MonotonicLongValues build() { finish(); pending = null; - final PackedInts.Reader[] values = Arrays.copyOf(this.values, valuesOff); - final long[] mins = Arrays.copyOf(this.mins, valuesOff); - final float[] averages = Arrays.copyOf(this.averages, valuesOff); + final PackedInts.Reader[] values = ArrayUtil.copyOfSubArray(this.values, 0, valuesOff); + final long[] mins = ArrayUtil.copyOfSubArray(this.mins, 0, valuesOff); + final float[] averages = ArrayUtil.copyOfSubArray(this.averages, 0, valuesOff); final long ramBytesUsed = MonotonicLongValues.BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(values) + RamUsageEstimator.sizeOf(mins) + RamUsageEstimator.sizeOf(averages); @@ -95,7 +94,7 @@ class MonotonicLongValues extends DeltaPackedLongValues { void grow(int newBlockCount) { super.grow(newBlockCount); ramBytesUsed -= RamUsageEstimator.sizeOf(averages); - averages = Arrays.copyOf(averages, newBlockCount); + averages = ArrayUtil.growExact(averages, newBlockCount); ramBytesUsed += RamUsageEstimator.sizeOf(averages); } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java index 0daf0623f90..19788b76eed 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java @@ -19,8 +19,6 @@ package org.apache.lucene.util.packed; import static org.apache.lucene.util.packed.PackedInts.checkBlockSize; -import java.util.Arrays; - import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LongValues; @@ -194,7 +192,7 @@ public class PackedLongValues extends LongValues implements Accountable { public PackedLongValues build() { finish(); pending = null; - final PackedInts.Reader[] values = Arrays.copyOf(this.values, valuesOff); + final PackedInts.Reader[] values = ArrayUtil.copyOfSubArray(this.values, 0, valuesOff); final long ramBytesUsed = PackedLongValues.BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(values); return new PackedLongValues(pageShift, pageMask, values, size, ramBytesUsed); } @@ -273,7 +271,7 @@ public class PackedLongValues extends LongValues implements Accountable { void grow(int newBlockCount) { ramBytesUsed -= RamUsageEstimator.shallowSizeOf(values); - values = Arrays.copyOf(values, newBlockCount); + values = ArrayUtil.growExact(values, newBlockCount); ramBytesUsed += RamUsageEstimator.shallowSizeOf(values); } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java index 53b3f568e88..438e5e3cdc5 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java @@ -20,9 +20,9 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.util.Arrays; import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.junit.Test; @@ -42,7 +42,7 @@ public class TestCharacterUtils extends LuceneTestCase { final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2); final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3); assertEquals(orig.length - o1, charCount); - assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount)); + assertArrayEquals(ArrayUtil.copyOfSubArray(orig, o1, o1 + charCount), ArrayUtil.copyOfSubArray(restored, o3, o3 + charCount)); } @Test diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java index 045b19ad802..62d06d8f0f6 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java @@ -22,6 +22,7 @@ import java.util.Arrays; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -60,7 +61,7 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { ByteArrayDataOutput out = new ByteArrayDataOutput(compressed); compressor.compress(decompressed, off, len, out); final int compressedLen = out.getPosition(); - return Arrays.copyOf(compressed, compressedLen); + return ArrayUtil.copyOfSubArray(compressed, 0, compressedLen); } byte[] decompress(byte[] compressed, int originalLength) throws IOException { @@ -71,14 +72,14 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { static byte[] decompress(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException { final BytesRef bytes = new BytesRef(); decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes); - return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length); + return BytesRef.deepCopyOf(bytes).bytes; } byte[] decompress(byte[] compressed, int originalLength, int offset, int length) throws IOException { Decompressor decompressor = mode.newDecompressor(); final BytesRef bytes = new BytesRef(); decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, offset, length, bytes); - return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length); + return BytesRef.deepCopyOf(bytes).bytes; } public void testDecompress() throws IOException { @@ -89,7 +90,7 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { final int len = random().nextBoolean() ? decompressed.length - off : TestUtil.nextInt(random(), 0, decompressed.length - off); final byte[] compressed = compress(decompressed, off, len); final byte[] restored = decompress(compressed, len); - assertArrayEquals(Arrays.copyOfRange(decompressed, off, off+len), restored); + assertArrayEquals(ArrayUtil.copyOfSubArray(decompressed, off, off+len), restored); } } @@ -106,7 +107,7 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { length = random().nextInt(decompressed.length - offset); } final byte[] restored = decompress(compressed, decompressed.length, offset, length); - assertArrayEquals(Arrays.copyOfRange(decompressed, offset, offset + length), restored); + assertArrayEquals(ArrayUtil.copyOfSubArray(decompressed, offset, offset + length), restored); } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java index 3fe003e1dce..e13645fbfad 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java @@ -22,13 +22,13 @@ import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_DATA_SIZE; import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.packed.PackedInts; @@ -39,7 +39,7 @@ public class TestForUtil extends LuceneTestCase { public void testEncodeDecode() throws IOException { final int iterations = RandomNumbers.randomIntBetween(random(), 1, 1000); final float acceptableOverheadRatio = random().nextFloat(); - final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE]; + final int[] values = new int[iterations * BLOCK_SIZE]; for (int i = 0; i < iterations; ++i) { final int bpv = random().nextInt(32); if (bpv == 0) { @@ -64,9 +64,9 @@ public class TestForUtil extends LuceneTestCase { final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out); for (int i = 0; i < iterations; ++i) { - forUtil.writeBlock( - Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length), - new byte[MAX_ENCODED_SIZE], out); + // Although values after BLOCK_SIZE are garbage, we need to allocate extra bytes to avoid AIOOBE. + int[] block = ArrayUtil.grow(ArrayUtil.copyOfSubArray(values, i*BLOCK_SIZE, (i+1)*BLOCK_SIZE)); + forUtil.writeBlock(ArrayUtil.grow(block, MAX_DATA_SIZE), new byte[MAX_ENCODED_SIZE], out); } endPointer = out.getFilePointer(); out.close(); @@ -83,8 +83,8 @@ public class TestForUtil extends LuceneTestCase { } final int[] restored = new int[MAX_DATA_SIZE]; forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored); - assertArrayEquals(Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE), - Arrays.copyOf(restored, BLOCK_SIZE)); + assertArrayEquals(ArrayUtil.copyOfSubArray(values, i*BLOCK_SIZE, (i+1)*BLOCK_SIZE), + ArrayUtil.copyOfSubArray(restored, 0, BLOCK_SIZE)); } assertEquals(endPointer, in.getFilePointer()); in.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java index 4da5059487d..6e2bd13b0ce 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java @@ -18,7 +18,6 @@ package org.apache.lucene.index; import java.io.IOException; -import java.util.Arrays; import java.util.Map; import java.util.Random; @@ -238,7 +237,7 @@ public class TestPerSegmentDeletes extends LuceneTestCase { docs = ArrayUtil.grow(docs, numDocs + 1); docs[numDocs + 1] = docID; } - return Arrays.copyOf(docs, numDocs); + return ArrayUtil.copyOfSubArray(docs, 0, numDocs); } public static class RangeMergePolicy extends MergePolicy { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index a9e2891140a..66b895a3b62 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -19,7 +19,6 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.Arrays; import java.util.Collections; import java.util.Random; @@ -38,6 +37,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; @@ -227,7 +227,7 @@ public class TestBoolean2 extends LuceneTestCase { // adjust the expected doc numbers according to our filler docs if (0 < NUM_FILLER_DOCS) { - expDocNrs = Arrays.copyOf(expDocNrs, expDocNrs.length); + expDocNrs = ArrayUtil.copyOfSubArray(expDocNrs, 0, expDocNrs.length); for (int i=0; i < expDocNrs.length; i++) { expDocNrs[i] = PRE_FILLER_DOCS + ((NUM_FILLER_DOCS + 1) * expDocNrs[i]); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java index 5a5a3aee420..49b0e110563 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -122,7 +123,7 @@ public class TestDoubleValuesSource extends LuceneTestCase { }; Collections.shuffle(Arrays.asList(fields), random()); int numSorts = TestUtil.nextInt(random(), 1, fields.length); - return new Sort(Arrays.copyOfRange(fields, 0, numSorts)); + return new Sort(ArrayUtil.copyOfSubArray(fields, 0, numSorts)); } // Take a Sort, and replace any field sorts with Sortables diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java b/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java index 8b20be5169d..4c77e7e7d25 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -103,7 +104,7 @@ public class TestLongValuesSource extends LuceneTestCase { }; Collections.shuffle(Arrays.asList(fields), random()); int numSorts = TestUtil.nextInt(random(), 1, fields.length); - return new Sort(Arrays.copyOfRange(fields, 0, numSorts)); + return new Sort(ArrayUtil.copyOfSubArray(fields, 0, numSorts)); } // Take a Sort, and replace any field sorts with Sortables diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java index eb311284bcd..7bd235e0c00 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -45,6 +45,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; @@ -731,7 +732,7 @@ public class TestPhraseQuery extends LuceneTestCase { public void testTopPhrases() throws IOException { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); - String[] docs = Arrays.copyOf(DOCS, DOCS.length); + String[] docs = ArrayUtil.copyOfSubArray(DOCS, 0, DOCS.length); Collections.shuffle(Arrays.asList(docs), random()); for (String value : DOCS) { Document doc = new Document(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSimpleExplanationsWithFillerDocs.java b/lucene/core/src/test/org/apache/lucene/search/TestSimpleExplanationsWithFillerDocs.java index 9f506688c52..7e8a7614ae6 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSimpleExplanationsWithFillerDocs.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSimpleExplanationsWithFillerDocs.java @@ -16,12 +16,11 @@ */ package org.apache.lucene.search; -import java.util.Arrays; - import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.Term; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.LuceneTestCase.Slow; @@ -101,7 +100,7 @@ public class TestSimpleExplanationsWithFillerDocs extends TestSimpleExplanations @Override public void qtest(Query q, int[] expDocNrs) throws Exception { - expDocNrs = Arrays.copyOf(expDocNrs, expDocNrs.length); + expDocNrs = ArrayUtil.copyOfSubArray(expDocNrs, 0, expDocNrs.length); for (int i=0; i < expDocNrs.length; i++) { expDocNrs[i] = PRE_FILLER_DOCS + ((NUM_FILLER_DOCS + 1) * expDocNrs[i]); } diff --git a/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java b/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java index 2db901baa1a..50532235c0d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java @@ -48,7 +48,7 @@ public abstract class BaseSortTestCase extends LuceneTestCase { public void assertSorted(Entry[] original, Entry[] sorted) { assertEquals(original.length, sorted.length); - Entry[] actuallySorted = Arrays.copyOf(original, original.length); + Entry[] actuallySorted = ArrayUtil.copyOfSubArray(original, 0, original.length); Arrays.sort(actuallySorted); for (int i = 0; i < original.length; ++i) { assertEquals(actuallySorted[i].value, sorted[i].value); @@ -64,7 +64,7 @@ public abstract class BaseSortTestCase extends LuceneTestCase { System.arraycopy(arr, 0, toSort, o, arr.length); final Sorter sorter = newSorter(toSort); sorter.sort(o, o + arr.length); - assertSorted(arr, Arrays.copyOfRange(toSort, o, o + arr.length)); + assertSorted(arr, ArrayUtil.copyOfSubArray(toSort, o, o + arr.length)); } enum Strategy { diff --git a/lucene/core/src/test/org/apache/lucene/util/StressRamUsageEstimator.java b/lucene/core/src/test/org/apache/lucene/util/StressRamUsageEstimator.java index 7a2712fcbd2..cb743f19561 100644 --- a/lucene/core/src/test/org/apache/lucene/util/StressRamUsageEstimator.java +++ b/lucene/core/src/test/org/apache/lucene/util/StressRamUsageEstimator.java @@ -16,9 +16,6 @@ */ package org.apache.lucene.util; - -import java.util.Arrays; - /** * Estimates how {@link RamUsageEstimator} estimates physical memory consumption * of Java objects. @@ -88,7 +85,7 @@ public class StressRamUsageEstimator extends LuceneTestCase { // Make another batch of objects. Object[] seg = new Object[10000]; - all = Arrays.copyOf(all, all.length + 1); + all = ArrayUtil.growExact(all, all.length + 1); all[all.length - 1] = seg; for (int i = 0; i < seg.length; i++) { seg[i] = new byte[random().nextInt(7)]; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java index 0cda337d4aa..285df74f078 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java @@ -21,6 +21,9 @@ import java.util.Collections; import java.util.Comparator; import java.util.Random; +import static org.apache.lucene.util.ArrayUtil.copyOfSubArray; +import static org.apache.lucene.util.ArrayUtil.growExact; + public class TestArrayUtil extends LuceneTestCase { // Ensure ArrayUtil.getNextSize gives linear amortized cost of realloc/copy @@ -294,4 +297,88 @@ public class TestArrayUtil extends LuceneTestCase { } } } + + public void testGrowExact() { + assertArrayEquals(new short[]{1, 2, 3, 0}, growExact(new short[]{1, 2, 3}, 4)); + assertArrayEquals(new short[]{1, 2, 3, 0, 0}, growExact(new short[]{1, 2, 3}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new short[]{1, 2, 3}, random().nextInt(3))); + + assertArrayEquals(new int[]{1, 2, 3, 0}, growExact(new int[]{1, 2, 3}, 4)); + assertArrayEquals(new int[]{1, 2, 3, 0, 0}, growExact(new int[]{1, 2, 3}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new int[]{1, 2, 3}, random().nextInt(3))); + + assertArrayEquals(new long[]{1, 2, 3, 0}, growExact(new long[]{1, 2, 3}, 4)); + assertArrayEquals(new long[]{1, 2, 3, 0, 0}, growExact(new long[]{1, 2, 3}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new long[]{1, 2, 3}, random().nextInt(3))); + + assertArrayEquals(new float[]{0.1f, 0.2f, 0.3f, 0}, growExact(new float[]{0.1f, 0.2f, 0.3f}, 4), 0.001f); + assertArrayEquals(new float[]{0.1f, 0.2f, 0.3f, 0, 0}, growExact(new float[]{0.1f, 0.2f, 0.3f}, 5), 0.001f); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new float[]{1, 2, 3}, random().nextInt(3))); + + assertArrayEquals(new double[]{0.1, 0.2, 0.3, 0.0}, growExact(new double[]{0.1, 0.2, 0.3}, 4), 0.001); + assertArrayEquals(new double[]{0.1, 0.2, 0.3, 0.0, 0.0}, growExact(new double[]{0.1, 0.2, 0.3}, 5), 0.001); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new double[]{0.1, 0.2, 0.3}, random().nextInt(3))); + + assertArrayEquals(new byte[]{1, 2, 3, 0}, growExact(new byte[]{1, 2, 3}, 4)); + assertArrayEquals(new byte[]{1, 2, 3, 0, 0}, growExact(new byte[]{1, 2, 3}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new byte[]{1, 2, 3}, random().nextInt(3))); + + assertArrayEquals(new char[]{'a', 'b', 'c', '\0'}, growExact(new char[]{'a', 'b', 'c'}, 4)); + assertArrayEquals(new char[]{'a', 'b', 'c', '\0', '\0'}, growExact(new char[]{'a', 'b', 'c'}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new byte[]{'a', 'b', 'c'}, random().nextInt(3))); + + assertArrayEquals(new String[]{"a1", "b2", "c3", null}, growExact(new String[]{"a1", "b2", "c3"}, 4)); + assertArrayEquals(new String[]{"a1", "b2", "c3", null, null}, growExact(new String[]{"a1", "b2", "c3"}, 5)); + expectThrows(IndexOutOfBoundsException.class, () -> growExact(new String[]{"a", "b", "c"}, random().nextInt(3))); + } + + public void testCopyOfSubArray() { + short[] shortArray = {1, 2, 3}; + assertArrayEquals(new short[]{1}, copyOfSubArray(shortArray, 0, 1)); + assertArrayEquals(new short[]{1, 2, 3}, copyOfSubArray(shortArray, 0, 3)); + assertEquals(0, copyOfSubArray(shortArray, 0, 0).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(shortArray, 0, 4 + random().nextInt(10))); + + int[] intArray = {1, 2, 3}; + assertArrayEquals(new int[]{1, 2}, copyOfSubArray(intArray, 0, 2)); + assertArrayEquals(new int[]{1, 2, 3}, copyOfSubArray(intArray, 0, 3)); + assertEquals(0, copyOfSubArray(intArray, 1, 1).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(intArray, 1, 4 + random().nextInt(10))); + + long[] longArray = {1, 2, 3}; + assertArrayEquals(new long[]{2}, copyOfSubArray(longArray, 1, 2)); + assertArrayEquals(new long[]{1, 2, 3}, copyOfSubArray(longArray, 0, 3)); + assertEquals(0, copyOfSubArray(longArray, 2, 2).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(longArray, 2, 4 + random().nextInt(10))); + + float[] floatArray = {0.1f, 0.2f, 0.3f}; + assertArrayEquals(new float[]{0.2f, 0.3f}, copyOfSubArray(floatArray, 1, 3), 0.001f); + assertArrayEquals(new float[]{0.1f, 0.2f, 0.3f}, copyOfSubArray(floatArray, 0, 3), 0.001f); + assertEquals(0, copyOfSubArray(floatArray, 0, 0).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(floatArray, 3, 4 + random().nextInt(10))); + + double[] doubleArray = {0.1, 0.2, 0.3}; + assertArrayEquals(new double[]{0.3}, copyOfSubArray(doubleArray, 2, 3), 0.001); + assertArrayEquals(new double[]{0.1, 0.2, 0.3}, copyOfSubArray(doubleArray, 0, 3), 0.001); + assertEquals(0, copyOfSubArray(doubleArray, 1, 1).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(doubleArray, 0, 4 + random().nextInt(10))); + + byte[] byteArray = {1, 2, 3}; + assertArrayEquals(new byte[]{1}, copyOfSubArray(byteArray, 0, 1)); + assertArrayEquals(new byte[]{1, 2, 3}, copyOfSubArray(byteArray, 0, 3)); + assertEquals(0, copyOfSubArray(byteArray, 1, 1).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(byteArray, 1, 4 + random().nextInt(10))); + + char[] charArray = {'a', 'b', 'c'}; + assertArrayEquals(new char[]{'a', 'b'}, copyOfSubArray(charArray, 0, 2)); + assertArrayEquals(new char[]{'a', 'b', 'c'}, copyOfSubArray(charArray, 0, 3)); + assertEquals(0, copyOfSubArray(charArray, 1, 1).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(charArray, 3, 4)); + + String[] objectArray = {"a1", "b2", "c3"}; + assertArrayEquals(new String[]{"a1"}, copyOfSubArray(objectArray, 0, 1)); + assertArrayEquals(new String[]{"a1", "b2", "c3"}, copyOfSubArray(objectArray, 0, 3)); + assertEquals(0, copyOfSubArray(objectArray, 1, 1).length); + expectThrows(IndexOutOfBoundsException.class, () -> copyOfSubArray(objectArray, 2, 5)); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRef.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRef.java index 3a5bb53c742..2a869adca37 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRef.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRef.java @@ -48,4 +48,12 @@ public class TestBytesRef extends LuceneTestCase { // only for 4.x assertEquals("\uFFFF", new BytesRef("\uFFFF").utf8ToString()); } + + public void testInvalidDeepCopy() { + BytesRef from = new BytesRef(new byte[] { 1, 2 }); + from.offset += 1; // now invalid + expectThrows(IndexOutOfBoundsException.class, () -> { + BytesRef.deepCopyOf(from); + }); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java index 0a4c8844c85..079b3b774b5 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java @@ -125,4 +125,12 @@ public class TestCharsRef extends LuceneTestCase { c.subSequence(2, 1); }); } + + public void testInvalidDeepCopy() { + CharsRef from = new CharsRef(new char[] { 'a', 'b' }, 0, 2); + from.offset += 1; // now invalid + expectThrows(IndexOutOfBoundsException.class, () -> { + CharsRef.deepCopyOf(from); + }); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestIntsRef.java b/lucene/core/src/test/org/apache/lucene/util/TestIntsRef.java index b9976592fa0..654e77d133d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestIntsRef.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestIntsRef.java @@ -37,4 +37,12 @@ public class TestIntsRef extends LuceneTestCase { assertFalse(i.equals(i2)); } + + public void testInvalidDeepCopy() { + IntsRef from = new IntsRef(new int[] { 1, 2 }, 0, 2); + from.offset += 1; // now invalid + expectThrows(IndexOutOfBoundsException.class, () -> { + IntsRef.deepCopyOf(from); + }); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestLSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestLSBRadixSorter.java index ba8bd021d65..b7696c28aa2 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestLSBRadixSorter.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestLSBRadixSorter.java @@ -37,7 +37,7 @@ public class TestLSBRadixSorter extends LuceneTestCase { } public void test(LSBRadixSorter sorter, int[] arr, int len) { - final int[] expected = Arrays.copyOf(arr, len); + final int[] expected = ArrayUtil.copyOfSubArray(arr, 0, len); Arrays.sort(expected); int numBits = 0; @@ -50,7 +50,7 @@ public class TestLSBRadixSorter extends LuceneTestCase { } sorter.sort(numBits, arr, len); - final int[] actual = Arrays.copyOf(arr, len); + final int[] actual = ArrayUtil.copyOfSubArray(arr, 0, len); assertArrayEquals(expected, actual); } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestLongsRef.java b/lucene/core/src/test/org/apache/lucene/util/TestLongsRef.java new file mode 100644 index 00000000000..ec4575fa48c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestLongsRef.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +public class TestLongsRef extends LuceneTestCase { + public void testEmpty() { + LongsRef i = new LongsRef(); + assertEquals(LongsRef.EMPTY_LONGS, i.longs); + assertEquals(0, i.offset); + assertEquals(0, i.length); + } + + public void testFromLongs() { + long longs[] = new long[] { 1, 2, 3, 4 }; + LongsRef i = new LongsRef(longs, 0, 4); + assertEquals(longs, i.longs); + assertEquals(0, i.offset); + assertEquals(4, i.length); + + LongsRef i2 = new LongsRef(longs, 1, 3); + assertEquals(new LongsRef(new long[] { 2, 3, 4 }, 0, 3), i2); + + assertFalse(i.equals(i2)); + } + + public void testInvalidDeepCopy() { + LongsRef from = new LongsRef(new long[] { 1, 2 }, 0, 2); + from.offset += 1; // now invalid + expectThrows(IndexOutOfBoundsException.class, () -> { + LongsRef.deepCopyOf(from); + }); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java index 52eb494711a..efd1f0328e9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java @@ -23,7 +23,7 @@ import java.util.Set; public class TestMSBRadixSorter extends LuceneTestCase { private void test(BytesRef[] refs, int len) { - BytesRef[] expected = Arrays.copyOf(refs, len); + BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len); Arrays.sort(expected); int maxLength = 0; @@ -63,7 +63,7 @@ public class TestMSBRadixSorter extends LuceneTestCase { refs[j] = tmp; } }.sort(0, len); - BytesRef[] actual = Arrays.copyOf(refs, len); + BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len); assertArrayEquals(expected, actual); } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestStringMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestStringMSBRadixSorter.java index c83ff676404..c4ee68b2d1a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestStringMSBRadixSorter.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestStringMSBRadixSorter.java @@ -21,7 +21,7 @@ import java.util.Arrays; public class TestStringMSBRadixSorter extends LuceneTestCase { private void test(BytesRef[] refs, int len) { - BytesRef[] expected = Arrays.copyOf(refs, len); + BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len); Arrays.sort(expected); new StringMSBRadixSorter() { @@ -38,7 +38,7 @@ public class TestStringMSBRadixSorter extends LuceneTestCase { refs[j] = tmp; } }.sort(0, len); - BytesRef[] actual = Arrays.copyOf(refs, len); + BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len); assertArrayEquals(expected, actual); } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index a675e0b8cd0..69c1b3f7720 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -34,6 +34,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.LuceneTestCase; @@ -903,8 +904,8 @@ public class TestPackedInts extends LuceneTestCase { // 3. re-encode final long[] blocks2 = new long[blocksOffset2 + blocksLen]; encoder.encode(values, valuesOffset, blocks2, blocksOffset2, longIterations); - assertArrayEquals(msg, Arrays.copyOfRange(blocks, blocksOffset, blocks.length), - Arrays.copyOfRange(blocks2, blocksOffset2, blocks2.length)); + assertArrayEquals(msg, ArrayUtil.copyOfSubArray(blocks, blocksOffset, blocks.length), + ArrayUtil.copyOfSubArray(blocks2, blocksOffset2, blocks2.length)); // test encoding from int[] if (bpv <= 32) { final long[] blocks3 = new long[blocks2.length]; diff --git a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java index cb24f6fe2a5..fe5317eb527 100644 --- a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java +++ b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java @@ -40,6 +40,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -109,7 +110,7 @@ public class TestExpressionSorts extends LuceneTestCase { }; Collections.shuffle(Arrays.asList(fields), random()); int numSorts = TestUtil.nextInt(random(), 1, fields.length); - assertQuery(query, new Sort(Arrays.copyOfRange(fields, 0, numSorts))); + assertQuery(query, new Sort(ArrayUtil.copyOfSubArray(fields, 0, numSorts))); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalQuery.java index 306c05973c6..c1125c2ff27 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalQuery.java @@ -18,7 +18,6 @@ package org.apache.lucene.search.intervals; import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; import java.util.Objects; import java.util.Set; @@ -36,6 +35,7 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.ArrayUtil; /** * A query that retrieves documents containing intervals returned from an @@ -89,7 +89,7 @@ public final class IntervalQuery extends Query { return null; } CollectionStatistics collectionStats = searcher.collectionStatistics(field); - return searcher.getSimilarity().scorer(boost, collectionStats, Arrays.copyOf(termStats, termUpTo)); + return searcher.getSimilarity().scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo)); } @Override diff --git a/lucene/sandbox/src/test/org/apache/lucene/document/TestHalfFloatPoint.java b/lucene/sandbox/src/test/org/apache/lucene/document/TestHalfFloatPoint.java index 0bcb3f8b844..9f7080846b6 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/document/TestHalfFloatPoint.java +++ b/lucene/sandbox/src/test/org/apache/lucene/document/TestHalfFloatPoint.java @@ -89,7 +89,7 @@ public class TestHalfFloatPoint extends LuceneTestCase { values[o++] = v; } } - values = Arrays.copyOf(values, o); + values = ArrayUtil.copyOfSubArray(values, 0, o); int iters = atLeast(1000000); for (int iter = 0; iter < iters; ++iter) { diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/tree/QuadPrefixTree.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/tree/QuadPrefixTree.java index 3242e7e623f..9bc947fbbda 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/tree/QuadPrefixTree.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/tree/QuadPrefixTree.java @@ -19,7 +19,6 @@ package org.apache.lucene.spatial.prefix.tree; import java.io.PrintStream; import java.text.NumberFormat; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Locale; @@ -245,7 +244,8 @@ public class QuadPrefixTree extends LegacyPrefixTree { protected BytesRef concat(BytesRef source, byte b) { //+2 for new char + potential leaf - final byte[] buffer = Arrays.copyOfRange(source.bytes, source.offset, source.offset + source.length + 2); + final byte[] buffer = new byte[source.length + 2]; + System.arraycopy(source.bytes, source.offset, buffer, 0, source.length); BytesRef target = new BytesRef(buffer); target.length = source.length; target.bytes[target.length++] = b; diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/DistanceStrategyTest.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/DistanceStrategyTest.java index 3e3b2e2abcb..989252e3c6d 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/DistanceStrategyTest.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/DistanceStrategyTest.java @@ -18,7 +18,6 @@ package org.apache.lucene.spatial; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; @@ -31,6 +30,7 @@ import org.apache.lucene.spatial.prefix.tree.QuadPrefixTree; import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree; import org.apache.lucene.spatial.serialized.SerializedDVStrategy; import org.apache.lucene.spatial.vector.PointVectorStrategy; +import org.apache.lucene.util.ArrayUtil; import org.junit.Test; import org.locationtech.spatial4j.context.SpatialContext; import org.locationtech.spatial4j.shape.Point; @@ -107,7 +107,7 @@ public class DistanceStrategyTest extends StrategyTestCase { void checkDistValueSource(Point pt, float... distances) throws IOException { float multiplier = random().nextFloat() * 100f; - float[] dists2 = Arrays.copyOf(distances, distances.length); + float[] dists2 = ArrayUtil.copyOfSubArray(distances, 0, distances.length); for (int i = 0; i < dists2.length; i++) { dists2[i] *= multiplier; } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java index 2c5dcd8e540..c25b44d5706 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.suggest.document; import java.util.ArrayList; -import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -521,7 +520,7 @@ public class TestContextQuery extends LuceneTestCase { query.addContext(contexts.get(i), i + 1); } TopSuggestDocs suggest = suggestIndexSearcher.suggest(query, 4, false); - assertSuggestions(suggest, Arrays.copyOfRange(expectedResults, 0, 4)); + assertSuggestions(suggest, ArrayUtil.copyOfSubArray(expectedResults, 0, 4)); } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java index 60e2cca0249..82d8adb08e4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java @@ -559,7 +559,7 @@ public abstract class BaseStoredFieldsFormatTestCase extends BaseIndexFileFormat for (int j = 0; j < data[docId].length; ++j) { final byte[] arr = data[docId][j]; final BytesRef arr2Ref = doc.getBinaryValue("bytes" + j); - final byte[] arr2 = Arrays.copyOfRange(arr2Ref.bytes, arr2Ref.offset, arr2Ref.offset + arr2Ref.length); + final byte[] arr2 = BytesRef.deepCopyOf(arr2Ref).bytes; assertArrayEquals(arr, arr2); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/BlockScoreQueryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/search/BlockScoreQueryWrapper.java index 3b9a740a448..b15fa280fb6 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/BlockScoreQueryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/BlockScoreQueryWrapper.java @@ -97,10 +97,8 @@ public final class BlockScoreQueryWrapper extends Query { DocIdSetIterator it = inScorer.iterator(); int i = 1; for (int doc = it.nextDoc(); ; doc = it.nextDoc()) { - if (i == tmpDocs.length) { - tmpDocs = ArrayUtil.grow(tmpDocs); - tmpScores = Arrays.copyOf(tmpScores, tmpDocs.length); - } + tmpDocs = ArrayUtil.grow(tmpDocs, i + 1); + tmpScores = ArrayUtil.grow(tmpScores, i + 1); tmpDocs[i] = doc; if (doc == DocIdSetIterator.NO_MORE_DOCS) { i++; @@ -109,8 +107,8 @@ public final class BlockScoreQueryWrapper extends Query { tmpScores[i] = inScorer.score(); i++; } - final int[] docs = Arrays.copyOf(tmpDocs, i); - final float[] scores = Arrays.copyOf(tmpScores, i); + final int[] docs = ArrayUtil.copyOfSubArray(tmpDocs, 0, i); + final float[] scores = ArrayUtil.copyOfSubArray(tmpScores, 0, i); return new Scorer(inWeight) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java index 8ef4febea9c..e3f26e46032 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java @@ -17,7 +17,6 @@ package org.apache.lucene.util.automaton; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; @@ -253,8 +252,7 @@ public class AutomatonTestUtil { codePoints[codepointCount++] = getRandomCodePoint(r, t.min, t.max); s = t.dest; } - - return Arrays.copyOf(codePoints, codepointCount); + return ArrayUtil.copyOfSubArray(codePoints, 0, codepointCount); } } diff --git a/lucene/tools/forbiddenApis/lucene.txt b/lucene/tools/forbiddenApis/lucene.txt index e02bd40ad01..2a1d88390bf 100644 --- a/lucene/tools/forbiddenApis/lucene.txt +++ b/lucene/tools/forbiddenApis/lucene.txt @@ -27,3 +27,27 @@ java.util.zip.ZipFile @defaultMessage Use home-grown methods instead java.lang.Math#toRadians(double) java.lang.Math#toDegrees(double) + +@defaultMessage Prefer using ArrayUtil as Arrays#copyOfRange fills zeros for bad bounds +java.util.Arrays#copyOfRange(byte[],int,int) +java.util.Arrays#copyOfRange(char[],int,int) +java.util.Arrays#copyOfRange(short[],int,int) +java.util.Arrays#copyOfRange(int[],int,int) +java.util.Arrays#copyOfRange(long[],int,int) +java.util.Arrays#copyOfRange(float[],int,int) +java.util.Arrays#copyOfRange(double[],int,int) +java.util.Arrays#copyOfRange(boolean[],int,int) +java.util.Arrays#copyOfRange(java.lang.Object[],int,int) +java.util.Arrays#copyOfRange(java.lang.Object[],int,int,java.lang.Class) + +@defaultMessage Prefer using ArrayUtil as Arrays#copyOf fills zeros for bad bounds +java.util.Arrays#copyOf(byte[],int) +java.util.Arrays#copyOf(char[],int) +java.util.Arrays#copyOf(short[],int) +java.util.Arrays#copyOf(int[],int) +java.util.Arrays#copyOf(long[],int) +java.util.Arrays#copyOf(float[],int) +java.util.Arrays#copyOf(double[],int) +java.util.Arrays#copyOf(boolean[],int) +java.util.Arrays#copyOf(java.lang.Object[],int) +java.util.Arrays#copyOf(java.lang.Object[],int,java.lang.Class) From a82c9cfcf49510c4e3d8d4de7f7a7947583d438f Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 7 Jun 2018 10:33:52 +0200 Subject: [PATCH 36/38] Add a 7.5.0 version. --- lucene/CHANGES.txt | 3 +++ .../java/org/apache/lucene/util/Version.java | 7 +++++++ solr/CHANGES.txt | 17 +++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8f307fe45ab..1b4bb2c0a8d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -107,6 +107,9 @@ Optimizations or phrase queries as sub queries, which know how to leverage this information to run faster. (Adrien Grand) +======================= Lucene 7.5.0 ======================= +(No Changes) + ======================= Lucene 7.4.0 ======================= API Changes diff --git a/lucene/core/src/java/org/apache/lucene/util/Version.java b/lucene/core/src/java/org/apache/lucene/util/Version.java index 1067f93a224..80368da9f62 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Version.java +++ b/lucene/core/src/java/org/apache/lucene/util/Version.java @@ -89,6 +89,13 @@ public final class Version { @Deprecated public static final Version LUCENE_7_4_0 = new Version(7, 4, 0); + /** + * Match settings and bugs in Lucene's 7.5.0 release. + * @deprecated Use latest + */ + @Deprecated + public static final Version LUCENE_7_5_0 = new Version(7, 5, 0); + /** * Match settings and bugs in Lucene's 8.0.0 release. *

    diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1a88c5b0419..4499c4dfa66 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -45,6 +45,23 @@ Upgrade Notes MemoryDocValues). If you used postingsFormat="Memory" or docValuesFormat="Memory" switch to "Direct" instead. (Dawid Weiss) +================== 7.5.0 ================== + +Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. + +Versions of Major Components +--------------------- +Apache Tika 1.17 +Carrot2 3.16.0 +Velocity 1.7 and Velocity Tools 2.0 +Apache UIMA 2.3.1 +Apache ZooKeeper 3.4.11 +Jetty 9.4.10.v20180503 + + +(No Changes) + + ================== 7.4.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. From a4fa16896225e08b72bf64fba97a216bb6a83fbb Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 7 Jun 2018 12:53:43 +0100 Subject: [PATCH 37/38] LUCENE-8273: Don't wrap ShingleFilter in conditions in testRandomChains --- .../org/apache/lucene/analysis/core/TestRandomChains.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index d94b39607bb..fd6f4b50aa1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -88,6 +88,7 @@ import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; import org.apache.lucene.analysis.payloads.IdentityEncoder; import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.TestSnowball; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; @@ -124,6 +125,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase { avoidConditionals.add(FingerprintFilter.class); avoidConditionals.add(MinHashFilter.class); avoidConditionals.add(ConcatenateGraphFilter.class); + // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can + // expose inconsistent offsets + // https://issues.apache.org/jira/browse/LUCENE-4170 + avoidConditionals.add(ShingleFilter.class); } private static final Map,Predicate> brokenConstructors = new HashMap<>(); From 6e880352976e15f876fa2b3c578a563c4d26cd2d Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Thu, 7 Jun 2018 10:17:23 -0500 Subject: [PATCH 38/38] SOLR-12018: Remove comments.apache.org integration for the Ref Guide --- solr/CHANGES.txt | 36 ++--- solr/solr-ref-guide/src/_includes/head.html | 1 - solr/solr-ref-guide/src/_layouts/page.html | 12 -- solr/solr-ref-guide/src/css/comments.css | 160 -------------------- 4 files changed, 19 insertions(+), 190 deletions(-) delete mode 100644 solr/solr-ref-guide/src/css/comments.css diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4499c4dfa66..289203e1ff3 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -95,7 +95,7 @@ New Features * SOLR-12396: Upgrade Carrot2 to 3.16.0, HPPC to 0.8.1, morfologik to 2.1.5. (Dawid Weiss) -* SOLR-11200: A new CMS config option 'ioThrottle' to manually enable/disable +* SOLR-11200: A new CMS config option 'ioThrottle' to manually enable/disable ConcurrentMergeSchedule.doAutoIOThrottle. (Amrit Sarkar, Nawab Zada Asad iqbal via Dawid Weiss) * SOLR-11670: Implement a periodic house-keeping task. This uses a scheduled autoscaling trigger and @@ -109,7 +109,7 @@ New Features * SOLR-12139: The "eq" (equals) function query now works with string fields, string literals, and perhaps anything. (Andrey Kudryavtsev, David Smiley) - + * SOLR-10783: Add support for Hadoop Credential Provider as SSL/TLS store password source. (Mano Kovacs via Mark Miller) @@ -143,15 +143,15 @@ New Features * SOLR-12378: Support missing versionField on indexed docs in DocBasedVersionConstraintsURP. (Oliver Bates, Michael Braun via Mark Miller) - + * SOLR-12388: Enable a strict ZooKeeper-connected search request mode, in which search - requests will fail when the coordinating node can't communicate with ZooKeeper, + requests will fail when the coordinating node can't communicate with ZooKeeper, by setting the "shards.tolerant" param to "requireZkConnected". (Steve Rowe) * SOLR-9685: #Tagging queries in JSON Query DSL, equivalent to LocalParams based query/filter tagging. Multiple tags are comma separated. LocalParams Example : {!tag=colorfilt}color:blue - Equivalent JSON Example : { "#colorfilt" : "color:blue" } + Equivalent JSON Example : { "#colorfilt" : "color:blue" } (Dmitry Tikhonov, Mikhail Khludnev, yonik) * SOLR-12328: JSON Facet API: Domain change with graph query. @@ -192,7 +192,7 @@ Bug Fixes ---------------------- * SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with - content stream. The similarity is calculated between the content stream's value and all + content stream. The similarity is calculated between the content stream's value and all fields listed in mlt.fl. (Dawid Weiss) * SOLR-12103: Raise CryptoKeys.DEFAULT_KEYPAIR_LENGTH from 1024 to 2048. (Mark Miller) @@ -219,10 +219,10 @@ Bug Fixes * SOLR-12172: Fixed race condition that could cause an invalid set of collection properties to be kept in memory when multiple collection property changes are done in a short period of time. (Tomás Fernández Löbbe) - -* SOLR-11929: UpdateLog metrics are not initialized on core reload. (ab, Steve Rowe) -* SOLR-12199: TestReplicationHandler.doTestRepeater(): TEST_PORT interpolation failure: +* SOLR-11929: UpdateLog metrics are not initialized on core reload. (ab, Steve Rowe) + +* SOLR-12199: TestReplicationHandler.doTestRepeater(): TEST_PORT interpolation failure: Server refused connection at: http://127.0.0.1:TEST_PORT/solr (Mikhail Khludnev, Dawid Weiss, Steve Rowe) * SOLR-12096: Fixed inconsistent results format of subquery transformer for distributed search (multi-shard). @@ -271,8 +271,8 @@ Bug Fixes * SOLR-12275: wrong caching for {!filters} as well as for `filters` local param in {!parent} and {!child} (David Smiley, Mikhail Khluldnev) - -* SOLR-12284: WordBreakSolrSpellchecker will no longer add parenthesis in collations when breaking words in + +* SOLR-12284: WordBreakSolrSpellchecker will no longer add parenthesis in collations when breaking words in non-boolean queries. (James Dyer) * SOLR-12290: Do not close any servlet streams and improve our servlet stream closing prevention code for users @@ -295,9 +295,9 @@ Bug Fixes about "Invalid Date String". (yonik) * SOLR-12307: exiting OverseerTriggerThread without endless noise in log when Zookeeper session is expired - (Mikhail Khludnev) + (Mikhail Khludnev) -* SOLR-12200: abandon OverseerExitThread when ZkController is closed. (Mikhail Khludnev) +* SOLR-12200: abandon OverseerExitThread when ZkController is closed. (Mikhail Khludnev) * SOLR-12355: Fixes hash conflict in HashJoinStream and OuterHashJoinStream (Dennis Gove) @@ -321,7 +321,7 @@ Bug Fixes * SOLR-12271: Fixed bug in how Analytics component reads negative values from float and double fields. (Houston Putman) -* SOLR-12433: Recovering flag of a replica is set equals to leader even it failed to receive update +* SOLR-12433: Recovering flag of a replica is set equals to leader even it failed to receive update on recovering. (Cao Manh Dat) * SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 (noble) @@ -374,6 +374,8 @@ Optimizations Other Changes ---------------------- +* SOLR-12018: Remove comments.apache.org integration for the Ref Guide; the comments system has been down since December 2017 and there is no concrete plan to bring it back. + * SOLR-12076: Remove unnecessary printLayout usage in CDCR tests (Varun Thacker) * SOLR-12086: Fix format problem in FastLRUCache description string shown on Cache Statistics page. @@ -455,7 +457,7 @@ Bug Fixes * SOLR-12256: Fixed some eventual-consistency issues with collection aliases by using ZooKeeper.sync(). (David Smiley) -* SOLR-12087: Deleting replicas sometimes fails and causes the replicas to exist in the down +* SOLR-12087: Deleting replicas sometimes fails and causes the replicas to exist in the down state (Cao Manh Dat) * SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat) @@ -471,7 +473,7 @@ Bug Fixes * SOLR-12202: Fix errors in solr-exporter.cmd. (Minoru Osuka via koji) * SOLR-12316: Do not allow to use absolute URIs for including other files in solrconfig.xml - and schema parsing (CVE-2018-8010). (Ananthesh, Ishan Chattopadhyaya, Uwe Schindler) + and schema parsing (CVE-2018-8010). (Ananthesh, Ishan Chattopadhyaya, Uwe Schindler) ================== 7.3.0 ================== @@ -635,7 +637,7 @@ New Features * SOLR-12077: Add support for autoAddReplicas in the collection creation dialog in Admin UI. (shalin) -* SOLR-9510: introducing {!filters param=$fq excludeTags=f} query parser. +* SOLR-9510: introducing {!filters param=$fq excludeTags=f} query parser. Introducing {!.. filters=$fq excludeTags=t,q} in {!parent} and {!child} (Dr. Oleg Savrasov via Mikhail Khludnev) Bug Fixes diff --git a/solr/solr-ref-guide/src/_includes/head.html b/solr/solr-ref-guide/src/_includes/head.html index ac20a72f9eb..60d6e5d7292 100755 --- a/solr/solr-ref-guide/src/_includes/head.html +++ b/solr/solr-ref-guide/src/_includes/head.html @@ -12,7 +12,6 @@ - diff --git a/solr/solr-ref-guide/src/_layouts/page.html b/solr/solr-ref-guide/src/_layouts/page.html index 22f88e6909f..d9f56c9f1ea 100755 --- a/solr/solr-ref-guide/src/_layouts/page.html +++ b/solr/solr-ref-guide/src/_layouts/page.html @@ -68,16 +68,4 @@ layout: default - - -

    -
    Comments on this Page
    -

    We welcome feedback on Solr documentation. However, we cannot provide application support via comments. If you need help, please send a message to the Solr User mailing list.

    -
    - - - {% include footer.html %} diff --git a/solr/solr-ref-guide/src/css/comments.css b/solr/solr-ref-guide/src/css/comments.css deleted file mode 100644 index f59796affcf..00000000000 --- a/solr/solr-ref-guide/src/css/comments.css +++ /dev/null @@ -1,160 +0,0 @@ -/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * comments.css - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - -#comments_thread a:link { - color: #5A88B5; - background-color: inherit; -} - -#comments_thread a:visited { - color: #5A88B5; - background-color: inherit; -} - -#comments_thread a:link:hover, -#comments_thread a:link:active, -#comments_thread a:visited:hover, -#comments_thread a:visited:active { - color: #0073c7; - background-color: #f0f0f0; -} - - -/* in general */ - -#comments_thread h4 { - font-size: 14px; -} - -.apaste_menu { - float: right; - margin-right: 10px; - width: 80px; -} - -.apaste_comment { - background: #FEFEFE; - border: 1px solid #AAA; - border-radius: 2px; - display: block; - white-space: pre-wrap; - font-weight: normal; - padding-left: 20px; - padding-right: 20px; - padding-bottom: 16px; - padding-top: 5px; - margin: 15px; - font-size: 13px -} -.comment_header { - color: #000000; - border-radius: 3px; - border: 1px solid #999; - min-height: 24px; - text-indent: 5px; - font-size: 12pt; - background: #ffe9a3; /* Old browsers */ - background: -moz-linear-gradient(top, #ffe9a3 0%, #ffd08a 32%, #ff9d57 69%, #ff833d 100%); /* FF3.6-15 */ - background: -webkit-linear-gradient(top, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* Chrome10-25,Safari5.1-6 */ - background: linear-gradient(to bottom, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* W3C, IE10+, FF16+, Chrome26+, Opera12+, Safari7+ */ -} - -.comment_header_verified { - color: #000000; - border-radius: 3px; - border: 1px solid #999; - min-height: 24px; - text-indent: 5px; - font-size: 12pt; - background: #ffe9a3; /* Old browsers */ - background: -moz-linear-gradient(top, #ffe9a3 0%, #ffd08a 32%, #ff9d57 69%, #ff833d 100%); /* FF3.6-15 */ - background: -webkit-linear-gradient(top, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* Chrome10-25,Safari5.1-6 */ - background: linear-gradient(to bottom, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* W3C, IE10+, FF16+, Chrome26+, Opera12+, Safari7+ */ -} - -.comment_header_sticky { - color: #000000; - border-radius: 3px; - border: 1px solid #999; - min-height: 24px; - text-indent: 5px; - font-size: 12pt; - background: #ffe9a3; /* Old browsers */ - background: -moz-linear-gradient(top, #ffe9a3 0%, #ffd08a 32%, #ff9d57 69%, #ff833d 100%); /* FF3.6-15 */ - background: -webkit-linear-gradient(top, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* Chrome10-25,Safari5.1-6 */ - background: linear-gradient(to bottom, #ffe9a3 0%,#ffd08a 32%,#ff9d57 69%,#ff833d 100%); /* W3C, IE10+, FF16+, Chrome26+, Opera12+, Safari7+ */ -} - -.comment_header img { - padding-top: 3px; - padding-bottom: 2px; -} - -.comment_header_verified img { - padding-top: 3px; - padding-bottom: 2px; -} - -.comment_header_sticky img { - padding-top: 3px; - padding-bottom: 2px; -} - -.apaste_comment img { -/* border-radius: 5px;*/ - border: none; -} - -.apaste_comment_selected {background: #F8F4E9;} -.apaste_comment_notapproved {background: #F8E0E0;} -.apaste_comment_resolved {background: #FAFCFA;} -.apaste_comment_sticky {background: #FFFFF6;} -.apaste_comment_verified {background: #FAFBFA;} - -.apaste_comment_invalid { - color: #999; - background: #F8F8F8; -} - - -.apaste_comment textarea { - width: 480px; - height: 180px; -} - -#apaste { - margin: 5px; - font-weight: normal; - font-size: 14px; - color: #024; - -} -#apaste .section { - padding: 20px; - padding-left: 80px; -} - -.notapproved { - background-color: #FEE; - padding: 5px; -} - -#comments_thread textarea{ - background-color: #ffffff; - width: auto; - border: 1px solid #1c1c1c; - border-radius: 3px; - box-shadow: 0pt 1px 3px rgba(0, 0, 0, 0.16) inset; - position: relative; -} - -.apaste_honeypot { - display: none; -} - -//* Remove external link icons when they appear in comments *// -a[href^="http://"]:after, -a[href^="https://"]:after { - content: none !important; -}