From 5b2594350df11ef54d52f417b34c6d082ad85e89 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 29 Nov 2016 08:05:47 +0530 Subject: [PATCH 01/53] SOLR-9784: added deprecation javadocs --- .../org/apache/solr/client/solrj/impl/CloudSolrClient.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index 9c59d4f69e1..3b694843995 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -645,6 +645,8 @@ public class CloudSolrClient extends SolrClient { * are allowing client access to zookeeper, you should protect the * /configs node against unauthorised write access. * + * @deprecated Please use {@link ZkClientClusterStateProvider#uploadConfig(Path, String)} instead + * * @param configPath {@link java.nio.file.Path} to the config files * @param configName the name of the config * @throws IOException if an IO error occurs @@ -665,6 +667,8 @@ public class CloudSolrClient extends SolrClient { /** * Download a named config from Zookeeper to a location on the filesystem + * + * @deprecated Please use {@link ZkClientClusterStateProvider#downloadConfig(String, Path)} instead * @param configName the name of the config * @param downloadPath the path to write config files to * @throws IOException if an I/O exception occurs From 70b358960dfe8a6da35991b2a84c93cc9370c3d8 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 29 Nov 2016 18:02:59 +0530 Subject: [PATCH 02/53] SOLR-9546: remove unnecessary boxing --- .../solr/search/mlt/CloudMLTQParser.java | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java index 0f85feb13c9..0f46725eb27 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java @@ -69,26 +69,19 @@ public class CloudMLTQParser extends QParser { Map boostFields = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); - if(localParams.getInt("mintf") != null) - mlt.setMinTermFreq(localParams.getInt("mintf")); + mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); mlt.setMinDocFreq(localParams.getInt("mindf", 0)); - if(localParams.get("minwl") != null) - mlt.setMinWordLen(localParams.getInt("minwl")); + mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - if(localParams.get("maxwl") != null) - mlt.setMaxWordLen(localParams.getInt("maxwl")); + mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - if(localParams.get("maxqt") != null) - mlt.setMaxQueryTerms(localParams.getInt("maxqt")); + mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - if(localParams.get("maxntp") != null) - mlt.setMaxNumTokensParsed(localParams.getInt("maxntp")); + mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - if(localParams.get("maxdf") != null) { - mlt.setMaxDocFreq(localParams.getInt("maxdf")); - } + mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); if(localParams.get("boost") != null) { mlt.setBoost(localParams.getBool("boost")); From 02c687758e904ab92c2b766b2ec837bcb99f484f Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Mon, 28 Nov 2016 19:58:25 +0100 Subject: [PATCH 03/53] SOLR-9783: (Search|Top)Group[s]ShardResponseProcessor.process: turned sortWithinGroup null check into assert. Also sort.equals tweak in (grouping) QueryCommand.create method. --- solr/CHANGES.txt | 3 +++ .../search/grouping/distributed/command/QueryCommand.java | 2 +- .../responseprocessor/SearchGroupShardResponseProcessor.java | 4 +--- .../responseprocessor/TopGroupsShardResponseProcessor.java | 4 +--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 422f1c658e4..cead9af537e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -245,6 +245,9 @@ Other Changes * SOLR-9801: Upgrade jetty to 9.3.14.v20161028 (shalin) +* SOLR-9783: (Search|Top)Group[s]ShardResponseProcessor.process: turned sortWithinGroup null check into assert. + (Christine Poerschke) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/QueryCommand.java b/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/QueryCommand.java index 86fe729447e..afb8ba78a9c 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/QueryCommand.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/QueryCommand.java @@ -124,7 +124,7 @@ public class QueryCommand implements Command { @Override public List create() throws IOException { - if (sort == null || sort == Sort.RELEVANCE) { + if (sort == null || sort.equals(Sort.RELEVANCE)) { collector = TopScoreDocCollector.create(docsToCollect); } else { collector = TopFieldCollector.create(sort, docsToCollect, true, needScores, needScores); diff --git a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/SearchGroupShardResponseProcessor.java b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/SearchGroupShardResponseProcessor.java index 18b0de54a44..0acd6f90e27 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/SearchGroupShardResponseProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/SearchGroupShardResponseProcessor.java @@ -52,9 +52,7 @@ public class SearchGroupShardResponseProcessor implements ShardResponseProcessor Sort groupSort = rb.getGroupingSpec().getGroupSort(); final String[] fields = rb.getGroupingSpec().getFields(); Sort sortWithinGroup = rb.getGroupingSpec().getSortWithinGroup(); - if (sortWithinGroup == null) { // TODO prevent it from being null in the first place - sortWithinGroup = Sort.RELEVANCE; - } + assert sortWithinGroup != null; final Map>>> commandSearchGroups = new HashMap<>(fields.length, 1.0f); final Map, Set>> tempSearchGroupToShards = new HashMap<>(fields.length, 1.0f); diff --git a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java index 688a6c37011..3610a383ccb 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java @@ -61,9 +61,7 @@ public class TopGroupsShardResponseProcessor implements ShardResponseProcessor { String[] fields = rb.getGroupingSpec().getFields(); String[] queries = rb.getGroupingSpec().getQueries(); Sort sortWithinGroup = rb.getGroupingSpec().getSortWithinGroup(); - if (sortWithinGroup == null) { // TODO prevent it from being null in the first place - sortWithinGroup = Sort.RELEVANCE; - } + assert sortWithinGroup != null; // If group.format=simple group.offset doesn't make sense int groupOffsetDefault; From 590d31f311c092aa97bc64b1a28a9dbf934b0e52 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Tue, 29 Nov 2016 21:11:40 +0100 Subject: [PATCH 04/53] SOLR-9768 RecordingJsonParser produces incomplete json (Wojciech Stryszyk via ab) --- solr/CHANGES.txt | 2 ++ .../apache/solr/util/RecordingJSONParser.java | 17 +++++++-- .../common/util/TestJsonRecordReader.java | 35 +++++++++++++++---- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index cead9af537e..448f2d7adbe 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -216,6 +216,8 @@ Bug Fixes * SOLR-5260: Facet search on a docvalue field in a multi shard collection (Trym Møller, Erick Erickson) +* SOLR-9768: RecordingJsonParser produces incomplete json (Wojciech Stryszyk via ab) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/util/RecordingJSONParser.java b/solr/core/src/java/org/apache/solr/util/RecordingJSONParser.java index 030913d349f..a85610b7cb2 100644 --- a/solr/core/src/java/org/apache/solr/util/RecordingJSONParser.java +++ b/solr/core/src/java/org/apache/solr/util/RecordingJSONParser.java @@ -29,7 +29,9 @@ public class RecordingJSONParser extends JSONParser { private StringBuilder sb = new StringBuilder(); private boolean objectStarted = false; - public long lastMarkedPosition = 0; + private long lastMarkedPosition = 0; + private long lastGlobalPosition = 0; + private static final int BUFFER_SIZE = 8192; public RecordingJSONParser(Reader in) { @@ -39,7 +41,7 @@ public class RecordingJSONParser extends JSONParser { } static char[] getChars() { - buf.set(new char[8192]); + buf.set(new char[BUFFER_SIZE]); return buf.get(); } @@ -68,11 +70,22 @@ public class RecordingJSONParser extends JSONParser { if(currPosition < 0){ System.out.println("ERROR"); } + if (currPosition > lastMarkedPosition) { for (long i = lastMarkedPosition; i < currPosition; i++) { recordChar(bufCopy[(int) i]); } + } else if (currPosition < lastMarkedPosition) { + for (long i = 0; i < currPosition; i++) { + recordChar(bufCopy[(int) i]); + } + } else if (currPosition == BUFFER_SIZE && lastGlobalPosition != globalPosition) { + for (long i = 0; i < currPosition; i++) { + recordChar(bufCopy[(int) i]); + } } + + lastGlobalPosition = globalPosition; lastMarkedPosition = currPosition; } diff --git a/solr/solrj/src/test/org/apache/solr/common/util/TestJsonRecordReader.java b/solr/solrj/src/test/org/apache/solr/common/util/TestJsonRecordReader.java index d59dea39146..da75a43cf15 100644 --- a/solr/solrj/src/test/org/apache/solr/common/util/TestJsonRecordReader.java +++ b/solr/solrj/src/test/org/apache/solr/common/util/TestJsonRecordReader.java @@ -16,11 +16,6 @@ */ package org.apache.solr.common.util; -import org.apache.solr.SolrTestCaseJ4; -import org.apache.solr.util.RecordingJSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.IOException; import java.io.StringReader; import java.lang.invoke.MethodHandles; @@ -31,6 +26,12 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicReference; +import org.apache.commons.lang.StringUtils; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.util.RecordingJSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class TestJsonRecordReader extends SolrTestCaseJ4 { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -129,18 +130,32 @@ public class TestJsonRecordReader extends SolrTestCaseJ4 { " \"nested_inside\" : \"check check check 1\"\n" + " }\n" + "}"; + String json2 = " {\n" + " \"id\" : \"345\",\n" + + " \"payload\": \""+ StringUtils.repeat("0123456789", 819) + + "\",\n" + " \"description\": \"Testing /json/docs srcField 2\",\n" + "\n" + " \"nested_data\" : {\n" + " \"nested_inside\" : \"check check check 2\"\n" + " }\n" + "}"; - JsonRecordReader streamer = JsonRecordReader.getInst("/", Arrays.asList("id:/id")); - RecordingJSONParser parser = new RecordingJSONParser(new StringReader(json + json2)); + String json3 = + " {\n" + + " \"id\" : \"678\",\n" + + " \"description\": \"Testing /json/docs srcField 3\",\n" + + "\n" + + " \"nested_data\" : {\n" + + " \"nested_inside\" : \"check check check 3\"\n" + + " }\n" + + "}"; + + + JsonRecordReader streamer = JsonRecordReader.getInst("/", Arrays.asList("id:/id")); + RecordingJSONParser parser = new RecordingJSONParser(new StringReader(json + json2 + json3)); streamer.streamRecords(parser, new JsonRecordReader.Handler() { int count = 0; @@ -162,6 +177,12 @@ public class TestJsonRecordReader extends SolrTestCaseJ4 { assertEquals(m.get("description"), "Testing /json/docs srcField 2"); assertEquals(((Map) m.get("nested_data")).get("nested_inside"), "check check check 2"); } + if (count++ == 3) { + assertEquals(m.get("id"), "678"); + assertEquals(m.get("description"), "Testing /json/docs srcField 3"); + assertEquals(((Map) m.get("nested_data")).get("nested_inside"), "check check check 3"); + } + } }); From a7fa920b52febb80be70210caad7db1eeaf0f97a Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Tue, 29 Nov 2016 19:54:47 +0100 Subject: [PATCH 05/53] SOLR-9660: in GroupingSpecification factor [group](sort|offset|limit) into [group](sortSpec) (Judith Silverman, Christine Poerschke) --- solr/CHANGES.txt | 3 + .../handler/component/QueryComponent.java | 34 ++++----- .../component/QueryElevationComponent.java | 28 +++----- .../apache/solr/search/SolrIndexSearcher.java | 26 +++++++ .../java/org/apache/solr/search/SortSpec.java | 10 +++ .../grouping/GroupingSpecification.java | 70 +++++++++++-------- .../TopGroupsShardResponseProcessor.java | 4 +- .../GroupedEndResultTransformer.java | 4 +- 8 files changed, 112 insertions(+), 67 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 448f2d7adbe..1584647e787 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -250,6 +250,9 @@ Other Changes * SOLR-9783: (Search|Top)Group[s]ShardResponseProcessor.process: turned sortWithinGroup null check into assert. (Christine Poerschke) +* SOLR-9660: in GroupingSpecification factor [group](sort|offset|limit) into [group](sortSpec) + (Judith Silverman, Christine Poerschke) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java index fb6fec94111..84ade43c7a5 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java @@ -252,21 +252,27 @@ public class QueryComponent extends SearchComponent final SortSpec sortSpec = rb.getSortSpec(); //TODO: move weighting of sort - Sort groupSort = searcher.weightSort(sortSpec.getSort()); - if (groupSort == null) { - groupSort = Sort.RELEVANCE; - } + final SortSpec groupSortSpec = searcher.weightSortSpec(sortSpec, Sort.RELEVANCE); // groupSort defaults to sort String sortWithinGroupStr = params.get(GroupParams.GROUP_SORT); //TODO: move weighting of sort - Sort sortWithinGroup = sortWithinGroupStr == null ? groupSort : searcher.weightSort(SortSpecParsing.parseSortSpec(sortWithinGroupStr, req).getSort()); - if (sortWithinGroup == null) { - sortWithinGroup = Sort.RELEVANCE; + final SortSpec sortSpecWithinGroup; + if (sortWithinGroupStr != null) { + SortSpec parsedSortSpecWithinGroup = SortSpecParsing.parseSortSpec(sortWithinGroupStr, req); + sortSpecWithinGroup = searcher.weightSortSpec(parsedSortSpecWithinGroup, Sort.RELEVANCE); + } else { + sortSpecWithinGroup = new SortSpec( + groupSortSpec.getSort(), + groupSortSpec.getSchemaFields(), + groupSortSpec.getCount(), + groupSortSpec.getOffset()); } + sortSpecWithinGroup.setOffset(params.getInt(GroupParams.GROUP_OFFSET, 0)); + sortSpecWithinGroup.setCount(params.getInt(GroupParams.GROUP_LIMIT, 1)); - groupingSpec.setSortWithinGroup(sortWithinGroup); - groupingSpec.setGroupSort(groupSort); + groupingSpec.setSortSpecWithinGroup(sortSpecWithinGroup); + groupingSpec.setGroupSortSpec(groupSortSpec); String formatStr = params.get(GroupParams.GROUP_FORMAT, Grouping.Format.grouped.name()); Grouping.Format responseFormat; @@ -280,10 +286,6 @@ public class QueryComponent extends SearchComponent groupingSpec.setFields(params.getParams(GroupParams.GROUP_FIELD)); groupingSpec.setQueries(params.getParams(GroupParams.GROUP_QUERY)); groupingSpec.setFunctions(params.getParams(GroupParams.GROUP_FUNC)); - groupingSpec.setGroupOffset(params.getInt(GroupParams.GROUP_OFFSET, 0)); - groupingSpec.setGroupLimit(params.getInt(GroupParams.GROUP_LIMIT, 1)); - groupingSpec.setOffset(sortSpec.getOffset()); - groupingSpec.setLimit(sortSpec.getCount()); groupingSpec.setIncludeGroupCount(params.getBool(GroupParams.GROUP_TOTAL_COUNT, false)); groupingSpec.setMain(params.getBool(GroupParams.GROUP_MAIN, false)); groupingSpec.setNeedScore((rb.getFieldFlags() & SolrIndexSearcher.GET_SCORES) != 0); @@ -415,7 +417,7 @@ public class QueryComponent extends SearchComponent .setTruncateGroups(groupingSpec.isTruncateGroups() && groupingSpec.getFields().length > 0) .setSearcher(searcher); - int docsToCollect = Grouping.getMax(groupingSpec.getGroupOffset(), groupingSpec.getGroupLimit(), searcher.maxDoc()); + int docsToCollect = Grouping.getMax(groupingSpec.getWithinGroupOffset(), groupingSpec.getWithinGroupLimit(), searcher.maxDoc()); docsToCollect = Math.max(docsToCollect, 1); for (String field : groupingSpec.getFields()) { @@ -477,8 +479,8 @@ public class QueryComponent extends SearchComponent .setDefaultFormat(groupingSpec.getResponseFormat()) .setLimitDefault(limitDefault) .setDefaultTotalCount(defaultTotalCount) - .setDocsPerGroupDefault(groupingSpec.getGroupLimit()) - .setGroupOffsetDefault(groupingSpec.getGroupOffset()) + .setDocsPerGroupDefault(groupingSpec.getWithinGroupLimit()) + .setGroupOffsetDefault(groupingSpec.getWithinGroupOffset()) .setGetGroupedDocSet(groupingSpec.isTruncateGroups()); if (groupingSpec.getFields() != null) { diff --git a/solr/core/src/java/org/apache/solr/handler/component/QueryElevationComponent.java b/solr/core/src/java/org/apache/solr/handler/component/QueryElevationComponent.java index 4dde8ef539e..f72fc89a66a 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/QueryElevationComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/QueryElevationComponent.java @@ -460,15 +460,15 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore // alter the sorting in the grouping specification if there is one GroupingSpecification groupingSpec = rb.getGroupingSpec(); if(groupingSpec != null) { - SortField[] groupSort = groupingSpec.getGroupSort().getSort(); - Sort modGroupSort = this.modifySort(groupSort, force, comparator); - if(modGroupSort != null) { - groupingSpec.setGroupSort(modGroupSort); + SortSpec groupSortSpec = groupingSpec.getGroupSortSpec(); + SortSpec modGroupSortSpec = this.modifySortSpec(groupSortSpec, force, comparator); + if (modGroupSortSpec != null) { + groupingSpec.setGroupSortSpec(modGroupSortSpec); } - SortField[] withinGroupSort = groupingSpec.getSortWithinGroup().getSort(); - Sort modWithinGroupSort = this.modifySort(withinGroupSort, force, comparator); - if(modWithinGroupSort != null) { - groupingSpec.setSortWithinGroup(modWithinGroupSort); + SortSpec withinGroupSortSpec = groupingSpec.getSortSpecWithinGroup(); + SortSpec modWithinGroupSortSpec = this.modifySortSpec(withinGroupSortSpec, force, comparator); + if (modWithinGroupSortSpec != null) { + groupingSpec.setSortSpecWithinGroup(modWithinGroupSortSpec); } } } @@ -494,12 +494,6 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore } } - private Sort modifySort(SortField[] current, boolean force, ElevationComparatorSource comparator) { - SortSpec tmp = new SortSpec(new Sort(current), Arrays.asList(new SchemaField[current.length])); - tmp = modifySortSpec(tmp, force, comparator); - return null == tmp ? null : tmp.getSort(); - } - private SortSpec modifySortSpec(SortSpec current, boolean force, ElevationComparatorSource comparator) { boolean modify = false; SortField[] currentSorts = current.getSort().getSort(); @@ -526,9 +520,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore } if (modify) { SortSpec newSpec = new SortSpec(new Sort(sorts.toArray(new SortField[sorts.size()])), - fields); - newSpec.setOffset(current.getOffset()); - newSpec.setCount(current.getCount()); + fields, + current.getCount(), + current.getOffset()); return newSpec; } return null; diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index bf381f4c332..6d13b515076 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -924,6 +924,32 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable, SolrI return (sort != null) ? sort.rewrite(this) : null; } + /** Returns a weighted sort spec according to this searcher */ + public SortSpec weightSortSpec(SortSpec originalSortSpec, Sort nullEquivalent) throws IOException { + return implWeightSortSpec( + originalSortSpec.getSort(), + originalSortSpec.getCount(), + originalSortSpec.getOffset(), + nullEquivalent); + } + + /** Returns a weighted sort spec according to this searcher */ + private SortSpec implWeightSortSpec(Sort originalSort, int num, int offset, Sort nullEquivalent) throws IOException { + Sort rewrittenSort = weightSort(originalSort); + if (rewrittenSort == null) { + rewrittenSort = nullEquivalent; + } + + final SortField[] rewrittenSortFields = rewrittenSort.getSort(); + final SchemaField[] rewrittenSchemaFields = new SchemaField[rewrittenSortFields.length]; + for (int ii = 0; ii < rewrittenSortFields.length; ++ii) { + final String fieldName = rewrittenSortFields[ii].getField(); + rewrittenSchemaFields[ii] = (fieldName == null ? null : schema.getFieldOrNull(fieldName)); + } + + return new SortSpec(rewrittenSort, rewrittenSchemaFields, num, offset); + } + /** * Returns the first document number containing the term t Returns -1 if no document was found. This * method is primarily intended for clients that want to fetch documents using a unique identifier." diff --git a/solr/core/src/java/org/apache/solr/search/SortSpec.java b/solr/core/src/java/org/apache/solr/search/SortSpec.java index 8cd954c0354..b79ed0a094f 100644 --- a/solr/core/src/java/org/apache/solr/search/SortSpec.java +++ b/solr/core/src/java/org/apache/solr/search/SortSpec.java @@ -34,9 +34,19 @@ public class SortSpec private int num = 10; private int offset = 0; + public SortSpec(Sort sort, List fields, int num, int offset) { + setSortAndFields(sort, fields); + this.num = num; + this.offset = offset; + } public SortSpec(Sort sort, List fields) { setSortAndFields(sort, fields); } + public SortSpec(Sort sort, SchemaField[] fields, int num, int offset) { + setSortAndFields(sort, Arrays.asList(fields)); + this.num = num; + this.offset = offset; + } public SortSpec(Sort sort, SchemaField[] fields) { setSortAndFields(sort, Arrays.asList(fields)); } diff --git a/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java b/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java index fbe0aced053..4194dd087f8 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java @@ -18,6 +18,7 @@ package org.apache.solr.search.grouping; import org.apache.lucene.search.Sort; import org.apache.solr.search.Grouping; +import org.apache.solr.search.SortSpec; /** * Encapsulates the grouping options like fields group sort and more specified by clients. @@ -29,12 +30,8 @@ public class GroupingSpecification { private String[] fields = new String[]{}; private String[] queries = new String[]{}; private String[] functions = new String[]{}; - private int offset; - private int limit; - private int groupOffset; - private int groupLimit; - private Sort groupSort; - private Sort sortWithinGroup; + private SortSpec groupSortSpec; + private SortSpec sortSpecWithinGroup; private boolean includeGroupCount; private boolean main; private Grouping.Format responseFormat; @@ -77,53 +74,49 @@ public class GroupingSpecification { this.functions = functions; } + @Deprecated + public int getWithinGroupOffset() { + return sortSpecWithinGroup.getOffset(); + } + @Deprecated public int getGroupOffset() { - return groupOffset; + return getWithinGroupOffset(); } - public void setGroupOffset(int groupOffset) { - this.groupOffset = groupOffset; - } + @Deprecated + public int getWithinGroupLimit() { + return sortSpecWithinGroup.getCount(); + } + @Deprecated public int getGroupLimit() { - return groupLimit; + return getWithinGroupLimit(); } - public void setGroupLimit(int groupLimit) { - this.groupLimit = groupLimit; - } + @Deprecated public int getOffset() { - return offset; + return groupSortSpec.getOffset(); } - public void setOffset(int offset) { - this.offset = offset; - } + @Deprecated public int getLimit() { - return limit; + return groupSortSpec.getCount(); } - public void setLimit(int limit) { - this.limit = limit; - } + @Deprecated public Sort getGroupSort() { - return groupSort; + return groupSortSpec.getSort(); } - public void setGroupSort(Sort groupSort) { - this.groupSort = groupSort; - } + @Deprecated public Sort getSortWithinGroup() { - return sortWithinGroup; + return sortSpecWithinGroup.getSort(); } - public void setSortWithinGroup(Sort sortWithinGroup) { - this.sortWithinGroup = sortWithinGroup; - } public boolean isIncludeGroupCount() { return includeGroupCount; @@ -164,4 +157,21 @@ public class GroupingSpecification { public void setTruncateGroups(boolean truncateGroups) { this.truncateGroups = truncateGroups; } + + public SortSpec getGroupSortSpec() { + return groupSortSpec; + } + + public void setGroupSortSpec(SortSpec groupSortSpec) { + this.groupSortSpec = groupSortSpec; + } + + public SortSpec getSortSpecWithinGroup() { + return sortSpecWithinGroup; + } + + public void setSortSpecWithinGroup(SortSpec sortSpecWithinGroup) { + this.sortSpecWithinGroup = sortSpecWithinGroup; + } + } diff --git a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java index 3610a383ccb..7e38e5dead2 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/distributed/responseprocessor/TopGroupsShardResponseProcessor.java @@ -68,9 +68,9 @@ public class TopGroupsShardResponseProcessor implements ShardResponseProcessor { if (rb.getGroupingSpec().getResponseFormat() == Grouping.Format.simple || rb.getGroupingSpec().isMain()) { groupOffsetDefault = 0; } else { - groupOffsetDefault = rb.getGroupingSpec().getGroupOffset(); + groupOffsetDefault = rb.getGroupingSpec().getWithinGroupOffset(); } - int docsPerGroupDefault = rb.getGroupingSpec().getGroupLimit(); + int docsPerGroupDefault = rb.getGroupingSpec().getWithinGroupLimit(); Map>> commandTopGroups = new HashMap<>(); for (String field : fields) { diff --git a/solr/core/src/java/org/apache/solr/search/grouping/endresulttransformer/GroupedEndResultTransformer.java b/solr/core/src/java/org/apache/solr/search/grouping/endresulttransformer/GroupedEndResultTransformer.java index f8c9872a7a6..47b5276a1eb 100644 --- a/solr/core/src/java/org/apache/solr/search/grouping/endresulttransformer/GroupedEndResultTransformer.java +++ b/solr/core/src/java/org/apache/solr/search/grouping/endresulttransformer/GroupedEndResultTransformer.java @@ -79,7 +79,7 @@ public class GroupedEndResultTransformer implements EndResultTransformer { if (!Float.isNaN(group.maxScore)) { docList.setMaxScore(group.maxScore); } - docList.setStart(rb.getGroupingSpec().getGroupOffset()); + docList.setStart(rb.getGroupingSpec().getWithinGroupOffset()); for (ScoreDoc scoreDoc : group.scoreDocs) { docList.add(solrDocumentSource.retrieve(scoreDoc)); } @@ -97,7 +97,7 @@ public class GroupedEndResultTransformer implements EndResultTransformer { if (!Float.isNaN(queryCommandResult.getTopDocs().getMaxScore())) { docList.setMaxScore(queryCommandResult.getTopDocs().getMaxScore()); } - docList.setStart(rb.getGroupingSpec().getGroupOffset()); + docList.setStart(rb.getGroupingSpec().getWithinGroupOffset()); for (ScoreDoc scoreDoc :queryCommandResult.getTopDocs().scoreDocs){ docList.add(solrDocumentSource.retrieve(scoreDoc)); } From 44cce6bc4c5f3452d188cf4e8905a3ed7ef3e247 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Wed, 30 Nov 2016 18:47:52 +0530 Subject: [PATCH 06/53] typo in javadocs --- solr/solrj/src/java/org/apache/solr/common/PushWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/solrj/src/java/org/apache/solr/common/PushWriter.java b/solr/solrj/src/java/org/apache/solr/common/PushWriter.java index ddfac3cca52..7829d3762c8 100644 --- a/solr/solrj/src/java/org/apache/solr/common/PushWriter.java +++ b/solr/solrj/src/java/org/apache/solr/common/PushWriter.java @@ -33,7 +33,7 @@ public interface PushWriter extends Closeable { void writeMap(MapWriter mw) throws IOException; /**Write an array. The array is opened at the beginning of this method - * and closed at the end. All array entries must be returned before this + * and closed at the end. All array entries must be written before this * method returns * */ From e64bcb37ffe9ccbe1c88cb451ff147de774aec8e Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Thu, 1 Dec 2016 00:46:58 +0530 Subject: [PATCH 07/53] SOLR-9616 Solr throws exception when expand=true on empty index --- solr/CHANGES.txt | 2 ++ .../solr/handler/component/ExpandComponent.java | 6 ++++++ .../handler/component/TestExpandComponent.java | 15 +++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1584647e787..d09ae3be13b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -218,6 +218,8 @@ Bug Fixes * SOLR-9768: RecordingJsonParser produces incomplete json (Wojciech Stryszyk via ab) +* SOLR-9616: Solr throws exception when expand=true on empty index (Timo Hund via Ishan Chattopadhyaya) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index 8274d68aebd..366c4a9b7a8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -265,6 +265,12 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia * This code gathers the group information for the current page. */ List contexts = searcher.getTopReaderContext().leaves(); + + if(contexts.size() == 0) { + //When no context is available we can skip the expanding + return; + } + int currentContext = 0; int currentDocBase = contexts.get(currentContext).docBase; int nextDocBase = (currentContext+1) Date: Wed, 30 Nov 2016 14:04:58 -0500 Subject: [PATCH 08/53] LUCENE-7542: Remove debug printing of parsed versions --- dev-tools/scripts/smokeTestRelease.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev-tools/scripts/smokeTestRelease.py b/dev-tools/scripts/smokeTestRelease.py index 2b1ff193e4d..f9c34990418 100644 --- a/dev-tools/scripts/smokeTestRelease.py +++ b/dev-tools/scripts/smokeTestRelease.py @@ -497,7 +497,6 @@ def versionToTuple(version, name): versionTuple = versionTuple[:-2] + ('100',) elif versionTuple[-1].lower()[:2] == 'rc': versionTuple = versionTuple[:-2] + (versionTuple[-1][2:],) - print('%s: %s' % (version, versionTuple)) return versionTuple From c61268f7cd2c47884f98513febee6bb5f33ea6dc Mon Sep 17 00:00:00 2001 From: Anshum Gupta Date: Fri, 2 Dec 2016 12:09:10 -0800 Subject: [PATCH 09/53] SOLR-9819: Upgrade Apache commons-fileupload to 1.3.2, fixing a security vulnerability --- lucene/ivy-versions.properties | 2 +- solr/CHANGES.txt | 2 ++ solr/licenses/commons-fileupload-1.3.1.jar.sha1 | 1 - solr/licenses/commons-fileupload-1.3.2.jar.sha1 | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) delete mode 100644 solr/licenses/commons-fileupload-1.3.1.jar.sha1 create mode 100644 solr/licenses/commons-fileupload-1.3.2.jar.sha1 diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 85261052a06..ffc54a89f3b 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -64,7 +64,7 @@ com.sun.jersey.version = 1.9 /commons-collections/commons-collections = 3.2.2 /commons-configuration/commons-configuration = 1.6 /commons-digester/commons-digester = 2.1 -/commons-fileupload/commons-fileupload = 1.3.1 +/commons-fileupload/commons-fileupload = 1.3.2 /commons-io/commons-io = 2.5 /commons-lang/commons-lang = 2.6 /commons-logging/commons-logging = 1.1.3 diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d09ae3be13b..e76616942be 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -255,6 +255,8 @@ Other Changes * SOLR-9660: in GroupingSpecification factor [group](sort|offset|limit) into [group](sortSpec) (Judith Silverman, Christine Poerschke) +* SOLR-9819: Upgrade commons-fileupload to 1.3.2, fixing a potential vulnerability CVE-2016-3092 (Anshum Gupta) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/licenses/commons-fileupload-1.3.1.jar.sha1 b/solr/licenses/commons-fileupload-1.3.1.jar.sha1 deleted file mode 100644 index 32f48724c86..00000000000 --- a/solr/licenses/commons-fileupload-1.3.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -c621b54583719ac0310404463d6d99db27e1052c diff --git a/solr/licenses/commons-fileupload-1.3.2.jar.sha1 b/solr/licenses/commons-fileupload-1.3.2.jar.sha1 new file mode 100644 index 00000000000..747b509acc2 --- /dev/null +++ b/solr/licenses/commons-fileupload-1.3.2.jar.sha1 @@ -0,0 +1 @@ +5d7491ed6ebd02b6a8d2305f8e6b7fe5dbd95f72 \ No newline at end of file From fcccd317ddb44a742a0b3265fcf32923649f38cd Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 2 Dec 2016 15:26:04 -0500 Subject: [PATCH 10/53] LUCENE-7576: detect when special case automaton is passed to Terms.intersect --- lucene/CHANGES.txt | 4 ++++ .../lucene/codecs/blocktree/FieldReader.java | 3 +++ .../java/org/apache/lucene/index/Terms.java | 8 ++++++-- .../org/apache/lucene/index/TestTermsEnum.java | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e62a99d1eb0..4afc5078fa2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -81,6 +81,10 @@ Bug Fixes * LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term component when preserveOriginal was set to true. (Adrien Grand) +* LUCENE-7576: Fix Terms.intersect in the default codec to detect when + the incoming automaton is a special case and throw a clearer + exception than NullPointerException (Tom Mortimer via Mike McCandless) + Improvements * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java index 7f13a3264ce..4ee38262403 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java @@ -182,6 +182,9 @@ public final class FieldReader extends Terms implements Accountable { //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton); // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum? // can we optimize knowing that...? + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm, compiled.sinkState); } diff --git a/lucene/core/src/java/org/apache/lucene/index/Terms.java b/lucene/core/src/java/org/apache/lucene/index/Terms.java index dd48ce9c189..7197e25e549 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/core/src/java/org/apache/lucene/index/Terms.java @@ -49,8 +49,12 @@ public abstract class Terms { * provided startTerm must be accepted by * the automaton. * - *

NOTE: the returned TermsEnum cannot - * seek

. + *

This is an expert low-level API and will only work + * for {@code NORMAL} compiled automata. To handle any + * compiled automata you should instead use + * {@link CompiledAutomaton#getTermsEnum} instead. + * + *

NOTE: the returned TermsEnum cannot seek

. * *

NOTE: the terms dictionary is free to * return arbitrary terms as long as the resulted visited diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java index 3f15381e54c..a388d42ae30 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java @@ -998,4 +998,22 @@ public class TestTermsEnum extends LuceneTestCase { } dir.close(); } + + // LUCENE-7576 + public void testIntersectRegexp() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + doc.add(newStringField("field", "foobar", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + Fields fields = MultiFields.getFields(r); + CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton()); + Terms terms = fields.terms("field"); + String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage(); + assertEquals("please use CompiledAutomaton.getTermsEnum instead", message); + r.close(); + w.close(); + d.close(); + } } From 58476b1675befd88776c72fb7b178c294a39edae Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 2 Dec 2016 15:30:37 -0500 Subject: [PATCH 11/53] improve IW javadocs --- .../org/apache/lucene/index/IndexWriter.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 68f3b3b6b2d..98687855231 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -133,19 +133,24 @@ import org.apache.lucene.util.Version;

Expert: IndexWriter allows an optional - {@link IndexDeletionPolicy} implementation to be - specified. You can use this to control when prior commits - are deleted from the index. The default policy is {@link - KeepOnlyLastCommitDeletionPolicy} which removes all prior - commits as soon as a new commit is done (this matches - behavior before 2.2). Creating your own policy can allow - you to explicitly keep previous "point in time" commits - alive in the index for some time, to allow readers to - refresh to the new commit without having the old commit - deleted out from under them. This is necessary on - filesystems like NFS that do not support "delete on last - close" semantics, which Lucene's "point in time" search - normally relies on.

+ {@link IndexDeletionPolicy} implementation to be specified. You + can use this to control when prior commits are deleted from + the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy} + which removes all prior commits as soon as a new commit is + done. Creating your own policy can allow you to explicitly + keep previous "point in time" commits alive in the index for + some time, either because this is useful for your application, + or to give readers enough time to refresh to the new commit + without having the old commit deleted out from under them. + The latter is necessary when multiple computers take turns opening + their own {@code IndexWriter} and {@code IndexReader}s + against a single shared index mounted via remote filesystems + like NFS which do not support "delete on last close" semantics. + A single computer accessing an index via NFS is fine with the + default deletion policy since NFS clients emulate "delete on + last close" locally. That said, accessing an index via NFS + will likely result in poor performance compared to a local IO + device.

Expert: IndexWriter allows you to separately change From 8cbcbc9d956754de1fab2c626705aa6d6ab9f910 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 2 Dec 2016 17:42:27 -0500 Subject: [PATCH 12/53] LUCENE-7576: fix other codecs to detect when special case automaton is passed to Terms.intersect --- .../org/apache/lucene/codecs/memory/DirectPostingsFormat.java | 3 +++ .../org/apache/lucene/codecs/memory/FSTOrdTermsReader.java | 3 +++ .../java/org/apache/lucene/codecs/memory/FSTTermsReader.java | 3 +++ 3 files changed, 9 insertions(+) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 3ce2abe4358..00f25cf189c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -659,6 +659,9 @@ public final class DirectPostingsFormat extends PostingsFormat { @Override public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } return new DirectIntersectTermsEnum(compiled, startTerm); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java index 305c4194381..97bbea3ddef 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java @@ -270,6 +270,9 @@ public class FSTOrdTermsReader extends FieldsProducer { @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } return new IntersectTermsEnum(compiled, startTerm); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index 775f6929548..b120656688c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -250,6 +250,9 @@ public class FSTTermsReader extends FieldsProducer { @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } return new IntersectTermsEnum(compiled, startTerm); } From 39c2f3d80fd585c7ae4a4a559d53a19a3f100061 Mon Sep 17 00:00:00 2001 From: Anshum Gupta Date: Fri, 2 Dec 2016 16:42:35 -0800 Subject: [PATCH 13/53] SOLR-9819: Add new line to the end of SHA --- solr/licenses/commons-fileupload-1.3.2.jar.sha1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/licenses/commons-fileupload-1.3.2.jar.sha1 b/solr/licenses/commons-fileupload-1.3.2.jar.sha1 index 747b509acc2..80f80fb6def 100644 --- a/solr/licenses/commons-fileupload-1.3.2.jar.sha1 +++ b/solr/licenses/commons-fileupload-1.3.2.jar.sha1 @@ -1 +1 @@ -5d7491ed6ebd02b6a8d2305f8e6b7fe5dbd95f72 \ No newline at end of file +5d7491ed6ebd02b6a8d2305f8e6b7fe5dbd95f72 From 5e8db2e068f2549b9619d5ac48a50c8032fc292b Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 4 Dec 2016 05:18:04 -0500 Subject: [PATCH 14/53] LUCENE-7563: use a compressed format for the in-heap BKD index --- lucene/CHANGES.txt | 4 + .../simpletext/SimpleTextBKDReader.java | 281 ++- .../simpletext/SimpleTextBKDWriter.java | 1661 +++++++++++++++++ .../simpletext/SimpleTextPointsReader.java | 5 +- .../simpletext/SimpleTextPointsWriter.java | 188 +- .../codecs/lucene60/Lucene60PointsFormat.java | 10 +- .../lucene/codecs/lucene60/package-info.java | 4 +- .../lucene/codecs/lucene62/package-info.java | 4 +- .../lucene/codecs/lucene70/package-info.java | 15 +- .../org/apache/lucene/index/CheckIndex.java | 314 ++-- .../org/apache/lucene/util/bkd/BKDReader.java | 717 ++++--- .../org/apache/lucene/util/bkd/BKDWriter.java | 293 ++- .../lucene/util/bkd/HeapPointReader.java | 7 +- .../lucene/util/bkd/HeapPointWriter.java | 22 +- .../util/bkd/MutablePointsReaderUtils.java | 21 +- .../lucene/util/bkd/OfflinePointReader.java | 8 +- .../lucene/util/bkd/OfflinePointWriter.java | 10 +- .../apache/lucene/util/bkd/PointReader.java | 14 +- .../apache/lucene/util/bkd/PointWriter.java | 6 +- .../lucene/search/TestPointQueries.java | 3 + .../lucene/util/bkd/Test2BBKDPoints.java | 11 +- .../org/apache/lucene/util/bkd/TestBKD.java | 54 + .../org/apache/lucene/util/fst/TestFSTs.java | 2 +- .../lucene/document/NearestNeighbor.java | 44 +- 24 files changed, 3030 insertions(+), 668 deletions(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4afc5078fa2..79e44e112c8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,6 +126,10 @@ Optimizations * LUCENE-7568: Optimize merging when index sorting is used but the index is already sorted (Jim Ferenczi via Mike McCandless) +* LUCENE-7563: The BKD in-memory index for dimensional points now uses + a compressed format, using substantially less RAM in some cases + (Adrien Grand, Mike McCandless) + Other * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java index a2b784afd27..488547b4dea 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java @@ -16,13 +16,17 @@ */ package org.apache.lucene.codecs.simpletext; - import java.io.IOException; import java.nio.charset.StandardCharsets; +import org.apache.lucene.codecs.simpletext.SimpleTextUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.PointValues; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.bkd.BKDReader; @@ -30,15 +34,105 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_C import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID; import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE; -class SimpleTextBKDReader extends BKDReader { +/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */ - public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues, +final class SimpleTextBKDReader extends PointValues implements Accountable { + // Packed array of byte[] holding all split values in the full binary tree: + final private byte[] splitPackedValues; + final long[] leafBlockFPs; + final private int leafNodeOffset; + final int numDims; + final int bytesPerDim; + final int bytesPerIndexEntry; + final IndexInput in; + final int maxPointsInLeafNode; + final byte[] minPackedValue; + final byte[] maxPackedValue; + final long pointCount; + final int docCount; + final int version; + protected final int packedBytesLength; + + public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues, byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException { - super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount); + this.in = in; + this.numDims = numDims; + this.maxPointsInLeafNode = maxPointsInLeafNode; + this.bytesPerDim = bytesPerDim; + // no version check here because callers of this API (SimpleText) have no back compat: + bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1; + packedBytesLength = numDims * bytesPerDim; + this.leafNodeOffset = leafBlockFPs.length; + this.leafBlockFPs = leafBlockFPs; + this.splitPackedValues = splitPackedValues; + this.minPackedValue = minPackedValue; + this.maxPackedValue = maxPackedValue; + this.pointCount = pointCount; + this.docCount = docCount; + this.version = SimpleTextBKDWriter.VERSION_CURRENT; + assert minPackedValue.length == packedBytesLength; + assert maxPackedValue.length == packedBytesLength; } - @Override - protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { + /** Used to track all state for a single call to {@link #intersect}. */ + public static final class IntersectState { + final IndexInput in; + final int[] scratchDocIDs; + final byte[] scratchPackedValue; + final int[] commonPrefixLengths; + + final IntersectVisitor visitor; + + public IntersectState(IndexInput in, int numDims, + int packedBytesLength, + int maxPointsInLeafNode, + IntersectVisitor visitor) { + this.in = in; + this.visitor = visitor; + this.commonPrefixLengths = new int[numDims]; + this.scratchDocIDs = new int[maxPointsInLeafNode]; + this.scratchPackedValue = new byte[packedBytesLength]; + } + } + + public void intersect(IntersectVisitor visitor) throws IOException { + intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue); + } + + /** Fast path: this is called when the query box fully encompasses all cells under this node. */ + private void addAll(IntersectState state, int nodeID) throws IOException { + //System.out.println("R: addAll nodeID=" + nodeID); + + if (nodeID >= leafNodeOffset) { + //System.out.println("ADDALL"); + visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor); + // TODO: we can assert that the first value here in fact matches what the index claimed? + } else { + addAll(state, 2*nodeID); + addAll(state, 2*nodeID+1); + } + } + + /** Create a new {@link IntersectState} */ + public IntersectState getIntersectState(IntersectVisitor visitor) { + return new IntersectState(in.clone(), numDims, + packedBytesLength, + maxPointsInLeafNode, + visitor); + } + + /** Visits all docIDs and packed values in a single leaf block */ + public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException { + int leafID = nodeID - leafNodeOffset; + + // Leaf node; scan and filter all points in this block: + int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs); + + // Again, this time reading values and checking with the visitor + visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor); + } + + void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { BytesRefBuilder scratch = new BytesRefBuilder(); in.seek(blockFP); readLine(in, scratch); @@ -50,8 +144,7 @@ class SimpleTextBKDReader extends BKDReader { } } - @Override - protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { + int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { BytesRefBuilder scratch = new BytesRefBuilder(); in.seek(blockFP); readLine(in, scratch); @@ -63,8 +156,7 @@ class SimpleTextBKDReader extends BKDReader { return count; } - @Override - protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { visitor.grow(count); // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths assert scratchPackedValue.length == packedBytesLength; @@ -79,6 +171,175 @@ class SimpleTextBKDReader extends BKDReader { } } + private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException { + // the byte at `compressedByteOffset` is compressed using run-length compression, + // other suffix bytes are stored verbatim + final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim]; + commonPrefixLengths[compressedDim]++; + int i; + for (i = 0; i < count; ) { + scratchPackedValue[compressedByteOffset] = in.readByte(); + final int runLen = Byte.toUnsignedInt(in.readByte()); + for (int j = 0; j < runLen; ++j) { + for(int dim=0;dim 1.1 MB with 128 points +// per leaf, and you can reduce that by putting more points per leaf +// - we could use threads while building; the higher nodes are very parallelizable + +/** Forked from {@link BKDWriter} and simplified/specialized for SimpleText's usage */ + +final class SimpleTextBKDWriter implements Closeable { + + public static final String CODEC_NAME = "BKD"; + public static final int VERSION_START = 0; + public static final int VERSION_COMPRESSED_DOC_IDS = 1; + public static final int VERSION_COMPRESSED_VALUES = 2; + public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3; + public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D; + + /** How many bytes each docs takes in the fixed-width offline format */ + private final int bytesPerDoc; + + /** Default maximum number of point in each leaf block */ + public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024; + + /** Default maximum heap to use, before spilling to (slower) disk */ + public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f; + + /** Maximum number of dimensions */ + public static final int MAX_DIMS = 8; + + /** How many dimensions we are indexing */ + protected final int numDims; + + /** How many bytes each value in each dimension takes. */ + protected final int bytesPerDim; + + /** numDims * bytesPerDim */ + protected final int packedBytesLength; + + final BytesRefBuilder scratch = new BytesRefBuilder(); + + final TrackingDirectoryWrapper tempDir; + final String tempFileNamePrefix; + final double maxMBSortInHeap; + + final byte[] scratchDiff; + final byte[] scratch1; + final byte[] scratch2; + final BytesRef scratchBytesRef1 = new BytesRef(); + final BytesRef scratchBytesRef2 = new BytesRef(); + final int[] commonPrefixLengths; + + protected final FixedBitSet docsSeen; + + private OfflinePointWriter offlinePointWriter; + private HeapPointWriter heapPointWriter; + + private IndexOutput tempInput; + protected final int maxPointsInLeafNode; + private final int maxPointsSortInHeap; + + /** Minimum per-dim values, packed */ + protected final byte[] minPackedValue; + + /** Maximum per-dim values, packed */ + protected final byte[] maxPackedValue; + + protected long pointCount; + + /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */ + protected final boolean longOrds; + + /** An upper bound on how many points the caller will add (includes deletions) */ + private final long totalPointCount; + + /** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */ + protected final boolean singleValuePerDoc; + + /** How much heap OfflineSorter is allowed to use */ + protected final OfflineSorter.BufferSize offlineSorterBufferMB; + + /** How much heap OfflineSorter is allowed to use */ + protected final int offlineSorterMaxTempFiles; + + private final int maxDoc; + + public SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, + int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException { + this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc, + totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES); + } + + private SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, + int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, + boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException { + verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount); + // We use tracking dir to deal with removing files on exception, so each place that + // creates temp files doesn't need crazy try/finally/sucess logic: + this.tempDir = new TrackingDirectoryWrapper(tempDir); + this.tempFileNamePrefix = tempFileNamePrefix; + this.maxPointsInLeafNode = maxPointsInLeafNode; + this.numDims = numDims; + this.bytesPerDim = bytesPerDim; + this.totalPointCount = totalPointCount; + this.maxDoc = maxDoc; + this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB); + this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles; + docsSeen = new FixedBitSet(maxDoc); + packedBytesLength = numDims * bytesPerDim; + + scratchDiff = new byte[bytesPerDim]; + scratch1 = new byte[packedBytesLength]; + scratch2 = new byte[packedBytesLength]; + commonPrefixLengths = new int[numDims]; + + minPackedValue = new byte[packedBytesLength]; + maxPackedValue = new byte[packedBytesLength]; + + // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes). + this.longOrds = longOrds; + + this.singleValuePerDoc = singleValuePerDoc; + + // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int) + if (singleValuePerDoc) { + // Lucene only supports up to 2.1 docs, so we better not need longOrds in this case: + assert longOrds == false; + bytesPerDoc = packedBytesLength + Integer.BYTES; + } else if (longOrds) { + bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES; + } else { + bytesPerDoc = packedBytesLength + Integer.BYTES + Integer.BYTES; + } + + // As we recurse, we compute temporary partitions of the data, halving the + // number of points at each recursion. Once there are few enough points, + // we can switch to sorting in heap instead of offline (on disk). At any + // time in the recursion, we hold the number of points at that level, plus + // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X + // what that level would consume, so we multiply by 0.5 to convert from + // bytes to points here. Each dimension has its own sorted partition, so + // we must divide by numDims as wel. + + maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims)); + + // Finally, we must be able to hold at least the leaf node in heap during build: + if (maxPointsSortInHeap < maxPointsInLeafNode) { + throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode"); + } + + // We write first maxPointsSortInHeap in heap, then cutover to offline for additional points: + heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength, longOrds, singleValuePerDoc); + + this.maxMBSortInHeap = maxMBSortInHeap; + } + + public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) { + // We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use + // remaining 4 bits for another purpose later + if (numDims < 1 || numDims > MAX_DIMS) { + throw new IllegalArgumentException("numDims must be 1 .. " + MAX_DIMS + " (got: " + numDims + ")"); + } + if (maxPointsInLeafNode <= 0) { + throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode); + } + if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) { + throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode); + } + if (maxMBSortInHeap < 0.0) { + throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")"); + } + if (totalPointCount < 0) { + throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")"); + } + } + + /** If the current segment has too many points then we spill over to temp files / offline sort. */ + private void spillToOffline() throws IOException { + + // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree: + offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill", 0, singleValuePerDoc); + tempInput = offlinePointWriter.out; + PointReader reader = heapPointWriter.getReader(0, pointCount); + for(int i=0;i= maxPointsSortInHeap) { + if (offlinePointWriter == null) { + spillToOffline(); + } + offlinePointWriter.append(packedValue, pointCount, docID); + } else { + // Not too many points added yet, continue using heap: + heapPointWriter.append(packedValue, pointCount, docID); + } + + // TODO: we could specialize for the 1D case: + if (pointCount == 0) { + System.arraycopy(packedValue, 0, minPackedValue, 0, packedBytesLength); + System.arraycopy(packedValue, 0, maxPackedValue, 0, packedBytesLength); + } else { + for(int dim=0;dim 0) { + System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim); + } + } + } + + pointCount++; + if (pointCount > totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); + } + docsSeen.set(docID); + } + + /** How many points have been added so far */ + public long getPointCount() { + return pointCount; + } + + private static class MergeReader { + final SimpleTextBKDReader bkd; + final SimpleTextBKDReader.IntersectState state; + final MergeState.DocMap docMap; + + /** Current doc ID */ + public int docID; + + /** Which doc in this block we are up to */ + private int docBlockUpto; + + /** How many docs in the current block */ + private int docsInBlock; + + /** Which leaf block we are up to */ + private int blockID; + + private final byte[] packedValues; + + public MergeReader(SimpleTextBKDReader bkd, MergeState.DocMap docMap) throws IOException { + this.bkd = bkd; + state = new SimpleTextBKDReader.IntersectState(bkd.in.clone(), + bkd.numDims, + bkd.packedBytesLength, + bkd.maxPointsInLeafNode, + null); + this.docMap = docMap; + long minFP = Long.MAX_VALUE; + //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length); + for(long fp : bkd.leafBlockFPs) { + minFP = Math.min(minFP, fp); + //System.out.println(" leaf fp=" + fp); + } + state.in.seek(minFP); + this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength]; + } + + public boolean next() throws IOException { + //System.out.println("MR.next this=" + this); + while (true) { + if (docBlockUpto == docsInBlock) { + if (blockID == bkd.leafBlockFPs.length) { + //System.out.println(" done!"); + return false; + } + //System.out.println(" new block @ fp=" + state.in.getFilePointer()); + docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs); + assert docsInBlock > 0; + docBlockUpto = 0; + bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() { + int i = 0; + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assert docID == state.scratchDocIDs[i]; + System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength); + i++; + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + throw new UnsupportedOperationException(); + } + + }); + + blockID++; + } + + final int index = docBlockUpto++; + int oldDocID = state.scratchDocIDs[index]; + + int mappedDocID; + if (docMap == null) { + mappedDocID = oldDocID; + } else { + mappedDocID = docMap.get(oldDocID); + } + + if (mappedDocID != -1) { + // Not deleted! + docID = mappedDocID; + System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength); + return true; + } + } + } + } + + private static class BKDMergeQueue extends PriorityQueue { + private final int bytesPerDim; + + public BKDMergeQueue(int bytesPerDim, int maxSize) { + super(maxSize); + this.bytesPerDim = bytesPerDim; + } + + @Override + public boolean lessThan(MergeReader a, MergeReader b) { + assert a != b; + + int cmp = StringHelper.compare(bytesPerDim, a.state.scratchPackedValue, 0, b.state.scratchPackedValue, 0); + if (cmp < 0) { + return true; + } else if (cmp > 0) { + return false; + } + + // Tie break by sorting smaller docIDs earlier: + return a.docID < b.docID; + } + } + + /** Write a field from a {@link MutablePointValues}. This way of writing + * points is faster than regular writes with {@link BKDWriter#add} since + * there is opportunity for reordering points before writing them to + * disk. This method does not use transient disk in order to reorder points. + */ + public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException { + if (numDims == 1) { + return writeField1Dim(out, fieldName, reader); + } else { + return writeFieldNDims(out, fieldName, reader); + } + } + + + /* In the 2+D case, we recursively pick the split dimension, compute the + * median value and partition other values around it. */ + private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException { + if (pointCount != 0) { + throw new IllegalStateException("cannot mix add and writeField"); + } + + // Catch user silliness: + if (heapPointWriter == null && tempInput == null) { + throw new IllegalStateException("already finished"); + } + + // Mark that we already finished: + heapPointWriter = null; + + long countPerLeaf = pointCount = values.size(); + long innerNodeCount = 1; + + while (countPerLeaf > maxPointsInLeafNode) { + countPerLeaf = (countPerLeaf+1)/2; + innerNodeCount *= 2; + } + + int numLeaves = Math.toIntExact(innerNodeCount); + + checkMaxLeafNodeCount(numLeaves); + + final byte[] splitPackedValues = new byte[numLeaves * (bytesPerDim + 1)]; + final long[] leafBlockFPs = new long[numLeaves]; + + // compute the min/max for this slice + Arrays.fill(minPackedValue, (byte) 0xff); + Arrays.fill(maxPackedValue, (byte) 0); + for (int i = 0; i < Math.toIntExact(pointCount); ++i) { + values.getValue(i, scratchBytesRef1); + for(int dim=0;dim 0) { + System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim); + } + } + + docsSeen.set(values.getDocID(i)); + } + + build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out, + minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, + new int[maxPointsInLeafNode]); + + long indexFP = out.getFilePointer(); + writeIndex(out, leafBlockFPs, splitPackedValues); + return indexFP; + } + + + /* In the 1D case, we can simply sort points in ascending order and use the + * same writing logic as we use at merge time. */ + private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException { + MutablePointsReaderUtils.sort(maxDoc, packedBytesLength, reader, 0, Math.toIntExact(reader.size())); + + final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out); + + reader.intersect(new IntersectVisitor() { + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + oneDimWriter.add(packedValue, docID); + } + + @Override + public void visit(int docID) throws IOException { + throw new IllegalStateException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + return oneDimWriter.finish(); + } + + // TODO: remove this opto: SimpleText is supposed to be simple! + + /** More efficient bulk-add for incoming {@link SimpleTextBKDReader}s. This does a merge sort of the already + * sorted values and currently only works when numDims==1. This returns -1 if all documents containing + * dimensional values were deleted. */ + public long merge(IndexOutput out, List docMaps, List readers) throws IOException { + assert docMaps == null || readers.size() == docMaps.size(); + + BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size()); + + for(int i=0;i totalPointCount) { + throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values"); + } + + if (leafCount == maxPointsInLeafNode) { + // We write a block once we hit exactly the max count ... this is different from + // when we flush a new segment, where we write between max/2 and max per leaf block, + // so merged segments will behave differently from newly flushed segments: + writeLeafBlock(); + leafCount = 0; + } + + assert (lastDocID = docID) >= 0; // only assign when asserts are enabled + } + + public long finish() throws IOException { + if (leafCount > 0) { + writeLeafBlock(); + leafCount = 0; + } + + if (valueCount == 0) { + return -1; + } + + pointCount = valueCount; + + long indexFP = out.getFilePointer(); + + int numInnerNodes = leafBlockStartValues.size(); + + //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size()); + + byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)]; + rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues); + long[] arr = new long[leafBlockFPs.size()]; + for(int i=0;i 0) { + // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end: + leafBlockStartValues.add(Arrays.copyOf(leafValues, packedBytesLength)); + } + leafBlockFPs.add(out.getFilePointer()); + checkMaxLeafNodeCount(leafBlockFPs.size()); + + Arrays.fill(commonPrefixLengths, bytesPerDim); + // Find per-dim common prefix: + for(int dim=0;dim packedValues = new IntFunction() { + final BytesRef scratch = new BytesRef(); + + { + scratch.length = packedBytesLength; + scratch.bytes = leafValues; + } + + @Override + public BytesRef apply(int i) { + scratch.offset = packedBytesLength * i; + return scratch; + } + }; + assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength), + Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), + packedValues, leafDocs, 0); + writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues); + } + + } + + // TODO: there must be a simpler way? + private void rotateToTree(int nodeID, int offset, int count, byte[] index, List leafBlockStartValues) { + //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length); + if (count == 1) { + // Leaf index node + //System.out.println(" leaf index node"); + //System.out.println(" index[" + nodeID + "] = blockStartValues[" + offset + "]"); + System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim); + } else if (count > 1) { + // Internal index node: binary partition of count + int countAtLevel = 1; + int totalCount = 0; + while (true) { + int countLeft = count - totalCount; + //System.out.println(" cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel); + if (countLeft <= countAtLevel) { + // This is the last level, possibly partially filled: + int lastLeftCount = Math.min(countAtLevel/2, countLeft); + assert lastLeftCount >= 0; + int leftHalf = (totalCount-1)/2 + lastLeftCount; + + int rootOffset = offset + leftHalf; + /* + System.out.println(" last left count " + lastLeftCount); + System.out.println(" leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1)); + System.out.println(" rootOffset=" + rootOffset); + */ + + System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim); + //System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]"); + + // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree + // under here, to save this while loop on each recursion + + // Recurse left + rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues); + + // Recurse right + rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues); + return; + } + totalCount += countAtLevel; + countAtLevel *= 2; + } + } else { + assert count == 0; + } + } + + // TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length + // encoding and not have our own ByteSequencesReader/Writer + + /** Sort the heap writer by the specified dim */ + private void sortHeapPointWriter(final HeapPointWriter writer, int dim) { + final int pointCount = Math.toIntExact(this.pointCount); + // Tie-break by docID: + + // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it + // can't matter at search time since we don't write ords into the index: + new MSBRadixSorter(bytesPerDim + Integer.BYTES) { + + @Override + protected int byteAt(int i, int k) { + assert k >= 0; + if (k < bytesPerDim) { + // dim bytes + int block = i / writer.valuesPerBlock; + int index = i % writer.valuesPerBlock; + return writer.blocks.get(block)[index * packedBytesLength + dim * bytesPerDim + k] & 0xff; + } else { + // doc id + int s = 3 - (k - bytesPerDim); + return (writer.docIDs[i] >>> (s * 8)) & 0xff; + } + } + + @Override + protected void swap(int i, int j) { + int docID = writer.docIDs[i]; + writer.docIDs[i] = writer.docIDs[j]; + writer.docIDs[j] = docID; + + if (singleValuePerDoc == false) { + if (longOrds) { + long ord = writer.ordsLong[i]; + writer.ordsLong[i] = writer.ordsLong[j]; + writer.ordsLong[j] = ord; + } else { + int ord = writer.ords[i]; + writer.ords[i] = writer.ords[j]; + writer.ords[j] = ord; + } + } + + byte[] blockI = writer.blocks.get(i / writer.valuesPerBlock); + int indexI = (i % writer.valuesPerBlock) * packedBytesLength; + byte[] blockJ = writer.blocks.get(j / writer.valuesPerBlock); + int indexJ = (j % writer.valuesPerBlock) * packedBytesLength; + + // scratch1 = values[i] + System.arraycopy(blockI, indexI, scratch1, 0, packedBytesLength); + // values[i] = values[j] + System.arraycopy(blockJ, indexJ, blockI, indexI, packedBytesLength); + // values[j] = scratch1 + System.arraycopy(scratch1, 0, blockJ, indexJ, packedBytesLength); + } + + }.sort(0, pointCount); + } + + private PointWriter sort(int dim) throws IOException { + assert dim >= 0 && dim < numDims; + + if (heapPointWriter != null) { + + assert tempInput == null; + + // We never spilled the incoming points to disk, so now we sort in heap: + HeapPointWriter sorted; + + if (dim == 0) { + // First dim can re-use the current heap writer + sorted = heapPointWriter; + } else { + // Subsequent dims need a private copy + sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc); + sorted.copyFrom(heapPointWriter); + } + + //long t0 = System.nanoTime(); + sortHeapPointWriter(sorted, dim); + //long t1 = System.nanoTime(); + //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec"); + + sorted.close(); + return sorted; + } else { + + // Offline sort: + assert tempInput != null; + + final int offset = bytesPerDim * dim; + + Comparator cmp; + if (dim == numDims - 1) { + // in that case the bytes for the dimension and for the doc id are contiguous, + // so we don't need a branch + cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) { + @Override + protected int byteAt(BytesRef ref, int i) { + return ref.bytes[ref.offset + offset + i] & 0xff; + } + }; + } else { + cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) { + @Override + protected int byteAt(BytesRef ref, int i) { + if (i < bytesPerDim) { + return ref.bytes[ref.offset + offset + i] & 0xff; + } else { + return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff; + } + } + }; + } + + OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) { + + /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */ + @Override + protected ByteSequencesWriter getWriter(IndexOutput out) { + return new ByteSequencesWriter(out) { + @Override + public void write(byte[] bytes, int off, int len) throws IOException { + assert len == bytesPerDoc: "len=" + len + " bytesPerDoc=" + bytesPerDoc; + out.writeBytes(bytes, off, len); + } + }; + } + + /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */ + @Override + protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException { + return new ByteSequencesReader(in, name) { + final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]); + @Override + public BytesRef next() throws IOException { + if (in.getFilePointer() >= end) { + return null; + } + in.readBytes(scratch.bytes, 0, bytesPerDoc); + return scratch; + } + }; + } + }; + + String name = sorter.sort(tempInput.getName()); + + return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc); + } + } + + private void checkMaxLeafNodeCount(int numLeaves) { + if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { + throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex"); + } + } + + /** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */ + public long finish(IndexOutput out) throws IOException { + // System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapPointWriter); + + // TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recurse...) + + // Catch user silliness: + if (heapPointWriter == null && tempInput == null) { + throw new IllegalStateException("already finished"); + } + + if (offlinePointWriter != null) { + offlinePointWriter.close(); + } + + if (pointCount == 0) { + throw new IllegalStateException("must index at least one point"); + } + + LongBitSet ordBitSet; + if (numDims > 1) { + if (singleValuePerDoc) { + ordBitSet = new LongBitSet(maxDoc); + } else { + ordBitSet = new LongBitSet(pointCount); + } + } else { + ordBitSet = null; + } + + long countPerLeaf = pointCount; + long innerNodeCount = 1; + + while (countPerLeaf > maxPointsInLeafNode) { + countPerLeaf = (countPerLeaf+1)/2; + innerNodeCount *= 2; + } + + int numLeaves = (int) innerNodeCount; + + checkMaxLeafNodeCount(numLeaves); + + // NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each + // step of the recursion to recompute the split dim: + + // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each recursion says which dim we split on. + byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))]; + + // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7) + long[] leafBlockFPs = new long[numLeaves]; + + // Make sure the math above "worked": + assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode; + + // Sort all docs once by each dimension: + PathSlice[] sortedPointWriters = new PathSlice[numDims]; + + // This is only used on exception; on normal code paths we close all files we opened: + List toCloseHeroically = new ArrayList<>(); + + boolean success = false; + try { + //long t0 = System.nanoTime(); + for(int dim=0;dim packedValues) throws IOException { + for (int i = 0; i < count; ++i) { + BytesRef packedValue = packedValues.apply(i); + // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths + write(out, BLOCK_VALUE); + write(out, packedValue.toString()); + newline(out); + } + } + + private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction packedValues) throws IOException { + for (int i = start; i < end; ++i) { + BytesRef ref = packedValues.apply(i); + assert ref.length == packedBytesLength; + + for(int dim=0;dim packedValues, int start, int end, int byteOffset) { + BytesRef first = packedValues.apply(start); + byte b = first.bytes[first.offset + byteOffset]; + for (int i = start + 1; i < end; ++i) { + BytesRef ref = packedValues.apply(i); + byte b2 = ref.bytes[ref.offset + byteOffset]; + assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b); + if (b != b2) { + return i - start; + } + } + return end - start; + } + + @Override + public void close() throws IOException { + if (tempInput != null) { + // NOTE: this should only happen on exception, e.g. caller calls close w/o calling finish: + try { + tempInput.close(); + } finally { + tempDir.deleteFile(tempInput.getName()); + tempInput = null; + } + } + } + + /** Sliced reference to points in an OfflineSorter.ByteSequencesWriter file. */ + private static final class PathSlice { + final PointWriter writer; + final long start; + final long count; + + public PathSlice(PointWriter writer, long start, long count) { + this.writer = writer; + this.start = start; + this.count = count; + } + + @Override + public String toString() { + return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")"; + } + } + + /** Called on exception, to check whether the checksum is also corrupt in this source, and add that + * information (checksum matched or didn't) as a suppressed exception. */ + private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException { + // TODO: we could improve this, to always validate checksum as we recurse, if we shared left and + // right reader after recursing to children, and possibly within recursed children, + // since all together they make a single pass through the file. But this is a sizable re-org, + // and would mean leaving readers (IndexInputs) open for longer: + if (writer instanceof OfflinePointWriter) { + // We are reading from a temp file; go verify the checksum: + String tempFileName = ((OfflinePointWriter) writer).name; + try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) { + CodecUtil.checkFooter(in, priorException); + } + } else { + // We are reading from heap; nothing to add: + IOUtils.reThrow(priorException); + } + } + + /** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */ + private byte[] markRightTree(long rightCount, int splitDim, PathSlice source, LongBitSet ordBitSet) throws IOException { + + // Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim: + + // Read the split value, then mark all ords in the right tree (larger than the split value): + + // TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work? + try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) { + boolean result = reader.next(); + assert result; + System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim); + if (numDims > 1) { + assert ordBitSet.get(reader.ord()) == false; + ordBitSet.set(reader.ord()); + // Subtract 1 from rightCount because we already did the first value above (so we could record the split value): + reader.markOrds(rightCount-1, ordBitSet); + } + } catch (Throwable t) { + verifyChecksum(t, source.writer); + } + + return scratch1; + } + + /** Called only in assert */ + private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) { + for(int dim=0;dim 0) { + return false; + } + } + + return true; + } + + protected int split(byte[] minPackedValue, byte[] maxPackedValue) { + // Find which dim has the largest span so we can split on it: + int splitDim = -1; + for(int dim=0;dim 0) { + System.arraycopy(scratchDiff, 0, scratch1, 0, bytesPerDim); + splitDim = dim; + } + } + + //System.out.println("SPLIT: " + splitDim); + return splitDim; + } + + /** Pull a partition back into heap once the point count is low enough while recursing. */ + private PathSlice switchToHeap(PathSlice source, List toCloseHeroically) throws IOException { + int count = Math.toIntExact(source.count); + // Not inside the try because we don't want to close it here: + PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically); + try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) { + for(int i=0;i= leafNodeOffset) { + // leaf node + final int count = to - from; + assert count <= maxPointsInLeafNode; + + // Compute common prefixes + Arrays.fill(commonPrefixLengths, bytesPerDim); + reader.getValue(from, scratchBytesRef1); + for (int i = from + 1; i < to; ++i) { + reader.getValue(i, scratchBytesRef2); + for (int dim=0;dim packedValues = new IntFunction() { + @Override + public BytesRef apply(int i) { + reader.getValue(from + i, scratchBytesRef1); + return scratchBytesRef1; + } + }; + assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, + docIDs, 0); + writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues); + + } else { + // inner node + + // compute the split dimension and partition around it + final int splitDim = split(minPackedValue, maxPackedValue); + final int mid = (from + to + 1) >>> 1; + + int commonPrefixLen = bytesPerDim; + for (int i = 0; i < bytesPerDim; ++i) { + if (minPackedValue[splitDim * bytesPerDim + i] != maxPackedValue[splitDim * bytesPerDim + i]) { + commonPrefixLen = i; + break; + } + } + MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen, + reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); + + // set the split value + final int address = nodeID * (1+bytesPerDim); + splitPackedValues[address] = (byte) splitDim; + reader.getValue(mid, scratchBytesRef1); + System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim); + + byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength); + byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength); + System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, + minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim); + System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, + maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim); + + // recurse + build(nodeID * 2, leafNodeOffset, reader, from, mid, out, + minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); + build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out, + minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); + } + } + + /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */ + private void build(int nodeID, int leafNodeOffset, + PathSlice[] slices, + LongBitSet ordBitSet, + IndexOutput out, + byte[] minPackedValue, byte[] maxPackedValue, + byte[] splitPackedValues, + long[] leafBlockFPs, + List toCloseHeroically) throws IOException { + + for(PathSlice slice : slices) { + assert slice.count == slices[0].count; + } + + if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) { + // Special case for 1D, to cutover to heap once we recurse deeply enough: + slices[0] = switchToHeap(slices[0], toCloseHeroically); + } + + if (nodeID >= leafNodeOffset) { + + // Leaf node: write block + // We can write the block in any order so by default we write it sorted by the dimension that has the + // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient + int sortedDim = 0; + int sortedDimCardinality = Integer.MAX_VALUE; + + for (int dim=0;dim= maxPointsInLeafNode, so we better be in heap at this point: + HeapPointWriter heapSource = (HeapPointWriter) source.writer; + + // Save the block file pointer: + leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer(); + //System.out.println(" write leaf block @ fp=" + out.getFilePointer()); + + // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o + // loading the values: + int count = Math.toIntExact(source.count); + assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; + writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count); + + // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us + // from the index, much like how terms dict does so from the FST: + + // Write the full values: + IntFunction packedValues = new IntFunction() { + final BytesRef scratch = new BytesRef(); + + { + scratch.length = packedBytesLength; + } + + @Override + public BytesRef apply(int i) { + heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch); + return scratch; + } + }; + assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, + heapSource.docIDs, Math.toIntExact(source.start)); + writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues); + + } else { + // Inner node: partition/recurse + + int splitDim; + if (numDims > 1) { + splitDim = split(minPackedValue, maxPackedValue); + } else { + splitDim = 0; + } + + PathSlice source = slices[splitDim]; + + assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length; + + // How many points will be in the left tree: + long rightCount = source.count / 2; + long leftCount = source.count - rightCount; + + byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet); + int address = nodeID * (1+bytesPerDim); + splitPackedValues[address] = (byte) splitDim; + System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim); + + // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse: + + PathSlice[] leftSlices = new PathSlice[numDims]; + PathSlice[] rightSlices = new PathSlice[numDims]; + + byte[] minSplitPackedValue = new byte[packedBytesLength]; + System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength); + + byte[] maxSplitPackedValue = new byte[packedBytesLength]; + System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength); + + // When we are on this dim, below, we clear the ordBitSet: + int dimToClear; + if (numDims - 1 == splitDim) { + dimToClear = numDims - 2; + } else { + dimToClear = numDims - 1; + } + + for(int dim=0;dim values, int[] docs, int docsOffset) throws IOException { + byte[] lastPackedValue = new byte[packedBytesLength]; + int lastDoc = -1; + for (int i=0;i 0) { + int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, dimOffset, packedValue, packedValueOffset + dimOffset); + if (cmp > 0) { + throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); + } + if (cmp == 0 && doc < lastDoc) { + throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord); + } + } + System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, packedBytesLength); + return true; + } + + PointWriter getPointWriter(long count, String desc) throws IOException { + if (count <= maxPointsSortInHeap) { + int size = Math.toIntExact(count); + return new HeapPointWriter(size, size, packedBytesLength, longOrds, singleValuePerDoc); + } else { + return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, desc, count, singleValuePerDoc); + } + } + + private void write(IndexOutput out, String s) throws IOException { + SimpleTextUtil.write(out, s, scratch); + } + + private void writeInt(IndexOutput out, int x) throws IOException { + SimpleTextUtil.write(out, Integer.toString(x), scratch); + } + + private void writeLong(IndexOutput out, long x) throws IOException { + SimpleTextUtil.write(out, Long.toString(x), scratch); + } + + private void write(IndexOutput out, BytesRef b) throws IOException { + SimpleTextUtil.write(out, b); + } + + private void newline(IndexOutput out) throws IOException { + SimpleTextUtil.writeNewline(out); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java index f7ff16ecbc2..453bd2384b2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java @@ -36,7 +36,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.bkd.BKDReader; import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP; import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM; @@ -58,7 +57,7 @@ class SimpleTextPointsReader extends PointsReader { private final IndexInput dataIn; final SegmentReadState readState; - final Map readers = new HashMap<>(); + final Map readers = new HashMap<>(); final BytesRefBuilder scratch = new BytesRefBuilder(); public SimpleTextPointsReader(SegmentReadState readState) throws IOException { @@ -98,7 +97,7 @@ class SimpleTextPointsReader extends PointsReader { this.readState = readState; } - private BKDReader initReader(long fp) throws IOException { + private SimpleTextBKDReader initReader(long fp) throws IOException { // NOTE: matches what writeIndex does in SimpleTextPointsWriter dataIn.seek(fp); readLine(dataIn); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index c06c128d154..9d2db890fa0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -20,7 +20,6 @@ package org.apache.lucene.codecs.simpletext; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.function.IntFunction; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; @@ -33,29 +32,28 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.bkd.BKDWriter; class SimpleTextPointsWriter extends PointsWriter { - final static BytesRef NUM_DIMS = new BytesRef("num dims "); - final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim "); - final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points "); - final static BytesRef INDEX_COUNT = new BytesRef("index count "); - final static BytesRef BLOCK_COUNT = new BytesRef("block count "); - final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc "); - final static BytesRef BLOCK_FP = new BytesRef(" block fp "); - final static BytesRef BLOCK_VALUE = new BytesRef(" block value "); - final static BytesRef SPLIT_COUNT = new BytesRef("split count "); - final static BytesRef SPLIT_DIM = new BytesRef(" split dim "); - final static BytesRef SPLIT_VALUE = new BytesRef(" split value "); - final static BytesRef FIELD_COUNT = new BytesRef("field count "); - final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name "); - final static BytesRef FIELD_FP = new BytesRef(" field fp "); - final static BytesRef MIN_VALUE = new BytesRef("min value "); - final static BytesRef MAX_VALUE = new BytesRef("max value "); - final static BytesRef POINT_COUNT = new BytesRef("point count "); - final static BytesRef DOC_COUNT = new BytesRef("doc count "); - final static BytesRef END = new BytesRef("END"); + public final static BytesRef NUM_DIMS = new BytesRef("num dims "); + public final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim "); + public final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points "); + public final static BytesRef INDEX_COUNT = new BytesRef("index count "); + public final static BytesRef BLOCK_COUNT = new BytesRef("block count "); + public final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc "); + public final static BytesRef BLOCK_FP = new BytesRef(" block fp "); + public final static BytesRef BLOCK_VALUE = new BytesRef(" block value "); + public final static BytesRef SPLIT_COUNT = new BytesRef("split count "); + public final static BytesRef SPLIT_DIM = new BytesRef(" split dim "); + public final static BytesRef SPLIT_VALUE = new BytesRef(" split value "); + public final static BytesRef FIELD_COUNT = new BytesRef("field count "); + public final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name "); + public final static BytesRef FIELD_FP = new BytesRef(" field fp "); + public final static BytesRef MIN_VALUE = new BytesRef("min value "); + public final static BytesRef MAX_VALUE = new BytesRef("max value "); + public final static BytesRef POINT_COUNT = new BytesRef("point count "); + public final static BytesRef DOC_COUNT = new BytesRef("doc count "); + public final static BytesRef END = new BytesRef("END"); private IndexOutput dataOut; final BytesRefBuilder scratch = new BytesRefBuilder(); @@ -75,105 +73,15 @@ class SimpleTextPointsWriter extends PointsWriter { boolean singleValuePerDoc = values.size() == values.getDocCount(); // We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk: - try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(), - writeState.directory, - writeState.segmentInfo.name, - fieldInfo.getPointDimensionCount(), - fieldInfo.getPointNumBytes(), - BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, - BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, - values.size(), - singleValuePerDoc) { - - @Override - protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { - write(out, NUM_DIMS); - writeInt(out, numDims); - newline(out); - - write(out, BYTES_PER_DIM); - writeInt(out, bytesPerDim); - newline(out); - - write(out, MAX_LEAF_POINTS); - writeInt(out, maxPointsInLeafNode); - newline(out); - - write(out, INDEX_COUNT); - writeInt(out, leafBlockFPs.length); - newline(out); - - write(out, MIN_VALUE); - BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length); - write(out, br.toString()); - newline(out); - - write(out, MAX_VALUE); - br = new BytesRef(maxPackedValue, 0, maxPackedValue.length); - write(out, br.toString()); - newline(out); - - write(out, POINT_COUNT); - writeLong(out, pointCount); - newline(out); - - write(out, DOC_COUNT); - writeInt(out, docsSeen.cardinality()); - newline(out); - - for(int i=0;i packedValues) throws IOException { - for (int i = 0; i < count; ++i) { - BytesRef packedValue = packedValues.apply(i); - // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths - write(out, BLOCK_VALUE); - write(out, packedValue.toString()); - newline(out); - } - } - }) { + try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(), + writeState.directory, + writeState.segmentInfo.name, + fieldInfo.getPointDimensionCount(), + fieldInfo.getPointNumBytes(), + SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, + SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, + values.size(), + singleValuePerDoc)) { values.intersect(new IntersectVisitor() { @Override @@ -198,26 +106,6 @@ class SimpleTextPointsWriter extends PointsWriter { } } - private void write(IndexOutput out, String s) throws IOException { - SimpleTextUtil.write(out, s, scratch); - } - - private void writeInt(IndexOutput out, int x) throws IOException { - SimpleTextUtil.write(out, Integer.toString(x), scratch); - } - - private void writeLong(IndexOutput out, long x) throws IOException { - SimpleTextUtil.write(out, Long.toString(x), scratch); - } - - private void write(IndexOutput out, BytesRef b) throws IOException { - SimpleTextUtil.write(out, b); - } - - private void newline(IndexOutput out) throws IOException { - SimpleTextUtil.writeNewline(out); - } - @Override public void finish() throws IOException { SimpleTextUtil.write(dataOut, END); @@ -250,4 +138,24 @@ class SimpleTextPointsWriter extends PointsWriter { } } } + + private void write(IndexOutput out, String s) throws IOException { + SimpleTextUtil.write(out, s, scratch); + } + + private void writeInt(IndexOutput out, int x) throws IOException { + SimpleTextUtil.write(out, Integer.toString(x), scratch); + } + + private void writeLong(IndexOutput out, long x) throws IOException { + SimpleTextUtil.write(out, Long.toString(x), scratch); + } + + private void write(IndexOutput out, BytesRef b) throws IOException { + SimpleTextUtil.write(out, b); + } + + private void newline(IndexOutput out) throws IOException { + SimpleTextUtil.writeNewline(out); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java index e558d0d4fa8..1d2285c73b6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java @@ -28,7 +28,8 @@ import org.apache.lucene.index.SegmentWriteState; /** * Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure - * for fast shape intersection filtering. See this paper for details. + * for fast 1D range and N dimesional shape intersection filtering. + * See this paper for details. * *

This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced * binary tree of split values referencing those blocks at the leaves. @@ -50,10 +51,13 @@ import org.apache.lucene.index.SegmentWriteState; *

  • maxPointsInLeafNode (vInt) *
  • bytesPerDim (vInt) *
  • count (vInt) - *
  • byte[bytesPerDim]count (packed byte[] all split values) - *
  • delta-blockFP (vLong)count (delta-coded file pointers to the on-disk leaf blocks)) + *
  • packed index (byte[]) * * + *

    The packed index uses hierarchical delta and prefix coding to compactly encode the file pointer for + * all leaf blocks, once the tree is traversed, as well as the split dimension and split value for each + * inner node of the tree. + * *

    After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum. * *

    The .dii file records the file pointer in the .dim file where each field's diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java index 8968a6d624c..a914001d9d2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java @@ -16,7 +16,7 @@ */ /** - * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62} - * for an overview of the index format. + * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70} + * for an overview of the current index format. */ package org.apache.lucene.codecs.lucene60; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java index 2fe2dc74b4a..fb556732d08 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java @@ -17,8 +17,8 @@ /** * Components from the Lucene 6.2 index format - * See {@link org.apache.lucene.codecs.lucene62} for an overview - * of the index format. + * See {@link org.apache.lucene.codecs.lucene70} for an overview + * of the current index format. */ package org.apache.lucene.codecs.lucene62; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java index 9b432f7c4f4..cab2859766e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java @@ -185,6 +185,12 @@ * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}. * An optional file indicating which documents are live. *

  • + *
  • + * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}. + * Optional pair of files, recording dimensionally indexed fields, to enable fast + * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D) + * and geographic shape intersection (2D, 3D). + *
  • * *

    Details on each of these are provided in their linked pages.

    * @@ -300,7 +306,12 @@ * * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents} * .liv - * Info about what files are live + * Info about what documents are live + * + * + * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values} + * .dii, .dim + * Holds indexed points, if any * * * @@ -374,6 +385,8 @@ * that is suitable for faceting/sorting/analytics. *
  • In version 5.4, DocValues have been improved to store more information on disk: * addresses for binary fields and ord indexes for multi-valued fields. + *
  • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting. *
  • In version 7.0, DocValues have been improved to better support sparse doc values * thanks to an iterator API. *
  • diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 7bc08f3c4a8..fd8011d4d07 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1801,161 +1801,32 @@ public final class CheckIndex implements Closeable { } for (FieldInfo fieldInfo : fieldInfos) { if (fieldInfo.getPointDimensionCount() > 0) { - FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc()); - status.totalValueFields++; - int dimCount = fieldInfo.getPointDimensionCount(); - int bytesPerDim = fieldInfo.getPointNumBytes(); - int packedBytesCount = dimCount * bytesPerDim; - byte[] lastMinPackedValue = new byte[packedBytesCount]; - byte[] lastMaxPackedValue = new byte[packedBytesCount]; - BytesRef scratch = new BytesRef(); - scratch.length = bytesPerDim; - byte[] lastPackedValue = new byte[packedBytesCount]; - - long[] pointCountSeen = new long[1]; - PointValues values = pointsReader.getValues(fieldInfo.name); if (values == null) { continue; } - byte[] globalMinPackedValue = values.getMinPackedValue(); + + status.totalValueFields++; + long size = values.size(); int docCount = values.getDocCount(); - if (docCount > size) { - throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount); + VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values); + values.intersect(visitor); + + if (visitor.getPointCountSeen() != size) { + throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen()); } - if (docCount > reader.maxDoc()) { - throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but that's greater than maxDoc=" + reader.maxDoc()); + if (visitor.getDocCountSeen() != docCount) { + throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + visitor.getDocCountSeen()); } - if (globalMinPackedValue == null) { - if (size != 0) { - throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size); - } - } else if (globalMinPackedValue.length != packedBytesCount) { - throw new RuntimeException("getMinPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount); - } - byte[] globalMaxPackedValue = values.getMaxPackedValue(); - if (globalMaxPackedValue == null) { - if (size != 0) { - throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size); - } - } else if (globalMaxPackedValue.length != packedBytesCount) { - throw new RuntimeException("getMaxPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount); - } - - values.intersect(new PointValues.IntersectVisitor() { - - private int lastDocID = -1; - - @Override - public void visit(int docID) { - throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID); - } - - @Override - public void visit(int docID, byte[] packedValue) { - checkPackedValue("packed value", packedValue, docID); - pointCountSeen[0]++; - docsSeen.set(docID); - - for(int dim=0;dim 0) { - throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + - " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\""); - } - - // Make sure this cell is not outside of the global min/max: - if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) { - throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + - " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\""); - } - - if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) { - throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) + - " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\""); - } - - if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) { - throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + - " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\""); - } - if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) { - throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) + - " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\""); - } - } - - // We always pretend the query shape is so complex that it crosses every cell, so - // that packedValue is passed for every document - return PointValues.Relation.CELL_CROSSES_QUERY; - } - - private void checkPackedValue(String desc, byte[] packedValue, int docID) { - if (packedValue == null) { - throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldInfo.name + "\""); - } - - if (packedValue.length != packedBytesCount) { - throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldInfo.name + "\""); - } - } - }); - - if (pointCountSeen[0] != size) { - throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]); - } - - if (docsSeen.cardinality() != docCount) { - throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality()); - } + status.totalValuePoints += visitor.getPointCountSeen(); } } } + msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS))); } catch (Throwable e) { @@ -1972,6 +1843,167 @@ public final class CheckIndex implements Closeable { return status; } + /** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries. + * + * @lucene.internal */ + public static class VerifyPointsVisitor implements PointValues.IntersectVisitor { + private long pointCountSeen; + private int lastDocID = -1; + private final int maxDoc; + private final FixedBitSet docsSeen; + private final byte[] lastMinPackedValue; + private final byte[] lastMaxPackedValue; + private final byte[] lastPackedValue; + private final byte[] globalMinPackedValue; + private final byte[] globalMaxPackedValue; + private final int packedBytesCount; + private final int numDims; + private final int bytesPerDim; + private final String fieldName; + + /** Sole constructor */ + public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException { + this.maxDoc = maxDoc; + this.fieldName = fieldName; + numDims = values.getNumDimensions(); + bytesPerDim = values.getBytesPerDimension(); + packedBytesCount = numDims * bytesPerDim; + globalMinPackedValue = values.getMinPackedValue(); + globalMaxPackedValue = values.getMaxPackedValue(); + docsSeen = new FixedBitSet(maxDoc); + lastMinPackedValue = new byte[packedBytesCount]; + lastMaxPackedValue = new byte[packedBytesCount]; + lastPackedValue = new byte[packedBytesCount]; + + if (values.getDocCount() > values.size()) { + throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount()); + } + + if (values.getDocCount() > maxDoc) { + throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc); + } + + if (globalMinPackedValue == null) { + if (values.size() != 0) { + throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size()); + } + } else if (globalMinPackedValue.length != packedBytesCount) { + throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount); + } + if (globalMaxPackedValue == null) { + if (values.size() != 0) { + throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size()); + } + } else if (globalMaxPackedValue.length != packedBytesCount) { + throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount); + } + } + + /** Returns total number of points in this BKD tree */ + public long getPointCountSeen() { + return pointCountSeen; + } + + /** Returns total number of unique docIDs in this BKD tree */ + public long getDocCountSeen() { + return docsSeen.cardinality(); + } + + @Override + public void visit(int docID) { + throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID); + } + + @Override + public void visit(int docID, byte[] packedValue) { + checkPackedValue("packed value", packedValue, docID); + pointCountSeen++; + docsSeen.set(docID); + + for(int dim=0;dim 0) { + throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + + " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\""); + } + + // Make sure this cell is not outside of the global min/max: + if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) { + throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + + " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\""); + } + + if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) { + throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) + + " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\""); + } + + if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) { + throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) + + " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\""); + } + if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) { + throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) + + " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\""); + } + } + + // We always pretend the query shape is so complex that it crosses every cell, so + // that packedValue is passed for every document + return PointValues.Relation.CELL_CROSSES_QUERY; + } + + private void checkPackedValue(String desc, byte[] packedValue, int docID) { + if (packedValue == null) { + throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldName + "\""); + } + + if (packedValue.length != packedBytesCount) { + throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldName + "\""); + } + } + } + + /** * Test stored fields. * @lucene.experimental diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 6bf7dfc1a86..6cccf4cf1d1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -17,14 +17,15 @@ package org.apache.lucene.util.bkd; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.PointValues; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; @@ -32,14 +33,12 @@ import org.apache.lucene.util.StringHelper; * * @lucene.experimental */ -public class BKDReader extends PointValues implements Accountable { +public final class BKDReader extends PointValues implements Accountable { // Packed array of byte[] holding all split values in the full binary tree: - final private byte[] splitPackedValues; - final long[] leafBlockFPs; - final private int leafNodeOffset; + final int leafNodeOffset; final int numDims; final int bytesPerDim; - final int bytesPerIndexEntry; + final int numLeaves; final IndexInput in; final int maxPointsInLeafNode; final byte[] minPackedValue; @@ -49,6 +48,14 @@ public class BKDReader extends PointValues implements Accountable { final int version; protected final int packedBytesLength; + // Used for 6.4.0+ index format: + final byte[] packedIndex; + + // Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index: + final private byte[] splitPackedValues; + final int bytesPerIndexEntry; + final long[] leafBlockFPs; + /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */ public BKDReader(IndexInput in) throws IOException { version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT); @@ -59,7 +66,7 @@ public class BKDReader extends PointValues implements Accountable { packedBytesLength = numDims * bytesPerDim; // Read index: - int numLeaves = in.readVInt(); + numLeaves = in.readVInt(); assert numLeaves > 0; leafNodeOffset = numLeaves; @@ -78,203 +85,378 @@ public class BKDReader extends PointValues implements Accountable { pointCount = in.readVLong(); docCount = in.readVInt(); - splitPackedValues = new byte[bytesPerIndexEntry*numLeaves]; - - // TODO: don't write split packed values[0]! - in.readBytes(splitPackedValues, 0, splitPackedValues.length); - - // Read the file pointers to the start of each leaf block: - long[] leafBlockFPs = new long[numLeaves]; - long lastFP = 0; - for(int i=0;i 1) { - //System.out.println("BKDR: numLeaves=" + numLeaves); - int levelCount = 2; - while (true) { - //System.out.println(" cycle levelCount=" + levelCount); - if (numLeaves >= levelCount && numLeaves <= 2*levelCount) { - int lastLevel = 2*(numLeaves - levelCount); - assert lastLevel >= 0; - /* - System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount); - System.out.println("FPs before:"); - for(int i=0;i= maxDoc) { - throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc); - } - for(int dim=0;dim 0) { - throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim)); - } - if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) { - throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim)); - } - } - - if (numDims == 1) { - // With only 1D, all values should always be in sorted order - if (lastPackedValue == null) { - lastPackedValue = Arrays.copyOf(packedValue, packedValue.length); - } else if (StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) { - throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue)); - } else { - System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim); - } - } - } - - @Override - public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - throw new UnsupportedOperationException(); - } - } - - /** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */ - // TODO: maybe we can get this into CheckIndex? - public void verify(int maxDoc) throws IOException { - //System.out.println("BKDR.verify this=" + this); - // Visits every doc in every leaf block and confirms that - // their values agree with the index: - byte[] rootMinPacked = new byte[packedBytesLength]; - byte[] rootMaxPacked = new byte[packedBytesLength]; - Arrays.fill(rootMaxPacked, (byte) 0xff); - verify(getIntersectState(new VerifyVisitor(numDims, bytesPerDim, maxDoc)), 1, rootMinPacked, rootMaxPacked); - } - - private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException { - - if (nodeID >= leafNodeOffset) { - int leafID = nodeID - leafNodeOffset; - - // In the unbalanced case it's possible the left most node only has one child: - if (leafID < leafBlockFPs.length) { - //System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]); - //System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked)); - - // Leaf node: check that all values are in fact in bounds: - VerifyVisitor visitor = (VerifyVisitor) state.visitor; - visitor.cellMinPacked = cellMinPacked; - visitor.cellMaxPacked = cellMaxPacked; - - int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs); - visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor); - } else { - //System.out.println("BKDR.verify skip leafID=" + leafID); - } + if (version >= BKDWriter.VERSION_PACKED_INDEX) { + int numBytes = in.readVInt(); + packedIndex = new byte[numBytes]; + in.readBytes(packedIndex, 0, numBytes); + leafBlockFPs = null; + splitPackedValues = null; } else { - // Non-leaf node: + // legacy un-packed index - int address = nodeID * bytesPerIndexEntry; - int splitDim; - if (numDims == 1) { - splitDim = 0; - if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) { - // skip over wastefully encoded 0 splitDim: - assert splitPackedValues[address] == 0; - address++; + splitPackedValues = new byte[bytesPerIndexEntry*numLeaves]; + + in.readBytes(splitPackedValues, 0, splitPackedValues.length); + + // Read the file pointers to the start of each leaf block: + long[] leafBlockFPs = new long[numLeaves]; + long lastFP = 0; + for(int i=0;i 1) { + int levelCount = 2; + while (true) { + if (numLeaves >= levelCount && numLeaves <= 2*levelCount) { + int lastLevel = 2*(numLeaves - levelCount); + assert lastLevel >= 0; + if (lastLevel != 0) { + // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading + // at read-time, so that we can still delta code them on disk at write: + long[] newLeafBlockFPs = new long[numLeaves]; + System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel); + System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel); + leafBlockFPs = newLeafBlockFPs; + } + break; + } + + levelCount *= 2; } - } else { - splitDim = splitPackedValues[address++] & 0xff; } - assert splitDim < numDims; - - byte[] splitPackedValue = new byte[packedBytesLength]; - - // Recurse on left sub-tree: - System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength); - System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); - verify(state, - 2*nodeID, - cellMinPacked, splitPackedValue); - - // Recurse on right sub-tree: - System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength); - System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); - verify(state, - 2*nodeID+1, - splitPackedValue, cellMaxPacked); + this.leafBlockFPs = leafBlockFPs; + packedIndex = null; } + + this.in = in; + } + + long getMinLeafBlockFP() { + if (packedIndex != null) { + return new ByteArrayDataInput(packedIndex).readVLong(); + } else { + long minFP = Long.MAX_VALUE; + for(long fp : leafBlockFPs) { + minFP = Math.min(minFP, fp); + } + return minFP; + } + } + + /** Used to walk the in-heap index + * + * @lucene.internal */ + public abstract class IndexTree implements Cloneable { + protected int nodeID; + // level is 1-based so that we can do level-1 w/o checking each time: + protected int level; + protected int splitDim; + protected final byte[][] splitPackedValueStack; + + protected IndexTree() { + int treeDepth = getTreeDepth(); + splitPackedValueStack = new byte[treeDepth+1][]; + nodeID = 1; + level = 1; + splitPackedValueStack[level] = new byte[packedBytesLength]; + } + + public void pushLeft() { + nodeID *= 2; + level++; + if (splitPackedValueStack[level] == null) { + splitPackedValueStack[level] = new byte[packedBytesLength]; + } + } + + /** Clone, but you are not allowed to pop up past the point where the clone happened. */ + public abstract IndexTree clone(); + + public void pushRight() { + nodeID = nodeID * 2 + 1; + level++; + if (splitPackedValueStack[level] == null) { + splitPackedValueStack[level] = new byte[packedBytesLength]; + } + } + + public void pop() { + nodeID /= 2; + level--; + splitDim = -1; + //System.out.println(" pop nodeID=" + nodeID); + } + + public boolean isLeafNode() { + return nodeID >= leafNodeOffset; + } + + public boolean nodeExists() { + return nodeID - leafNodeOffset < leafNodeOffset; + } + + public int getNodeID() { + return nodeID; + } + + public byte[] getSplitPackedValue() { + assert isLeafNode() == false; + assert splitPackedValueStack[level] != null: "level=" + level; + return splitPackedValueStack[level]; + } + + /** Only valid after pushLeft or pushRight, not pop! */ + public int getSplitDim() { + assert isLeafNode() == false; + return splitDim; + } + + /** Only valid after pushLeft or pushRight, not pop! */ + public abstract BytesRef getSplitDimValue(); + + /** Only valid after pushLeft or pushRight, not pop! */ + public abstract long getLeafBlockFP(); + } + + /** Reads the original simple yet heap-heavy index format */ + private final class LegacyIndexTree extends IndexTree { + + private long leafBlockFP; + private final byte[] splitDimValue = new byte[bytesPerDim]; + private final BytesRef scratch = new BytesRef(); + + public LegacyIndexTree() { + setNodeData(); + scratch.bytes = splitDimValue; + scratch.length = bytesPerDim; + } + + @Override + public LegacyIndexTree clone() { + LegacyIndexTree index = new LegacyIndexTree(); + index.nodeID = nodeID; + index.level = level; + index.splitDim = splitDim; + index.leafBlockFP = leafBlockFP; + index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone(); + + return index; + } + + @Override + public void pushLeft() { + super.pushLeft(); + setNodeData(); + } + + @Override + public void pushRight() { + super.pushRight(); + setNodeData(); + } + + private void setNodeData() { + if (isLeafNode()) { + leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset]; + splitDim = -1; + } else { + leafBlockFP = -1; + int address = nodeID * bytesPerIndexEntry; + if (numDims == 1) { + splitDim = 0; + if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) { + // skip over wastefully encoded 0 splitDim: + assert splitPackedValues[address] == 0; + address++; + } + } else { + splitDim = splitPackedValues[address++] & 0xff; + } + System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim); + } + } + + @Override + public long getLeafBlockFP() { + assert isLeafNode(); + return leafBlockFP; + } + + @Override + public BytesRef getSplitDimValue() { + assert isLeafNode() == false; + return scratch; + } + + @Override + public void pop() { + super.pop(); + leafBlockFP = -1; + } + } + + /** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This + * format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing + * downwards one child at a time. */ + private final class PackedIndexTree extends IndexTree { + // used to read the packed byte[] + private final ByteArrayDataInput in; + // holds the minimum (left most) leaf block file pointer for each level we've recursed to: + private final long[] leafBlockFPStack; + // holds the address, in the packed byte[] index, of the left-node of each level: + private final int[] leftNodePositions; + // holds the address, in the packed byte[] index, of the right-node of each level: + private final int[] rightNodePositions; + // holds the splitDim for each level: + private final int[] splitDims; + // true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed + // 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we + // split on this dimension, we next pushed to the left sub-tree: + private final boolean[] negativeDeltas; + // holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses: + private final byte[][] splitValuesStack; + // scratch value to return from getPackedValue: + private final BytesRef scratch; + + public PackedIndexTree() { + int treeDepth = getTreeDepth(); + leafBlockFPStack = new long[treeDepth+1]; + leftNodePositions = new int[treeDepth+1]; + rightNodePositions = new int[treeDepth+1]; + splitValuesStack = new byte[treeDepth+1][]; + splitDims = new int[treeDepth+1]; + negativeDeltas = new boolean[numDims*(treeDepth+1)]; + + in = new ByteArrayDataInput(packedIndex); + splitValuesStack[0] = new byte[packedBytesLength]; + readNodeData(false); + scratch = new BytesRef(); + scratch.length = bytesPerDim; + } + + @Override + public PackedIndexTree clone() { + PackedIndexTree index = new PackedIndexTree(); + index.nodeID = nodeID; + index.level = level; + index.splitDim = splitDim; + System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims); + index.leafBlockFPStack[level] = leafBlockFPStack[level]; + index.leftNodePositions[level] = leftNodePositions[level]; + index.rightNodePositions[level] = rightNodePositions[level]; + index.splitValuesStack[index.level] = splitValuesStack[index.level].clone(); + System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims); + index.splitDims[level] = splitDims[level]; + return index; + } + + @Override + public void pushLeft() { + int nodePosition = leftNodePositions[level]; + super.pushLeft(); + System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims); + assert splitDim != -1; + negativeDeltas[level*numDims+splitDim] = true; + in.setPosition(nodePosition); + readNodeData(true); + } + + @Override + public void pushRight() { + int nodePosition = rightNodePositions[level]; + super.pushRight(); + System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims); + assert splitDim != -1; + negativeDeltas[level*numDims+splitDim] = false; + in.setPosition(nodePosition); + readNodeData(false); + } + + @Override + public void pop() { + super.pop(); + splitDim = splitDims[level]; + } + + @Override + public long getLeafBlockFP() { + assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf"; + return leafBlockFPStack[level]; + } + + @Override + public BytesRef getSplitDimValue() { + assert isLeafNode() == false; + scratch.bytes = splitValuesStack[level]; + scratch.offset = splitDim * bytesPerDim; + return scratch; + } + + private void readNodeData(boolean isLeft) { + + leafBlockFPStack[level] = leafBlockFPStack[level-1]; + + // read leaf block FP delta + if (isLeft == false) { + leafBlockFPStack[level] += in.readVLong(); + } + + if (isLeafNode()) { + splitDim = -1; + } else { + + // read split dim, prefix, firstDiffByteDelta encoded as int: + int code = in.readVInt(); + splitDim = code % numDims; + splitDims[level] = splitDim; + code /= numDims; + int prefix = code % (1+bytesPerDim); + int suffix = bytesPerDim - prefix; + + if (splitValuesStack[level] == null) { + splitValuesStack[level] = new byte[packedBytesLength]; + } + System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength); + if (suffix > 0) { + int firstDiffByteDelta = code / (1+bytesPerDim); + if (negativeDeltas[level*numDims + splitDim]) { + firstDiffByteDelta = -firstDiffByteDelta; + } + int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF; + splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta); + in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1); + } else { + // our split value is == last split value in this dim, which can happen when there are many duplicate values + } + + int leftNumBytes; + if (nodeID * 2 < leafNodeOffset) { + leftNumBytes = in.readVInt(); + } else { + leftNumBytes = 0; + } + + leftNodePositions[level] = in.getPosition(); + rightNodePositions[level] = leftNodePositions[level] + leftNumBytes; + } + } + } + + private int getTreeDepth() { + // First +1 because all the non-leave nodes makes another power + // of 2; e.g. to have a fully balanced tree with 4 leaves you + // need a depth=3 tree: + + // Second +1 because MathUtil.log computes floor of the logarithm; e.g. + // with 5 leaves you need a depth=4 tree: + return MathUtil.log(numLeaves, 2) + 2; } /** Used to track all state for a single call to {@link #intersect}. */ @@ -285,57 +467,73 @@ public class BKDReader extends PointValues implements Accountable { final int[] commonPrefixLengths; final IntersectVisitor visitor; + public final IndexTree index; public IntersectState(IndexInput in, int numDims, int packedBytesLength, int maxPointsInLeafNode, - IntersectVisitor visitor) { + IntersectVisitor visitor, + IndexTree indexVisitor) { this.in = in; this.visitor = visitor; this.commonPrefixLengths = new int[numDims]; this.scratchDocIDs = new int[maxPointsInLeafNode]; this.scratchPackedValue = new byte[packedBytesLength]; + this.index = indexVisitor; } } public void intersect(IntersectVisitor visitor) throws IOException { - intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue); + intersect(getIntersectState(visitor), minPackedValue, maxPackedValue); } /** Fast path: this is called when the query box fully encompasses all cells under this node. */ - private void addAll(IntersectState state, int nodeID) throws IOException { + private void addAll(IntersectState state) throws IOException { //System.out.println("R: addAll nodeID=" + nodeID); - if (nodeID >= leafNodeOffset) { + if (state.index.isLeafNode()) { //System.out.println("ADDALL"); - visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor); + if (state.index.nodeExists()) { + visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor); + } // TODO: we can assert that the first value here in fact matches what the index claimed? } else { - addAll(state, 2*nodeID); - addAll(state, 2*nodeID+1); + state.index.pushLeft(); + addAll(state); + state.index.pop(); + + state.index.pushRight(); + addAll(state); + state.index.pop(); } } /** Create a new {@link IntersectState} */ public IntersectState getIntersectState(IntersectVisitor visitor) { + IndexTree index; + if (packedIndex != null) { + index = new PackedIndexTree(); + } else { + index = new LegacyIndexTree(); + } return new IntersectState(in.clone(), numDims, packedBytesLength, maxPointsInLeafNode, - visitor); + visitor, + index); } /** Visits all docIDs and packed values in a single leaf block */ - public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException { - int leafID = nodeID - leafNodeOffset; + public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException { // Leaf node; scan and filter all points in this block: - int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs); + int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs); // Again, this time reading values and checking with the visitor visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor); } - protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { + private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { // Leaf node in.seek(blockFP); @@ -350,7 +548,7 @@ public class BKDReader extends PointValues implements Accountable { } } - protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { + int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { in.seek(blockFP); // How many points are stored in this leaf cell: @@ -365,7 +563,7 @@ public class BKDReader extends PointValues implements Accountable { return count; } - protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { visitor.grow(count); readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in); @@ -434,13 +632,10 @@ public class BKDReader extends PointValues implements Accountable { } } - private void intersect(IntersectState state, - int nodeID, - byte[] cellMinPacked, byte[] cellMaxPacked) - throws IOException { + private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException { /* - System.out.println("\nR: intersect nodeID=" + nodeID); + System.out.println("\nR: intersect nodeID=" + state.index.getNodeID()); for(int dim=0;dim= 0 && dim < numDims; @@ -1019,46 +1034,238 @@ public class BKDWriter implements Closeable { return indexFP; } - /** Subclass can change how it writes the index. */ - protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { + /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */ + private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { + + int numLeaves = leafBlockFPs.length; + + // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens + // if it was created by OneDimensionBKDWriter). In this case the leaf nodes may straddle the two bottom + // levels of the binary tree: + if (numDims == 1 && numLeaves > 1) { + int levelCount = 2; + while (true) { + if (numLeaves >= levelCount && numLeaves <= 2*levelCount) { + int lastLevel = 2*(numLeaves - levelCount); + assert lastLevel >= 0; + if (lastLevel != 0) { + // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading + // at read-time, so that we can still delta code them on disk at write: + long[] newLeafBlockFPs = new long[numLeaves]; + System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel); + System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel); + leafBlockFPs = newLeafBlockFPs; + } + break; + } + + levelCount *= 2; + } + } + + /** Reused while packing the index */ + RAMOutputStream writeBuffer = new RAMOutputStream(); + + // This is the "file" we append the byte[] to: + List blocks = new ArrayList<>(); + byte[] lastSplitValues = new byte[bytesPerDim * numDims]; + //System.out.println("\npack index"); + int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false); + + // Compact the byte[] blocks into single byte index: + byte[] index = new byte[totalSize]; + int upto = 0; + for(byte[] block : blocks) { + System.arraycopy(block, 0, index, upto, block.length); + upto += block.length; + } + assert upto == totalSize; + + return index; + } + + /** Appends the current contents of writeBuffer as another block on the growing in-memory file */ + private int appendBlock(RAMOutputStream writeBuffer, List blocks) throws IOException { + int pos = Math.toIntExact(writeBuffer.getFilePointer()); + byte[] bytes = new byte[pos]; + writeBuffer.writeTo(bytes, 0); + writeBuffer.reset(); + blocks.add(bytes); + return pos; + } + + /** + * lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node + */ + private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List blocks, + int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException { + if (nodeID >= leafBlockFPs.length) { + int leafID = nodeID - leafBlockFPs.length; + //System.out.println("recursePack leaf nodeID=" + nodeID); + + // In the unbalanced case it's possible the left most node only has one child: + if (leafID < leafBlockFPs.length) { + long delta = leafBlockFPs[leafID] - minBlockFP; + if (isLeft) { + assert delta == 0; + return 0; + } else { + assert nodeID == 1 || delta > 0: "nodeID=" + nodeID; + writeBuffer.writeVLong(delta); + return appendBlock(writeBuffer, blocks); + } + } else { + return 0; + } + } else { + long leftBlockFP; + if (isLeft == false) { + leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID); + long delta = leftBlockFP - minBlockFP; + assert nodeID == 1 || delta > 0; + writeBuffer.writeVLong(delta); + } else { + // The left tree's left most leaf block FP is always the minimal FP: + leftBlockFP = minBlockFP; + } + + int address = nodeID * (1+bytesPerDim); + int splitDim = splitPackedValues[address++] & 0xff; + + //System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim)); + + // find common prefix with last split value in this dim: + int prefix = 0; + for(;prefix 0; - out.writeVInt(leafBlockFPs.length); + assert numLeaves > 0; + out.writeVInt(numLeaves); out.writeBytes(minPackedValue, 0, packedBytesLength); out.writeBytes(maxPackedValue, 0, packedBytesLength); out.writeVLong(pointCount); out.writeVInt(docsSeen.cardinality()); - - // NOTE: splitPackedValues[0] is unused, because nodeID is 1-based: - if (numDims == 1) { - // write the index, skipping the byte used to store the split dim since it is always 0 - for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) { - out.writeBytes(splitPackedValues, i, bytesPerDim); - } - } else { - out.writeBytes(splitPackedValues, 0, splitPackedValues.length); - } - - long lastFP = 0; - for (int i=0;i 0: "maxPointsInLeafNode=" + maxPointsInLeafNode; out.writeVInt(count); DocIdsWriter.writeDocIds(docIDs, start, count, out); } - protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction packedValues) throws IOException { + private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction packedValues) throws IOException { int prefixLenSum = Arrays.stream(commonPrefixLengths).sum(); if (prefixLenSum == packedBytesLength) { // all values in this block are equal @@ -1109,7 +1316,7 @@ public class BKDWriter implements Closeable { return end - start; } - protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { + private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { for(int dim=0;dim(terms[idx], - outputs.newPair((long) idx, value))); + outputs.newPair((long) idx, value))); } new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java index 3b9f302f5eb..587c63fb7a3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java +++ b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java @@ -26,7 +26,10 @@ import org.apache.lucene.geo.Rectangle; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SloppyMath; +import org.apache.lucene.util.bkd.BKDReader.IndexTree; +import org.apache.lucene.util.bkd.BKDReader.IntersectState; import org.apache.lucene.util.bkd.BKDReader; import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude; @@ -41,16 +44,16 @@ class NearestNeighbor { static class Cell implements Comparable { final int readerIndex; - final int nodeID; final byte[] minPacked; final byte[] maxPacked; + final IndexTree index; /** The closest possible distance of all points in this cell */ final double distanceMeters; - public Cell(int readerIndex, int nodeID, byte[] minPacked, byte[] maxPacked, double distanceMeters) { + public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) { + this.index = index; this.readerIndex = readerIndex; - this.nodeID = nodeID; this.minPacked = minPacked.clone(); this.maxPacked = maxPacked.clone(); this.distanceMeters = distanceMeters; @@ -66,7 +69,7 @@ class NearestNeighbor { double minLon = decodeLongitude(minPacked, Integer.BYTES); double maxLat = decodeLatitude(maxPacked, 0); double maxLon = decodeLongitude(maxPacked, Integer.BYTES); - return "Cell(readerIndex=" + readerIndex + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")"; + return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")"; } } @@ -219,13 +222,21 @@ class NearestNeighbor { List states = new ArrayList<>(); // Add root cell for each reader into the queue: + int bytesPerDim = -1; + for(int i=0;i Date: Mon, 5 Dec 2016 06:45:16 -0500 Subject: [PATCH 15/53] LUCENE-7563: remove redundant array copy in PackedIndexTree.clone --- lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 6cccf4cf1d1..44744c181a3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -347,7 +347,6 @@ public final class BKDReader extends PointValues implements Accountable { index.nodeID = nodeID; index.level = level; index.splitDim = splitDim; - System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims); index.leafBlockFPStack[level] = leafBlockFPStack[level]; index.leftNodePositions[level] = leftNodePositions[level]; index.rightNodePositions[level] = rightNodePositions[level]; From 2e948fea300f883b7dfb586e303d5720d09b3210 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 5 Dec 2016 16:11:57 -0500 Subject: [PATCH 16/53] LUCENE-7575: Add UnifiedHighlighter field matcher predicate (AKA requireFieldMatch=false) --- lucene/CHANGES.txt | 4 + .../uhighlight/MemoryIndexOffsetStrategy.java | 10 +- .../uhighlight/MultiTermHighlighting.java | 37 +-- .../search/uhighlight/PhraseHelper.java | 158 +++++++--- .../search/uhighlight/UnifiedHighlighter.java | 64 ++-- .../uhighlight/TestUnifiedHighlighter.java | 275 ++++++++++++++++++ .../TestUnifiedHighlighterExtensibility.java | 3 +- 7 files changed, 467 insertions(+), 84 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 79e44e112c8..c6c39ac45fb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -121,6 +121,10 @@ Improvements control how text is analyzed and converted into a query (Matt Weber via Mike McCandless) +* LUCENE-7575: UnifiedHighlighter can now highlight fields with queries that don't + necessarily refer to that field (AKA requireFieldMatch==false). Disabled by default. + See UH get/setFieldMatcher. (Jim Ferenczi via David Smiley) + Optimizations * LUCENE-7568: Optimize merging when index sorting is used but the diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index 4028912fcf0..0001a801f8c 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -23,6 +23,7 @@ import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.function.Function; +import java.util.function.Predicate; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.FilteringTokenFilter; @@ -49,7 +50,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { private final LeafReader leafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; - public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, + public MemoryIndexOffsetStrategy(String field, Predicate fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer, Function> multiTermQueryRewrite) { super(field, extractedTerms, phraseHelper, automata, analyzer); @@ -57,13 +58,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite); + preMemIndexFilterAutomaton = buildCombinedAutomaton(fieldMatcher, terms, this.automata, phraseHelper, multiTermQueryRewrite); } /** * Build one {@link CharacterRunAutomaton} matching any term the query might match. */ - private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms, + private static CharacterRunAutomaton buildCombinedAutomaton(Predicate fieldMatcher, + BytesRef[] terms, CharacterRunAutomaton[] automata, PhraseHelper strictPhrases, Function> multiTermQueryRewrite) { @@ -74,7 +76,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { Collections.addAll(allAutomata, automata); for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) { Collections.addAll(allAutomata, - MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan + MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));//true==lookInSpan } if (allAutomata.size() == 1) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index fd6a26a778f..267d6039d83 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -22,6 +22,7 @@ import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.function.Function; +import java.util.function.Predicate; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; @@ -56,50 +57,52 @@ class MultiTermHighlighting { } /** - * Extracts all MultiTermQueries for {@code field}, and returns equivalent - * automata that will match terms. + * Extracts MultiTermQueries that match the provided field predicate. + * Returns equivalent automata that will match terms. */ - public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan, + public static CharacterRunAutomaton[] extractAutomata(Query query, + Predicate fieldMatcher, + boolean lookInSpan, Function> preRewriteFunc) { List list = new ArrayList<>(); Collection customSubQueries = preRewriteFunc.apply(query); if (customSubQueries != null) { for (Query sub : customSubQueries) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc))); } } else if (query instanceof BooleanQuery) { for (BooleanClause clause : (BooleanQuery) query) { if (!clause.isProhibited()) { - list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc))); } } } else if (query instanceof ConstantScoreQuery) { - list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan, + list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc))); } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNotQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan, + list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan, preRewriteFunc))); } else if (lookInSpan && query instanceof SpanPositionCheckQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan, + list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan, preRewriteFunc))); } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { - list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, - lookInSpan, preRewriteFunc))); + list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), + fieldMatcher, lookInSpan, preRewriteFunc))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; - if (aq.getField().equals(field)) { + if (fieldMatcher.test(aq.getField())) { list.add(new CharacterRunAutomaton(aq.getAutomaton()) { @Override public String toString() { @@ -110,7 +113,7 @@ class MultiTermHighlighting { } else if (query instanceof PrefixQuery) { final PrefixQuery pq = (PrefixQuery) query; Term prefix = pq.getPrefix(); - if (prefix.field().equals(field)) { + if (fieldMatcher.test(prefix.field())) { list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), Automata.makeAnyString())) { @Override @@ -121,7 +124,7 @@ class MultiTermHighlighting { } } else if (query instanceof FuzzyQuery) { final FuzzyQuery fq = (FuzzyQuery) query; - if (fq.getField().equals(field)) { + if (fieldMatcher.test(fq.getField())) { String utf16 = fq.getTerm().text(); int termText[] = new int[utf16.codePointCount(0, utf16.length())]; for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { @@ -142,7 +145,7 @@ class MultiTermHighlighting { } } else if (query instanceof TermRangeQuery) { final TermRangeQuery tq = (TermRangeQuery) query; - if (tq.getField().equals(field)) { + if (fieldMatcher.test(tq.getField())) { final CharsRef lowerBound; if (tq.getLowerTerm() == null) { lowerBound = null; diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index cde17baf87a..d7e8671c4c2 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -16,17 +16,50 @@ */ package org.apache.lucene.search.uhighlight; -import org.apache.lucene.index.*; -import org.apache.lucene.search.*; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.function.Predicate; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.highlight.WeightedSpanTerm; import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; -import org.apache.lucene.search.spans.*; +import org.apache.lucene.search.spans.SpanCollector; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.BytesRef; -import java.io.IOException; -import java.util.*; -import java.util.function.Function; - /** * Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly). * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting @@ -40,7 +73,7 @@ import java.util.function.Function; public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - spanQuery -> null, query -> null, true); + (s) -> false, spanQuery -> null, query -> null, true); //TODO it seems this ought to be a general thing on Spans? private static final Comparator SPANS_COMPARATOR = (o1, o2) -> { @@ -59,10 +92,11 @@ public class PhraseHelper { } }; - private final String fieldName; // if non-null, only look at queries/terms for this field + private final String fieldName; private final Set positionInsensitiveTerms; // (TermQuery terms) private final Set spanQueries; private final boolean willRewrite; + private final Predicate fieldMatcher; /** * Constructor. @@ -73,14 +107,15 @@ public class PhraseHelper { * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked. * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones. + * {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted. */ - public PhraseHelper(Query query, String field, Function rewriteQueryPred, + public PhraseHelper(Query query, String field, Predicate fieldMatcher, Function rewriteQueryPred, Function> preExtractRewriteFunction, boolean ignoreQueriesNeedingRewrite) { - this.fieldName = field; // if null then don't require field match + this.fieldName = field; + this.fieldMatcher = fieldMatcher; // filter terms to those we want - positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>(); - // requireFieldMatch optional + positionInsensitiveTerms = new FieldFilteringTermSet(); spanQueries = new HashSet<>(); // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls @@ -131,11 +166,11 @@ public class PhraseHelper { @Override protected void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery, float boost) throws IOException { - if (field != null) { - // if this span query isn't for this field, skip it. - Set fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1 - collectSpanQueryFields(spanQuery, fieldNameSet); - if (!fieldNameSet.contains(field)) { + // if this span query isn't for this field, skip it. + Set fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1 + collectSpanQueryFields(spanQuery, fieldNameSet); + for (String spanField : fieldNameSet) { + if (!fieldMatcher.test(spanField)) { return; } } @@ -190,10 +225,11 @@ public class PhraseHelper { if (spanQueries.isEmpty()) { return Collections.emptyMap(); } + final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName); // for each SpanQuery, collect the member spans into a map. Map result = new HashMap<>(); for (SpanQuery spanQuery : spanQueries) { - getTermToSpans(spanQuery, leafReader.getContext(), doc, result); + getTermToSpans(spanQuery, filteredReader.getContext(), doc, result); } return result; } @@ -203,15 +239,14 @@ public class PhraseHelper { int doc, Map result) throws IOException { // note: in WSTE there was some field specific looping that seemed pointless so that isn't here. - final IndexSearcher searcher = new IndexSearcher(readerContext); + final IndexSearcher searcher = new IndexSearcher(readerContext.reader()); searcher.setQueryCache(null); if (willRewrite) { spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done } // Get the underlying query terms - - TreeSet termSet = new TreeSet<>(); // sorted so we can loop over results in order shortly... + TreeSet termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly... searcher.createWeight(spanQuery, false, 1.0f).extractTerms(termSet);//needsScores==false // Get Spans by running the query against the reader @@ -240,9 +275,6 @@ public class PhraseHelper { for (final Term queryTerm : termSet) { // note: we expect that at least one query term will pass these filters. This is because the collected // spanQuery list were already filtered by these conditions. - if (fieldName != null && fieldName.equals(queryTerm.field()) == false) { - continue; - } if (positionInsensitiveTerms.contains(queryTerm)) { continue; } @@ -375,19 +407,17 @@ public class PhraseHelper { } /** - * Simple HashSet that filters out Terms not matching a desired field on {@code add()}. + * Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}. */ - private static class FieldFilteringTermHashSet extends HashSet { - private final String field; - - FieldFilteringTermHashSet(String field) { - this.field = field; - } - + private class FieldFilteringTermSet extends TreeSet { @Override public boolean add(Term term) { - if (term.field().equals(field)) { - return super.add(term); + if (fieldMatcher.test(term.field())) { + if (term.field().equals(fieldName)) { + return super.add(term); + } else { + return super.add(new Term(fieldName, term.bytes())); + } } else { return false; } @@ -499,6 +529,64 @@ public class PhraseHelper { } } + /** + * This reader will just delegate every call to a single field in the wrapped + * LeafReader. This way we ensure that all queries going through this reader target the same field. + */ + static final class SingleFieldFilterLeafReader extends FilterLeafReader { + final String fieldName; + SingleFieldFilterLeafReader(LeafReader in, String fieldName) { + super(in); + this.fieldName = fieldName; + } + + @Override + public FieldInfos getFieldInfos() { + throw new UnsupportedOperationException(); + } + + @Override + public Fields fields() throws IOException { + return new FilterFields(super.fields()) { + @Override + public Terms terms(String field) throws IOException { + return super.terms(fieldName); + } + + @Override + public Iterator iterator() { + return Collections.singletonList(fieldName).iterator(); + } + + @Override + public int size() { + return 1; + } + }; + } + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + return super.getNumericDocValues(fieldName); + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + return super.getBinaryDocValues(fieldName); + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + return super.getSortedDocValues(fieldName); + } + + @Override + public NumericDocValues getNormValues(String field) throws IOException { + return super.getNormValues(fieldName); + } + } + + /** * A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc. */ diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index ac5f0f69999..bbcfd5b0203 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; @@ -31,6 +32,7 @@ import java.util.Objects; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.function.Predicate; import java.util.function.Supplier; import org.apache.lucene.analysis.Analyzer; @@ -58,7 +60,6 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.InPlaceMergeSorter; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** @@ -119,13 +120,13 @@ public class UnifiedHighlighter { private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy - // private boolean defaultRequireFieldMatch = true; TODO - private int maxLength = DEFAULT_MAX_LENGTH; // BreakIterator is stateful so we use a Supplier factory method private Supplier defaultBreakIterator = () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private Predicate defaultFieldMatcher; + private PassageScorer defaultScorer = new PassageScorer(); private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); @@ -140,8 +141,8 @@ public class UnifiedHighlighter { /** * Calls {@link Weight#extractTerms(Set)} on an empty index for the query. */ - protected static SortedSet extractTerms(Query query) throws IOException { - SortedSet queryTerms = new TreeSet<>(); + protected static Set extractTerms(Query query) throws IOException { + Set queryTerms = new HashSet<>(); EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms); return queryTerms; } @@ -197,6 +198,10 @@ public class UnifiedHighlighter { this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold; } + public void setFieldMatcher(Predicate predicate) { + this.defaultFieldMatcher = predicate; + } + /** * Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's enabled. MTQ * highlighting can be expensive, particularly when using offsets in postings. @@ -220,6 +225,18 @@ public class UnifiedHighlighter { return defaultPassageRelevancyOverSpeed; } + /** + * Returns the predicate to use for extracting the query part that must be highlighted. + * By default only queries that target the current field are kept. (AKA requireFieldMatch) + */ + protected Predicate getFieldMatcher(String field) { + if (defaultFieldMatcher != null) { + return defaultFieldMatcher; + } else { + // requireFieldMatch = true + return (qf) -> field.equals(qf); + } + } /** * The maximum content size to process. Content will be truncated to this size before highlighting. Typically @@ -548,7 +565,7 @@ public class UnifiedHighlighter { copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params // Init field highlighters (where most of the highlight logic lives, and on a per field basis) - SortedSet queryTerms = extractTerms(query); + Set queryTerms = extractTerms(query); FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length]; int numTermVectors = 0; int numPostings = 0; @@ -718,13 +735,13 @@ public class UnifiedHighlighter { getClass().getSimpleName() + " without an IndexSearcher."); } Objects.requireNonNull(content, "content is required"); - SortedSet queryTerms = extractTerms(query); + Set queryTerms = extractTerms(query); return getFieldHighlighter(field, query, queryTerms, maxPassages) .highlightFieldForDoc(null, -1, content); } - protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet allTerms, int maxPassages) { - BytesRef[] terms = filterExtractedTerms(field, allTerms); + protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { + BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms); Set highlightFlags = getFlags(field); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); @@ -738,19 +755,15 @@ public class UnifiedHighlighter { getFormatter(field)); } - protected static BytesRef[] filterExtractedTerms(String field, SortedSet queryTerms) { - // TODO consider requireFieldMatch - Term floor = new Term(field, ""); - Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); - SortedSet fieldTerms = queryTerms.subSet(floor, ceiling); - - // Strip off the redundant field: - BytesRef[] terms = new BytesRef[fieldTerms.size()]; - int termUpto = 0; - for (Term term : fieldTerms) { - terms[termUpto++] = term.bytes(); + protected static BytesRef[] filterExtractedTerms(Predicate fieldMatcher, Set queryTerms) { + // Strip off the redundant field and sort the remaining terms + SortedSet filteredTerms = new TreeSet<>(); + for (Term term : queryTerms) { + if (fieldMatcher.test(term.field())) { + filteredTerms.add(term.bytes()); + } } - return terms; + return filteredTerms.toArray(new BytesRef[filteredTerms.size()]); } protected Set getFlags(String field) { @@ -771,14 +784,13 @@ public class UnifiedHighlighter { boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES); boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY); return highlightPhrasesStrictly ? - new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : - PhraseHelper.NONE; + new PhraseHelper(query, field, getFieldMatcher(field), + this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE; } protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set highlightFlags) { return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) - ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES), - this::preMultiTermQueryRewrite) + ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), !highlightFlags.contains(HighlightFlag.PHRASES), this::preMultiTermQueryRewrite) : ZERO_LEN_AUTOMATA_ARRAY; } @@ -826,7 +838,7 @@ public class UnifiedHighlighter { //skip using a memory index since it's pure term filtering return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer()); } else { - return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(), + return new MemoryIndexOffsetStrategy(field, getFieldMatcher(field), terms, phraseHelper, automata, getIndexAnalyzer(), this::preMultiTermQueryRewrite); } case NONE_NEEDED: diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 0fd7d3d0a23..ddf8a926ba0 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.function.Predicate; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -32,14 +33,17 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; @@ -959,4 +963,275 @@ public class TestUnifiedHighlighter extends LuceneTestCase { ir.close(); } + private IndexReader indexSomeFields() throws IOException { + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + FieldType ft = new FieldType(); + ft.setIndexOptions(IndexOptions.NONE); + ft.setTokenized(false); + ft.setStored(true); + ft.freeze(); + + Field title = new Field("title", "", fieldType); + Field text = new Field("text", "", fieldType); + Field category = new Field("category", "", fieldType); + + Document doc = new Document(); + doc.add(title); + doc.add(text); + doc.add(category); + title.setStringValue("This is the title field."); + text.setStringValue("This is the text field. You can put some text if you want."); + category.setStringValue("This is the category field."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + return ir; + } + + public void testFieldMatcherTermQuery() throws Exception { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Predicate getFieldMatcher(String field) { + // requireFieldMatch=false + return (qf) -> true; + } + }; + UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer); + BooleanQuery.Builder queryBuilder = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("text", "some")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "field")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "this")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("title", "this")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("category", "this")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("category", "some")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("category", "category")), BooleanClause.Occur.SHOULD); + Query query = queryBuilder.build(); + + // title + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // text + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. ", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // category + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + + highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + ir.close(); + } + + public void testFieldMatcherMultiTermQuery() throws Exception { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Predicate getFieldMatcher(String field) { + // requireFieldMatch=false + return (qf) -> true; + } + }; + UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer); + BooleanQuery.Builder queryBuilder = + new BooleanQuery.Builder() + .add(new FuzzyQuery(new Term("text", "sime"), 1), BooleanClause.Occur.SHOULD) + .add(new PrefixQuery(new Term("text", "fie")), BooleanClause.Occur.SHOULD) + .add(new PrefixQuery(new Term("text", "thi")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD) + .add(new PrefixQuery(new Term("title", "thi")), BooleanClause.Occur.SHOULD) + .add(new PrefixQuery(new Term("category", "thi")), BooleanClause.Occur.SHOULD) + .add(new FuzzyQuery(new Term("category", "sime"), 1), BooleanClause.Occur.SHOULD) + .add(new PrefixQuery(new Term("category", "categ")), BooleanClause.Occur.SHOULD); + Query query = queryBuilder.build(); + + // title + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // text + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. ", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // category + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + + highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + ir.close(); + } + + public void testFieldMatcherPhraseQuery() throws Exception { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Predicate getFieldMatcher(String field) { + // requireFieldMatch=false + return (qf) -> true; + } + }; + UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer); + BooleanQuery.Builder queryBuilder = + new BooleanQuery.Builder() + .add(new PhraseQuery("title", "this", "is", "the", "title"), BooleanClause.Occur.SHOULD) + .add(new PhraseQuery(2, "category", "this", "is", "the", "field"), BooleanClause.Occur.SHOULD) + .add(new PhraseQuery("text", "this", "is"), BooleanClause.Occur.SHOULD) + .add(new PhraseQuery("category", "this", "is"), BooleanClause.Occur.SHOULD) + .add(new PhraseQuery(1, "text", "you", "can", "put", "text"), BooleanClause.Occur.SHOULD); + Query query = queryBuilder.build(); + + // title + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); + snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // text + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + + highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); + snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + + // category + { + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + + + highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); + snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the category field.", snippets[0]); + highlighterFieldMatch.setFieldMatcher(null); + } + ir.close(); + } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index d15094000c3..10757a5b1e7 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -23,7 +23,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -144,7 +143,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { } @Override - protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet allTerms, int maxPassages) { + protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { return super.getFieldHighlighter(field, query, allTerms, maxPassages); } From bf3a3137be8a70ceed884e87c3ada276e82b187b Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Tue, 6 Dec 2016 13:11:36 -0500 Subject: [PATCH 17/53] SOLR-9832: Schema modifications are not immediately visible on the coordinating node --- solr/CHANGES.txt | 2 + .../java/org/apache/solr/core/SolrCore.java | 8 -- .../schema/ManagedIndexSchemaFactory.java | 12 +++ .../org/apache/solr/schema/SchemaManager.java | 2 +- .../ManagedSchemaRoundRobinCloudTest.java | 98 +++++++++++++++++++ 5 files changed, 113 insertions(+), 9 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/schema/ManagedSchemaRoundRobinCloudTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e76616942be..bac24e570d6 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -220,6 +220,8 @@ Bug Fixes * SOLR-9616: Solr throws exception when expand=true on empty index (Timo Hund via Ishan Chattopadhyaya) +* SOLR-9832: Schema modifications are not immediately visible on the coordinating node. (Steve Rowe) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java index e5bc53da9bd..a459bf2e9b1 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrCore.java +++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java @@ -119,7 +119,6 @@ import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchemaFactory; import org.apache.solr.schema.ManagedIndexSchema; -import org.apache.solr.schema.SchemaManager; import org.apache.solr.schema.SimilarityFactory; import org.apache.solr.search.QParserPlugin; import org.apache.solr.search.SolrFieldCacheMBean; @@ -2720,13 +2719,6 @@ public final class SolrCore implements SolrInfoMBean, Closeable { if (checkStale(zkClient, overlayPath, solrConfigversion) || checkStale(zkClient, solrConfigPath, overlayVersion) || checkStale(zkClient, managedSchmaResourcePath, managedSchemaVersion)) { - - try (SolrCore solrCore = cc.solrCores.getCoreFromAnyList(coreName, true)) { - solrCore.setLatestSchema(SchemaManager.getFreshManagedSchema(solrCore)); - } catch (Exception e) { - log.warn("", SolrZkClient.checkInterrupted(e)); - } - log.info("core reload {}", coreName); try { cc.reload(coreName); diff --git a/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchemaFactory.java b/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchemaFactory.java index 66d947ea09b..d4a10bda5f0 100644 --- a/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchemaFactory.java +++ b/solr/core/src/java/org/apache/solr/schema/ManagedIndexSchemaFactory.java @@ -377,6 +377,18 @@ public class ManagedIndexSchemaFactory extends IndexSchemaFactory implements Sol this.zkIndexSchemaReader = new ZkIndexSchemaReader(this, core); ZkSolrResourceLoader zkLoader = (ZkSolrResourceLoader)loader; zkLoader.setZkIndexSchemaReader(this.zkIndexSchemaReader); + try { + zkIndexSchemaReader.refreshSchemaFromZk(-1); // update immediately if newer is available + core.setLatestSchema(getSchema()); + } catch (KeeperException e) { + String msg = "Error attempting to access " + zkLoader.getConfigSetZkPath() + "/" + managedSchemaResourceName; + log.error(msg, e); + throw new SolrException(ErrorCode.SERVER_ERROR, msg, e); + } catch (InterruptedException e) { + // Restore the interrupted status + Thread.currentThread().interrupt(); + log.warn("", e); + } } else { this.zkIndexSchemaReader = null; } diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaManager.java b/solr/core/src/java/org/apache/solr/schema/SchemaManager.java index 4b0ea546fe4..33406318158 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaManager.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaManager.java @@ -133,8 +133,8 @@ public class SchemaManager { try { int latestVersion = ZkController.persistConfigResourceToZooKeeper(zkLoader, managedIndexSchema.getSchemaZkVersion(), managedIndexSchema.getResourceName(), sw.toString().getBytes(StandardCharsets.UTF_8), true); + req.getCore().getCoreDescriptor().getCoreContainer().reload(req.getCore().getName()); waitForOtherReplicasToUpdate(timeOut, latestVersion); - core.setLatestSchema(managedIndexSchema); return Collections.emptyList(); } catch (ZkController.ResourceModifiedInZkException e) { log.info("Schema was modified by another node. Retrying.."); diff --git a/solr/core/src/test/org/apache/solr/schema/ManagedSchemaRoundRobinCloudTest.java b/solr/core/src/test/org/apache/solr/schema/ManagedSchemaRoundRobinCloudTest.java new file mode 100644 index 00000000000..883ebfdb5e6 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/schema/ManagedSchemaRoundRobinCloudTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.schema; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.schema.SchemaRequest; +import org.apache.solr.client.solrj.response.schema.SchemaResponse; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.cloud.DocCollection; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class ManagedSchemaRoundRobinCloudTest extends SolrCloudTestCase { + private static final String COLLECTION = "managed_coll"; + private static final String CONFIG = "cloud-managed"; + private static final String FIELD_PREFIX = "NumberedField_"; + private static final int NUM_SHARDS = 2; + private static final int NUM_FIELDS_TO_ADD = 10; + + @BeforeClass + public static void setupCluster() throws Exception { + System.setProperty("managed.schema.mutable", "true"); + configureCluster(NUM_SHARDS).addConfig(CONFIG, configset(CONFIG)).configure(); + CollectionAdminRequest.createCollection(COLLECTION, CONFIG, NUM_SHARDS, 1) + .setMaxShardsPerNode(1) + .process(cluster.getSolrClient()); + cluster.getSolrClient().waitForState(COLLECTION, DEFAULT_TIMEOUT, TimeUnit.SECONDS, + (n, c) -> DocCollection.isFullyActive(n, c, NUM_SHARDS, 1)); + } + + @AfterClass + public static void clearSysProps() throws Exception { + System.clearProperty("managed.schema.mutable"); + } + + @Test + public void testAddFieldsRoundRobin() throws Exception { + List clients = new ArrayList<>(NUM_SHARDS); + try { + for (int shardNum = 0 ; shardNum < NUM_SHARDS ; ++shardNum) { + clients.add(getHttpSolrClient(cluster.getJettySolrRunners().get(shardNum).getBaseUrl().toString())); + } + int shardNum = 0; + for (int fieldNum = 0 ; fieldNum < NUM_FIELDS_TO_ADD ; ++fieldNum) { + addField(clients.get(shardNum), keyValueArrayToMap("name", FIELD_PREFIX + fieldNum, "type", "string")); + if (++shardNum == NUM_SHARDS) { + shardNum = 0; + } + } + } finally { + for (int shardNum = 0 ; shardNum < NUM_SHARDS ; ++shardNum) { + clients.get(shardNum).close(); + } + } + } + + private void addField(SolrClient client, Map field) throws Exception { + SchemaResponse.UpdateResponse addFieldResponse = new SchemaRequest.AddField(field).process(client, COLLECTION); + assertNotNull(addFieldResponse); + assertEquals(0, addFieldResponse.getStatus()); + assertNull(addFieldResponse.getResponse().get("errors")); + String fieldName = field.get("name").toString(); + SchemaResponse.FieldResponse fieldResponse = new SchemaRequest.Field(fieldName).process(client, COLLECTION); + assertNotNull(fieldResponse); + assertEquals(0, fieldResponse.getStatus()); + } + + private Map keyValueArrayToMap(String... alternatingKeysAndValues) { + Map map = new HashMap<>(); + for (int i = 0 ; i < alternatingKeysAndValues.length ; i += 2) + map.put(alternatingKeysAndValues[i], alternatingKeysAndValues[i + 1]); + return map; + } +} From c164f7e35e45d0bfa844cd450ffb4865c27fc4d5 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 6 Dec 2016 10:34:22 -0800 Subject: [PATCH 18/53] SOLR-9827: Make ConcurrentUpdateSolrClient create RemoteSolrExceptions in case of remote errors instead of SolrException --- solr/CHANGES.txt | 4 ++++ .../solrj/impl/ConcurrentUpdateSolrClient.java | 16 +++++++++++++--- .../solr/client/solrj/SolrExampleTests.java | 7 ++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index bac24e570d6..4f7377c5f94 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -259,6 +259,10 @@ Other Changes * SOLR-9819: Upgrade commons-fileupload to 1.3.2, fixing a potential vulnerability CVE-2016-3092 (Anshum Gupta) +* SOLR-9827: ConcurrentUpdateSolrClient creates a RemoteSolrException if the remote host responded with a non-ok + response (instead of a SolrException) and includes the remote error message as part of the exception message + (Tomás Fernández Löbbe) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java index b96cc2365c2..5c3f289c4f5 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java @@ -46,7 +46,6 @@ import org.apache.solr.client.solrj.request.RequestWriter; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; @@ -330,7 +329,8 @@ public class ConcurrentUpdateSolrClient extends SolrClient { msg.append("\n\n\n\n"); msg.append("request: ").append(method.getURI()); - SolrException solrExc = new SolrException(ErrorCode.getErrorCode(statusCode), msg.toString()); + SolrException solrExc; + NamedList metadata = null; // parse out the metadata from the SolrException try { String encoding = "UTF-8"; // default @@ -343,11 +343,21 @@ public class ConcurrentUpdateSolrClient extends SolrClient { NamedList resp = client.parser.processResponse(rspBody, encoding); NamedList error = (NamedList) resp.get("error"); if (error != null) { - solrExc.setMetadata((NamedList) error.get("metadata")); + metadata = (NamedList) error.get("metadata"); + String remoteMsg = (String) error.get("msg"); + if (remoteMsg != null) { + msg.append("\nRemote error message: "); + msg.append(remoteMsg); + } } } catch (Exception exc) { // don't want to fail to report error if parsing the response fails log.warn("Failed to parse error response from " + client.getBaseURL() + " due to: " + exc); + } finally { + solrExc = new HttpSolrClient.RemoteSolrException(client.getBaseURL(), statusCode, msg.toString(), null); + if (metadata != null) { + solrExc.setMetadata(metadata); + } } handleError(solrExc); diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrExampleTests.java b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrExampleTests.java index f403f3f7b9e..d25280dfe26 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrExampleTests.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrExampleTests.java @@ -36,6 +36,7 @@ import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.client.solrj.embedded.SolrExampleStreamingTest.ErrorTrackingConcurrentUpdateSolrClient; import org.apache.solr.client.solrj.impl.BinaryResponseParser; import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException; import org.apache.solr.client.solrj.impl.NoOpResponseParser; import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.request.AbstractUpdateRequest; @@ -463,7 +464,11 @@ abstract public class SolrExampleTests extends SolrExampleTestsBase concurrentClient.lastError = null; concurrentClient.add(doc); concurrentClient.blockUntilFinished(); - assertNotNull("Should throw exception!", concurrentClient.lastError); + assertNotNull("Should throw exception!", concurrentClient.lastError); + assertEquals("Unexpected exception type", + RemoteSolrException.class, concurrentClient.lastError.getClass()); + assertTrue("Unexpected exception message: " + concurrentClient.lastError.getMessage(), + concurrentClient.lastError.getMessage().contains("Remote error message: Document contains multiple values for uniqueKey")); } else { log.info("Ignoring update test for client:" + client.getClass().getName()); } From 8b98b158ff9cc2a71216e12c894ca14352d31f0e Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Tue, 6 Dec 2016 14:47:03 -0700 Subject: [PATCH 19/53] SOLR-5043: New solr.dns.prevent.reverse.lookup system property that can be used to prevent long core (re)load delays on systems with missconfigured hostname/DNS --- solr/CHANGES.txt | 3 ++ .../solr/handler/admin/SystemInfoHandler.java | 51 ++++++++++++++++--- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4f7377c5f94..14dd2fae779 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -152,6 +152,9 @@ New Features * SOLR-9728: Ability to specify Key Store type in solr.in.sh file for SSL (Michael Suzuki, Kevin Risden) +* SOLR-5043: New solr.dns.prevent.reverse.lookup system property that can be used to prevent long core + (re)load delays on systems with missconfigured hostname/DNS (hossman) + Optimizations ---------------------- * SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have diff --git a/solr/core/src/java/org/apache/solr/handler/admin/SystemInfoHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/SystemInfoHandler.java index 35ef906d353..a873c09c984 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/SystemInfoHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/SystemInfoHandler.java @@ -31,7 +31,6 @@ import java.lang.management.PlatformManagedObject; import java.lang.management.RuntimeMXBean; import java.lang.reflect.InvocationTargetException; import java.net.InetAddress; -import java.net.UnknownHostException; import java.nio.charset.Charset; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; @@ -50,6 +49,8 @@ import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.IndexSchema; +import org.apache.solr.util.RTimer; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,8 +65,22 @@ import static org.apache.solr.common.params.CommonParams.NAME; public class SystemInfoHandler extends RequestHandlerBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - + /** + *

    + * Undocumented expert level system property to prevent doing a reverse lookup of our hostname. + * This property ill be logged as a suggested workaround if any probems are noticed when doing reverse + * lookup. + *

    + * + *

    + * TODO: should we refactor this (and the associated logic) into a helper method for any other places + * where DNS is used? + *

    + * @see #initHostname + */ + private static final String PREVENT_REVERSE_DNS_OF_LOCALHOST_SYSPROP = "solr.dns.prevent.reverse.lookup"; + // on some platforms, resolving canonical hostname can cause the thread // to block for several seconds if nameservices aren't available // so resolve this once per handler instance @@ -75,22 +90,42 @@ public class SystemInfoHandler extends RequestHandlerBase private CoreContainer cc; public SystemInfoHandler() { - super(); - init(); + this(null); } public SystemInfoHandler(CoreContainer cc) { super(); this.cc = cc; - init(); + initHostname(); } - private void init() { + private void initHostname() { + if (null != System.getProperty(PREVENT_REVERSE_DNS_OF_LOCALHOST_SYSPROP, null)) { + log.info("Resolving canonical hostname for local host prevented due to '{}' sysprop", + PREVENT_REVERSE_DNS_OF_LOCALHOST_SYSPROP); + hostname = null; + return; + } + + RTimer timer = new RTimer(); try { InetAddress addr = InetAddress.getLocalHost(); hostname = addr.getCanonicalHostName(); - } catch (UnknownHostException e) { - //default to null + } catch (Exception e) { + log.warn("Unable to resolve canonical hostname for local host, possible DNS misconfiguration. " + + "Set the '"+PREVENT_REVERSE_DNS_OF_LOCALHOST_SYSPROP+"' sysprop to true on startup to " + + "prevent future lookups if DNS can not be fixed.", e); + hostname = null; + return; + } + timer.stop(); + + if (15000D < timer.getTime()) { + String readableTime = String.format(Locale.ROOT, "%.3f", (timer.getTime() / 1000)); + log.warn("Resolving canonical hostname for local host took {} seconds, possible DNS misconfiguration. " + + "Set the '{}' sysprop to true on startup to prevent future lookups if DNS can not be fixed.", + readableTime, PREVENT_REVERSE_DNS_OF_LOCALHOST_SYSPROP); + } } From 3f6164c76e2fc581abe4408066e08cf9fc817260 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Wed, 7 Dec 2016 18:42:07 +0530 Subject: [PATCH 20/53] added an extra testcase --- .../TestPlainTextEntityProcessor.java | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java index 82b757e6db6..a286d841c91 100644 --- a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java +++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java @@ -16,12 +16,23 @@ */ package org.apache.solr.handler.dataimport; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.sql.Blob; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.Collections; import java.util.Properties; +import org.apache.solr.common.util.Utils; import org.junit.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Test for PlainTextEntityProcessor * @@ -42,6 +53,103 @@ public class TestPlainTextEntityProcessor extends AbstractDataImportHandlerTestC assertEquals(DS.s, sw.docs.get(0).getFieldValue("x")); } + static class BlobImpl implements Blob{ + private final byte[] bytes; + + BlobImpl(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public long length() throws SQLException { + return 0; + } + + @Override + public byte[] getBytes(long pos, int length) throws SQLException { + return bytes; + } + + @Override + public InputStream getBinaryStream() throws SQLException { + return new ByteArrayInputStream(bytes); + } + + @Override + public long position(byte[] pattern, long start) throws SQLException { + return 0; + } + + @Override + public long position(Blob pattern, long start) throws SQLException { + return 0; + } + + @Override + public int setBytes(long pos, byte[] bytes) throws SQLException { + return 0; + } + + @Override + public int setBytes(long pos, byte[] bytes, int offset, int len) throws SQLException { + return 0; + } + + @Override + public OutputStream setBinaryStream(long pos) throws SQLException { + return null; + } + + @Override + public void truncate(long len) throws SQLException { + + } + + @Override + public void free() throws SQLException { + + } + + @Override + public InputStream getBinaryStream(long pos, long length) throws SQLException { + return new ByteArrayInputStream(bytes); + } + } + + @Test + public void testSimple2() throws IOException { + DataImporter di = new DataImporter(); + MockDataSource.setIterator("select id, name, blob_field from lw_table4", Collections.singletonList(Utils.makeMap("blob_field",new BlobImpl(DS.s.getBytes(UTF_8)) ) ).iterator()); + + String dc = + + " " + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + ""; + System.out.println(dc); + di.loadAndInit(dc); + redirectTempProperties(di); + + TestDocBuilder.SolrWriterImpl sw = new TestDocBuilder.SolrWriterImpl(); + RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); + di.runCmd(rp, sw); + assertEquals(DS.s, sw.docs.get(0).getFieldValue("plainText")); + } + + public static class DS extends DataSource { static String s = "hello world"; From ca5e736db1df0cdf35f1b039350bfd5a9cdfa102 Mon Sep 17 00:00:00 2001 From: yonik Date: Wed, 7 Dec 2016 11:08:33 -0500 Subject: [PATCH 21/53] SOLR-9822: speed up single-valued string fieldcache counting in dv facet processor --- solr/CHANGES.txt | 5 + .../facet/FacetFieldProcessorByArrayDV.java | 49 +++++- .../apache/solr/search/facet/FieldUtil.java | 147 ++++++++++++++++++ .../apache/solr/uninverting/FieldCache.java | 2 +- .../solr/uninverting/FieldCacheImpl.java | 142 +++++++++-------- 5 files changed, 270 insertions(+), 75 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 14dd2fae779..8dee8379901 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -62,6 +62,11 @@ Optimizations * SOLR-9579: Make Solr's SchemaField implement Lucene's IndexableFieldType, removing the creation of a Lucene FieldType every time a field is indexed. (John Call, yonik) +* SOLR-9822: JSON Facet API: Recover performance lost due to the DocValues transition to + an iterator API (LUCENE-7407). This only fixes calculating counts for single-valued + string fields from the FieldCache, resulting in up to 56% better throughput for those cases. + (yonik) + ================== 6.4.0 ================== diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java index 88adf6751d6..1481f187df5 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java @@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.solr.common.SolrException; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.Filter; +import org.apache.solr.uninverting.FieldCacheImpl; /** * Grabs values from {@link DocValues}. @@ -184,15 +185,33 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { int segMax = singleDv.getValueCount() + 1; final int[] counts = getCountArr( segMax ); + /** alternate trial implementations + // ord + // FieldUtil.visitOrds(singleDv, disi, (doc,ord)->{counts[ord+1]++;} ); + + FieldUtil.OrdValues ordValues = FieldUtil.getOrdValues(singleDv, disi); + while (ordValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + counts[ ordValues.getOrd() + 1]++; + } + **/ + + + // calculate segment-local counts int doc; - while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - if (singleDv.advanceExact(doc)) { - counts[ singleDv.ordValue() + 1 ]++; - } else { - counts[ 0 ]++; + if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { + FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv; + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + counts[fc.getOrd(doc) + 1]++; + } + } else { + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (singleDv.advanceExact(doc)) { + counts[singleDv.ordValue() + 1]++; + } } } + // convert segment-local counts to global counts for (int i=1; i 0) { @@ -250,12 +269,26 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { int doc; - while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - if (singleDv.advanceExact(doc)) { - int segOrd = singleDv.ordValue(); + if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { + + FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter)singleDv; + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + int segOrd = fc.getOrd(doc); + if (segOrd < 0) continue; int ord = (int)toGlobal.get(segOrd); countAcc.incrementCount(ord, 1); } + + } else { + + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (singleDv.advanceExact(doc)) { + int segOrd = singleDv.ordValue(); + int ord = (int) toGlobal.get(segOrd); + countAcc.incrementCount(ord, 1); + } + } + } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FieldUtil.java b/solr/core/src/java/org/apache/solr/search/facet/FieldUtil.java index 84255b9f9df..389b6d74045 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FieldUtil.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FieldUtil.java @@ -21,10 +21,13 @@ import java.io.IOException; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.QParser; import org.apache.solr.search.QueryContext; import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.uninverting.FieldCacheImpl; /** @lucene.internal * Porting helper... may be removed if it offers no value in the future. @@ -52,4 +55,148 @@ public class FieldUtil { return si == null ? DocValues.emptySortedSet() : si; } + + /** The following ord visitors and wrappers are a work in progress and experimental + * @lucene.experimental */ + @FunctionalInterface + public interface OrdFunc { + void handleOrd(int docid, int ord); // TODO: throw exception? + } + + public static boolean isFieldCache(SortedDocValues singleDv) { + return singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter; + } + + public static void visitOrds(SortedDocValues singleDv, DocIdSetIterator disi, OrdFunc ordFunc) throws IOException { + int doc; + if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { + FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv; + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ordFunc.handleOrd(doc, fc.getOrd(doc)); + } + } else { + while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (singleDv.advanceExact(doc)) { + ordFunc.handleOrd(doc, singleDv.ordValue()); + } else { + // TODO: optionally pass in missingOrd? + } + } + } + } + + public static OrdValues getOrdValues(SortedDocValues singleDv, DocIdSetIterator disi) { + if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { + FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv; + return new FCOrdValues(fc, disi); + } + return new DVOrdValues(singleDv, disi); + } + + + public static abstract class OrdValues extends SortedDocValues { + int doc; + int ord; + + public int getOrd() { + return ord; + } + + @Override + public int docID() { + return doc; + } + + @Override + public abstract int nextDoc() throws IOException; + + @Override + public int advance(int target) throws IOException { + return 0; // TODO + } + + @Override + public long cost() { + return 0; + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + } + + + public static class FCOrdValues extends OrdValues { + FieldCacheImpl.SortedDocValuesImpl.Iter vals; + DocIdSetIterator disi; + + public FCOrdValues(FieldCacheImpl.SortedDocValuesImpl.Iter iter, DocIdSetIterator disi) { + this.vals = iter; + this.disi = disi; + } + + @Override + public int nextDoc() throws IOException { + doc = disi.nextDoc(); + if (doc == NO_MORE_DOCS) return NO_MORE_DOCS; + ord = vals.getOrd(doc); // todo: loop until a hit? + return doc; + } + + @Override + public boolean advanceExact(int target) throws IOException { + return false; + } + + @Override + public int ordValue() { + return 0; + } + + @Override + public BytesRef lookupOrd(int ord) throws IOException { + return null; + } + } + + public static class DVOrdValues extends OrdValues { + SortedDocValues vals; + DocIdSetIterator disi; + int valDoc; + + public DVOrdValues(SortedDocValues vals, DocIdSetIterator disi) { + this.vals = vals; + this.disi = disi; + } + + @Override + public int nextDoc() throws IOException { + for (;;) { + // todo - use skipping when appropriate + doc = disi.nextDoc(); + if (doc == NO_MORE_DOCS) return NO_MORE_DOCS; + boolean match = vals.advanceExact(doc); + if (match) { + ord = vals.ordValue(); + return doc; + } + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + return false; + } + + @Override + public int ordValue() { + return 0; + } + + @Override + public BytesRef lookupOrd(int ord) throws IOException { + return null; + } + } } diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java index ea8f6ea1d3c..32f56152c8c 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java @@ -45,7 +45,7 @@ import org.apache.lucene.util.RamUsageEstimator; * * @lucene.internal */ -interface FieldCache { +public interface FieldCache { /** * Placeholder indicating creation of this cache is currently in-progress. diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java index 2224010a85f..0ca687f3952 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java @@ -58,9 +58,9 @@ import org.apache.lucene.util.packed.PackedLongValues; * Expert: The default cache implementation, storing all values in memory. * A WeakHashMap is used for storage. * - * @since lucene 1.4 + * @lucene.internal */ -class FieldCacheImpl implements FieldCache { +public class FieldCacheImpl implements FieldCache { private Map,Cache> caches; FieldCacheImpl() { @@ -786,79 +786,89 @@ class FieldCacheImpl implements FieldCache { this.termOrdToBytesOffset = termOrdToBytesOffset; this.numOrd = numOrd; } - + public SortedDocValues iterator() { - final BytesRef term = new BytesRef(); - return new SortedDocValues() { - private int docID = -1; + return new Iter(); + } - @Override - public int docID() { - return docID; - } + public class Iter extends SortedDocValues { + private int docID = -1; + private final BytesRef term = new BytesRef(); - @Override - public int nextDoc() { - while (true) { - docID++; - if (docID >= docToTermOrd.size()) { - docID = NO_MORE_DOCS; - return docID; - } - if (docToTermOrd.get(docID) != 0) { - return docID; - } - } - } + /** @lucene.internal Specific to this implementation and subject to change. For internal optimization only. */ + public int getOrd(int docID) { + // Subtract 1, matching the 1+ord we did when + // storing, so that missing values, which are 0 in the + // packed ints, are returned as -1 ord: + return (int) docToTermOrd.get(docID)-1; + } - @Override - public int advance(int target) { - if (target < docToTermOrd.size()) { - docID = target; - if (docToTermOrd.get(docID) != 0) { - return docID; - } else{ - return nextDoc(); - } - } else { + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + while (true) { + docID++; + if (docID >= docToTermOrd.size()) { docID = NO_MORE_DOCS; return docID; } - } - - @Override - public boolean advanceExact(int target) throws IOException { - docID = target; - return docToTermOrd.get(docID) != 0; - } - - @Override - public long cost() { - return 0; - } - - @Override - public int ordValue() { - // Subtract 1, matching the 1+ord we did when - // storing, so that missing values, which are 0 in the - // packed ints, are returned as -1 ord: - return (int) docToTermOrd.get(docID)-1; - } - - @Override - public int getValueCount() { - return numOrd; - } - - @Override - public BytesRef lookupOrd(int ord) { - if (ord < 0) { - throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); + if (docToTermOrd.get(docID) != 0) { + return docID; } - bytes.fill(term, termOrdToBytesOffset.get(ord)); - return term; } - }; + } + + @Override + public int advance(int target) { + if (target < docToTermOrd.size()) { + docID = target; + if (docToTermOrd.get(docID) != 0) { + return docID; + } else{ + return nextDoc(); + } + } else { + docID = NO_MORE_DOCS; + return docID; + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + return docToTermOrd.get(docID) != 0; + } + + @Override + public long cost() { + return 0; + } + + @Override + public int ordValue() { + // Subtract 1, matching the 1+ord we did when + // storing, so that missing values, which are 0 in the + // packed ints, are returned as -1 ord: + return (int) docToTermOrd.get(docID)-1; + } + + @Override + public int getValueCount() { + return numOrd; + } + + @Override + public BytesRef lookupOrd(int ord) { + if (ord < 0) { + throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); + } + bytes.fill(term, termOrdToBytesOffset.get(ord)); + return term; + } } @Override From d2ed42b847b1114fe3d0befc788fba55255d4ee2 Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Wed, 7 Dec 2016 18:39:04 +0000 Subject: [PATCH 22/53] removed two unused imports in TestPlainTextEntityProcessor.java --- .../solr/handler/dataimport/TestPlainTextEntityProcessor.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java index a286d841c91..23854382dde 100644 --- a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java +++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestPlainTextEntityProcessor.java @@ -21,10 +21,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringReader; -import java.nio.charset.StandardCharsets; import java.sql.Blob; import java.sql.SQLException; -import java.util.Arrays; import java.util.Collections; import java.util.Properties; From bfc3690d5203cee20550450bac3771e5c2b85cbf Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Wed, 7 Dec 2016 20:43:49 +0000 Subject: [PATCH 23/53] SOLR-8542: couple of tweaks (Michael Nilsson, Diego Ceccarelli, Christine Poerschke) * removed code triplication in ManagedModelStore * LTRScoringQuery.java tweaks * FeatureLogger.makeFeatureVector(...) can now safely be called repeatedly (though that doesn't happen at present) * make Feature.FeatureWeight.extractTerms a no-op; (OriginalScore|SolrFeature)Weight now implement extractTerms * LTRThreadModule javadocs and README.md tweaks * add TestFieldValueFeature.testBooleanValue test; replace "T"/"F" magic string use in FieldValueFeature * add TestOriginalScoreScorer test; add OriginalScoreScorer.freq() method * in TestMultipleAdditiveTreesModel revive dead explain test --- solr/contrib/ltr/README.md | 6 +-- .../org/apache/solr/ltr/FeatureLogger.java | 10 ++-- .../org/apache/solr/ltr/LTRScoringQuery.java | 22 ++++----- .../org/apache/solr/ltr/LTRThreadModule.java | 29 +++++++++++ .../org/apache/solr/ltr/feature/Feature.java | 3 +- .../solr/ltr/feature/FieldValueFeature.java | 18 ++++--- .../ltr/feature/OriginalScoreFeature.java | 12 ++++- .../apache/solr/ltr/feature/SolrFeature.java | 17 +++++-- .../ltr/store/rest/ManagedFeatureStore.java | 1 - .../ltr/store/rest/ManagedModelStore.java | 32 +++++-------- .../solr/collection1/conf/schema.xml | 2 + .../ltr/feature/TestFieldValueFeature.java | 48 ++++++++++++++++--- .../ltr/feature/TestOriginalScoreScorer.java | 47 ++++++++++++++++++ .../model/TestMultipleAdditiveTreesModel.java | 44 ++++++++--------- .../org/apache/solr/schema/BoolField.java | 4 +- 15 files changed, 212 insertions(+), 83 deletions(-) create mode 100644 solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestOriginalScoreScorer.java diff --git a/solr/contrib/ltr/README.md b/solr/contrib/ltr/README.md index 88e2f67b941..2033ffc3e7b 100644 --- a/solr/contrib/ltr/README.md +++ b/solr/contrib/ltr/README.md @@ -390,17 +390,17 @@ About half the time for ranking is spent in the creation of weights for each fea 10 - 5 + 5 10 - 5 + 5 ``` -The threadModule.totalPoolThreads option limits the total number of threads to be used across all query instances at any given time. threadModule.numThreadsPerRequest limits the number of threads used to process a single query. In the above example, 10 threads will be used to services all queries and a maximum of 5 threads to service a single query. If the solr instances is expected to receive no more than one query at a time, it is best to set both these numbers to the same value. If multiple queries need to serviced simultaneously, the numbers can be adjusted based on the expected response times. If the value of threadModule.numThreadsPerRequest is higher, the reponse time for a single query will be improved upto a point. If multiple queries are serviced simultaneously, the threadModule.totalPoolThreads imposes a contention between the queries if (threadModule.numThreadsPerRequest*total parallel queries > threadModule.totalPoolThreads). +The threadModule.totalPoolThreads option limits the total number of threads to be used across all query instances at any given time. threadModule.numThreadsPerRequest limits the number of threads used to process a single query. In the above example, 10 threads will be used to services all queries and a maximum of 5 threads to service a single query. If the solr instance is expected to receive no more than one query at a time, it is best to set both these numbers to the same value. If multiple queries need to be serviced simultaneously, the numbers can be adjusted based on the expected response times. If the value of threadModule.numThreadsPerRequest is higher, the response time for a single query will be improved upto a point. If multiple queries are serviced simultaneously, the threadModule.totalPoolThreads imposes a contention between the queries if (threadModule.numThreadsPerRequest*total parallel queries > threadModule.totalPoolThreads). diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/FeatureLogger.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/FeatureLogger.java index a5afd05952c..9c10c2c6917 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/FeatureLogger.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/FeatureLogger.java @@ -151,7 +151,6 @@ public abstract class FeatureLogger { } public static class CSVFeatureLogger extends FeatureLogger { - StringBuilder sb = new StringBuilder(500); char keyValueSep = ':'; char featureSep = ';'; @@ -171,6 +170,10 @@ public abstract class FeatureLogger { @Override public String makeFeatureVector(LTRScoringQuery.FeatureInfo[] featuresInfo) { + // Allocate the buffer to a size based on the number of features instead of the + // default 16. You need space for the name, value, and two separators per feature, + // but not all the features are expected to fire, so this is just a naive estimate. + StringBuilder sb = new StringBuilder(featuresInfo.length * 3); boolean isDense = featureFormat.equals(FeatureFormat.DENSE); for (LTRScoringQuery.FeatureInfo featInfo:featuresInfo) { if (featInfo.isUsed() || isDense){ @@ -181,9 +184,8 @@ public abstract class FeatureLogger { } } - final String features = (sb.length() > 0 ? sb.substring(0, - sb.length() - 1) : ""); - sb.setLength(0); + final String features = (sb.length() > 0 ? + sb.substring(0, sb.length() - 1) : ""); return features; } diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRScoringQuery.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRScoringQuery.java index 991c1edf58f..d60ebf55bb0 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRScoringQuery.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRScoringQuery.java @@ -205,10 +205,10 @@ public class LTRScoringQuery extends Query { List featureWeights = new ArrayList<>(features.size()); if (querySemaphore == null) { - createWeights(searcher, needsScores, boost, featureWeights, features); + createWeights(searcher, needsScores, featureWeights, features); } else{ - createWeightsParallel(searcher, needsScores, boost, featureWeights, features); + createWeightsParallel(searcher, needsScores, featureWeights, features); } int i=0, j = 0; if (this.extractAllFeatures) { @@ -228,7 +228,7 @@ public class LTRScoringQuery extends Query { return new ModelWeight(modelFeaturesWeights, extractedFeatureWeights, allFeatures.size()); } - private void createWeights(IndexSearcher searcher, boolean needsScores, float boost, + private void createWeights(IndexSearcher searcher, boolean needsScores, List featureWeights, Collection features) throws IOException { final SolrQueryRequest req = getRequest(); // since the feature store is a linkedhashmap order is preserved @@ -271,7 +271,7 @@ public class LTRScoringQuery extends Query { } } // end of call CreateWeightCallable - private void createWeightsParallel(IndexSearcher searcher, boolean needsScores, float boost, + private void createWeightsParallel(IndexSearcher searcher, boolean needsScores, List featureWeights, Collection features) throws RuntimeException { final SolrQueryRequest req = getRequest(); @@ -401,8 +401,9 @@ public class LTRScoringQuery extends Query { /** * Goes through all the stored feature values, and calculates the normalized * values for all the features that will be used for scoring. + * Then calculate and return the model's score. */ - private void makeNormalizedFeatures() { + private float makeNormalizedFeaturesAndScore() { int pos = 0; for (final Feature.FeatureWeight feature : modelFeatureWeights) { final int featureId = feature.getIndex(); @@ -415,6 +416,7 @@ public class LTRScoringQuery extends Query { pos++; } ltrScoringModel.normalizeFeaturesInPlace(modelFeatureValuesNormalized); + return ltrScoringModel.score(modelFeatureValuesNormalized); } @Override @@ -491,8 +493,8 @@ public class LTRScoringQuery extends Query { for (final Feature.FeatureWeight.FeatureScorer subSocer : featureScorers) { subSocer.setDocInfo(docInfo); } - if (featureScorers.size() <= 1) { // TODO: Allow the use of dense - // features in other cases + if (featureScorers.size() <= 1) { + // future enhancement: allow the use of dense features in other cases featureTraversalScorer = new DenseModelScorer(weight, featureScorers); } else { featureTraversalScorer = new SparseModelScorer(weight, featureScorers); @@ -570,8 +572,7 @@ public class LTRScoringQuery extends Query { featuresInfo[featureId].setUsed(true); } } - makeNormalizedFeatures(); - return ltrScoringModel.score(modelFeatureValuesNormalized); + return makeNormalizedFeaturesAndScore(); } @Override @@ -663,8 +664,7 @@ public class LTRScoringQuery extends Query { } } } - makeNormalizedFeatures(); - return ltrScoringModel.score(modelFeatureValuesNormalized); + return makeNormalizedFeaturesAndScore(); } @Override diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRThreadModule.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRThreadModule.java index 8e2563f1e08..b8d0bda3a46 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRThreadModule.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRThreadModule.java @@ -29,6 +29,35 @@ import org.apache.solr.util.DefaultSolrThreadFactory; import org.apache.solr.util.SolrPluginUtils; import org.apache.solr.util.plugin.NamedListInitializedPlugin; +/** + * The LTRThreadModule is optionally used by the {@link org.apache.solr.ltr.search.LTRQParserPlugin} and + * {@link org.apache.solr.ltr.response.transform.LTRFeatureLoggerTransformerFactory LTRFeatureLoggerTransformerFactory} + * classes to parallelize the creation of {@link org.apache.solr.ltr.feature.Feature.FeatureWeight Feature.FeatureWeight} + * objects. + *

    + * Example configuration: + *

    +  <queryParser name="ltr" class="org.apache.solr.ltr.search.LTRQParserPlugin">
    +     <int name="threadModule.totalPoolThreads">10</int>
    +     <int name="threadModule.numThreadsPerRequest">5</int>
    +  </queryParser>
    +
    +  <transformer name="features" class="org.apache.solr.ltr.response.transform.LTRFeatureLoggerTransformerFactory">
    +     <int name="threadModule.totalPoolThreads">10</int>
    +     <int name="threadModule.numThreadsPerRequest">5</int>
    +  </transformer>
    +
    + * If an individual solr instance is expected to receive no more than one query at a time, it is best + * to set totalPoolThreads and numThreadsPerRequest to the same value. + * + * If multiple queries need to be serviced simultaneously then totalPoolThreads and + * numThreadsPerRequest can be adjusted based on the expected response times. + * + * If the value of numThreadsPerRequest is higher, the response time for a single query + * will be improved up to a point. If multiple queries are serviced simultaneously, the value of + * totalPoolThreads imposes a contention between the queries if + * (totalPoolThreads < numThreadsPerRequest * total parallel queries). + */ final public class LTRThreadModule implements NamedListInitializedPlugin { public static LTRThreadModule getInstance(NamedList args) { diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/Feature.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/Feature.java index 228b964e6b9..48e89423ca1 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/Feature.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/Feature.java @@ -258,8 +258,7 @@ public abstract class Feature extends Query { @Override public void extractTerms(Set terms) { - // needs to be implemented by query subclasses - throw new UnsupportedOperationException(); + // no-op } /** diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java index 279adbc3ca3..5fcf144d89c 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java @@ -29,6 +29,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.BoolField; /** * This feature returns the value of a field in the current document @@ -119,13 +120,16 @@ public class FieldValueFeature extends Feature { return number.floatValue(); } else { final String string = indexableField.stringValue(); - // boolean values in the index are encoded with the - // chars T/F - if (string.equals("T")) { - return 1; - } - if (string.equals("F")) { - return 0; + if (string.length() == 1) { + // boolean values in the index are encoded with the + // a single char contained in TRUE_TOKEN or FALSE_TOKEN + // (see BoolField) + if (string.charAt(0) == BoolField.TRUE_TOKEN[0]) { + return 1; + } + if (string.charAt(0) == BoolField.FALSE_TOKEN[0]) { + return 0; + } } } } catch (final IOException e) { diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/OriginalScoreFeature.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/OriginalScoreFeature.java index 125615cbb4f..549880be06e 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/OriginalScoreFeature.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/OriginalScoreFeature.java @@ -19,8 +19,10 @@ package org.apache.solr.ltr.feature; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Set; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -76,7 +78,10 @@ public class OriginalScoreFeature extends Feature { return "OriginalScoreFeature [query:" + originalQuery.toString() + "]"; } - + @Override + public void extractTerms(Set terms) { + w.extractTerms(terms); + } @Override public FeatureScorer scorer(LeafReaderContext context) throws IOException { @@ -102,6 +107,11 @@ public class OriginalScoreFeature extends Feature { return (docInfo.hasOriginalDocScore() ? docInfo.getOriginalDocScore() : originalScorer.score()); } + @Override + public int freq() throws IOException { + return originalScorer.freq(); + } + @Override public int docID() { return originalScorer.docID(); diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/SolrFeature.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/SolrFeature.java index cb7c1a0c81a..13eb96fee2a 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/SolrFeature.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/SolrFeature.java @@ -21,8 +21,10 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; @@ -123,9 +125,9 @@ public class SolrFeature extends Feature { * Weight for a SolrFeature **/ public class SolrFeatureWeight extends FeatureWeight { - Weight solrQueryWeight; - Query query; - List queryAndFilters; + final private Weight solrQueryWeight; + final private Query query; + final private List queryAndFilters; public SolrFeatureWeight(IndexSearcher searcher, SolrQueryRequest request, Query originalQuery, Map efi) throws IOException { @@ -174,6 +176,8 @@ public class SolrFeature extends Feature { if (query != null) { queryAndFilters.add(query); solrQueryWeight = searcher.createNormalizedWeight(query, true); + } else { + solrQueryWeight = null; } } catch (final SyntaxError e) { throw new FeatureException("Failed to parse feature query.", e); @@ -201,6 +205,13 @@ public class SolrFeature extends Feature { } } + @Override + public void extractTerms(Set terms) { + if (solrQueryWeight != null) { + solrQueryWeight.extractTerms(terms); + } + } + @Override public FeatureScorer scorer(LeafReaderContext context) throws IOException { Scorer solrScorer = null; diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedFeatureStore.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedFeatureStore.java index beb217c5c37..2c7bce58156 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedFeatureStore.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedFeatureStore.java @@ -57,7 +57,6 @@ public class ManagedFeatureStore extends ManagedResource implements ManagedResou /** the feature store rest endpoint **/ public static final String REST_END_POINT = "/schema/feature-store"; - // TODO: reduce from public to package visibility (once tests no longer need public access) /** name of the attribute containing the feature class **/ static final String CLASS_KEY = "class"; diff --git a/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedModelStore.java b/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedModelStore.java index 97aaa4004ad..9c19b0a7c26 100644 --- a/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedModelStore.java +++ b/solr/contrib/ltr/src/java/org/apache/solr/ltr/store/rest/ManagedModelStore.java @@ -61,7 +61,6 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc /** the model store rest endpoint **/ public static final String REST_END_POINT = "/schema/model-store"; - // TODO: reduce from public to package visibility (once tests no longer need public access) /** * Managed model store: the name of the attribute containing all the models of @@ -124,16 +123,20 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc if ((managedData != null) && (managedData instanceof List)) { final List> up = (List>) managedData; for (final Map u : up) { - try { - final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, u, managedFeatureStore); - addModel(algo); - } catch (final ModelException e) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); - } + addModelFromMap(u); } } } + private void addModelFromMap(Map modelMap) { + try { + final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, modelMap, managedFeatureStore); + addModel(algo); + } catch (final ModelException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + } + public synchronized void addModel(LTRScoringModel ltrScoringModel) throws ModelException { try { log.info("adding model {}", ltrScoringModel.getName()); @@ -146,26 +149,17 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc @SuppressWarnings("unchecked") @Override protected Object applyUpdatesToManagedData(Object updates) { + if (updates instanceof List) { final List> up = (List>) updates; for (final Map u : up) { - try { - final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, u, managedFeatureStore); - addModel(algo); - } catch (final ModelException e) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); - } + addModelFromMap(u); } } if (updates instanceof Map) { final Map map = (Map) updates; - try { - final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, map, managedFeatureStore); - addModel(algo); - } catch (final ModelException e) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); - } + addModelFromMap(map); } return modelsAsManagedResources(store.getModels()); diff --git a/solr/contrib/ltr/src/test-files/solr/collection1/conf/schema.xml b/solr/contrib/ltr/src/test-files/solr/collection1/conf/schema.xml index 15cf140cc09..0b958c0aca3 100644 --- a/solr/contrib/ltr/src/test-files/solr/collection1/conf/schema.xml +++ b/solr/contrib/ltr/src/test-files/solr/collection1/conf/schema.xml @@ -24,6 +24,8 @@ + + diff --git a/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java b/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java index af150c060e4..95742733cc7 100644 --- a/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java +++ b/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java @@ -32,21 +32,21 @@ public class TestFieldValueFeature extends TestRerankBase { setuptest("solrconfig-ltr.xml", "schema.xml"); assertU(adoc("id", "1", "title", "w1", "description", "w1", "popularity", - "1")); + "1","isTrendy","true")); assertU(adoc("id", "2", "title", "w2 2asd asdd didid", "description", "w2 2asd asdd didid", "popularity", "2")); assertU(adoc("id", "3", "title", "w3", "description", "w3", "popularity", - "3")); + "3","isTrendy","true")); assertU(adoc("id", "4", "title", "w4", "description", "w4", "popularity", - "4")); + "4","isTrendy","false")); assertU(adoc("id", "5", "title", "w5", "description", "w5", "popularity", - "5")); + "5","isTrendy","true")); assertU(adoc("id", "6", "title", "w1 w2", "description", "w1 w2", - "popularity", "6")); + "popularity", "6","isTrendy","false")); assertU(adoc("id", "7", "title", "w1 w2 w3 w4 w5", "description", - "w1 w2 w3 w4 w5 w8", "popularity", "7")); + "w1 w2 w3 w4 w5 w8", "popularity", "7","isTrendy","true")); assertU(adoc("id", "8", "title", "w1 w1 w1 w2 w2 w8", "description", - "w1 w1 w1 w2 w2", "popularity", "8")); + "w1 w1 w1 w2 w2", "popularity", "8","isTrendy","false")); // a document without the popularity field assertU(adoc("id", "42", "title", "NO popularity", "description", "NO popularity")); @@ -169,5 +169,39 @@ public class TestFieldValueFeature extends TestRerankBase { } + @Test + public void testBooleanValue() throws Exception { + final String fstore = "test_boolean_store"; + loadFeature("trendy", FieldValueFeature.class.getCanonicalName(), fstore, + "{\"field\":\"isTrendy\"}"); + + loadModel("trendy-model", LinearModel.class.getCanonicalName(), + new String[] {"trendy"}, fstore, "{\"weights\":{\"trendy\":1.0}}"); + + SolrQuery query = new SolrQuery(); + query.setQuery("id:4"); + query.add("rq", "{!ltr model=trendy-model reRankDocs=4}"); + query.add("fl", "[fv]"); + assertJQ("/query" + query.toQueryString(), + "/response/docs/[0]/=={'[fv]':'trendy:0.0'}"); + + + query = new SolrQuery(); + query.setQuery("id:5"); + query.add("rq", "{!ltr model=trendy-model reRankDocs=4}"); + query.add("fl", "[fv]"); + assertJQ("/query" + query.toQueryString(), + "/response/docs/[0]/=={'[fv]':'trendy:1.0'}"); + + // check default value is false + query = new SolrQuery(); + query.setQuery("id:2"); + query.add("rq", "{!ltr model=trendy-model reRankDocs=4}"); + query.add("fl", "[fv]"); + assertJQ("/query" + query.toQueryString(), + "/response/docs/[0]/=={'[fv]':'trendy:0.0'}"); + + } + } diff --git a/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestOriginalScoreScorer.java b/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestOriginalScoreScorer.java new file mode 100644 index 00000000000..e85ebedf084 --- /dev/null +++ b/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestOriginalScoreScorer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.ltr.feature; + +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; + +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +public class TestOriginalScoreScorer extends LuceneTestCase { + + @Test + public void testOverridesAbstractScorerMethods() { + final Class ossClass = OriginalScoreFeature.OriginalScoreWeight.OriginalScoreScorer.class; + for (final Method scorerClassMethod : Scorer.class.getDeclaredMethods()) { + final int modifiers = scorerClassMethod.getModifiers(); + if (!Modifier.isAbstract(modifiers)) continue; + + try { + final Method ossClassMethod = ossClass.getDeclaredMethod( + scorerClassMethod.getName(), + scorerClassMethod.getParameterTypes()); + assertEquals("getReturnType() difference", + scorerClassMethod.getReturnType(), + ossClassMethod.getReturnType()); + } catch (NoSuchMethodException e) { + fail(ossClass + " needs to override '" + scorerClassMethod + "'"); + } + } + } +} diff --git a/solr/contrib/ltr/src/test/org/apache/solr/ltr/model/TestMultipleAdditiveTreesModel.java b/solr/contrib/ltr/src/test/org/apache/solr/ltr/model/TestMultipleAdditiveTreesModel.java index 3748331a43e..560437078cb 100644 --- a/solr/contrib/ltr/src/test/org/apache/solr/ltr/model/TestMultipleAdditiveTreesModel.java +++ b/solr/contrib/ltr/src/test/org/apache/solr/ltr/model/TestMultipleAdditiveTreesModel.java @@ -16,7 +16,7 @@ */ package org.apache.solr.ltr.model; -//import static org.junit.internal.matchers.StringContains.containsString; +import static org.junit.internal.matchers.StringContains.containsString; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.ltr.TestRerankBase; @@ -93,30 +93,28 @@ public class TestMultipleAdditiveTreesModel extends TestRerankBase { // test out the explain feature, make sure it returns something query.setParam("debugQuery", "on"); - String qryResult = JQ("/query" + query.toQueryString()); + String qryResult = JQ("/query" + query.toQueryString()); qryResult = qryResult.replaceAll("\n", " "); - // FIXME containsString doesn't exist. - // assertThat(qryResult, containsString("\"debug\":{")); - // qryResult = qryResult.substring(qryResult.indexOf("debug")); - // - // assertThat(qryResult, containsString("\"explain\":{")); - // qryResult = qryResult.substring(qryResult.indexOf("explain")); - // - // assertThat(qryResult, containsString("multipleadditivetreesmodel")); - // assertThat(qryResult, - // containsString(MultipleAdditiveTreesModel.class.getCanonicalName())); - // - // assertThat(qryResult, containsString("-100.0 = tree 0")); - // assertThat(qryResult, containsString("50.0 = tree 0")); - // assertThat(qryResult, containsString("-20.0 = tree 1")); - // assertThat(qryResult, containsString("'matchedTitle':1.0 > 0.5")); - // assertThat(qryResult, containsString("'matchedTitle':0.0 <= 0.5")); - // - // assertThat(qryResult, containsString(" Go Right ")); - // assertThat(qryResult, containsString(" Go Left ")); - // assertThat(qryResult, - // containsString("'this_feature_doesnt_exist' does not exist in FV")); + + assertThat(qryResult, containsString("\"debug\":{")); + qryResult = qryResult.substring(qryResult.indexOf("debug")); + + assertThat(qryResult, containsString("\"explain\":{")); + qryResult = qryResult.substring(qryResult.indexOf("explain")); + + assertThat(qryResult, containsString("multipleadditivetreesmodel")); + assertThat(qryResult, containsString(MultipleAdditiveTreesModel.class.getCanonicalName())); + + assertThat(qryResult, containsString("-100.0 = tree 0")); + assertThat(qryResult, containsString("50.0 = tree 0")); + assertThat(qryResult, containsString("-20.0 = tree 1")); + assertThat(qryResult, containsString("'matchedTitle':1.0 > 0.5")); + assertThat(qryResult, containsString("'matchedTitle':0.0 <= 0.5")); + + assertThat(qryResult, containsString(" Go Right ")); + assertThat(qryResult, containsString(" Go Left ")); + assertThat(qryResult, containsString("'this_feature_doesnt_exist' does not exist in FV")); } @Test diff --git a/solr/core/src/java/org/apache/solr/schema/BoolField.java b/solr/core/src/java/org/apache/solr/schema/BoolField.java index 210ea0ba103..1645ee6cbf6 100644 --- a/solr/core/src/java/org/apache/solr/schema/BoolField.java +++ b/solr/core/src/java/org/apache/solr/schema/BoolField.java @@ -71,8 +71,8 @@ public class BoolField extends PrimitiveFieldType { } // avoid instantiating every time... - protected final static char[] TRUE_TOKEN = {'T'}; - protected final static char[] FALSE_TOKEN = {'F'}; + public final static char[] TRUE_TOKEN = {'T'}; + public final static char[] FALSE_TOKEN = {'F'}; //////////////////////////////////////////////////////////////////////// // TODO: look into creating my own queryParser that can more efficiently From cacabc9a4edf299f1dd2e5d08cc66845bc52fe98 Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Wed, 7 Dec 2016 21:16:42 +0000 Subject: [PATCH 24/53] fix java warning in SolrQueryTest --- .../src/test/org/apache/solr/client/solrj/SolrQueryTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java index d27847f6803..1c86c93db53 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java @@ -441,7 +441,7 @@ public class SolrQueryTest extends LuceneTestCase { solrQuery.addMoreLikeThisField("mlt3"); solrQuery.addMoreLikeThisField("mlt4"); assertEquals(4, solrQuery.getMoreLikeThisFields().length); - solrQuery.setMoreLikeThisFields(null); + solrQuery.setMoreLikeThisFields((String[])null); assertTrue(null == solrQuery.getMoreLikeThisFields()); assertFalse(solrQuery.getMoreLikeThis()); From b97d9d7478f99660c1cfc91ef4461b7405254dea Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Wed, 7 Dec 2016 18:59:23 -0500 Subject: [PATCH 25/53] LUCENE-7583: buffer small leaf-block writes in BKDWriter --- lucene/CHANGES.txt | 4 + .../CompressingStoredFieldsWriter.java | 19 +++-- .../CompressingTermVectorsWriter.java | 11 +-- .../GrowableByteArrayDataOutput.java | 32 +++++-- .../org/apache/lucene/util/bkd/BKDWriter.java | 85 ++++++++++--------- .../apache/lucene/util/bkd/DocIdsWriter.java | 4 +- .../TestGrowableByteArrayDataOutput.java | 14 +-- 7 files changed, 101 insertions(+), 68 deletions(-) rename lucene/core/src/java/org/apache/lucene/{codecs/compressing => util}/GrowableByteArrayDataOutput.java (83%) rename lucene/core/src/test/org/apache/lucene/{codecs/compressing => store}/TestGrowableByteArrayDataOutput.java (89%) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c6c39ac45fb..26a9dec0014 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -134,6 +134,10 @@ Optimizations a compressed format, using substantially less RAM in some cases (Adrien Grand, Mike McCandless) +* LUCENE-7583: BKD writing now buffers each leaf block in heap before + writing to disk, giving a small speedup in points-heavy use cases. + (Mike McCandless) + Other * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 1956ab70683..cda855defcb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -33,6 +33,7 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.GrowableByteArrayDataOutput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -157,7 +158,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { } this.numStoredFields[numBufferedDocs] = numStoredFieldsInDoc; numStoredFieldsInDoc = 0; - endOffsets[numBufferedDocs] = bufferedDocs.length; + endOffsets[numBufferedDocs] = bufferedDocs.getPosition(); ++numBufferedDocs; if (triggerFlush()) { flush(); @@ -210,7 +211,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { } private boolean triggerFlush() { - return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes + return bufferedDocs.getPosition() >= chunkSize || // chunks of at least chunkSize bytes numBufferedDocs >= maxDocsPerChunk; } @@ -223,23 +224,23 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { lengths[i] = endOffsets[i] - endOffsets[i - 1]; assert lengths[i] >= 0; } - final boolean sliced = bufferedDocs.length >= 2 * chunkSize; + final boolean sliced = bufferedDocs.getPosition() >= 2 * chunkSize; writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced); // compress stored fields to fieldsStream if (sliced) { // big chunk, slice it - for (int compressed = 0; compressed < bufferedDocs.length; compressed += chunkSize) { - compressor.compress(bufferedDocs.bytes, compressed, Math.min(chunkSize, bufferedDocs.length - compressed), fieldsStream); + for (int compressed = 0; compressed < bufferedDocs.getPosition(); compressed += chunkSize) { + compressor.compress(bufferedDocs.getBytes(), compressed, Math.min(chunkSize, bufferedDocs.getPosition() - compressed), fieldsStream); } } else { - compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream); + compressor.compress(bufferedDocs.getBytes(), 0, bufferedDocs.getPosition(), fieldsStream); } // reset docBase += numBufferedDocs; numBufferedDocs = 0; - bufferedDocs.length = 0; + bufferedDocs.reset(); numChunks++; } @@ -459,7 +460,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { flush(); numDirtyChunks++; // incomplete: we had to force this flush } else { - assert bufferedDocs.length == 0; + assert bufferedDocs.getPosition() == 0; } if (docBase != numDocs) { throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs); @@ -468,7 +469,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { fieldsStream.writeVLong(numChunks); fieldsStream.writeVLong(numDirtyChunks); CodecUtil.writeFooter(fieldsStream); - assert bufferedDocs.length == 0; + assert bufferedDocs.getPosition() == 0; } // bulk merge is scary: its caused corruption bugs in the past. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index 46a289a97b5..9bd2483389e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.GrowableByteArrayDataOutput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -269,8 +270,8 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { @Override public void finishDocument() throws IOException { // append the payload bytes of the doc after its terms - termSuffixes.writeBytes(payloadBytes.bytes, payloadBytes.length); - payloadBytes.length = 0; + termSuffixes.writeBytes(payloadBytes.getBytes(), payloadBytes.getPosition()); + payloadBytes.reset(); ++numDocs; if (triggerFlush()) { flush(); @@ -316,7 +317,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { } private boolean triggerFlush() { - return termSuffixes.length >= chunkSize + return termSuffixes.getPosition() >= chunkSize || pendingDocs.size() >= MAX_DOCUMENTS_PER_CHUNK; } @@ -355,14 +356,14 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { flushPayloadLengths(); // compress terms and payloads and write them to the output - compressor.compress(termSuffixes.bytes, 0, termSuffixes.length, vectorsStream); + compressor.compress(termSuffixes.getBytes(), 0, termSuffixes.getPosition(), vectorsStream); } // reset pendingDocs.clear(); curDoc = null; curField = null; - termSuffixes.length = 0; + termSuffixes.reset(); numChunks++; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java similarity index 83% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java rename to lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java index ec551d14d1f..5f00d4a6ab0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java @@ -14,8 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.store; import java.io.IOException; @@ -25,6 +25,7 @@ import org.apache.lucene.util.UnicodeUtil; /** * A {@link DataOutput} that can be used to build a byte[]. + * * @lucene.internal */ public final class GrowableByteArrayDataOutput extends DataOutput { @@ -33,12 +34,13 @@ public final class GrowableByteArrayDataOutput extends DataOutput { static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536; /** The bytes */ - public byte[] bytes; + private byte[] bytes; + /** The length */ - public int length; + private int length; // scratch for utf8 encoding of small strings - byte[] scratchBytes = new byte[16]; + private byte[] scratchBytes; /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */ public GrowableByteArrayDataOutput(int cp) { @@ -57,7 +59,9 @@ public final class GrowableByteArrayDataOutput extends DataOutput { @Override public void writeBytes(byte[] b, int off, int len) { final int newLength = length + len; - bytes = ArrayUtil.grow(bytes, newLength); + if (newLength > bytes.length) { + bytes = ArrayUtil.grow(bytes, newLength); + } System.arraycopy(b, off, bytes, length, len); length = newLength; } @@ -68,7 +72,11 @@ public final class GrowableByteArrayDataOutput extends DataOutput { if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. - scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); + if (scratchBytes == null) { + scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)]; + } else { + scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); + } int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); writeVInt(len); writeBytes(scratchBytes, len); @@ -80,4 +88,16 @@ public final class GrowableByteArrayDataOutput extends DataOutput { length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length); } } + + public byte[] getBytes() { + return bytes; + } + + public int getPosition() { + return length; + } + + public void reset() { + length = 0; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index c82a0c8bf25..96575780b44 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -30,7 +30,9 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.GrowableByteArrayDataOutput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; @@ -478,8 +480,8 @@ public class BKDWriter implements Closeable { } build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out, - minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, - new int[maxPointsInLeafNode]); + minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, + new int[maxPointsInLeafNode]); long indexFP = out.getFilePointer(); writeIndex(out, leafBlockFPs, splitPackedValues); @@ -556,6 +558,9 @@ public class BKDWriter implements Closeable { return oneDimWriter.finish(); } + // reused when writing leaf blocks + private final GrowableByteArrayDataOutput scratchOut = new GrowableByteArrayDataOutput(32*1024); + private class OneDimensionBKDWriter { final IndexOutput out; @@ -563,8 +568,8 @@ public class BKDWriter implements Closeable { final List leafBlockStartValues = new ArrayList<>(); final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength]; final int[] leafDocs = new int[maxPointsInLeafNode]; - long valueCount; - int leafCount; + private long valueCount; + private int leafCount; OneDimensionBKDWriter(IndexOutput out) { if (numDims != 1) { @@ -589,7 +594,7 @@ public class BKDWriter implements Closeable { // for asserts final byte[] lastPackedValue; - int lastDocID; + private int lastDocID; void add(byte[] packedValue, int docID) throws IOException { assert valueInOrder(valueCount + leafCount, @@ -606,8 +611,7 @@ public class BKDWriter implements Closeable { if (leafCount == maxPointsInLeafNode) { // We write a block once we hit exactly the max count ... this is different from - // when we flush a new segment, where we write between max/2 and max per leaf block, - // so merged segments will behave differently from newly flushed segments: + // when we write N > 1 dimensional points where we write between max/2 and max per leaf block writeLeafBlock(); leafCount = 0; } @@ -644,7 +648,6 @@ public class BKDWriter implements Closeable { } private void writeLeafBlock() throws IOException { - //System.out.println("writeLeafBlock pos=" + out.getFilePointer()); assert leafCount != 0; if (valueCount == 0) { System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength); @@ -660,42 +663,39 @@ public class BKDWriter implements Closeable { leafBlockFPs.add(out.getFilePointer()); checkMaxLeafNodeCount(leafBlockFPs.size()); - Arrays.fill(commonPrefixLengths, bytesPerDim); // Find per-dim common prefix: - for(int dim=0;dim packedValues = new IntFunction() { - final BytesRef scratch = new BytesRef(); - - { - scratch.length = packedBytesLength; - scratch.bytes = leafValues; - } - @Override public BytesRef apply(int i) { - scratch.offset = packedBytesLength * i; - return scratch; + scratchBytesRef1.offset = packedBytesLength * i; + return scratchBytesRef1; } }; assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength), Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength), packedValues, leafDocs, 0); - writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues); + writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues); + out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition()); + scratchOut.reset(); } - } // TODO: there must be a simpler way? @@ -1259,13 +1259,13 @@ public class BKDWriter implements Closeable { out.writeBytes(packedIndex, 0, packedIndex.length); } - private void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException { + private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws IOException { assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode; out.writeVInt(count); DocIdsWriter.writeDocIds(docIDs, start, count, out); } - private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction packedValues) throws IOException { + private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction packedValues) throws IOException { int prefixLenSum = Arrays.stream(commonPrefixLengths).sum(); if (prefixLenSum == packedBytesLength) { // all values in this block are equal @@ -1290,7 +1290,7 @@ public class BKDWriter implements Closeable { } } - private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction packedValues) throws IOException { + private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction packedValues) throws IOException { for (int i = start; i < end; ++i) { BytesRef ref = packedValues.apply(i); assert ref.length == packedBytesLength; @@ -1316,7 +1316,7 @@ public class BKDWriter implements Closeable { return end - start; } - private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { + private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { for(int dim=0;dim Date: Thu, 8 Dec 2016 12:03:55 -0500 Subject: [PATCH 26/53] SOLR-9834: A variety of spots in the code can create a collection zk node after the collection has been removed. --- solr/CHANGES.txt | 3 + .../solr/cloud/CloudConfigSetService.java | 24 ++- .../apache/solr/cloud/CloudDescriptor.java | 2 +- .../solr/cloud/CreateCollectionCmd.java | 146 +++++++++++++++++- .../apache/solr/cloud/ElectionContext.java | 20 +-- .../org/apache/solr/cloud/LeaderElector.java | 9 +- .../org/apache/solr/cloud/ZkController.java | 128 +-------------- .../solr/cloud/ZkSolrResourceLoader.java | 2 +- .../apache/solr/core/ConfigSetService.java | 7 +- .../handler/admin/CollectionsHandler.java | 2 + .../apache/solr/cloud/LeaderElectionTest.java | 2 + ...rseerCollectionConfigSetProcessorTest.java | 33 ++-- .../apache/solr/cloud/ZkSolrClientTest.java | 54 +++++++ .../solr/common/cloud/SolrZkClient.java | 31 +++- .../solr/common/cloud/ZkCmdExecutor.java | 15 +- 15 files changed, 308 insertions(+), 170 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8dee8379901..abd99978022 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -230,6 +230,9 @@ Bug Fixes * SOLR-9832: Schema modifications are not immediately visible on the coordinating node. (Steve Rowe) +* SOLR-9834: A variety of spots in the code can create a collection zk node after the collection has been + removed. (Mark Miller) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java b/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java index bf11e921850..6e0583f2ca1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java @@ -16,12 +16,20 @@ */ package org.apache.solr.cloud; +import java.lang.invoke.MethodHandles; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.core.ConfigSetService; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.SolrResourceLoader; +import org.apache.zookeeper.KeeperException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class CloudConfigSetService extends ConfigSetService { - + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final ZkController zkController; public CloudConfigSetService(SolrResourceLoader loader, ZkController zkController) { @@ -31,8 +39,18 @@ public class CloudConfigSetService extends ConfigSetService { @Override public SolrResourceLoader createCoreResourceLoader(CoreDescriptor cd) { - // TODO: Shouldn't the collection node be created by the Collections API? - zkController.createCollectionZkNode(cd.getCloudDescriptor()); + try { + // for back compat with cores that can create collections without the collections API + if (!zkController.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + cd.getCollectionName(), true)) { + CreateCollectionCmd.createCollectionZkNode(zkController.getZkClient(), cd.getCollectionName(), cd.getCloudDescriptor().getParams()); + } + } catch (KeeperException e) { + SolrException.log(log, null, e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + SolrException.log(log, null, e); + } + String configName = zkController.getZkStateReader().readConfigName(cd.getCollectionName()); return new ZkSolrResourceLoader(cd.getInstanceDir(), configName, parentLoader.getClassLoader(), cd.getSubstitutableProperties(), zkController); diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index 4dd1527b0c1..fdc7b02dae5 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -33,7 +33,7 @@ public class CloudDescriptor { private String roles = null; private Integer numShards; private String nodeName = null; - private Map collectionParams = new HashMap<>(); + private Map collectionParams = new HashMap<>(); private volatile boolean isLeader = false; diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java index a067b4ae65f..a1bb70e36ab 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java @@ -25,19 +25,23 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.concurrent.TimeUnit; import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd; import org.apache.solr.cloud.overseer.ClusterStateMutator; import org.apache.solr.cloud.rule.ReplicaAssigner; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocRouter; import org.apache.solr.common.cloud.ImplicitDocRouter; import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkConfigManager; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.cloud.ZooKeeperException; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; @@ -46,7 +50,9 @@ import org.apache.solr.common.util.Utils; import org.apache.solr.handler.component.ShardHandler; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.KeeperException.NoNodeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,9 +70,11 @@ import static org.apache.solr.common.util.StrUtils.formatString; public class CreateCollectionCmd implements Cmd { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final OverseerCollectionMessageHandler ocmh; + private SolrZkClient zkClient; public CreateCollectionCmd(OverseerCollectionMessageHandler ocmh) { this.ocmh = ocmh; + this.zkClient = ocmh.zkStateReader.getZkClient(); } @Override @@ -84,7 +92,6 @@ public class CreateCollectionCmd implements Cmd { ocmh.validateConfigOrThrowSolrException(configName); - try { // look at the replication factor and see if it matches reality // if it does not, find best nodes to create more cores @@ -157,10 +164,20 @@ public class CreateCollectionCmd implements Cmd { } ZkStateReader zkStateReader = ocmh.zkStateReader; - boolean isLegacyCloud = Overseer.isLegacy(zkStateReader); + boolean isLegacyCloud = Overseer.isLegacy(zkStateReader); ocmh.createConfNode(configName, collectionName, isLegacyCloud); + Map collectionParams = new HashMap<>(); + Map collectionProps = message.getProperties(); + for (String propName : collectionProps.keySet()) { + if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { + collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), (String) collectionProps.get(propName)); + } + } + + createCollectionZkNode(zkClient, collectionName, collectionParams); + Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message)); // wait for a while until we don't see the collection @@ -288,4 +305,129 @@ public class CreateCollectionCmd implements Cmd { } return configName; } + + public static void createCollectionZkNode(SolrZkClient zkClient, String collection, Map params) { + log.debug("Check for collection zkNode:" + collection); + String collectionPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection; + + try { + if (!zkClient.exists(collectionPath, true)) { + log.debug("Creating collection in ZooKeeper:" + collection); + + try { + Map collectionProps = new HashMap<>(); + + // TODO: if collection.configName isn't set, and there isn't already a conf in zk, just use that? + String defaultConfigName = System.getProperty(ZkController.COLLECTION_PARAM_PREFIX + ZkController.CONFIGNAME_PROP, collection); + + if (params.size() > 0) { + collectionProps.putAll(params); + // if the config name wasn't passed in, use the default + if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP)) { + // users can create the collection node and conf link ahead of time, or this may return another option + getConfName(zkClient, collection, collectionPath, collectionProps); + } + + } else if (System.getProperty("bootstrap_confdir") != null) { + // if we are bootstrapping a collection, default the config for + // a new collection to the collection we are bootstrapping + log.info("Setting config for collection:" + collection + " to " + defaultConfigName); + + Properties sysProps = System.getProperties(); + for (String sprop : System.getProperties().stringPropertyNames()) { + if (sprop.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { + collectionProps.put(sprop.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), sysProps.getProperty(sprop)); + } + } + + // if the config name wasn't passed in, use the default + if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP)) + collectionProps.put(ZkController.CONFIGNAME_PROP, defaultConfigName); + + } else if (Boolean.getBoolean("bootstrap_conf")) { + // the conf name should should be the collection name of this core + collectionProps.put(ZkController.CONFIGNAME_PROP, collection); + } else { + getConfName(zkClient, collection, collectionPath, collectionProps); + } + + collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP); // we don't put numShards in the collections properties + + ZkNodeProps zkProps = new ZkNodeProps(collectionProps); + zkClient.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, null, true); + + } catch (KeeperException e) { + // it's okay if the node already exists + if (e.code() != KeeperException.Code.NODEEXISTS) { + throw e; + } + } + } else { + log.debug("Collection zkNode exists"); + } + + } catch (KeeperException e) { + // it's okay if another beats us creating the node + if (e.code() == KeeperException.Code.NODEEXISTS) { + return; + } + throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e); + } catch (InterruptedException e) { + Thread.interrupted(); + throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e); + } + + } + + private static void getConfName(SolrZkClient zkClient, String collection, String collectionPath, Map collectionProps) throws KeeperException, + InterruptedException { + // check for configName + log.debug("Looking for collection configName"); + if (collectionProps.containsKey("configName")) { + log.info("configName was passed as a param {}", collectionProps.get("configName")); + return; + } + + List configNames = null; + int retry = 1; + int retryLimt = 6; + for (; retry < retryLimt; retry++) { + if (zkClient.exists(collectionPath, true)) { + ZkNodeProps cProps = ZkNodeProps.load(zkClient.getData(collectionPath, null, null, true)); + if (cProps.containsKey(ZkController.CONFIGNAME_PROP)) { + break; + } + } + + // if there is only one conf, use that + try { + configNames = zkClient.getChildren(ZkConfigManager.CONFIGS_ZKNODE, null, + true); + } catch (NoNodeException e) { + // just keep trying + } + if (configNames != null && configNames.size() == 1) { + // no config set named, but there is only 1 - use it + log.info("Only one config set found in zk - using it:" + configNames.get(0)); + collectionProps.put(ZkController.CONFIGNAME_PROP, configNames.get(0)); + break; + } + + if (configNames != null && configNames.contains(collection)) { + log.info( + "Could not find explicit collection configName, but found config name matching collection name - using that set."); + collectionProps.put(ZkController.CONFIGNAME_PROP, collection); + break; + } + + log.info("Could not find collection configName - pausing for 3 seconds and trying again - try: " + retry); + Thread.sleep(3000); + } + if (retry == retryLimt) { + log.error("Could not find configName for collection " + collection); + throw new ZooKeeperException( + SolrException.ErrorCode.SERVER_ERROR, + "Could not find configName for collection " + collection + " found:" + configNames); + } + } } diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java index 183f1774ee7..b3cd58566d9 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java +++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java @@ -125,17 +125,6 @@ class ShardLeaderElectionContextBase extends ElectionContext { this.zkClient = zkStateReader.getZkClient(); this.shardId = shardId; this.collection = collection; - - try { - new ZkCmdExecutor(zkStateReader.getZkClient().getZkClientTimeout()) - .ensureExists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, - zkClient); - } catch (KeeperException e) { - throw new SolrException(ErrorCode.SERVER_ERROR, e); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new SolrException(ErrorCode.SERVER_ERROR, e); - } } @Override @@ -175,9 +164,16 @@ class ShardLeaderElectionContextBase extends ElectionContext { void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStartMs) throws KeeperException, InterruptedException, IOException { // register as leader - if an ephemeral is already there, wait to see if it goes away + + if (!zkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) { + log.info("Will not register as leader because collection appears to be gone."); + return; + } + String parent = new Path(leaderPath).getParent().toString(); ZkCmdExecutor zcmd = new ZkCmdExecutor(30000); - zcmd.ensureExists(parent, zkClient); + // only if /collections/{collection} exists already do we succeed in creating this path + zcmd.ensureExists(parent, (byte[])null, CreateMode.PERSISTENT, zkClient, 2); try { RetryUtil.retryOnThrowable(NodeExistsException.class, 60000, 5000, () -> { diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java index 71fdcfd8088..aa8943d68b1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java +++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java @@ -360,8 +360,13 @@ public class LeaderElector { public void setup(final ElectionContext context) throws InterruptedException, KeeperException { String electZKPath = context.electionPath + LeaderElector.ELECTION_NODE; - - zkCmdExecutor.ensureExists(electZKPath, zkClient); + if (context instanceof OverseerElectionContext) { + zkCmdExecutor.ensureExists(electZKPath, zkClient); + } else { + // we use 2 param so that replica won't create /collection/{collection} if it doesn't exist + zkCmdExecutor.ensureExists(electZKPath, (byte[])null, CreateMode.PERSISTENT, zkClient, 2); + } + this.context = context; } diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index c0a8d555000..eba7067a90a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -34,7 +34,6 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; @@ -1273,130 +1272,6 @@ public class ZkController { zkClient.printLayoutToStdOut(); } - public void createCollectionZkNode(CloudDescriptor cd) { - String collection = cd.getCollectionName(); - - log.debug("Check for collection zkNode:" + collection); - String collectionPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection; - - try { - if (!zkClient.exists(collectionPath, true)) { - log.debug("Creating collection in ZooKeeper:" + collection); - - try { - Map collectionProps = new HashMap<>(); - - // TODO: if collection.configName isn't set, and there isn't already a conf in zk, just use that? - String defaultConfigName = System.getProperty(COLLECTION_PARAM_PREFIX + CONFIGNAME_PROP, collection); - - // params passed in - currently only done via core admin (create core commmand). - if (cd.getParams().size() > 0) { - collectionProps.putAll(cd.getParams()); - // if the config name wasn't passed in, use the default - if (!collectionProps.containsKey(CONFIGNAME_PROP)) { - // TODO: getting the configName from the collectionPath should fail since we already know it doesn't exist? - getConfName(collection, collectionPath, collectionProps); - } - - } else if (System.getProperty("bootstrap_confdir") != null) { - // if we are bootstrapping a collection, default the config for - // a new collection to the collection we are bootstrapping - log.info("Setting config for collection:" + collection + " to " + defaultConfigName); - - Properties sysProps = System.getProperties(); - for (String sprop : System.getProperties().stringPropertyNames()) { - if (sprop.startsWith(COLLECTION_PARAM_PREFIX)) { - collectionProps.put(sprop.substring(COLLECTION_PARAM_PREFIX.length()), sysProps.getProperty(sprop)); - } - } - - // if the config name wasn't passed in, use the default - if (!collectionProps.containsKey(CONFIGNAME_PROP)) - collectionProps.put(CONFIGNAME_PROP, defaultConfigName); - - } else if (Boolean.getBoolean("bootstrap_conf")) { - // the conf name should should be the collection name of this core - collectionProps.put(CONFIGNAME_PROP, cd.getCollectionName()); - } else { - getConfName(collection, collectionPath, collectionProps); - } - - collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP); // we don't put numShards in the collections properties - - ZkNodeProps zkProps = new ZkNodeProps(collectionProps); - zkClient.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, null, true); - - } catch (KeeperException e) { - // it's okay if the node already exists - if (e.code() != KeeperException.Code.NODEEXISTS) { - throw e; - } - } - } else { - log.debug("Collection zkNode exists"); - } - - } catch (KeeperException e) { - // it's okay if another beats us creating the node - if (e.code() == KeeperException.Code.NODEEXISTS) { - return; - } - throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e); - } catch (InterruptedException e) { - Thread.interrupted(); - throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e); - } - - } - - - private void getConfName(String collection, String collectionPath, - Map collectionProps) throws KeeperException, - InterruptedException { - // check for configName - log.debug("Looking for collection configName"); - List configNames = null; - int retry = 1; - int retryLimt = 6; - for (; retry < retryLimt; retry++) { - if (zkClient.exists(collectionPath, true)) { - ZkNodeProps cProps = ZkNodeProps.load(zkClient.getData(collectionPath, null, null, true)); - if (cProps.containsKey(CONFIGNAME_PROP)) { - break; - } - } - - // if there is only one conf, use that - try { - configNames = zkClient.getChildren(ZkConfigManager.CONFIGS_ZKNODE, null, - true); - } catch (NoNodeException e) { - // just keep trying - } - if (configNames != null && configNames.size() == 1) { - // no config set named, but there is only 1 - use it - log.info("Only one config set found in zk - using it:" + configNames.get(0)); - collectionProps.put(CONFIGNAME_PROP, configNames.get(0)); - break; - } - - if (configNames != null && configNames.contains(collection)) { - log.info("Could not find explicit collection configName, but found config name matching collection name - using that set."); - collectionProps.put(CONFIGNAME_PROP, collection); - break; - } - - log.info("Could not find collection configName - pausing for 3 seconds and trying again - try: " + retry); - Thread.sleep(3000); - } - if (retry == retryLimt) { - log.error("Could not find configName for collection " + collection); - throw new ZooKeeperException( - SolrException.ErrorCode.SERVER_ERROR, - "Could not find configName for collection " + collection + " found:" + configNames); - } - } - public ZkStateReader getZkStateReader() { return zkStateReader; } @@ -2175,7 +2050,8 @@ public class ZkController { } else { String parentZNodePath = getLeaderInitiatedRecoveryZnodePath(collection, shardId); try { - zkClient.makePath(parentZNodePath, retryOnConnLoss); + // make sure we don't create /collections/{collection} if they do not exist with 2 param + zkClient.makePath(parentZNodePath, (byte[]) null, CreateMode.PERSISTENT, (Watcher) null, true, retryOnConnLoss, 2); } catch (KeeperException.NodeExistsException nee) { // if it exists, that's great! } diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java index 209ca68c566..b4137b31d6c 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java @@ -109,7 +109,7 @@ public class ZkSolrResourceLoader extends SolrResourceLoader { } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IOException("Error opening " + file, e); - } catch (KeeperException e) { + } catch (Exception e) { throw new IOException("Error opening " + file, e); } } diff --git a/solr/core/src/java/org/apache/solr/core/ConfigSetService.java b/solr/core/src/java/org/apache/solr/core/ConfigSetService.java index 3f47f467a14..e4a135e4c65 100644 --- a/solr/core/src/java/org/apache/solr/core/ConfigSetService.java +++ b/solr/core/src/java/org/apache/solr/core/ConfigSetService.java @@ -78,11 +78,10 @@ public abstract class ConfigSetService { IndexSchema schema = createIndexSchema(dcore, solrConfig); NamedList properties = createConfigSetProperties(dcore, coreLoader); return new ConfigSet(configName(dcore), solrConfig, schema, properties); - } - catch (Exception e) { + } catch (Exception e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "Could not load conf for core " + dcore.getName() + - ": " + e.getMessage(), e); + "Could not load conf for core " + dcore.getName() + + ": " + e.getMessage(), e); } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 01095a1143b..1915176070d 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -346,9 +346,11 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission try { String path = ZkStateReader.CONFIGS_ZKNODE + "/" + SYSTEM_COLL + "/schema.xml"; byte[] data = IOUtils.toByteArray(Thread.currentThread().getContextClassLoader().getResourceAsStream("SystemCollectionSchema.xml")); + assert data != null && data.length > 0; cmdExecutor.ensureExists(path, data, CreateMode.PERSISTENT, zk); path = ZkStateReader.CONFIGS_ZKNODE + "/" + SYSTEM_COLL + "/solrconfig.xml"; data = IOUtils.toByteArray(Thread.currentThread().getContextClassLoader().getResourceAsStream("SystemCollectionSolrConfig.xml")); + assert data != null && data.length > 0; cmdExecutor.ensureExists(path, data, CreateMode.PERSISTENT, zk); } catch (IOException e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java index 8e1be10959a..2582872259a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java @@ -80,6 +80,8 @@ public class LeaderElectionTest extends SolrTestCaseJ4 { zkClient = new SolrZkClient(server.getZkAddress(), TIMEOUT); zkStateReader = new ZkStateReader(zkClient); seqToThread = Collections.synchronizedMap(new HashMap()); + zkClient.makePath("/collections/collection1", true); + zkClient.makePath("/collections/collection2", true); } class TestLeaderElectionContext extends ShardLeaderElectionContextBase { diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java index 239afa1f60d..6a7906db907 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java @@ -44,6 +44,7 @@ import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; import org.apache.solr.util.TimeOut; import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.Watcher; import org.easymock.Capture; import org.easymock.EasyMock; import org.junit.After; @@ -114,7 +115,6 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { zkStateReaderMock = createMock(ZkStateReader.class); clusterStateMock = createMock(ClusterState.class); solrZkClientMock = createMock(SolrZkClient.class); - } @AfterClass @@ -143,9 +143,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { reset(zkStateReaderMock); reset(clusterStateMock); reset(solrZkClientMock); - underTest = new OverseerCollectionConfigSetProcessorToBeTested(zkStateReaderMock, - "1234", shardHandlerFactoryMock, ADMIN_PATH, workQueueMock, runningMapMock, - completedMapMock, failureMapMock); + zkMap.clear(); collectionsSet.clear(); } @@ -157,12 +155,12 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { } protected Set commonMocks(int liveNodesCount) throws Exception { - shardHandlerFactoryMock.getShardHandler(); expectLastCall().andAnswer(() -> { log.info("SHARDHANDLER"); return shardHandlerMock; }).anyTimes(); + workQueueMock.peekTopN(EasyMock.anyInt(), anyObject(Predicate.class), EasyMock.anyLong()); expectLastCall().andAnswer(() -> { Object result; @@ -203,12 +201,12 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { workQueueMock.poll(); expectLastCall().andAnswer(() -> queue.poll()).anyTimes(); - - zkStateReaderMock.getClusterState(); - expectLastCall().andAnswer(() -> clusterStateMock).anyTimes(); zkStateReaderMock.getZkClient(); expectLastCall().andAnswer(() -> solrZkClientMock).anyTimes(); + + zkStateReaderMock.getClusterState(); + expectLastCall().andAnswer(() -> clusterStateMock).anyTimes(); zkStateReaderMock.updateClusterState(); @@ -262,6 +260,18 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { String key = (String) getCurrentArguments()[0]; return key; }).anyTimes(); + + solrZkClientMock.makePath(anyObject(String.class), anyObject(byte[].class), anyObject(CreateMode.class), anyObject(Watcher.class), anyBoolean()); + expectLastCall().andAnswer(() -> { + String key = (String) getCurrentArguments()[0]; + return key; + }).anyTimes(); + + solrZkClientMock.makePath(anyObject(String.class), anyObject(byte[].class), anyObject(CreateMode.class), anyObject(Watcher.class), anyBoolean(), anyBoolean(), anyInt()); + expectLastCall().andAnswer(() -> { + String key = (String) getCurrentArguments()[0]; + return key; + }).anyTimes(); solrZkClientMock.exists(anyObject(String.class),anyBoolean()); expectLastCall().andAnswer(() -> { @@ -518,12 +528,17 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 { replicationFactor); } - replay(workQueueMock); replay(solrZkClientMock); replay(zkStateReaderMock); + replay(workQueueMock); replay(clusterStateMock); replay(shardHandlerFactoryMock); replay(shardHandlerMock); + + + underTest = new OverseerCollectionConfigSetProcessorToBeTested(zkStateReaderMock, + "1234", shardHandlerFactoryMock, ADMIN_PATH, workQueueMock, runningMapMock, + completedMapMock, failureMapMock); log.info("clusterstate " + clusterStateMock.hashCode()); diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java index 39ef1b8394d..faa2ba74a05 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java @@ -26,6 +26,7 @@ import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkCmdExecutor; import org.apache.solr.common.cloud.ZkOperation; import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; @@ -304,6 +305,59 @@ public class ZkSolrClientTest extends AbstractSolrTestCase { } } + + public void testSkipPathPartsOnMakePath() throws Exception { + try (ZkConnection conn = new ZkConnection()) { + final SolrZkClient zkClient = conn.getClient(); + + zkClient.makePath("/test", true); + + // should work + zkClient.makePath("/test/path/here", (byte[]) null, CreateMode.PERSISTENT, (Watcher) null, true, true, 1); + + zkClient.clean("/"); + + // should not work + try { + zkClient.makePath("/test/path/here", (byte[]) null, CreateMode.PERSISTENT, (Watcher) null, true, true, 1); + fail("We should not be able to create this path"); + } catch (Exception e) { + + } + + zkClient.clean("/"); + + ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor(30000); + try { + zkCmdExecutor.ensureExists("/collection/collection/leader", (byte[]) null, CreateMode.PERSISTENT, zkClient, 2); + fail("We should not be able to create this path"); + } catch (Exception e) { + + } + + zkClient.makePath("/collection", true); + + try { + zkCmdExecutor.ensureExists("/collections/collection/leader", (byte[]) null, CreateMode.PERSISTENT, zkClient, 2); + fail("We should not be able to create this path"); + } catch (Exception e) { + + } + zkClient.makePath("/collection/collection", true); + + byte[] bytes = new byte[10]; + zkCmdExecutor.ensureExists("/collection/collection", bytes, CreateMode.PERSISTENT, zkClient, 2); + + byte[] returnedBytes = zkClient.getData("/collection/collection", null, null, true); + + assertNull("We skipped 2 path parts, so data won't be written", returnedBytes); + + zkClient.makePath("/collection/collection/leader", true); + + zkCmdExecutor.ensureExists("/collection/collection/leader", (byte[]) null, CreateMode.PERSISTENT, zkClient, 2); + + } + } @Override public void tearDown() throws Exception { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java index 422d9e5d7a9..3f8deea5ace 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java @@ -411,13 +411,13 @@ public class SolrZkClient implements Closeable { public void makePath(String path, boolean failOnExists, boolean retryOnConnLoss) throws KeeperException, InterruptedException { - makePath(path, null, CreateMode.PERSISTENT, null, failOnExists, retryOnConnLoss); + makePath(path, null, CreateMode.PERSISTENT, null, failOnExists, retryOnConnLoss, 0); } public void makePath(String path, File file, boolean failOnExists, boolean retryOnConnLoss) throws IOException, KeeperException, InterruptedException { makePath(path, FileUtils.readFileToByteArray(file), - CreateMode.PERSISTENT, null, failOnExists, retryOnConnLoss); + CreateMode.PERSISTENT, null, failOnExists, retryOnConnLoss, 0); } public void makePath(String path, File file, boolean retryOnConnLoss) throws IOException, @@ -463,21 +463,35 @@ public class SolrZkClient implements Closeable { */ public void makePath(String path, byte[] data, CreateMode createMode, Watcher watcher, boolean retryOnConnLoss) throws KeeperException, InterruptedException { - makePath(path, data, createMode, watcher, true, retryOnConnLoss); + makePath(path, data, createMode, watcher, true, retryOnConnLoss, 0); } - - + /** * Creates the path in ZooKeeper, creating each node as necessary. * * e.g. If path=/solr/group/node and none of the nodes, solr, * group, node exist, each will be created. * + * @param data to set on the last zkNode + */ + public void makePath(String path, byte[] data, CreateMode createMode, + Watcher watcher, boolean failOnExists, boolean retryOnConnLoss) throws KeeperException, InterruptedException { + makePath(path, data, createMode, watcher, failOnExists, retryOnConnLoss, 0); + } + + /** + * Creates the path in ZooKeeper, creating each node as necessary. + * + * e.g. If path=/solr/group/node and none of the nodes, solr, + * group, node exist, each will be created. + * + * skipPathParts will force the call to fail if the first skipPathParts do not exist already. + * * Note: retryOnConnLoss is only respected for the final node - nodes * before that are always retried on connection loss. */ public void makePath(String path, byte[] data, CreateMode createMode, - Watcher watcher, boolean failOnExists, boolean retryOnConnLoss) throws KeeperException, InterruptedException { + Watcher watcher, boolean failOnExists, boolean retryOnConnLoss, int skipPathParts) throws KeeperException, InterruptedException { log.debug("makePath: {}", path); boolean retry = true; @@ -487,9 +501,12 @@ public class SolrZkClient implements Closeable { String[] paths = path.split("/"); StringBuilder sbPath = new StringBuilder(); for (int i = 0; i < paths.length; i++) { - byte[] bytes = null; String pathPiece = paths[i]; sbPath.append("/" + pathPiece); + if (i < skipPathParts) { + continue; + } + byte[] bytes = null; final String currentPath = sbPath.toString(); Object exists = exists(currentPath, watcher, retryOnConnLoss); if (exists == null || ((i == paths.length -1) && failOnExists)) { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java index 0f50f0a81ce..c27f7671bc8 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java @@ -75,17 +75,26 @@ public class ZkCmdExecutor { } public void ensureExists(String path, final SolrZkClient zkClient) throws KeeperException, InterruptedException { - ensureExists(path, null, CreateMode.PERSISTENT, zkClient); + ensureExists(path, null, CreateMode.PERSISTENT, zkClient, 0); + } + + + public void ensureExists(String path, final byte[] data, final SolrZkClient zkClient) throws KeeperException, InterruptedException { + ensureExists(path, data, CreateMode.PERSISTENT, zkClient, 0); + } + + public void ensureExists(String path, final byte[] data, CreateMode createMode, final SolrZkClient zkClient) throws KeeperException, InterruptedException { + ensureExists(path, data, createMode, zkClient, 0); } public void ensureExists(final String path, final byte[] data, - CreateMode createMode, final SolrZkClient zkClient) throws KeeperException, InterruptedException { + CreateMode createMode, final SolrZkClient zkClient, int skipPathParts) throws KeeperException, InterruptedException { if (zkClient.exists(path, true)) { return; } try { - zkClient.makePath(path, data, createMode, true); + zkClient.makePath(path, data, createMode, null, true, true, skipPathParts); } catch (NodeExistsException e) { // it's okay if another beats us creating the node } From 93c11462bbe2c442f20a6d090911c5a1a4546564 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Thu, 8 Dec 2016 18:17:25 -0500 Subject: [PATCH 27/53] fix stale comment --- .../apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index 9d2db890fa0..c3217f30156 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -72,7 +72,7 @@ class SimpleTextPointsWriter extends PointsWriter { PointValues values = reader.getValues(fieldInfo.name); boolean singleValuePerDoc = values.size() == values.getDocCount(); - // We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk: + // We use our own fork of the BKDWriter to customize how it writes the index and blocks to disk: try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(), writeState.directory, writeState.segmentInfo.name, From 1d2e440a8fe3df8d3207a7428841f79f63381e4f Mon Sep 17 00:00:00 2001 From: yonik Date: Thu, 8 Dec 2016 18:29:07 -0500 Subject: [PATCH 28/53] SOLR-9837: fix redundant calculation of docsWithField for numeric fields in field cache --- solr/CHANGES.txt | 4 ++ .../solr/uninverting/FieldCacheImpl.java | 52 ++++++++++--------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index abd99978022..78f7f5580f9 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -50,6 +50,10 @@ Bug Fixes * SOLR-9262: Connection and read timeouts are being ignored by UpdateShardHandler after SOLR-4509. (Mark Miller, shalin) +* SOLR-9837: Fix 55% performance regression of FieldCache uninvert time of + numeric fields. (yonik) + + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java index 0ca687f3952..90be40070ba 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java @@ -365,8 +365,6 @@ public class FieldCacheImpl implements FieldCache { } } - /** @deprecated remove this when legacy numerics are removed */ - @Deprecated protected abstract TermsEnum termsEnum(Terms terms) throws IOException; protected abstract void visitTerm(BytesRef term); protected abstract void visitDoc(int docID); @@ -632,20 +630,21 @@ public class FieldCacheImpl implements FieldCache { } } - Bits docsWithField = getDocsWithField(reader, field, parser); - return ((LongsFromArray) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser))).iterator(docsWithField); + return ((LongsFromArray) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser))).iterator(); } } - static class LongsFromArray implements Accountable { + public static class LongsFromArray implements Accountable { private final PackedInts.Reader values; private final long minValue; + private final Bits docsWithField; private final String field; - public LongsFromArray(String field, PackedInts.Reader values, long minValue) { + public LongsFromArray(String field, PackedInts.Reader values, long minValue, Bits docsWithField) { // TODO: accept null docsWithField? this.field = field; this.values = values; this.minValue = minValue; + this.docsWithField = docsWithField; } @Override @@ -653,7 +652,7 @@ public class FieldCacheImpl implements FieldCache { return values.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF + Long.BYTES; } - public NumericDocValues iterator(final Bits docsWithField) { + public NumericDocValues iterator() { return new NumericDocValues() { int docID = -1; @@ -767,10 +766,11 @@ public class FieldCacheImpl implements FieldCache { u.uninvert(reader, key.field); wrapper.setDocsWithField(reader, key.field, u.docsWithField, parser); GrowableWriterAndMinValue values = valuesRef.get(); + Bits docsWithField = u.docsWithField == null ? new Bits.MatchNoBits(reader.maxDoc()) : u.docsWithField; if (values == null) { - return new LongsFromArray(key.field, new PackedInts.NullReader(reader.maxDoc()), 0L); + return new LongsFromArray(key.field, new PackedInts.NullReader(reader.maxDoc()), 0L, docsWithField); } - return new LongsFromArray(key.field, values.writer.getMutable(), values.minValue); + return new LongsFromArray(key.field, values.writer.getMutable(), values.minValue, docsWithField); } } @@ -993,16 +993,18 @@ public class FieldCacheImpl implements FieldCache { } } - private static class BinaryDocValuesImpl implements Accountable { + public static class BinaryDocValuesImpl implements Accountable { private final PagedBytes.Reader bytes; private final PackedInts.Reader docToOffset; + private final Bits docsWithField; - public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset) { + public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset, Bits docsWithField) { this.bytes = bytes; this.docToOffset = docToOffset; + this.docsWithField = docsWithField; } - public BinaryDocValues iterator(Bits docsWithField) { + public BinaryDocValues iterator() { return new BinaryDocValues() { final BytesRef term = new BytesRef(); @@ -1109,7 +1111,7 @@ public class FieldCacheImpl implements FieldCache { } BinaryDocValuesImpl impl = (BinaryDocValuesImpl) caches.get(BinaryDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio)); - return impl.iterator(getDocsWithField(reader, field, null)); + return impl.iterator(); } static final class BinaryDocValuesCache extends Cache { @@ -1188,19 +1190,21 @@ public class FieldCacheImpl implements FieldCache { } final PackedInts.Reader offsetReader = docToOffset.getMutable(); - wrapper.setDocsWithField(reader, key.field, new Bits() { - @Override - public boolean get(int index) { - return offsetReader.get(index) != 0; - } + Bits docsWithField = new Bits() { + @Override + public boolean get(int index) { + return offsetReader.get(index) != 0; + } - @Override - public int length() { - return maxDoc; - } - }, null); + @Override + public int length() { + return maxDoc; + } + }; + + wrapper.setDocsWithField(reader, key.field, docsWithField, null); // maybe an int-only impl? - return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader); + return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField); } } From c185617582b4bf3ce2899c9ae67e9eeaf2c21741 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Thu, 8 Dec 2016 18:34:51 -0500 Subject: [PATCH 29/53] LUCENE-7583: move this class to the right package --- .../lucene/{util => store}/GrowableByteArrayDataOutput.java | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lucene/core/src/java/org/apache/lucene/{util => store}/GrowableByteArrayDataOutput.java (100%) diff --git a/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/GrowableByteArrayDataOutput.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java rename to lucene/core/src/java/org/apache/lucene/store/GrowableByteArrayDataOutput.java From 22d04a7c1149c1af42dc2890a416fc45e4d0aa5e Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 9 Dec 2016 18:36:37 +0100 Subject: [PATCH 30/53] LUCENE-6989: Fix Exception handling in MMapDirectory's unmap hack support code to work with Java 9's new InaccessibleObjectException that does not extend ReflectiveAccessException in Java 9. --- lucene/CHANGES.txt | 5 +++++ .../core/src/java/org/apache/lucene/store/MMapDirectory.java | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 26a9dec0014..b9deb7e1768 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -85,6 +85,11 @@ Bug Fixes the incoming automaton is a special case and throw a clearer exception than NullPointerException (Tom Mortimer via Mike McCandless) +* LUCENE-6989: Fix Exception handling in MMapDirectory's unmap hack + support code to work with Java 9's new InaccessibleObjectException + that does not extend ReflectiveAccessException in Java 9. + (Uwe Schindler) + Improvements * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, diff --git a/lucene/core/src/java/org/apache/lucene/store/MMapDirectory.java b/lucene/core/src/java/org/apache/lucene/store/MMapDirectory.java index c0e35197f0e..be08a1663a6 100644 --- a/lucene/core/src/java/org/apache/lucene/store/MMapDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/MMapDirectory.java @@ -385,13 +385,13 @@ public class MMapDirectory extends FSDirectory { } } }; - } catch (ReflectiveOperationException e) { - return "Unmapping is not supported on this platform, because internal Java APIs are not compatible to this Lucene version: " + e; } catch (SecurityException e) { return "Unmapping is not supported, because not all required permissions are given to the Lucene JAR file: " + e + " [Please grant at least the following permissions: RuntimePermission(\"accessClassInPackage.sun.misc\"), " + "RuntimePermission(\"accessClassInPackage.jdk.internal.ref\"), and " + "ReflectPermission(\"suppressAccessChecks\")]"; + } catch (ReflectiveOperationException | RuntimeException e) { + return "Unmapping is not supported on this platform, because internal Java APIs are not compatible to this Lucene version: " + e; } } From 4efbde4e76277f364952866c071bb953ca2be070 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 9 Dec 2016 18:05:13 -0500 Subject: [PATCH 31/53] LUCENE-7581: don't allow updating a doc values field if it's used in the index sort --- lucene/CHANGES.txt | 4 +++ .../org/apache/lucene/index/IndexWriter.java | 6 ++++ .../lucene/index/IndexWriterConfig.java | 3 ++ .../lucene/index/LiveIndexWriterConfig.java | 13 ++++++++ .../apache/lucene/index/TestIndexSorting.java | 30 +++++++++++++++++-- 5 files changed, 53 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b9deb7e1768..da6e3d29508 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -90,6 +90,10 @@ Bug Fixes that does not extend ReflectiveAccessException in Java 9. (Uwe Schindler) +* LUCENE-7581: Lucene now prevents updating a doc values field that is used + in the index sort, since this would lead to corruption. (Jim + Ferenczi via Mike McCandless) + Improvements * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 98687855231..3ee87b18304 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1619,6 +1619,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (!globalFieldNumberMap.contains(field, DocValuesType.NUMERIC)) { throw new IllegalArgumentException("can only update existing numeric-docvalues fields!"); } + if (config.getIndexSortFields().contains(field)) { + throw new IllegalArgumentException("cannot update docvalues field involved in the index sort, field=" + field + ", sort=" + config.getIndexSort()); + } try { long seqNo = docWriter.updateDocValues(new NumericDocValuesUpdate(term, field, value)); if (seqNo < 0) { @@ -1713,6 +1716,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (!globalFieldNumberMap.contains(f.name(), dvType)) { throw new IllegalArgumentException("can only update existing docvalues fields! field=" + f.name() + ", type=" + dvType); } + if (config.getIndexSortFields().contains(f.name())) { + throw new IllegalArgumentException("cannot update docvalues field involved in the index sort, field=" + f.name() + ", sort=" + config.getIndexSort()); + } switch (dvType) { case NUMERIC: dvUpdates[i] = new NumericDocValuesUpdate(term, f.name(), (Long) f.numericValue()); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index 4f642eed52a..ce4f0a8e5c3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -18,7 +18,9 @@ package org.apache.lucene.index; import java.io.PrintStream; +import java.util.Arrays; import java.util.EnumSet; +import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -474,6 +476,7 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig { } } this.indexSort = sort; + this.indexSortFields = Arrays.stream(sort.getSort()).map((s) -> s.getField()).collect(Collectors.toSet()); return this; } diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index cec70c099aa..d9e1bc7bebb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -17,6 +17,9 @@ package org.apache.lucene.index; +import java.util.Collections; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; @@ -98,6 +101,9 @@ public class LiveIndexWriterConfig { /** The sort order to use to write merged segments. */ protected Sort indexSort = null; + /** The field names involved in the index sort */ + protected Set indexSortFields = Collections.emptySet(); + // used by IndexWriterConfig LiveIndexWriterConfig(Analyzer analyzer) { this.analyzer = analyzer; @@ -457,6 +463,13 @@ public class LiveIndexWriterConfig { return indexSort; } + /** + * Returns the field names involved in the index sort + */ + public Set getIndexSortFields() { + return indexSortFields; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 5ebf8f481d1..08a85ef3e24 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -1700,6 +1700,29 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + + // docvalues fields involved in the index sort cannot be updated + public void testBadDVUpdate() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new StringField("id", new BytesRef("0"), Store.NO)); + doc.add(new NumericDocValuesField("foo", random().nextInt())); + w.addDocument(doc); + w.commit(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, + () -> w.updateDocValues(new Term("id", "0"), new NumericDocValuesField("foo", -1))); + assertEquals(exc.getMessage(), "cannot update docvalues field involved in the index sort, field=foo, sort="); + exc = expectThrows(IllegalArgumentException.class, + () -> w.updateNumericDocValue(new Term("id", "0"), "foo", -1)); + assertEquals(exc.getMessage(), "cannot update docvalues field involved in the index sort, field=foo, sort="); + w.close(); + dir.close(); + } + static class DVUpdateRunnable implements Runnable { private final int numDocs; @@ -1727,7 +1750,7 @@ public class TestIndexSorting extends LuceneTestCase { final long value = random.nextInt(20); synchronized (values) { - w.updateDocValues(new Term("id", Integer.toString(id)), new NumericDocValuesField("foo", value)); + w.updateDocValues(new Term("id", Integer.toString(id)), new NumericDocValuesField("bar", value)); values.put(id, value); } @@ -1762,7 +1785,8 @@ public class TestIndexSorting extends LuceneTestCase { for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); doc.add(new StringField("id", Integer.toString(i), Store.NO)); - doc.add(new NumericDocValuesField("foo", -1)); + doc.add(new NumericDocValuesField("foo", random().nextInt())); + doc.add(new NumericDocValuesField("bar", -1)); w.addDocument(doc); values.put(i, -1L); } @@ -1786,7 +1810,7 @@ public class TestIndexSorting extends LuceneTestCase { for (int i = 0; i < numDocs; ++i) { final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); assertEquals(1, topDocs.totalHits); - NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "foo"); + NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "bar"); int hitDoc = topDocs.scoreDocs[0].doc; assertEquals(hitDoc, dvs.advance(hitDoc)); assertEquals(values.get(i).longValue(), dvs.longValue()); From 7cffae3a16f7d0c94a79a273a702c0013af7f5ac Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 9 Dec 2016 18:35:13 -0500 Subject: [PATCH 32/53] don't create unnecessary lambda --- .../src/java/org/apache/lucene/index/IndexWriterConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index ce4f0a8e5c3..1e1e795d50b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -476,7 +476,7 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig { } } this.indexSort = sort; - this.indexSortFields = Arrays.stream(sort.getSort()).map((s) -> s.getField()).collect(Collectors.toSet()); + this.indexSortFields = Arrays.stream(sort.getSort()).map(SortField::getField).collect(Collectors.toSet()); return this; } From 2b073a2f296289617bea8256d7efec06049df739 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 9 Dec 2016 18:41:30 -0500 Subject: [PATCH 33/53] LUCENE-7570: don't run merges while holding the commitLock to prevent deadlock when merges are stalled and a tragic merge exception strikes --- lucene/CHANGES.txt | 4 ++ .../org/apache/lucene/index/IndexWriter.java | 28 +++++--- .../index/TestTragicIndexWriterDeadlock.java | 69 ++++++++++++++++++- 3 files changed, 92 insertions(+), 9 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index da6e3d29508..15b89f09f2f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -94,6 +94,10 @@ Bug Fixes in the index sort, since this would lead to corruption. (Jim Ferenczi via Mike McCandless) +* LUCENE-7570: IndexWriter may deadlock if a commit is running while + there are too many merges running and one of the merges hits a + tragic exception (Joey Echeverria via Mike McCandless) + Improvements * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 3ee87b18304..47895050225 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -2952,11 +2952,16 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { @Override public final long prepareCommit() throws IOException { ensureOpen(); - pendingSeqNo = prepareCommitInternal(config.getMergePolicy()); + boolean[] doMaybeMerge = new boolean[1]; + pendingSeqNo = prepareCommitInternal(doMaybeMerge); + // we must do this outside of the commitLock else we can deadlock: + if (doMaybeMerge[0]) { + maybeMerge(config.getMergePolicy(), MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS); + } return pendingSeqNo; } - private long prepareCommitInternal(MergePolicy mergePolicy) throws IOException { + private long prepareCommitInternal(boolean[] doMaybeMerge) throws IOException { startCommitTime = System.nanoTime(); synchronized(commitLock) { ensureOpen(false); @@ -3063,7 +3068,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { boolean success = false; try { if (anySegmentsFlushed) { - maybeMerge(mergePolicy, MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS); + doMaybeMerge[0] = true; } startCommit(toCommit); success = true; @@ -3184,6 +3189,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { infoStream.message("IW", "commit: start"); } + boolean[] doMaybeMerge = new boolean[1]; + + long seqNo; + synchronized(commitLock) { ensureOpen(false); @@ -3191,13 +3200,11 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { infoStream.message("IW", "commit: enter lock"); } - long seqNo; - if (pendingCommit == null) { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "commit: now prepare"); } - seqNo = prepareCommitInternal(mergePolicy); + seqNo = prepareCommitInternal(doMaybeMerge); } else { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "commit: already prepared"); @@ -3206,9 +3213,14 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } finishCommit(); - - return seqNo; } + + // we must do this outside of the commitLock else we can deadlock: + if (doMaybeMerge[0]) { + maybeMerge(mergePolicy, MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS); + } + + return seqNo; } private final void finishCommit() throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTragicIndexWriterDeadlock.java b/lucene/core/src/test/org/apache/lucene/index/TestTragicIndexWriterDeadlock.java index 3cce69831b5..80f9392d0a7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTragicIndexWriterDeadlock.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTragicIndexWriterDeadlock.java @@ -14,13 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.index; - +import java.io.IOException; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; @@ -92,4 +94,69 @@ public class TestTragicIndexWriterDeadlock extends LuceneTestCase { w.close(); dir.close(); } + + // LUCENE-7570 + public void testDeadlockStalledMerges() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + + // so we merge every 2 segments: + LogMergePolicy mp = new LogDocMergePolicy(); + mp.setMergeFactor(2); + iwc.setMergePolicy(mp); + CountDownLatch done = new CountDownLatch(1); + ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler() { + @Override + protected void doMerge(IndexWriter writer, MergePolicy.OneMerge merge) throws IOException { + // let merge takes forever, until commit thread is stalled + try { + done.await(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new RuntimeException(ie); + } + super.doMerge(writer, merge); + } + + @Override + protected synchronized void doStall() { + done.countDown(); + super.doStall(); + } + + @Override + protected void handleMergeException(Directory dir, Throwable exc) { + } + }; + + // so we stall once the 2nd merge wants to run: + cms.setMaxMergesAndThreads(1, 1); + iwc.setMergeScheduler(cms); + + // so we write a segment every 2 indexed docs: + iwc.setMaxBufferedDocs(2); + + final IndexWriter w = new IndexWriter(dir, iwc) { + @Override + void mergeSuccess(MergePolicy.OneMerge merge) { + // tragedy strikes! + throw new OutOfMemoryError(); + } + }; + + w.addDocument(new Document()); + w.addDocument(new Document()); + // w writes first segment + w.addDocument(new Document()); + w.addDocument(new Document()); + // w writes second segment, and kicks off merge, that takes forever (done.await) + w.addDocument(new Document()); + w.addDocument(new Document()); + // w writes third segment + w.addDocument(new Document()); + w.commit(); + // w writes fourth segment, and commit flushes and kicks off merge that stalls + w.close(); + dir.close(); + } } From d75abe1a3022b5d596b7fca4c7e8623782010a88 Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Sat, 10 Dec 2016 14:03:15 -0800 Subject: [PATCH 34/53] SOLR-9843: Fix up DocValuesNotIndexedTest failures. Debugging code --- .../solr/cloud/DocValuesNotIndexedTest.java | 64 +++++++++++-------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/DocValuesNotIndexedTest.java b/solr/core/src/test/org/apache/solr/cloud/DocValuesNotIndexedTest.java index b8507b1a575..f5257f82865 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DocValuesNotIndexedTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DocValuesNotIndexedTest.java @@ -18,15 +18,15 @@ package org.apache.solr.cloud; import java.io.IOException; -import java.text.SimpleDateFormat; +import java.lang.invoke.MethodHandles; +import java.time.Instant; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; import com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule; import org.apache.solr.client.solrj.SolrQuery; @@ -50,11 +50,16 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.RuleChain; import org.junit.rules.TestRule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import static org.apache.lucene.util.LuceneTestCase.random; import static org.apache.solr.client.solrj.request.schema.SchemaRequest.*; public class DocValuesNotIndexedTest extends SolrCloudTestCase { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + @Rule public TestRule solrTestRules = RuleChain.outerRule(new SystemPropertiesRestoreRule()); @@ -79,7 +84,7 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { .process(cluster.getSolrClient()); fieldsToTestSingle = - Collections.unmodifiableList(Stream.of( + Collections.unmodifiableList(Arrays.asList( new FieldProps("intField", "int", 1), new FieldProps("longField", "long", 1), new FieldProps("doubleField", "double", 1), @@ -87,10 +92,10 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { new FieldProps("dateField", "date", 1), new FieldProps("stringField", "string", 1), new FieldProps("boolField", "boolean", 1) - ).collect(Collectors.toList())); + )); fieldsToTestMulti = - Collections.unmodifiableList(Stream.of( + Collections.unmodifiableList(Arrays.asList( new FieldProps("intFieldMulti", "int", 5), new FieldProps("longFieldMulti", "long", 5), new FieldProps("doubleFieldMulti", "double", 5), @@ -98,11 +103,11 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { new FieldProps("dateFieldMulti", "date", 5), new FieldProps("stringFieldMulti", "string", 5), new FieldProps("boolFieldMulti", "boolean", 2) - ).collect(Collectors.toList())); + )); // Fields to test for grouping and sorting with sortMinssingFirst/Last. fieldsToTestGroupSortFirst = - Collections.unmodifiableList(Stream.of( + Collections.unmodifiableList(Arrays.asList( new FieldProps("intGSF", "int"), new FieldProps("longGSF", "long"), new FieldProps("doubleGSF", "double"), @@ -110,10 +115,10 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { new FieldProps("dateGSF", "date"), new FieldProps("stringGSF", "string"), new FieldProps("boolGSF", "boolean") - ).collect(Collectors.toList())); + )); fieldsToTestGroupSortLast = - Collections.unmodifiableList(Stream.of( + Collections.unmodifiableList(Arrays.asList( new FieldProps("intGSL", "int"), new FieldProps("longGSL", "long"), new FieldProps("doubleGSL", "double"), @@ -121,7 +126,7 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { new FieldProps("dateGSL", "date"), new FieldProps("stringGSL", "string"), new FieldProps("boolGSL", "boolean") - ).collect(Collectors.toList())); + )); List updateList = new ArrayList<>(fieldsToTestSingle.size() + fieldsToTestMulti.size() + fieldsToTestGroupSortFirst.size() + fieldsToTestGroupSortLast.size() + @@ -235,7 +240,7 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { new UpdateRequest() .add(docs) .commit(client, COLLECTION); - + checkSortOrder(client, fieldsToTestGroupSortFirst, "asc", new String[]{"4", "2", "1", "3"}, new String[]{"4", "1", "2", "3"}); checkSortOrder(client, fieldsToTestGroupSortFirst, "desc", new String[]{"3", "1", "2", "4"}, new String[]{"2", "3", "1", "4"}); @@ -251,6 +256,10 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { solrQuery.addSort("id", SolrQuery.ORDER.asc); final QueryResponse rsp = client.query(COLLECTION, solrQuery); SolrDocumentList res = rsp.getResults(); + //TODO remove after SOLR-9843 + if (order.length != res.getNumFound()) { + log.error("(3) About to fail, response is: " + rsp.toString()); + } assertEquals("Should have exactly " + order.length + " documents returned", order.length, res.getNumFound()); String expected; for (int idx = 0; idx < res.size(); ++idx) { @@ -264,7 +273,7 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { @Test public void testGroupingDocAbsent() throws IOException, SolrServerException { - List docs = new ArrayList<>(3); + List docs = new ArrayList<>(4); docs.add(makeGSDoc(2, fieldsToTestGroupSortFirst, null)); docs.add(makeGSDoc(1, fieldsToTestGroupSortFirst, null)); docs.add(makeGSDoc(3, fieldsToTestGroupSortFirst, null)); @@ -296,7 +305,11 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { if (prop.getName().startsWith("bool")) expected = 3; //true, false and null List fieldCommandGroups = fieldCommand.getValues(); - assertEquals("Did not find the expected number of groups!", expected, fieldCommandGroups.size()); + //TODO: remove me since this is excessive in the normal case, this is in for SOLR-9843 + if (expected != fieldCommandGroups.size()) { + log.error("(1) About to fail assert, response is: " + rsp.toString()); + } + assertEquals("Did not find the expected number of groups for field " + prop.getName(), expected, fieldCommandGroups.size()); } } @@ -316,7 +329,7 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { docs.add(doc); if ((idx % 5) == 0) { doc = new SolrInputDocument(); - doc.addField("id", idx + 100); + doc.addField("id", idx + 10_000); docs.add(doc); } } @@ -368,6 +381,8 @@ public class DocValuesNotIndexedTest extends SolrCloudTestCase { break; default: + //TODO remove me after SOLR-9843 + log.error("(2) About to fail, response is: " + rsp.toString()); fail("Unexpected number of elements in the group for " + prop.getName() + ": " + grp.getResult().size()); } } @@ -450,8 +465,6 @@ class FieldProps { private Object base; private int counter = 0; - static SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", Locale.ROOT); - FieldProps(String name, String type, int expectedCount) { this.name = name; this.type = type; @@ -460,22 +473,23 @@ class FieldProps { } void resetBase() { if (name.startsWith("int")) { - base = Math.abs(DocValuesNotIndexedTest.random().nextInt()); + base = Math.abs(random().nextInt()); } else if (name.startsWith("long")) { - base = Math.abs(DocValuesNotIndexedTest.random().nextLong()); + base = Math.abs(random().nextLong()); } else if (name.startsWith("float")) { - base = Math.abs(DocValuesNotIndexedTest.random().nextFloat()); + base = Math.abs(random().nextFloat()); } else if (name.startsWith("double")) { - base = Math.abs(DocValuesNotIndexedTest.random().nextDouble()); + base = Math.abs(random().nextDouble()); } else if (name.startsWith("date")) { - base = Math.abs(DocValuesNotIndexedTest.random().nextLong()); + base = Math.abs(random().nextLong()); } else if (name.startsWith("bool")) { base = true; // Must start with a known value since bools only have a two values.... } else if (name.startsWith("string")) { - base = "base_string_" + DocValuesNotIndexedTest.random().nextInt(1_000_000) + "_"; + base = "base_string_" + random().nextInt(1_000_000) + "_"; } else { throw new RuntimeException("Should have found a prefix for the field before now!"); } + counter = 0; } FieldProps(String name, String type) { @@ -496,7 +510,7 @@ class FieldProps { public String getValue(boolean incrementCounter) { if (incrementCounter) { - counter += DocValuesNotIndexedTest.random().nextInt(10) + 100; + counter += random().nextInt(10) + 10_000; } if (name.startsWith("int")) { return Integer.toString((int) base + counter); @@ -511,7 +525,7 @@ class FieldProps { return Double.toString((double) base + counter); } if (name.startsWith("date")) { - return format.format(985_847_645 + (long) base + counter); + return Instant.ofEpochMilli(985_847_645 + (long) base + counter).toString(); } if (name.startsWith("bool")) { String ret = Boolean.toString((boolean) base); From 25c7855bbae4eaa8700e72d094442811f0e8e1d9 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Sun, 11 Dec 2016 13:08:33 +0200 Subject: [PATCH 35/53] Add .pydevproject to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8091ecdba71..625cfa97c80 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ parent.iml **/pom.xml /nbproject /nb-build +.pydevproject /solr/package From 87d8b5450a6d75fdd4b724b24a3722054b6d00f8 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Mon, 12 Dec 2016 10:00:21 +0100 Subject: [PATCH 36/53] LUCENE-7591 - approximate to no. of terms when DVs are not available --- .../apache/lucene/classification/utils/DatasetSplitter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java index 8bb0b1dcdc2..7ab674eafdd 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java @@ -94,7 +94,8 @@ public class DatasetSplitter { } } if (classValues == null) { - throw new IllegalStateException("field \"" + classFieldName + "\" must have sorted (set) doc values"); + // approximate with no. of terms + noOfClasses += leave.reader().terms(classFieldName).size(); } noOfClasses += valueCount; } From 39ba13046bc48beaa139923d5f9fbf7d6fc192b2 Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Mon, 12 Dec 2016 12:38:14 -0800 Subject: [PATCH 37/53] SOLR-9844: Improve FieldCache usage api response formatting and show total size information --- .../apache/lucene/index/SegmentCoreReaders.java | 8 ++++++++ solr/CHANGES.txt | 3 +++ .../apache/solr/search/SolrFieldCacheMBean.java | 2 ++ .../org/apache/solr/uninverting/FieldCache.java | 14 +++++--------- .../solr/uninverting/FieldCacheSanityChecker.java | 3 ++- .../apache/solr/uninverting/UninvertingReader.java | 10 ++++++++++ .../solr/search/TestSolrFieldCacheMBean.java | 1 + 7 files changed, 31 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index e99c1ada266..21ac4a16e98 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -56,6 +56,7 @@ final class SegmentCoreReaders { final TermVectorsReader termVectorsReaderOrig; final PointsReader pointsReader; final Directory cfsReader; + final String segment; /** * fieldinfos for this core: means gen=-1. * this is the exact fieldinfos these codec components saw at write. @@ -98,6 +99,8 @@ final class SegmentCoreReaders { cfsDir = dir; } + segment = si.info.name; + coreFieldInfos = codec.fieldInfosFormat().read(cfsDir, si.info, "", context); final SegmentReadState segmentReadState = new SegmentReadState(cfsDir, si.info, coreFieldInfos, context); @@ -192,4 +195,9 @@ final class SegmentCoreReaders { void removeCoreClosedListener(CoreClosedListener listener) { coreClosedListeners.remove(listener); } + + @Override + public String toString() { + return "SegmentCoreReader(" + segment + ")"; + } } diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 78f7f5580f9..37cccaea5db 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -164,6 +164,9 @@ New Features * SOLR-5043: New solr.dns.prevent.reverse.lookup system property that can be used to prevent long core (re)load delays on systems with missconfigured hostname/DNS (hossman) +* SOLR-9844: FieldCache information fetched via the mbeans handler or seen via the UI now displays the total size used. + The individual cache entries in the response are now formatted better as well. (Varun Thacker) + Optimizations ---------------------- * SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have diff --git a/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java b/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java index 62bc4fa1ff2..70781e967f5 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java +++ b/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java @@ -64,11 +64,13 @@ public class SolrFieldCacheMBean implements JmxAugmentedSolrInfoMBean { if (listEntries) { String[] entries = UninvertingReader.getUninvertedStats(); stats.add("entries_count", entries.length); + stats.add("total_size", UninvertingReader.getTotalSize()); for (int i = 0; i < entries.length; i++) { stats.add("entry#" + i, entries[i]); } } else { stats.add("entries_count", UninvertingReader.getUninvertedStatsSize()); + stats.add("total_size", UninvertingReader.getTotalSize()); } return stats; } diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java index 32f56152c8c..544800e3d3b 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCache.java @@ -384,7 +384,7 @@ public interface FieldCache { return custom; } - public Object getValue() { + public Accountable getValue() { return value; } @@ -399,15 +399,11 @@ public interface FieldCache { @Override public String toString() { - StringBuilder b = new StringBuilder(250); - b.append("'").append(getReaderKey()).append("'=>"); - b.append("'").append(getFieldName()).append("',"); - b.append(getCacheType()).append(",").append(getCustom()); - b.append("=>").append(getValue().getClass().getName()).append("#"); - b.append(System.identityHashCode(getValue())); - + StringBuilder b = new StringBuilder(100); + b.append("segment='").append(getReaderKey().toString()).append("', "); + b.append("field='").append(getFieldName()).append("', "); String s = getEstimatedSize(); - b.append(" (size =~ ").append(s).append(')'); + b.append("size =~ ").append(s); return b.toString(); } diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java index ec398f2174a..3d874ce4bfb 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java @@ -27,6 +27,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.MapOfSets; import org.apache.solr.uninverting.FieldCache.CacheEntry; @@ -103,7 +104,7 @@ final class FieldCacheSanityChecker { // iterate over all the cacheEntries to get the mappings we'll need for (int i = 0; i < cacheEntries.length; i++) { final CacheEntry item = cacheEntries[i]; - final Object val = item.getValue(); + final Accountable val = item.getValue(); // It's OK to have dup entries, where one is eg // float[] and the other is the Bits (from diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 78256664dee..87fb7a6bd42 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.solr.uninverting.FieldCache.CacheEntry; /** @@ -386,4 +387,13 @@ public class UninvertingReader extends FilterLeafReader { public static int getUninvertedStatsSize() { return FieldCache.DEFAULT.getCacheEntries().length; } + + public static String getTotalSize() { + CacheEntry[] entries = FieldCache.DEFAULT.getCacheEntries(); + long totalBytesUsed = 0; + for (int i = 0; i < entries.length; i++) { + totalBytesUsed += entries[i].getValue().ramBytesUsed(); + } + return RamUsageEstimator.humanReadableUnits(totalBytesUsed); + } } diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java b/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java index 5343f7345fb..a705e1ec94a 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java @@ -76,6 +76,7 @@ public class TestSolrFieldCacheMBean extends SolrTestCaseJ4 { SolrFieldCacheMBean mbean = new SolrFieldCacheMBean(); NamedList stats = checkJmx ? mbean.getStatisticsForJmx() : mbean.getStatistics(); assert(new Integer(stats.get("entries_count").toString()) > 0); + assertNotNull(stats.get("total_size")); assertNull(stats.get("entry#0")); } } From fecbbe081fd4a777f01517fdd8631e69797def38 Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Mon, 12 Dec 2016 15:28:22 -0800 Subject: [PATCH 38/53] SOLR-9707: Don't forward DeleteByQuery requests to down replicas --- solr/CHANGES.txt | 2 ++ .../solr/update/processor/DistributedUpdateProcessor.java | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 37cccaea5db..a8a3f971023 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -240,6 +240,8 @@ Bug Fixes * SOLR-9834: A variety of spots in the code can create a collection zk node after the collection has been removed. (Mark Miller) +* SOLR-9707: Don't forward DeleteByQuery requests to down replicas. (Jessica Cheng Mallet via Varun Thacker) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index b8bdd16cfb4..c62a90af260 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -658,8 +658,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor { String shardId = cloudDesc.getShardId(); try { - Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( - collection, shardId); + Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId); isLeader = leaderReplica.getName().equals( req.getCore().getCoreDescriptor().getCloudDescriptor() .getCoreNodeName()); @@ -668,7 +667,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor { forwardToLeader = false; List replicaProps = zkController.getZkStateReader() - .getReplicaProps(collection, shardId, leaderReplica.getName()); + .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN); if (replicaProps != null) { nodes = new ArrayList<>(replicaProps.size()); for (ZkCoreNodeProps props : replicaProps) { @@ -677,8 +676,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor { } } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", - e); + throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e); } return nodes; From 8c79ab2649437c8c7ca275f6481c058c67626660 Mon Sep 17 00:00:00 2001 From: Erick Date: Mon, 12 Dec 2016 18:43:30 -0800 Subject: [PATCH 39/53] SOLR:9823: CoreContainer incorrectly setting MDCLoggingContext for core --- solr/CHANGES.txt | 2 ++ solr/core/src/java/org/apache/solr/core/CoreContainer.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index a8a3f971023..41af0ff23d5 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -242,6 +242,8 @@ Bug Fixes * SOLR-9707: Don't forward DeleteByQuery requests to down replicas. (Jessica Cheng Mallet via Varun Thacker) +* SOLR-9823: CoreContainer incorrectly setting MDCLoggingContext for core (Jessica Cheng Mallet via Erick Erickson) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index ad4560e5bbb..7c38b81f40d 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -857,7 +857,7 @@ public class CoreContainer { SolrCore core = null; try { - MDCLoggingContext.setCore(core); + MDCLoggingContext.setCoreDescriptor(dcore); SolrIdentifierValidator.validateCoreName(dcore.getName()); if (zkSys.getZkController() != null) { zkSys.getZkController().preRegister(dcore); From 9aa5b734c38ed0b9327577bd2b1413d448230eab Mon Sep 17 00:00:00 2001 From: Nicholas Knize Date: Tue, 13 Dec 2016 15:07:06 -0600 Subject: [PATCH 40/53] fix RangeFieldQuery.scorer to return null if no docs in a segment indexed the field --- .../src/java/org/apache/lucene/document/RangeFieldQuery.java | 1 + .../org/apache/lucene/search/BaseRangeFieldQueryTestCase.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/RangeFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/document/RangeFieldQuery.java index 7ebdec491ee..52491912cea 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/document/RangeFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/document/RangeFieldQuery.java @@ -165,6 +165,7 @@ abstract class RangeFieldQuery extends Query { FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo == null) { // no docs in this segment indexed this field + return null; } checkFieldInfo(fieldInfo); boolean allDocsMatch = true; diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/BaseRangeFieldQueryTestCase.java b/lucene/sandbox/src/test/org/apache/lucene/search/BaseRangeFieldQueryTestCase.java index ff61ff65809..ceafd5360da 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/BaseRangeFieldQueryTestCase.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/BaseRangeFieldQueryTestCase.java @@ -93,7 +93,7 @@ public abstract class BaseRangeFieldQueryTestCase extends LuceneTestCase { ranges[id] = new Range[] {nextRange(dimensions)}; } if (x == 17) { - // dome docs don't have a box: + // some docs don't have a box: ranges[id][0].isMissing = true; if (VERBOSE) { System.out.println(" id=" + id + " is missing"); From ad7152ad4739a47aa2b45405ba1682b3dda18923 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Sun, 11 Dec 2016 12:49:50 +0200 Subject: [PATCH 41/53] LUCENE-7590: add DocValuesStatsCollector --- .../apache/lucene/search/DocValuesStats.java | 165 +++++++++++++++++ .../search/DocValuesStatsCollector.java | 64 +++++++ .../search/TestDocValuesStatsCollector.java | 166 ++++++++++++++++++ 3 files changed, 395 insertions(+) create mode 100644 lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java create mode 100644 lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java create mode 100644 lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java new file mode 100644 index 00000000000..fad9f97f0e2 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; + +/** Holds statistics for a DocValues field. */ +public abstract class DocValuesStats { + + private int missing = 0; + private int count = 0; + + protected final String field; + + protected T min; + protected T max; + + protected DocValuesStats(String field, T initialMin, T initialMax) { + this.field = field; + this.min = initialMin; + this.max = initialMax; + } + + /** + * Called after #{@link DocValuesStats#accumulate(int)} was processed and verified that the document has a value for + * the field. Implementations should update the statistics based on the value of the current document. + * + * @param count + * the updated number of documents with value for this field. + */ + protected abstract void doAccumulate(int count) throws IOException; + + /** + * Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e. + * it does have the requested DocValues field). + */ + protected abstract boolean init(LeafReaderContext contxt) throws IOException; + + /** Returns whether the given document has a value for the requested DocValues field. */ + protected abstract boolean hasValue(int doc) throws IOException; + + final void accumulate(int doc) throws IOException { + if (hasValue(doc)) { + ++count; + doAccumulate(count); + } else { + ++missing; + } + } + + final void addMissing() { + ++missing; + } + + /** The field for which these stats were computed. */ + public final String field() { + return field; + } + + /** The number of documents which have a value of the field. */ + public final int count() { + return count; + } + + /** The number of documents which do not have a value of the field. */ + public final int missing() { + return missing; + } + + /** The minimum value of the field. Undefined when {@link #count} is zero. */ + public final T min() { + return min; + } + + /** The maximum value of the field. Undefined when {@link #count} is zero. */ + public final T max() { + return max; + } + + /** Holds statistics for a numeric DocValues field. */ + public static abstract class NumericDocValuesStats extends DocValuesStats { + + protected double mean = 0.0; + + protected NumericDocValues ndv; + + protected NumericDocValuesStats(String field, T initialMin, T initialMax) { + super(field, initialMin, initialMax); + } + + @Override + protected final boolean init(LeafReaderContext contxt) throws IOException { + ndv = contxt.reader().getNumericDocValues(field); + return ndv != null; + } + + @Override + protected boolean hasValue(int doc) throws IOException { + return ndv.advanceExact(doc); + } + + /** The mean of all values of the field. Undefined when {@link #count} is zero. */ + public final double mean() { + return mean; + } + } + + /** Holds DocValues statistics for a numeric field storing {@code long} values. */ + public static final class LongDocValuesStats extends NumericDocValuesStats { + + public LongDocValuesStats(String description) { + super(description, Long.MAX_VALUE, Long.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + long val = ndv.longValue(); + if (val > max) { + max = val; + } + if (val < min) { + min = val; + } + mean += (val - mean) / count; + } + } + + /** Holds DocValues statistics for a numeric field storing {@code double} values. */ + public static final class DoubleDocValuesStats extends NumericDocValuesStats { + + public DoubleDocValuesStats(String description) { + super(description, Double.MAX_VALUE, Double.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + double val = Double.longBitsToDouble(ndv.longValue()); + if (Double.compare(val, max) > 0) { + max = val; + } + if (Double.compare(val, min) < 0) { + min = val; + } + mean += (val - mean) / count; + } + } + +} \ No newline at end of file diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java new file mode 100644 index 00000000000..2b1fa4fb852 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; + +/** A {@link Collector} which computes statistics for a DocValues field. */ +public class DocValuesStatsCollector implements Collector { + + private final DocValuesStats stats; + + /** Creates a collector to compute statistics for a DocValues field using the given {@code stats}. */ + public DocValuesStatsCollector(DocValuesStats stats) { + this.stats = stats; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + boolean shouldProcess = stats.init(context); + if (!shouldProcess) { + // Stats cannot be computed for this segment, therefore consider all matching documents as a 'miss'. + return new LeafCollector() { + @Override public void setScorer(Scorer scorer) throws IOException {} + + @Override + public void collect(int doc) throws IOException { + // All matching documents in this reader are missing a value + stats.addMissing(); + } + }; + } + + return new LeafCollector() { + @Override public void setScorer(Scorer scorer) throws IOException {} + + @Override + public void collect(int doc) throws IOException { + stats.accumulate(doc); + } + }; + } + + @Override + public boolean needsScores() { + return false; + } + +} diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java new file mode 100644 index 00000000000..65f82e62d42 --- /dev/null +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.DoubleStream; +import java.util.stream.LongStream; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats; +import org.apache.lucene.search.DocValuesStats.LongDocValuesStats; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +/** Unit tests for {@link DocValuesStatsCollector}. */ +public class TestDocValuesStatsCollector extends LuceneTestCase { + + public void testNoDocsWithField() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + int numDocs = TestUtil.nextInt(random(), 1, 100); + for (int i = 0; i < numDocs; i++) { + indexWriter.addDocument(new Document()); + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + LongDocValuesStats stats = new LongDocValuesStats("foo"); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + assertEquals(0, stats.count()); + assertEquals(numDocs, stats.missing()); + } + } + } + + public void testRandomDocsWithLongValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + long[] docValues = new long[numDocs]; + int nextVal = 1; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + doc.add(new NumericDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + LongDocValuesStats stats = new LongDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(getPositiveValues(docValues).max().getAsLong(), stats.max().longValue()); + assertEquals(getPositiveValues(docValues).min().getAsLong(), stats.min().longValue()); + assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + } + } + } + } + + public void testRandomDocsWithDoubleValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + double[] docValues = new double[numDocs]; + double nextVal = 1.0; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + doc.add(new DoubleDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + DoubleDocValuesStats stats = new DoubleDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(getPositiveValues(docValues).max().getAsDouble(), stats.max().doubleValue(), 0.00001); + assertEquals(getPositiveValues(docValues).min().getAsDouble(), stats.min().doubleValue(), 0.00001); + assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + } + } + } + } + + private static LongStream getPositiveValues(long[] docValues) { + return Arrays.stream(docValues).filter(v -> v > 0); + } + + private static DoubleStream getPositiveValues(double[] docValues) { + return Arrays.stream(docValues).filter(v -> v > 0); + } + + private static LongStream getZeroValues(long[] docValues) { + return Arrays.stream(docValues).filter(v -> v == 0); + } + + private static DoubleStream getZeroValues(double[] docValues) { + return Arrays.stream(docValues).filter(v -> v == 0); + } + +} From 770f1eb8ad6af5cce55d1bdf52f1288216c9691f Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 14 Dec 2016 13:07:19 +0200 Subject: [PATCH 42/53] Fix LeafReader.getNumericDocValues javadoc --- lucene/core/src/java/org/apache/lucene/index/LeafReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java index acdd0d87bd7..73394f23670 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java @@ -242,7 +242,7 @@ public abstract class LeafReader extends IndexReader { /** Returns {@link NumericDocValues} for this field, or * null if no numeric doc values were indexed for * this field. The returned instance should only be - * used by a single thread. This will never return null. */ + * used by a single thread. */ public abstract NumericDocValues getNumericDocValues(String field) throws IOException; /** Returns {@link BinaryDocValues} for this field, or From 85582dabe4372085e1af5d01ebbfcfd0303b9f12 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 14 Dec 2016 13:28:02 +0200 Subject: [PATCH 43/53] LUCENE-7590: fix typo in method parameter --- .../src/java/org/apache/lucene/search/DocValuesStats.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java index fad9f97f0e2..38158cf47d9 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -51,7 +51,7 @@ public abstract class DocValuesStats { * Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e. * it does have the requested DocValues field). */ - protected abstract boolean init(LeafReaderContext contxt) throws IOException; + protected abstract boolean init(LeafReaderContext context) throws IOException; /** Returns whether the given document has a value for the requested DocValues field. */ protected abstract boolean hasValue(int doc) throws IOException; @@ -106,8 +106,8 @@ public abstract class DocValuesStats { } @Override - protected final boolean init(LeafReaderContext contxt) throws IOException { - ndv = contxt.reader().getNumericDocValues(field); + protected final boolean init(LeafReaderContext context) throws IOException { + ndv = context.reader().getNumericDocValues(field); return ndv != null; } From 22d9af41a435feaa3307880b7c7ed4f5860faa21 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 14 Dec 2016 13:49:42 +0200 Subject: [PATCH 44/53] Rename constructor parameter name --- .../src/java/org/apache/lucene/search/DocValuesStats.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java index 38158cf47d9..998bef4fe21 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -125,8 +125,8 @@ public abstract class DocValuesStats { /** Holds DocValues statistics for a numeric field storing {@code long} values. */ public static final class LongDocValuesStats extends NumericDocValuesStats { - public LongDocValuesStats(String description) { - super(description, Long.MAX_VALUE, Long.MIN_VALUE); + public LongDocValuesStats(String field) { + super(field, Long.MAX_VALUE, Long.MIN_VALUE); } @Override @@ -145,8 +145,8 @@ public abstract class DocValuesStats { /** Holds DocValues statistics for a numeric field storing {@code double} values. */ public static final class DoubleDocValuesStats extends NumericDocValuesStats { - public DoubleDocValuesStats(String description) { - super(description, Double.MAX_VALUE, Double.MIN_VALUE); + public DoubleDocValuesStats(String field) { + super(field, Double.MAX_VALUE, Double.MIN_VALUE); } @Override From e82399d0677651ad4be1d8d2bdc4777b5d90b0fa Mon Sep 17 00:00:00 2001 From: markrmiller Date: Mon, 12 Dec 2016 11:10:58 -0500 Subject: [PATCH 45/53] SOLR-1953: It may be possible for temporary files to accumulate until the Solr process is shut down. --- solr/CHANGES.txt | 3 + .../solr/servlet/SolrDispatchFilter.java | 15 ++ .../solr/servlet/SolrRequestParsers.java | 31 ++-- .../solr/util/SolrFileCleaningTracker.java | 147 ++++++++++++++++++ 4 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/util/SolrFileCleaningTracker.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 41af0ff23d5..946a04e6936 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -244,6 +244,9 @@ Bug Fixes * SOLR-9823: CoreContainer incorrectly setting MDCLoggingContext for core (Jessica Cheng Mallet via Erick Erickson) +* SOLR-1953: It may be possible for temporary files to accumulate until the Solr process is shut down. + (Karl Wright, Mark Miller) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java index 5a4cfb627fe..e8c4657f378 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java @@ -45,6 +45,7 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.io.FileCleaningTracker; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.io.output.CloseShieldOutputStream; import org.apache.commons.lang.StringUtils; @@ -62,6 +63,7 @@ import org.apache.solr.core.SolrXmlConfig; import org.apache.solr.request.SolrRequestInfo; import org.apache.solr.security.AuthenticationPlugin; import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.util.SolrFileCleaningTracker; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -123,6 +125,8 @@ public class SolrDispatchFilter extends BaseSolrFilter { { log.trace("SolrDispatchFilter.init(): {}", this.getClass().getClassLoader()); + SolrRequestParsers.fileCleaningTracker = new SolrFileCleaningTracker(); + StartupLoggingUtils.checkLogDir(); logWelcomeBanner(); String muteConsole = System.getProperty(SOLR_LOG_MUTECONSOLE); @@ -240,6 +244,17 @@ public class SolrDispatchFilter extends BaseSolrFilter { @Override public void destroy() { + try { + FileCleaningTracker fileCleaningTracker = SolrRequestParsers.fileCleaningTracker; + if (fileCleaningTracker != null) { + fileCleaningTracker.exitWhenFinished(); + } + } catch (Exception e) { + log.warn("Exception closing FileCleaningTracker", e); + } finally { + SolrRequestParsers.fileCleaningTracker = null; + } + if (cores != null) { try { cores.shutdown(); diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java index 9d7e7d9aced..968320e28ee 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java @@ -42,6 +42,7 @@ import java.util.Map; import org.apache.commons.fileupload.FileItem; import org.apache.commons.fileupload.disk.DiskFileItemFactory; import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.commons.io.FileCleaningTracker; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.lucene.util.IOUtils; import org.apache.solr.common.SolrException; @@ -58,6 +59,7 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequestBase; import org.apache.solr.util.RTimerTree; +import org.apache.solr.util.SolrFileCleaningTracker; import static org.apache.solr.common.params.CommonParams.PATH; @@ -88,6 +90,8 @@ public class SolrRequestParsers /** Default instance for e.g. admin requests. Limits to 2 MB uploads and does not allow remote streams. */ public static final SolrRequestParsers DEFAULT = new SolrRequestParsers(); + public static volatile SolrFileCleaningTracker fileCleaningTracker; + /** * Pass in an xml configuration. A null configuration will enable * everything with maximum values. @@ -532,31 +536,30 @@ public class SolrRequestParsers /** * Extract Multipart streams */ - static class MultipartRequestParser implements SolrRequestParser - { + static class MultipartRequestParser implements SolrRequestParser { private final int uploadLimitKB; + private DiskFileItemFactory factory = new DiskFileItemFactory(); - public MultipartRequestParser( int limit ) - { + public MultipartRequestParser(int limit) { uploadLimitKB = limit; + + // Set factory constraints + FileCleaningTracker fct = fileCleaningTracker; + if (fct != null) { + factory.setFileCleaningTracker(fileCleaningTracker); + } + // TODO - configure factory.setSizeThreshold(yourMaxMemorySize); + // TODO - configure factory.setRepository(yourTempDirectory); } @Override - public SolrParams parseParamsAndFillStreams( - final HttpServletRequest req, ArrayList streams ) throws Exception - { + public SolrParams parseParamsAndFillStreams( + final HttpServletRequest req, ArrayList streams) throws Exception { if( !ServletFileUpload.isMultipartContent(req) ) { throw new SolrException( ErrorCode.BAD_REQUEST, "Not multipart content! "+req.getContentType() ); } MultiMapSolrParams params = parseQueryString( req.getQueryString() ); - - // Create a factory for disk-based file items - DiskFileItemFactory factory = new DiskFileItemFactory(); - - // Set factory constraints - // TODO - configure factory.setSizeThreshold(yourMaxMemorySize); - // TODO - configure factory.setRepository(yourTempDirectory); // Create a new file upload handler ServletFileUpload upload = new ServletFileUpload(factory); diff --git a/solr/core/src/java/org/apache/solr/util/SolrFileCleaningTracker.java b/solr/core/src/java/org/apache/solr/util/SolrFileCleaningTracker.java new file mode 100644 index 00000000000..9c66f0feadb --- /dev/null +++ b/solr/core/src/java/org/apache/solr/util/SolrFileCleaningTracker.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.util; + +import java.io.File; +import java.lang.ref.PhantomReference; +import java.lang.ref.ReferenceQueue; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; + +import org.apache.commons.io.FileCleaningTracker; +import org.apache.commons.io.FileDeleteStrategy; + +public class SolrFileCleaningTracker extends FileCleaningTracker { + + ReferenceQueue q = new ReferenceQueue<>(); + + final Collection trackers = Collections.synchronizedSet(new HashSet()); + + final List deleteFailures = Collections.synchronizedList(new ArrayList()); + + volatile boolean exitWhenFinished = false; + + Thread reaper; + + public void track(final File file, final Object marker) { + track(file, marker, null); + } + + public void track(final File file, final Object marker, final FileDeleteStrategy deleteStrategy) { + if (file == null) { + throw new NullPointerException("The file must not be null"); + } + addTracker(file.getPath(), marker, deleteStrategy); + } + + public void track(final String path, final Object marker) { + track(path, marker, null); + } + + public void track(final String path, final Object marker, final FileDeleteStrategy deleteStrategy) { + if (path == null) { + throw new NullPointerException("The path must not be null"); + } + addTracker(path, marker, deleteStrategy); + } + + private synchronized void addTracker(final String path, final Object marker, + final FileDeleteStrategy deleteStrategy) { + if (exitWhenFinished) { + throw new IllegalStateException("No new trackers can be added once exitWhenFinished() is called"); + } + if (reaper == null) { + reaper = new Reaper(); + reaper.start(); + } + trackers.add(new Tracker(path, deleteStrategy, marker, q)); + } + + public int getTrackCount() { + return trackers.size(); + } + + public List getDeleteFailures() { + return deleteFailures; + } + + public synchronized void exitWhenFinished() { + // synchronized block protects reaper + exitWhenFinished = true; + if (reaper != null) { + synchronized (reaper) { + reaper.interrupt(); + try { + reaper.join(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } + } + + private final class Reaper extends Thread { + Reaper() { + super("MultiPart Upload Tmp File Reaper"); + setDaemon(true); + } + + @Override + public void run() { + while (exitWhenFinished == false || trackers.size() > 0) { + try { + // Wait for a tracker to remove. + final Tracker tracker = (Tracker) q.remove(); // cannot return null + trackers.remove(tracker); + if (!tracker.delete()) { + deleteFailures.add(tracker.getPath()); + } + tracker.clear(); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + } + } + + private static final class Tracker extends PhantomReference { + + private final String path; + + private final FileDeleteStrategy deleteStrategy; + + Tracker(final String path, final FileDeleteStrategy deleteStrategy, final Object marker, + final ReferenceQueue queue) { + super(marker, queue); + this.path = path; + this.deleteStrategy = deleteStrategy == null ? FileDeleteStrategy.NORMAL : deleteStrategy; + } + + public String getPath() { + return path; + } + + public boolean delete() { + return deleteStrategy.deleteQuietly(new File(path)); + } + } + +} \ No newline at end of file From 7dec783b287ab554cc781622b4d6127e553fd2ae Mon Sep 17 00:00:00 2001 From: markrmiller Date: Sun, 11 Dec 2016 22:02:48 -0500 Subject: [PATCH 46/53] SOLR-9846: OverseerAutoReplicaFailoverThread can take too long to stop and leak out of unit tests. --- solr/CHANGES.txt | 2 ++ .../solr/cloud/OverseerAutoReplicaFailoverThread.java | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 946a04e6936..5f0357b3608 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -288,6 +288,8 @@ Other Changes response (instead of a SolrException) and includes the remote error message as part of the exception message (Tomás Fernández Löbbe) +* SOLR-9846: OverseerAutoReplicaFailoverThread can take too long to stop and leak out of unit tests. (Mark Miller) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerAutoReplicaFailoverThread.java b/solr/core/src/java/org/apache/solr/cloud/OverseerAutoReplicaFailoverThread.java index 83679a549c7..10b4bf3fb0e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerAutoReplicaFailoverThread.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerAutoReplicaFailoverThread.java @@ -89,6 +89,8 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable { private final int workLoopDelay; private final int waitAfterExpiration; + + private volatile Thread thread; public OverseerAutoReplicaFailoverThread(CloudConfig config, ZkStateReader zkStateReader, UpdateShardHandler updateShardHandler) { @@ -118,7 +120,7 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable { @Override public void run() { - + this.thread = Thread.currentThread(); while (!this.isClosed) { // work loop log.debug("do " + this.getClass().getSimpleName() + " work loop"); @@ -136,7 +138,6 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable { try { Thread.sleep(workLoopDelay); } catch (InterruptedException e) { - Thread.currentThread().interrupt(); return; } } @@ -480,6 +481,10 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable { @Override public void close() { isClosed = true; + Thread lThread = thread; + if (lThread != null) { + lThread.interrupt(); + } } public boolean isClosed() { From 512374384a8984c56c91f47dcac4aaf0490eda54 Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Tue, 13 Dec 2016 15:52:17 -0800 Subject: [PATCH 47/53] SOLR-9844: Display fc total size only when field entries asked for --- .../solr/search/SolrFieldCacheMBean.java | 6 ++--- .../solr/uninverting/UninvertingReader.java | 25 +++++++++++++------ .../solr/search/TestSolrFieldCacheMBean.java | 3 ++- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java b/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java index 70781e967f5..642b7087846 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java +++ b/solr/core/src/java/org/apache/solr/search/SolrFieldCacheMBean.java @@ -62,15 +62,15 @@ public class SolrFieldCacheMBean implements JmxAugmentedSolrInfoMBean { private NamedList getStats(boolean listEntries) { NamedList stats = new SimpleOrderedMap(); if (listEntries) { - String[] entries = UninvertingReader.getUninvertedStats(); + UninvertingReader.FieldCacheStats fieldCacheStats = UninvertingReader.getUninvertedStats(); + String[] entries = fieldCacheStats.info; stats.add("entries_count", entries.length); - stats.add("total_size", UninvertingReader.getTotalSize()); + stats.add("total_size", fieldCacheStats.totalSize); for (int i = 0; i < entries.length; i++) { stats.add("entry#" + i, entries[i]); } } else { stats.add("entries_count", UninvertingReader.getUninvertedStatsSize()); - stats.add("total_size", UninvertingReader.getTotalSize()); } return stats; } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 87fb7a6bd42..5276ca9da30 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -375,25 +375,34 @@ public class UninvertingReader extends FilterLeafReader { * Return information about the backing cache * @lucene.internal */ - public static String[] getUninvertedStats() { + public static FieldCacheStats getUninvertedStats() { CacheEntry[] entries = FieldCache.DEFAULT.getCacheEntries(); + long totalBytesUsed = 0; String[] info = new String[entries.length]; for (int i = 0; i < entries.length; i++) { info[i] = entries[i].toString(); + totalBytesUsed += entries[i].getValue().ramBytesUsed(); } - return info; + String totalSize = RamUsageEstimator.humanReadableUnits(totalBytesUsed); + return new FieldCacheStats(totalSize, info); } public static int getUninvertedStatsSize() { return FieldCache.DEFAULT.getCacheEntries().length; } - public static String getTotalSize() { - CacheEntry[] entries = FieldCache.DEFAULT.getCacheEntries(); - long totalBytesUsed = 0; - for (int i = 0; i < entries.length; i++) { - totalBytesUsed += entries[i].getValue().ramBytesUsed(); + /** + * Return information about the backing cache + * @lucene.internal + */ + public static class FieldCacheStats { + public String totalSize; + public String[] info; + + public FieldCacheStats(String totalSize, String[] info) { + this.totalSize = totalSize; + this.info = info; } - return RamUsageEstimator.humanReadableUnits(totalBytesUsed); + } } diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java b/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java index a705e1ec94a..35bdec643bc 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrFieldCacheMBean.java @@ -69,6 +69,7 @@ public class TestSolrFieldCacheMBean extends SolrTestCaseJ4 { SolrFieldCacheMBean mbean = new SolrFieldCacheMBean(); NamedList stats = checkJmx ? mbean.getStatisticsForJmx() : mbean.getStatistics(); assert(new Integer(stats.get("entries_count").toString()) > 0); + assertNotNull(stats.get("total_size")); assertNotNull(stats.get("entry#0")); } @@ -76,7 +77,7 @@ public class TestSolrFieldCacheMBean extends SolrTestCaseJ4 { SolrFieldCacheMBean mbean = new SolrFieldCacheMBean(); NamedList stats = checkJmx ? mbean.getStatisticsForJmx() : mbean.getStatistics(); assert(new Integer(stats.get("entries_count").toString()) > 0); - assertNotNull(stats.get("total_size")); + assertNull(stats.get("total_size")); assertNull(stats.get("entry#0")); } } From 6525bb56f027655e5a01f028fa373305c0d01caa Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Wed, 14 Dec 2016 13:18:56 -0700 Subject: [PATCH 48/53] SOLR-8959: Refactored TestSegmentSorting out of TestMiniSolrCloudCluster --- solr/CHANGES.txt | 3 + .../cloud/SegmentTerminateEarlyTestState.java | 12 +- .../solr/cloud/TestMiniSolrCloudCluster.java | 50 ------- .../apache/solr/cloud/TestSegmentSorting.java | 133 ++++++++++++++++++ 4 files changed, 145 insertions(+), 53 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestSegmentSorting.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5f0357b3608..73b0e9b8539 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -290,6 +290,9 @@ Other Changes * SOLR-9846: OverseerAutoReplicaFailoverThread can take too long to stop and leak out of unit tests. (Mark Miller) +* SOLR-8959: Refactored TestSegmentSorting out of TestMiniSolrCloudCluster (hossman) + + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/test/org/apache/solr/cloud/SegmentTerminateEarlyTestState.java b/solr/core/src/test/org/apache/solr/cloud/SegmentTerminateEarlyTestState.java index 199423b5b53..b3df9e78c69 100644 --- a/solr/core/src/test/org/apache/solr/cloud/SegmentTerminateEarlyTestState.java +++ b/solr/core/src/test/org/apache/solr/cloud/SegmentTerminateEarlyTestState.java @@ -22,6 +22,7 @@ import java.time.ZonedDateTime; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.Random; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.impl.CloudSolrClient; @@ -47,7 +48,12 @@ class SegmentTerminateEarlyTestState { Integer maxTimestampMM = null; int numDocs = 0; + final Random rand; + public SegmentTerminateEarlyTestState(Random rand) { + this.rand = rand; + } + void addDocuments(CloudSolrClient cloudSolrClient, int numCommits, int numDocsPerCommit, boolean optimize) throws Exception { for (int cc = 1; cc <= numCommits; ++cc) { @@ -56,7 +62,7 @@ class SegmentTerminateEarlyTestState { final Integer docKey = new Integer(numDocs); SolrInputDocument doc = new SolrInputDocument(); doc.setField(keyField, ""+docKey); - final int MM = TestMiniSolrCloudCluster.random().nextInt(60); // minutes + final int MM = rand.nextInt(60); // minutes if (minTimestampMM == null || MM <= minTimestampMM.intValue()) { if (minTimestampMM != null && MM < minTimestampMM.intValue()) { minTimestampDocKeys.clear(); @@ -116,7 +122,7 @@ class SegmentTerminateEarlyTestState { query.setFields(keyField, oddField, timestampField); final int rowsWanted = 1; query.setRows(rowsWanted); - final Boolean shardsInfoWanted = (TestMiniSolrCloudCluster.random().nextBoolean() ? null : new Boolean(TestMiniSolrCloudCluster.random().nextBoolean())); + final Boolean shardsInfoWanted = (rand.nextBoolean() ? null : new Boolean(rand.nextBoolean())); if (shardsInfoWanted != null) { query.set(ShardParams.SHARDS_INFO, shardsInfoWanted.booleanValue()); } @@ -163,7 +169,7 @@ class SegmentTerminateEarlyTestState { query.setSort(timestampField, SolrQuery.ORDER.desc); query.setFields(keyField, oddField, timestampField); query.setRows(1); - final Boolean shardsInfoWanted = (TestMiniSolrCloudCluster.random().nextBoolean() ? null : new Boolean(TestMiniSolrCloudCluster.random().nextBoolean())); + final Boolean shardsInfoWanted = (rand.nextBoolean() ? null : new Boolean(rand.nextBoolean())); if (shardsInfoWanted != null) { query.set(ShardParams.SHARDS_INFO, shardsInfoWanted.booleanValue()); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestMiniSolrCloudCluster.java b/solr/core/src/test/org/apache/solr/cloud/TestMiniSolrCloudCluster.java index 97ecb67b0ab..de18875d69a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestMiniSolrCloudCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestMiniSolrCloudCluster.java @@ -16,7 +16,6 @@ */ package org.apache.solr.cloud; -import java.io.File; import java.lang.invoke.MethodHandles; import java.net.URL; import java.util.ArrayList; @@ -384,53 +383,4 @@ public class TestMiniSolrCloudCluster extends LuceneTestCase { } } - @Test - public void testSegmentTerminateEarly() throws Exception { - - final String collectionName = "testSegmentTerminateEarlyCollection"; - - final SegmentTerminateEarlyTestState tstes = new SegmentTerminateEarlyTestState(); - - File solrXml = new File(SolrTestCaseJ4.TEST_HOME(), "solr.xml"); - Builder jettyConfig = JettyConfig.builder(); - jettyConfig.waitForLoadingCoresToFinish(null); - final MiniSolrCloudCluster miniCluster = createMiniSolrCloudCluster(); - final CloudSolrClient cloudSolrClient = miniCluster.getSolrClient(); - cloudSolrClient.setDefaultCollection(collectionName); - - try { - // create collection - { - final String asyncId = (random().nextBoolean() ? null : "asyncId("+collectionName+".create)="+random().nextInt()); - final Map collectionProperties = new HashMap<>(); - collectionProperties.put(CoreDescriptor.CORE_CONFIG, "solrconfig-sortingmergepolicyfactory.xml"); - createCollection(miniCluster, collectionName, null, asyncId, Boolean.TRUE, collectionProperties); - } - - ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader(); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(collectionName, zkStateReader, true, true, 330); - - // add some documents, then optimize to get merged-sorted segments - tstes.addDocuments(cloudSolrClient, 10, 10, true); - - // CommonParams.SEGMENT_TERMINATE_EARLY parameter intentionally absent - tstes.queryTimestampDescending(cloudSolrClient); - - // add a few more documents, but don't optimize to have some not-merge-sorted segments - tstes.addDocuments(cloudSolrClient, 2, 10, false); - - // CommonParams.SEGMENT_TERMINATE_EARLY parameter now present - tstes.queryTimestampDescendingSegmentTerminateEarlyYes(cloudSolrClient); - tstes.queryTimestampDescendingSegmentTerminateEarlyNo(cloudSolrClient); - - // CommonParams.SEGMENT_TERMINATE_EARLY parameter present but it won't be used - tstes.queryTimestampDescendingSegmentTerminateEarlyYesGrouped(cloudSolrClient); - tstes.queryTimestampAscendingSegmentTerminateEarlyYes(cloudSolrClient); // uses a sort order that is _not_ compatible with the merge sort order - - } - finally { - miniCluster.shutdown(); - } - } - } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestSegmentSorting.java b/solr/core/src/test/org/apache/solr/cloud/TestSegmentSorting.java new file mode 100644 index 00000000000..016b63e7aa3 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestSegmentSorting.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.TieredMergePolicy; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.core.CoreDescriptor; +import org.apache.solr.index.TieredMergePolicyFactory; + +import org.junit.After; +import org.junit.BeforeClass; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestSegmentSorting extends SolrCloudTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final int NUM_SERVERS = 5; + private static final int NUM_SHARDS = 2; + private static final int REPLICATION_FACTOR = 2; + + @BeforeClass + public static void setupCluster() throws Exception { + configureCluster(NUM_SERVERS).configure(); + } + + @After + public void ensureClusterEmpty() throws Exception { + cluster.deleteAllCollections(); + cluster.getSolrClient().setDefaultCollection(null); + } + + private void createCollection(MiniSolrCloudCluster miniCluster, String collectionName, String createNodeSet, String asyncId, + Boolean indexToPersist, Map collectionProperties) throws Exception { + String configName = "solrCloudCollectionConfig"; + miniCluster.uploadConfigSet(SolrTestCaseJ4.TEST_PATH().resolve("collection1").resolve("conf"), configName); + + final boolean persistIndex = (indexToPersist != null ? indexToPersist.booleanValue() : random().nextBoolean()); + if (collectionProperties == null) { + collectionProperties = new HashMap<>(); + } + collectionProperties.putIfAbsent(CoreDescriptor.CORE_CONFIG, "solrconfig-tlog.xml"); + collectionProperties.putIfAbsent("solr.tests.maxBufferedDocs", "100000"); + collectionProperties.putIfAbsent("solr.tests.ramBufferSizeMB", "100"); + // use non-test classes so RandomizedRunner isn't necessary + if (random().nextBoolean()) { + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_MERGEPOLICY, TieredMergePolicy.class.getName()); + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_USEMERGEPOLICY, "true"); + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_USEMERGEPOLICYFACTORY, "false"); + } else { + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_MERGEPOLICYFACTORY, TieredMergePolicyFactory.class.getName()); + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_USEMERGEPOLICYFACTORY, "true"); + collectionProperties.putIfAbsent(SolrTestCaseJ4.SYSTEM_PROPERTY_SOLR_TESTS_USEMERGEPOLICY, "false"); + } + collectionProperties.putIfAbsent("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler"); + collectionProperties.putIfAbsent("solr.directoryFactory", (persistIndex ? "solr.StandardDirectoryFactory" : "solr.RAMDirectoryFactory")); + + if (asyncId == null) { + CollectionAdminRequest.createCollection(collectionName, configName, NUM_SHARDS, REPLICATION_FACTOR) + .setCreateNodeSet(createNodeSet) + .setProperties(collectionProperties) + .process(miniCluster.getSolrClient()); + } + else { + CollectionAdminRequest.createCollection(collectionName, configName, NUM_SHARDS, REPLICATION_FACTOR) + .setCreateNodeSet(createNodeSet) + .setProperties(collectionProperties) + .processAndWait(miniCluster.getSolrClient(), 30); + } + } + + + public void testSegmentTerminateEarly() throws Exception { + + final String collectionName = "testSegmentTerminateEarlyCollection"; + + final SegmentTerminateEarlyTestState tstes = new SegmentTerminateEarlyTestState(random()); + + final CloudSolrClient cloudSolrClient = cluster.getSolrClient(); + cloudSolrClient.setDefaultCollection(collectionName); + + // create collection + { + final String asyncId = (random().nextBoolean() ? null : "asyncId("+collectionName+".create)="+random().nextInt()); + final Map collectionProperties = new HashMap<>(); + collectionProperties.put(CoreDescriptor.CORE_CONFIG, "solrconfig-sortingmergepolicyfactory.xml"); + createCollection(cluster, collectionName, null, asyncId, Boolean.TRUE, collectionProperties); + } + + ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader(); + AbstractDistribZkTestBase.waitForRecoveriesToFinish(collectionName, zkStateReader, true, true, 330); + + // add some documents, then optimize to get merged-sorted segments + tstes.addDocuments(cloudSolrClient, 10, 10, true); + + // CommonParams.SEGMENT_TERMINATE_EARLY parameter intentionally absent + tstes.queryTimestampDescending(cloudSolrClient); + + // add a few more documents, but don't optimize to have some not-merge-sorted segments + tstes.addDocuments(cloudSolrClient, 2, 10, false); + + // CommonParams.SEGMENT_TERMINATE_EARLY parameter now present + tstes.queryTimestampDescendingSegmentTerminateEarlyYes(cloudSolrClient); + tstes.queryTimestampDescendingSegmentTerminateEarlyNo(cloudSolrClient); + + // CommonParams.SEGMENT_TERMINATE_EARLY parameter present but it won't be used + tstes.queryTimestampDescendingSegmentTerminateEarlyYesGrouped(cloudSolrClient); + tstes.queryTimestampAscendingSegmentTerminateEarlyYes(cloudSolrClient); // uses a sort order that is _not_ compatible with the merge sort order + + } +} From e4f31fab2f98b7af6d2ec12a2eb3456521b446df Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Wed, 14 Dec 2016 18:00:51 -0500 Subject: [PATCH 49/53] LUCENE-7592: if segments file is truncated, throw CorruptIndexException --- lucene/CHANGES.txt | 4 ++++ .../src/java/org/apache/lucene/index/SegmentInfos.java | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 15b89f09f2f..f38c0d5afda 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,6 +138,10 @@ Improvements necessarily refer to that field (AKA requireFieldMatch==false). Disabled by default. See UH get/setFieldMatcher. (Jim Ferenczi via David Smiley) +* LUCENE-7592: If the segments file is truncated, we now throw + CorruptIndexException instead of the more confusing EOFException + (Mike Drob via Mike McCandless) + Optimizations * LUCENE-7568: Optimize merging when index sorting is used but the diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java index 8f627cd743d..3e8b1f871bd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java @@ -17,6 +17,7 @@ package org.apache.lucene.index; +import java.io.EOFException; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; @@ -277,7 +278,11 @@ public final class SegmentInfos implements Cloneable, Iterable Date: Thu, 15 Dec 2016 12:52:37 +0200 Subject: [PATCH 50/53] LUCENE-7590: add sum, variance and stdev stats to NumericDVStats --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/DocValuesStats.java | 39 +++++++++++- .../search/TestDocValuesStatsCollector.java | 62 ++++++++++++++++--- 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f38c0d5afda..0e327d28124 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -67,6 +67,9 @@ New features * LUCENE-7466: Added AxiomaticSimilarity. (Peilin Yang via Tommaso Teofili) +* LUCENE-7590: Added DocValuesStatsCollector to compute statistics on DocValues + fields. (Shai Erera) + Bug Fixes * LUCENE-7547: JapaneseTokenizerFactory was failing to close the diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java index 998bef4fe21..c8b775200d2 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -98,6 +98,7 @@ public abstract class DocValuesStats { public static abstract class NumericDocValuesStats extends DocValuesStats { protected double mean = 0.0; + protected double variance = 0.0; protected NumericDocValues ndv; @@ -116,15 +117,32 @@ public abstract class DocValuesStats { return ndv.advanceExact(doc); } - /** The mean of all values of the field. Undefined when {@link #count} is zero. */ + /** The mean of all values of the field. */ public final double mean() { return mean; } + + /** Returns the variance of all values of the field. */ + public final double variance() { + int count = count(); + return count > 0 ? variance / count : 0; + } + + /** Returns the stdev of all values of the field. */ + public final double stdev() { + return Math.sqrt(variance()); + } + + /** Returns the sum of values of the field. Note that if the values are large, the {@code sum} might overflow. */ + public abstract T sum(); } /** Holds DocValues statistics for a numeric field storing {@code long} values. */ public static final class LongDocValuesStats extends NumericDocValuesStats { + // To avoid boxing 'long' to 'Long' while the sum is computed, declare it as private variable. + private long sum = 0; + public LongDocValuesStats(String field) { super(field, Long.MAX_VALUE, Long.MIN_VALUE); } @@ -138,13 +156,24 @@ public abstract class DocValuesStats { if (val < min) { min = val; } + sum += val; + double oldMean = mean; mean += (val - mean) / count; + variance += (val - mean) * (val - oldMean); + } + + @Override + public Long sum() { + return sum; } } /** Holds DocValues statistics for a numeric field storing {@code double} values. */ public static final class DoubleDocValuesStats extends NumericDocValuesStats { + // To avoid boxing 'double' to 'Double' while the sum is computed, declare it as private variable. + private double sum = 0; + public DoubleDocValuesStats(String field) { super(field, Double.MAX_VALUE, Double.MIN_VALUE); } @@ -158,7 +187,15 @@ public abstract class DocValuesStats { if (Double.compare(val, min) < 0) { min = val; } + sum += val; + double oldMean = mean; mean += (val - mean) / count; + variance += (val - mean) * (val - oldMean); + } + + @Override + public Double sum() { + return sum; } } diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java index 65f82e62d42..8f8b09e6bac 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java @@ -18,6 +18,8 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Arrays; +import java.util.DoubleSummaryStatistics; +import java.util.LongSummaryStatistics; import java.util.stream.DoubleStream; import java.util.stream.LongStream; @@ -57,7 +59,33 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { } } - public void testRandomDocsWithLongValues() throws IOException { + public void testOneDoc() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + Document doc = new Document(); + doc.add(new NumericDocValuesField(field, 1)); + doc.add(new StringField("id", "doc1", Store.NO)); + indexWriter.addDocument(doc); + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + LongDocValuesStats stats = new LongDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + assertEquals(1, stats.count()); + assertEquals(0, stats.missing()); + assertEquals(1, stats.max().longValue()); + assertEquals(1, stats.min().longValue()); + assertEquals(1, stats.sum().longValue()); + assertEquals(1, stats.mean(), 0.0001); + assertEquals(0, stats.variance(), 0.0001); + assertEquals(0, stats.stdev(), 0.0001); + } + } + } + + public void testDocsWithLongValues() throws IOException { try (Directory dir = newDirectory(); IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { String field = "numeric"; @@ -94,15 +122,20 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { assertEquals(expCount, stats.count()); assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); if (stats.count() > 0) { - assertEquals(getPositiveValues(docValues).max().getAsLong(), stats.max().longValue()); - assertEquals(getPositiveValues(docValues).min().getAsLong(), stats.min().longValue()); - assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + LongSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().longValue()); + assertEquals(sumStats.getMin(), stats.min().longValue()); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().longValue()); + double variance = computeVariance(docValues, stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); } } } } - public void testRandomDocsWithDoubleValues() throws IOException { + public void testDocsWithDoubleValues() throws IOException { try (Directory dir = newDirectory(); IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { String field = "numeric"; @@ -139,9 +172,14 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { assertEquals(expCount, stats.count()); assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); if (stats.count() > 0) { - assertEquals(getPositiveValues(docValues).max().getAsDouble(), stats.max().doubleValue(), 0.00001); - assertEquals(getPositiveValues(docValues).min().getAsDouble(), stats.min().doubleValue(), 0.00001); - assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + DoubleSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001); + assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum(), 0.00001); + double variance = computeVariance(docValues, stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); } } } @@ -163,4 +201,12 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { return Arrays.stream(docValues).filter(v -> v == 0); } + private static double computeVariance(long[] values, double mean, int count) { + return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count; + } + + private static double computeVariance(double[] values, double mean, int count) { + return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count; + } + } From 268d4ace3695ad3738402d623400fa4775b113ef Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Thu, 15 Dec 2016 09:23:48 -0500 Subject: [PATCH 51/53] remove bad assertion --- .../src/java/org/apache/lucene/search/QueryUtils.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java index a3eaa80ed07..ae4c89023d4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java @@ -93,10 +93,6 @@ public class QueryUtils { public static void checkUnequal(Query q1, Query q2) { assertFalse(q1 + " equal to " + q2, q1.equals(q2)); assertFalse(q2 + " equal to " + q1, q2.equals(q1)); - - // possible this test can fail on a hash collision... if that - // happens, please change test to use a different example. - assertTrue(q1.hashCode() != q2.hashCode()); } /** deep check that explanations of a query 'score' correctly */ From ea1569e2914f9ba914b582a0801d6cb83a29529b Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 15 Dec 2016 16:30:15 +0100 Subject: [PATCH 52/53] LUCENE-7572: Cache the hash code of doc values queries. --- lucene/CHANGES.txt | 2 + .../apache/lucene/index/PrefixCodedTerms.java | 4 +- .../lucene/search/DocValuesNumbersQuery.java | 26 +-- .../lucene/search/DocValuesTermsQuery.java | 49 ++++-- .../org/apache/lucene/search/LongHashSet.java | 156 ++++++++++++++++++ .../lucene/search/LongHashSetTests.java | 100 +++++++++++ .../search/TestDocValuesTermsQuery.java | 1 + 7 files changed, 310 insertions(+), 28 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/search/LongHashSet.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/search/LongHashSetTests.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0e327d28124..bacc2703ae3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -158,6 +158,8 @@ Optimizations writing to disk, giving a small speedup in points-heavy use cases. (Mike McCandless) +* LUCENE-7572: Doc values queries now cache their hash code. (Adrien Grand) + Other * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file diff --git a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java index 3dca3dba927..df1653bcd4d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java @@ -28,7 +28,9 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; /** - * Prefix codes term instances (prefixes are shared) + * Prefix codes term instances (prefixes are shared). This is expected to be + * faster to build than a FST and might also be more compact if there are no + * common suffixes. * @lucene.internal */ public class PrefixCodedTerms implements Accountable { diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesNumbersQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesNumbersQuery.java index 0fd22449ee4..772570372f4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesNumbersQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesNumbersQuery.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Arrays; +import java.util.Collection; import java.util.HashSet; import java.util.Objects; import java.util.Set; @@ -45,11 +46,16 @@ import org.apache.lucene.index.SortedNumericDocValues; public class DocValuesNumbersQuery extends Query { private final String field; - private final Set numbers; + private final LongHashSet numbers; - public DocValuesNumbersQuery(String field, Set numbers) { + public DocValuesNumbersQuery(String field, long[] numbers) { this.field = Objects.requireNonNull(field); - this.numbers = Objects.requireNonNull(numbers, "Set of numbers must not be null"); + this.numbers = new LongHashSet(numbers); + } + + public DocValuesNumbersQuery(String field, Collection numbers) { + this.field = Objects.requireNonNull(field); + this.numbers = new LongHashSet(numbers.stream().mapToLong(Long::longValue).toArray()); } public DocValuesNumbersQuery(String field, Long... numbers) { @@ -82,15 +88,11 @@ public class DocValuesNumbersQuery extends Query { @Override public String toString(String defaultField) { - StringBuilder sb = new StringBuilder(); - sb.append(field).append(": ["); - for (Long number : numbers) { - sb.append(number).append(", "); - } - if (numbers.size() > 0) { - sb.setLength(sb.length() - 2); - } - return sb.append(']').toString(); + return new StringBuilder() + .append(field) + .append(": ") + .append(numbers.toString()) + .toString(); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesTermsQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesTermsQuery.java index 6d852a872ae..6e30baed9cd 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesTermsQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/DocValuesTermsQuery.java @@ -25,7 +25,10 @@ import java.util.Objects; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PrefixCodedTerms; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; @@ -91,13 +94,24 @@ import org.apache.lucene.util.LongBitSet; public class DocValuesTermsQuery extends Query { private final String field; - private final BytesRef[] terms; + private final PrefixCodedTerms termData; + private final int termDataHashCode; // cached hashcode of termData public DocValuesTermsQuery(String field, Collection terms) { this.field = Objects.requireNonNull(field); Objects.requireNonNull(terms, "Collection of terms must not be null"); - this.terms = terms.toArray(new BytesRef[terms.size()]); - ArrayUtil.timSort(this.terms); + BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]); + ArrayUtil.timSort(sortedTerms); + PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + BytesRef previous = null; + for (BytesRef term : sortedTerms) { + if (term.equals(previous) == false) { + builder.add(field, term); + } + previous = term; + } + termData = builder.finish(); + termDataHashCode = termData.hashCode(); } public DocValuesTermsQuery(String field, BytesRef... terms) { @@ -124,26 +138,30 @@ public class DocValuesTermsQuery extends Query { } private boolean equalsTo(DocValuesTermsQuery other) { - return field.equals(other.field) && - Arrays.equals(terms, other.terms); + // termData might be heavy to compare so check the hash code first + return termDataHashCode == other.termDataHashCode && + termData.equals(other.termData); } @Override public int hashCode() { - return 31 * classHash() + Objects.hash(field, Arrays.asList(terms)); + return 31 * classHash() + termDataHashCode; } @Override public String toString(String defaultField) { - StringBuilder sb = new StringBuilder(); - sb.append(field).append(": ["); - for (BytesRef term : terms) { - sb.append(term).append(", "); + StringBuilder builder = new StringBuilder(); + boolean first = true; + TermIterator iterator = termData.iterator(); + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { + if (!first) { + builder.append(' '); + } + first = false; + builder.append(new Term(iterator.field(), term).toString()); } - if (terms.length > 0) { - sb.setLength(sb.length() - 2); - } - return sb.append(']').toString(); + + return builder.toString(); } @Override @@ -155,7 +173,8 @@ public class DocValuesTermsQuery extends Query { final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); final LongBitSet bits = new LongBitSet(values.getValueCount()); boolean matchesAtLeastOneTerm = false; - for (BytesRef term : terms) { + TermIterator iterator = termData.iterator(); + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { final long ord = values.lookupTerm(term); if (ord >= 0) { matchesAtLeastOneTerm = true; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/LongHashSet.java b/lucene/sandbox/src/java/org/apache/lucene/search/LongHashSet.java new file mode 100644 index 00000000000..3a6af5fbe70 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/LongHashSet.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.AbstractSet; +import java.util.Arrays; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.apache.lucene.util.packed.PackedInts; + +final class LongHashSet extends AbstractSet { + + private static final long MISSING = Long.MIN_VALUE; + + final long[] table; + final int mask; + final boolean hasMissingValue; + final int size; + final int hashCode; + + LongHashSet(long... values) { + int tableSize = Math.toIntExact(values.length * 3L / 2); + tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2 + assert tableSize >= values.length * 3L / 2; + table = new long[tableSize]; + Arrays.fill(table, MISSING); + mask = tableSize - 1; + boolean hasMissingValue = false; + int size = 0; + int hashCode = 0; + for (long value : values) { + if (value == MISSING || add(value)) { + if (value == MISSING) { + hasMissingValue = true; + } + ++size; + hashCode += Long.hashCode(value); + } + } + this.hasMissingValue = hasMissingValue; + this.size = size; + this.hashCode = hashCode; + } + + private boolean add(long l) { + assert l != MISSING; + final int slot = Long.hashCode(l) & mask; + for (int i = slot; ; i = (i + 1) & mask) { + if (table[i] == MISSING) { + table[i] = l; + return true; + } else if (table[i] == l) { + // already added + return false; + } + } + } + + boolean contains(long l) { + if (l == MISSING) { + return hasMissingValue; + } + final int slot = Long.hashCode(l) & mask; + for (int i = slot; ; i = (i + 1) & mask) { + if (table[i] == MISSING) { + return false; + } else if (table[i] == l) { + return true; + } + } + } + + @Override + public int size() { + return size; + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object obj) { + if (obj != null && obj.getClass() == LongHashSet.class) { + LongHashSet that = (LongHashSet) obj; + if (hashCode != that.hashCode + || size != that.size + || hasMissingValue != that.hasMissingValue) { + return false; + } + for (long v : table) { + if (v != MISSING && that.contains(v) == false) { + return false; + } + } + return true; + } + return super.equals(obj); + } + + @Override + public boolean contains(Object o) { + return o instanceof Long && contains(((Long) o).longValue()); + } + + @Override + public Iterator iterator() { + return new Iterator() { + + private boolean hasNext = hasMissingValue; + private int i = -1; + private long value = MISSING; + + @Override + public boolean hasNext() { + if (hasNext) { + return true; + } + while (++i < table.length) { + value = table[i]; + if (value != MISSING) { + return hasNext = true; + } + } + return false; + } + + @Override + public Long next() { + if (hasNext() == false) { + throw new NoSuchElementException(); + } + hasNext = false; + return value; + } + + }; + } + +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/LongHashSetTests.java b/lucene/sandbox/src/test/org/apache/lucene/search/LongHashSetTests.java new file mode 100644 index 00000000000..25d94a6dbbc --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/search/LongHashSetTests.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +import org.apache.lucene.util.LuceneTestCase; + +public class LongHashSetTests extends LuceneTestCase { + + private void assertEquals(Set set1, LongHashSet set2) { + LuceneTestCase.assertEquals(set1, set2); + LuceneTestCase.assertEquals(set2, set1); + LuceneTestCase.assertEquals(set2, set2); + assertEquals(set1.hashCode(), set2.hashCode()); + + if (set1.isEmpty() == false) { + Set set3 = new HashSet<>(set1); + long removed = set3.iterator().next(); + while (true) { + long next = random().nextLong(); + if (next != removed && set3.add(next)) { + break; + } + } + assertNotEquals(set3, set2); + } + } + + private void assertNotEquals(Set set1, LongHashSet set2) { + assertFalse(set1.equals(set2)); + assertFalse(set2.equals(set1)); + LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).toArray()); + assertFalse(set2.equals(set3)); + } + + public void testEmpty() { + Set set1 = new HashSet<>(); + LongHashSet set2 = new LongHashSet(); + assertEquals(set1, set2); + } + + public void testOneValue() { + Set set1 = new HashSet<>(Arrays.asList(42L)); + LongHashSet set2 = new LongHashSet(42); + assertEquals(set1, set2); + + set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE)); + set2 = new LongHashSet(Long.MIN_VALUE); + assertEquals(set1, set2); + } + + public void testTwoValues() { + Set set1 = new HashSet<>(Arrays.asList(42L, Long.MAX_VALUE)); + LongHashSet set2 = new LongHashSet(42, Long.MAX_VALUE); + assertEquals(set1, set2); + + set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE, 42L)); + set2 = new LongHashSet(Long.MIN_VALUE, 42L); + assertEquals(set1, set2); + } + + public void testRandom() { + final int iters = atLeast(10); + for (int iter = 0; iter < iters; ++iter) { + long[] values = new long[random().nextInt(1 << random().nextInt(16))]; + for (int i = 0; i < values.length; ++i) { + if (i == 0 || random().nextInt(10) < 9) { + values[i] = random().nextLong(); + } else { + values[i] = values[random().nextInt(i)]; + } + } + if (values.length > 0 && random().nextBoolean()) { + values[values.length/2] = Long.MIN_VALUE; + } + Set set1 = LongStream.of(values).mapToObj(Long::valueOf).collect(Collectors.toCollection(HashSet::new)); + LongHashSet set2 = new LongHashSet(values); + assertEquals(set1, set2); + } + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestDocValuesTermsQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestDocValuesTermsQuery.java index 6e994927947..187f172b9f9 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/TestDocValuesTermsQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestDocValuesTermsQuery.java @@ -38,6 +38,7 @@ public class TestDocValuesTermsQuery extends LuceneTestCase { public void testEquals() { assertEquals(new DocValuesTermsQuery("foo", "bar"), new DocValuesTermsQuery("foo", "bar")); + assertEquals(new DocValuesTermsQuery("foo", "bar"), new DocValuesTermsQuery("foo", "bar", "bar")); assertEquals(new DocValuesTermsQuery("foo", "bar", "baz"), new DocValuesTermsQuery("foo", "baz", "bar")); assertFalse(new DocValuesTermsQuery("foo", "bar").equals(new DocValuesTermsQuery("foo2", "bar"))); assertFalse(new DocValuesTermsQuery("foo", "bar").equals(new DocValuesTermsQuery("foo", "baz"))); From 3b182aa2fb3e4062f6ec5be819f3aa70aa2e523d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 15 Dec 2016 16:33:36 +0100 Subject: [PATCH 53/53] LUCENE-7589: Prevent outliers from raising the bpv for everyone. --- lucene/CHANGES.txt | 4 + .../lucene70/Lucene70DocValuesConsumer.java | 163 ++++++++++--- .../lucene70/Lucene70DocValuesFormat.java | 5 +- .../lucene70/Lucene70DocValuesProducer.java | 220 ++++++++++++++---- .../org/apache/lucene/util/LongValues.java | 9 + .../lucene/util/packed/DirectWriter.java | 8 +- .../lucene70/TestLucene70DocValuesFormat.java | 152 ++++++++++++ 7 files changed, 479 insertions(+), 82 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bacc2703ae3..7e614693fd5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -48,6 +48,10 @@ Optimizations * LUCENE-7519: Add optimized APIs to compute browse-only top level facets (Mike McCandless) +* LUCENE-7589: Numeric doc values now have the ability to encode blocks of + values using different numbers of bits per value if this proves to save + storage. (Adrien Grand) + Other * LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java index e1b66e13eb0..2dd68e9e82b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java @@ -18,6 +18,8 @@ package org.apache.lucene.codecs.lucene70; import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE; import java.io.Closeable; // javadocs import java.io.IOException; @@ -42,6 +44,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.store.GrowableByteArrayDataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; @@ -112,12 +115,46 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close }); } + private static class MinMaxTracker { + long min, max, numValues, spaceInBits; + + MinMaxTracker() { + reset(); + spaceInBits = 0; + } + + private void reset() { + min = Long.MAX_VALUE; + max = Long.MIN_VALUE; + numValues = 0; + } + + /** Accumulate a new value. */ + void update(long v) { + min = Math.min(min, v); + max = Math.max(max, v); + ++numValues; + } + + /** Update the required space. */ + void finish() { + if (max > min) { + spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * numValues; + } + } + + /** Update space usage and get ready for accumulating values for the next block. */ + void nextBlock() { + finish(); + reset(); + } + } + private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); int numDocsWithValue = 0; - long numValues = 0; - long min = Long.MAX_VALUE; - long max = Long.MIN_VALUE; + MinMaxTracker minMax = new MinMaxTracker(); + MinMaxTracker blockMinMax = new MinMaxTracker(); long gcd = 0; Set uniqueValues = new HashSet<>(); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { @@ -130,26 +167,35 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; - } else if (numValues != 0) { // minValue needs to be set first - gcd = MathUtil.gcd(gcd, v - min); + } else if (minMax.numValues != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minMax.min); } } - min = Math.min(min, v); - max = Math.max(max, v); + minMax.update(v); + blockMinMax.update(v); + if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) { + blockMinMax.nextBlock(); + } if (uniqueValues != null && uniqueValues.add(v) && uniqueValues.size() > 256) { uniqueValues = null; } - - numValues++; } numDocsWithValue++; } + minMax.finish(); + blockMinMax.finish(); + + final long numValues = minMax.numValues; + long min = minMax.min; + final long max = minMax.max; + assert blockMinMax.spaceInBits <= minMax.spaceInBits; + if (numDocsWithValue == 0) { meta.writeLong(-2); meta.writeLong(0L); @@ -166,6 +212,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close meta.writeLong(numValues); final int numBitsPerValue; + boolean doBlocks = false; Map encode = null; if (min >= max) { numBitsPerValue = 0; @@ -189,12 +236,19 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close gcd = 1; } else { uniqueValues = null; - numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd); - if (gcd == 1 && min > 0 - && DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) { - min = 0; + // we do blocks if that appears to save 10+% storage + doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9; + if (doBlocks) { + numBitsPerValue = 0xFF; + meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); + } else { + numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd); + if (gcd == 1 && min > 0 + && DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) { + min = 0; + } + meta.writeInt(-1); } - meta.writeInt(-1); } } @@ -203,26 +257,79 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close meta.writeLong(gcd); long startOffset = data.getFilePointer(); meta.writeLong(startOffset); - if (numBitsPerValue != 0) { - values = valuesProducer.getSortedNumeric(field); - DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); - for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - for (int i = 0, count = values.docValueCount(); i < count; ++i) { - long v = values.nextValue(); - if (encode == null) { - writer.add((v - min) / gcd); - } else { - writer.add(encode.get(v)); - } - } - } - writer.finish(); + if (doBlocks) { + writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd); + } else if (numBitsPerValue != 0) { + writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), numValues, numBitsPerValue, min, gcd, encode); } meta.writeLong(data.getFilePointer() - startOffset); return new long[] {numDocsWithValue, numValues}; } + private void writeValuesSingleBlock(SortedNumericDocValues values, long numValues, int numBitsPerValue, + long min, long gcd, Map encode) throws IOException { + DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + long v = values.nextValue(); + if (encode == null) { + writer.add((v - min) / gcd); + } else { + writer.add(encode.get(v)); + } + } + } + writer.finish(); + } + + private void writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException { + final long[] buffer = new long[NUMERIC_BLOCK_SIZE]; + final GrowableByteArrayDataOutput encodeBuffer = new GrowableByteArrayDataOutput(NUMERIC_BLOCK_SIZE); + int upTo = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + buffer[upTo++] = values.nextValue(); + if (upTo == NUMERIC_BLOCK_SIZE) { + writeBlock(buffer, NUMERIC_BLOCK_SIZE, gcd, encodeBuffer); + upTo = 0; + } + } + } + if (upTo > 0) { + writeBlock(buffer, upTo, gcd, encodeBuffer); + } + } + + private void writeBlock(long[] values, int length, long gcd, GrowableByteArrayDataOutput buffer) throws IOException { + assert length > 0; + long min = values[0]; + long max = values[0]; + for (int i = 1; i < length; ++i) { + final long v = values[i]; + assert Math.floorMod(values[i] - min, gcd) == 0; + min = Math.min(min, v); + max = Math.max(max, v); + } + if (min == max) { + data.writeByte((byte) 0); + data.writeLong(min); + } else { + final int bitsPerValue = DirectWriter.unsignedBitsRequired(max - min); + buffer.reset(); + assert buffer.getPosition() == 0; + final DirectWriter w = DirectWriter.getInstance(buffer, length, bitsPerValue); + for (int i = 0; i < length; ++i) { + w.add((values[i] - min) / gcd); + } + w.finish(); + data.writeByte((byte) bitsPerValue); + data.writeLong(min); + data.writeInt(buffer.getPosition()); + data.writeBytes(buffer.getBytes(), buffer.getPosition()); + } + } + @Override public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java index ee477d666ee..2ce2124ff34 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java @@ -146,10 +146,11 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat { static final byte SORTED_SET = 3; static final byte SORTED_NUMERIC = 4; - // addressing uses 16k blocks - static final int MONOTONIC_BLOCK_SIZE = 16384; static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; + static final int NUMERIC_BLOCK_SHIFT = 14; + static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT; + static final int TERMS_DICT_BLOCK_SHIFT = 4; static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT; static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java index 3f3e73f2e38..386655e8c77 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java @@ -144,7 +144,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close entry.docsWithFieldLength = meta.readLong(); entry.numValues = meta.readLong(); int tableSize = meta.readInt(); - if (tableSize < -1 || tableSize > 256) { + if (tableSize > 256) { throw new CorruptIndexException("invalid table size: " + tableSize, meta); } if (tableSize >= 0) { @@ -154,6 +154,11 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close entry.table[i] = meta.readLong(); } } + if (tableSize < -1) { + entry.blockShift = -2 - tableSize; + } else { + entry.blockShift = -1; + } entry.bitsPerValue = meta.readByte(); entry.minValue = meta.readLong(); entry.gcd = meta.readLong(); @@ -260,6 +265,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close private static class NumericEntry { long[] table; + int blockShift; byte bitsPerValue; long docsWithFieldOffset; long docsWithFieldLength; @@ -429,24 +435,62 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close }; } else { final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength); - final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); - if (entry.table != null) { - final long[] table = entry.table; + if (entry.blockShift >= 0) { + // dense but split into blocks of different bits per value + final int shift = entry.blockShift; + final long mul = entry.gcd; + final int mask = (1 << shift) - 1; return new DenseNumericDocValues(maxDoc) { + int block = -1; + long delta; + long offset; + long blockEndOffset; + LongValues values; + @Override public long longValue() throws IOException { - return table[(int) values.get(doc)]; + final int block = doc >>> shift; + if (this.block != block) { + int bitsPerValue; + do { + offset = blockEndOffset; + bitsPerValue = slice.readByte(offset++); + delta = slice.readLong(offset); + offset += Long.BYTES; + if (bitsPerValue == 0) { + blockEndOffset = offset; + } else { + final int length = slice.readInt(offset); + offset += Integer.BYTES; + blockEndOffset = offset + length; + } + this.block ++; + } while (this.block != block); + values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset); + } + return mul * values.get(doc & mask) + delta; } }; } else { - final long mul = entry.gcd; - final long delta = entry.minValue; - return new DenseNumericDocValues(maxDoc) { - @Override - public long longValue() throws IOException { - return mul * values.get(doc) + delta; - } - }; + final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); + if (entry.table != null) { + final long[] table = entry.table; + return new DenseNumericDocValues(maxDoc) { + @Override + public long longValue() throws IOException { + return table[(int) values.get(doc)]; + } + }; + } else { + final long mul = entry.gcd; + final long delta = entry.minValue; + return new DenseNumericDocValues(maxDoc) { + @Override + public long longValue() throws IOException { + return mul * values.get(doc) + delta; + } + }; + } } } } else { @@ -461,24 +505,63 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close }; } else { final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength); - final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); - if (entry.table != null) { - final long[] table = entry.table; + if (entry.blockShift >= 0) { + // sparse and split into blocks of different bits per value + final int shift = entry.blockShift; + final long mul = entry.gcd; + final int mask = (1 << shift) - 1; return new SparseNumericDocValues(disi) { + int block = -1; + long delta; + long offset; + long blockEndOffset; + LongValues values; + @Override public long longValue() throws IOException { - return table[(int) values.get(disi.index())]; + final int index = disi.index(); + final int block = index >>> shift; + if (this.block != block) { + int bitsPerValue; + do { + offset = blockEndOffset; + bitsPerValue = slice.readByte(offset++); + delta = slice.readLong(offset); + offset += Long.BYTES; + if (bitsPerValue == 0) { + blockEndOffset = offset; + } else { + final int length = slice.readInt(offset); + offset += Integer.BYTES; + blockEndOffset = offset + length; + } + this.block ++; + } while (this.block != block); + values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset); + } + return mul * values.get(index & mask) + delta; } }; } else { - final long mul = entry.gcd; - final long delta = entry.minValue; - return new SparseNumericDocValues(disi) { - @Override - public long longValue() throws IOException { - return mul * values.get(disi.index()) + delta; - } - }; + final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); + if (entry.table != null) { + final long[] table = entry.table; + return new SparseNumericDocValues(disi) { + @Override + public long longValue() throws IOException { + return table[(int) values.get(disi.index())]; + } + }; + } else { + final long mul = entry.gcd; + final long delta = entry.minValue; + return new SparseNumericDocValues(disi) { + @Override + public long longValue() throws IOException { + return mul * values.get(disi.index()) + delta; + } + }; + } } } } @@ -494,34 +577,75 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close }; } else { final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength); - final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); - if (entry.table != null) { - final long[] table = entry.table; + if (entry.blockShift >= 0) { + final int shift = entry.blockShift; + final long mul = entry.gcd; + final long mask = (1L << shift) - 1; return new LongValues() { - @Override + long block = -1; + long delta; + long offset; + long blockEndOffset; + LongValues values; + public long get(long index) { - return table[(int) values.get(index)]; - } - }; - } else if (entry.gcd != 1) { - final long gcd = entry.gcd; - final long minValue = entry.minValue; - return new LongValues() { - @Override - public long get(long index) { - return values.get(index) * gcd + minValue; - } - }; - } else if (entry.minValue != 0) { - final long minValue = entry.minValue; - return new LongValues() { - @Override - public long get(long index) { - return values.get(index) + minValue; + final long block = index >>> shift; + if (this.block != block) { + assert block > this.block : "Reading backwards is illegal: " + this.block + " < " + block; + int bitsPerValue; + do { + offset = blockEndOffset; + try { + bitsPerValue = slice.readByte(offset++); + delta = slice.readLong(offset); + offset += Long.BYTES; + if (bitsPerValue == 0) { + blockEndOffset = offset; + } else { + final int length = slice.readInt(offset); + offset += Integer.BYTES; + blockEndOffset = offset + length; + } + } catch (IOException e) { + throw new RuntimeException(e); + } + this.block ++; + } while (this.block != block); + values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset); + } + return mul * values.get(index & mask) + delta; } }; } else { - return values; + final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); + if (entry.table != null) { + final long[] table = entry.table; + return new LongValues() { + @Override + public long get(long index) { + return table[(int) values.get(index)]; + } + }; + } else if (entry.gcd != 1) { + final long gcd = entry.gcd; + final long minValue = entry.minValue; + return new LongValues() { + @Override + public long get(long index) { + return values.get(index) * gcd + minValue; + } + }; + } else if (entry.minValue != 0) { + final long minValue = entry.minValue; + return new LongValues() { + @Override + public long get(long index) { + return values.get(index) + minValue; + } + }; + } else { + return values; + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/LongValues.java b/lucene/core/src/java/org/apache/lucene/util/LongValues.java index 23f4d32fc9a..04fbf81dc1c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/LongValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/LongValues.java @@ -30,6 +30,15 @@ public abstract class LongValues { }; + public static final LongValues ZEROES = new LongValues() { + + @Override + public long get(long index) { + return 0; + } + + }; + /** Get value at index. */ public abstract long get(long index); diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java index 9a7f18eb26f..5a38445d20d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java @@ -21,7 +21,7 @@ import java.io.EOFException; import java.io.IOException; import java.util.Arrays; -import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.DataOutput; /** * Class for writing packed integers to be directly read from Directory. @@ -44,7 +44,7 @@ import org.apache.lucene.store.IndexOutput; public final class DirectWriter { final int bitsPerValue; final long numValues; - final IndexOutput output; + final DataOutput output; long count; boolean finished; @@ -56,7 +56,7 @@ public final class DirectWriter { final BulkOperation encoder; final int iterations; - DirectWriter(IndexOutput output, long numValues, int bitsPerValue) { + DirectWriter(DataOutput output, long numValues, int bitsPerValue) { this.output = output; this.numValues = numValues; this.bitsPerValue = bitsPerValue; @@ -103,7 +103,7 @@ public final class DirectWriter { } /** Returns an instance suitable for encoding {@code numValues} using {@code bitsPerValue} */ - public static DirectWriter getInstance(IndexOutput output, long numValues, int bitsPerValue) { + public static DirectWriter getInstance(DataOutput output, long numValues, int bitsPerValue) { if (Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) < 0) { throw new IllegalArgumentException("Unsupported bitsPerValue " + bitsPerValue + ". Did you use bitsRequired?"); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java index 8661298b51d..6cca55e3a48 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java @@ -25,6 +25,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; +import java.util.function.LongSupplier; import java.util.function.Supplier; import org.apache.lucene.analysis.MockAnalyzer; @@ -61,6 +62,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMFile; @@ -534,4 +536,154 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT dir.close(); } } + + @Slow + public void testSortedNumericBlocksOfVariousBitsPerValue() throws Exception { + doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 1, 3)); + } + + @Slow + public void testSparseSortedNumericBlocksOfVariousBitsPerValue() throws Exception { + doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 0, 2)); + } + + @Slow + public void testNumericBlocksOfVariousBitsPerValue() throws Exception { + doTestSparseNumericBlocksOfVariousBitsPerValue(1); + } + + @Slow + public void testSparseNumericBlocksOfVariousBitsPerValue() throws Exception { + doTestSparseNumericBlocksOfVariousBitsPerValue(random().nextDouble()); + } + + private static LongSupplier blocksOfVariousBPV() { + final long mul = TestUtil.nextInt(random(), 1, 100); + final long min = random().nextInt(); + return new LongSupplier() { + int i = Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE; + int maxDelta; + @Override + public long getAsLong() { + if (i == Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE) { + maxDelta = 1 << random().nextInt(5); + i = 0; + } + i++; + return min + mul * random().nextInt(maxDelta); + } + }; + } + + private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts) throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE)); + conf.setRAMBufferSizeMB(-1); + conf.setMergePolicy(newLogMergePolicy(random().nextBoolean())); + IndexWriter writer = new IndexWriter(dir, conf); + + final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3); + final LongSupplier values = blocksOfVariousBPV(); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + + int valueCount = (int) counts.getAsLong(); + long valueArray[] = new long[valueCount]; + for (int j = 0; j < valueCount; j++) { + long value = values.getAsLong(); + valueArray[j] = value; + doc.add(new SortedNumericDocValuesField("dv", value)); + } + Arrays.sort(valueArray); + for (int j = 0; j < valueCount; j++) { + doc.add(new StoredField("stored", Long.toString(valueArray[j]))); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + writer.forceMerge(1); + + writer.close(); + + // compare + DirectoryReader ir = DirectoryReader.open(dir); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + SortedNumericDocValues docValues = DocValues.getSortedNumeric(r, "dv"); + for (int i = 0; i < r.maxDoc(); i++) { + if (i > docValues.docID()) { + docValues.nextDoc(); + } + String expected[] = r.document(i).getValues("stored"); + if (i < docValues.docID()) { + assertEquals(0, expected.length); + } else { + String actual[] = new String[docValues.docValueCount()]; + for (int j = 0; j < actual.length; j++) { + actual[j] = Long.toString(docValues.nextValue()); + } + assertArrayEquals(expected, actual); + } + } + } + ir.close(); + dir.close(); + } + + private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE)); + conf.setRAMBufferSizeMB(-1); + conf.setMergePolicy(newLogMergePolicy(random().nextBoolean())); + IndexWriter writer = new IndexWriter(dir, conf); + Document doc = new Document(); + Field storedField = newStringField("stored", "", Field.Store.YES); + Field dvField = new NumericDocValuesField("dv", 0); + doc.add(storedField); + doc.add(dvField); + + final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3); + final LongSupplier longs = blocksOfVariousBPV(); + for (int i = 0; i < numDocs; i++) { + if (random().nextDouble() > density) { + writer.addDocument(new Document()); + continue; + } + long value = longs.getAsLong(); + storedField.setStringValue(Long.toString(value)); + dvField.setLongValue(value); + writer.addDocument(doc); + } + + writer.forceMerge(1); + + writer.close(); + + // compare + DirectoryReader ir = DirectoryReader.open(dir); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + NumericDocValues docValues = DocValues.getNumeric(r, "dv"); + docValues.nextDoc(); + for (int i = 0; i < r.maxDoc(); i++) { + String storedValue = r.document(i).get("stored"); + if (storedValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(Long.parseLong(storedValue), docValues.longValue()); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + dir.close(); + } }