diff --git a/dev-tools/scripts/checkJavadocLinks.py b/dev-tools/scripts/checkJavadocLinks.py index 20993145741..9a4dbf714ad 100644 --- a/dev-tools/scripts/checkJavadocLinks.py +++ b/dev-tools/scripts/checkJavadocLinks.py @@ -197,6 +197,9 @@ def checkAll(dirName): elif link.find('lucene.apache.org/java/docs/discussion.html') != -1: # OK pass + elif link.find('lucene.apache.org/core/discussion.html') != -1: + # OK + pass elif link.find('lucene.apache.org/solr/mirrors-solr-latest-redir.html') != -1: # OK pass diff --git a/dev-tools/scripts/smokeTestRelease.py b/dev-tools/scripts/smokeTestRelease.py index b83de5891c3..9b6aa9a7825 100644 --- a/dev-tools/scripts/smokeTestRelease.py +++ b/dev-tools/scripts/smokeTestRelease.py @@ -308,7 +308,7 @@ def checkSigs(project, urlString, version, tmpDir, isSigned): artifact = text artifactURL = subURL if project == 'solr': - expected = 'apache-solr-%s' % version + expected = 'solr-%s' % version else: expected = 'lucene-%s' % version if not artifact.startswith(expected): @@ -334,9 +334,9 @@ def checkSigs(project, urlString, version, tmpDir, isSigned): 'lucene-%s.tgz' % version, 'lucene-%s.zip' % version] else: - expected = ['apache-solr-%s-src.tgz' % version, - 'apache-solr-%s.tgz' % version, - 'apache-solr-%s.zip' % version] + expected = ['solr-%s-src.tgz' % version, + 'solr-%s.tgz' % version, + 'solr-%s.zip' % version] actual = [x[0] for x in artifacts] if expected != actual: @@ -556,10 +556,7 @@ def unpackAndVerify(project, tmpDir, artifact, version): # make sure it unpacks to proper subdir l = os.listdir(destDir) - if project == 'solr': - expected = 'apache-%s-%s' % (project, version) - else: - expected = '%s-%s' % (project, version) + expected = '%s-%s' % (project, version) if l != [expected]: raise RuntimeError('unpack produced entries %s; expected only %s' % (l, expected)) @@ -956,7 +953,6 @@ def getDistributionsForMavenChecks(tmpDir, version, baseURL): distributionFiles = defaultdict() for project in ('lucene', 'solr'): distribution = '%s-%s.tgz' % (project, version) - if project == 'solr': distribution = 'apache-' + distribution if not os.path.exists('%s/%s' % (tmpDir, distribution)): distURL = '%s/%s/%s' % (baseURL, project, distribution) print(' download %s...' % distribution, end=' ') @@ -1010,8 +1006,6 @@ def checkIdenticalMavenArtifacts(distributionFiles, nonMavenizedDeps, artifacts, distFilenames = dict() for file in distributionFiles[project]: baseName = os.path.basename(file) - if project == 'solr': # Remove 'apache-' prefix to allow comparison to Maven artifacts - baseName = baseName.replace('apache-', '') distFilenames[baseName] = file for artifact in artifacts[project]: if reJarWar.search(artifact): @@ -1348,9 +1342,9 @@ def smokeTest(baseURL, version, tmpDir, isSigned): print() print('Test Solr...') checkSigs('solr', solrPath, version, tmpDir, isSigned) - for artifact in ('apache-solr-%s.tgz' % version, 'apache-solr-%s.zip' % version): + for artifact in ('solr-%s.tgz' % version, 'solr-%s.zip' % version): unpackAndVerify('solr', tmpDir, artifact, version) - unpackAndVerify('solr', tmpDir, 'apache-solr-%s-src.tgz' % version, version) + unpackAndVerify('solr', tmpDir, 'solr-%s-src.tgz' % version, version) print() print('Test Maven artifacts for Lucene and Solr...') diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9ac8f0c3e65..366b2482da4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -19,6 +19,16 @@ Changes in backwards compatibility policy (Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless, Robert Muir) +* LUCENE-4677, LUCENE-4682: unpacked FSTs now use vInt to encode the node target, + to reduce their size (Mike McCandless) + +* LUCENE-4678: FST now uses a paged byte[] structure instead of a + single byte[] internally, to avoid large memory spikes during + building (James Dyer, Mike McCandless) + +* LUCENE-3298: FST can now be larger than 2.1 GB / 2.1 B nodes. + (James Dyer, Mike McCandless) + ======================= Lucene 4.1.0 ======================= Changes in backwards compatibility policy @@ -45,7 +55,7 @@ Changes in backwards compatibility policy Instead of calling refresh(), you should write similar code to how you reopen a regular DirectoryReader. - TaxonomyReader.openIfChanged (previously refresh()) no longer throws - IncosistentTaxonomyException, and supports recreate. InconsistentTaxoEx + InconsistentTaxonomyException, and supports recreate. InconsistentTaxoEx was removed. - ChildrenArrays was pulled out of TaxonomyReader into a top-level class. - TaxonomyReader was made an abstract class (instead of an interface), with @@ -94,7 +104,7 @@ Changes in backwards compatibility policy Also, the entire IndexingParams chain is now immutable. If you need to override a setting, you should extend the relevant class. Additionally, FacetSearchParams is now immutable, and requires all FacetRequests - to speified at initialization time. (Shai Erera) + to specified at initialization time. (Shai Erera) * LUCENE-4647: CategoryDocumentBuilder and EnhancementsDocumentBuilder are replaced by FacetFields and AssociationsFacetFields respectively. CategoryEnhancement and @@ -115,6 +125,10 @@ Changes in backwards compatibility policy result, few other classes such as Aggregator and CategoryListIterator were changed to handle bulk category ordinals. (Shai Erera) +* LUCENE-4683: CategoryListIterator and Aggregator are now per-segment. As such + their implementations no longer take a top-level IndexReader in the constructor + but rather implement a setNextReader. (Shai Erera) + New Features * LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of @@ -152,11 +166,6 @@ New Features * LUCENE-4515: MemoryIndex now supports adding the same field multiple times. (Simon Willnauer) -* LUCENE-4540: Added an experimental Norm.setPackedLong, which allows - the use of VAR_INTS-encoded norms. This can be useful for cases where - you only need a few bits per-document, or where you might want exact - document length, and so on. (Robert Muir) - * LUCENE-4489: Added consumeAllTokens option to LimitTokenCountFilter (hossman, Robert Muir) @@ -267,7 +276,7 @@ Bug Fixes allow 1+maxMergeCount merges threads to be created, instead of just maxMergeCount (Radim Kolar, Mike McCandless) -* LUCENE-4567: Fixed NullPointerException in analzying, fuzzy, and +* LUCENE-4567: Fixed NullPointerException in analyzing, fuzzy, and WFST suggesters when no suggestions were added (selckin via Mike McCandless) @@ -527,7 +536,7 @@ API Changes StoredFieldVisitor API. (Mike McCandless) * LUCENE-4343: Made Tokenizer.setReader final. This is a setter that should - not be overriden by subclasses: per-stream initialization should happen + not be overridden by subclasses: per-stream initialization should happen in reset(). (Robert Muir) * LUCENE-4377: Remove IndexInput.copyBytes(IndexOutput, long). @@ -753,7 +762,7 @@ API Changes * LUCENE-4273: When pulling a DocsEnum, you can pass an int flags instead of the previous boolean needsFlags; consistent with the changes - for DocsAndPositionsEnum in LUCENE-4230. Currently othe only flag + for DocsAndPositionsEnum in LUCENE-4230. Currently the only flag is DocsEnum.FLAG_FREQS. (Robert Muir, Mike McCandless) * LUCENE-3616: TextField(String, Reader, Store) was reduced to TextField(String, Reader), @@ -825,7 +834,7 @@ Bug Fixes instance are already checked out and queued up but not yet flushed. (Simon Willnauer) -* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results. +* LUCENE-4282: Automaton FuzzyQuery didn't always deliver all results. (Johannes Christen, Uwe Schindler, Robert Muir) * LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter. @@ -1055,7 +1064,7 @@ Changes in backwards compatibility policy Query/Weight/Scorer. If you extended Similarity directly before, you should extend TFIDFSimilarity instead. Similarity is now a lower-level API to implement other scoring algorithms. See MIGRATE.txt for more details. - (David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir) + (David Nemeskey, Simon Willnauer, Mike McCandless, Robert Muir) * LUCENE-3330: The expert visitor API in Scorer has been simplified and extended to support arbitrary relationships. To navigate to a scorer's @@ -1163,12 +1172,12 @@ Changes in Runtime Behavior omitNorms(true) for field "a" for 1000 documents, but then add a document with omitNorms(false) for field "a", all documents for field "a" will have no norms. Previously, Lucene would fill the first 1000 documents with - "fake norms" from Similarity.getDefault(). (Robert Muir, Mike Mccandless) + "fake norms" from Similarity.getDefault(). (Robert Muir, Mike McCandless) * LUCENE-2846: When some documents contain field "a", and others do not, the documents that don't have the field get a norm byte value of 0. Previously, Lucene would populate "fake norms" with Similarity.getDefault() for these - documents. (Robert Muir, Mike Mccandless) + documents. (Robert Muir, Mike McCandless) * LUCENE-2720: IndexWriter throws IndexFormatTooOldException on open, rather than later when e.g. a merge starts. @@ -1201,13 +1210,13 @@ Changes in Runtime Behavior update or delete on IndexWriter. By default DWPTs are flushed either on maxBufferedDocs per DWPT or the global active used memory. Once the active memory exceeds ramBufferSizeMB only the largest DWPT is selected for - flushing and the memory used by this DWPT is substracted from the active + flushing and the memory used by this DWPT is subtracted from the active memory and added to a flushing memory pool, which can lead to temporarily higher memory usage due to ongoing indexing. - IndexWriter now can utilize ramBufferSize > 2048 MB. Each DWPT can address up to 2048 MB memory such that the ramBufferSize is now bounded by the max - number of DWPT avaliable in the used DocumentsWriterPerThreadPool. + number of DWPT available in the used DocumentsWriterPerThreadPool. IndexWriters net memory consumption can grow far beyond the 2048 MB limit if the application can use all available DWPTs. To prevent a DWPT from exhausting its address space IndexWriter will forcefully flush a DWPT if its @@ -1215,7 +1224,7 @@ Changes in Runtime Behavior via IndexWriterConfig and defaults to 1945 MB. Since IndexWriter flushes DWPT concurrently not all memory is released immediately. Applications should still use a ramBufferSize significantly - lower than the JVMs avaliable heap memory since under high load multiple + lower than the JVMs available heap memory since under high load multiple flushing DWPT can consume substantial transient memory when IO performance is slow relative to indexing rate. @@ -1223,7 +1232,7 @@ Changes in Runtime Behavior 'currently' RAM resident documents to disk. Yet, flushes that occur while a a full flush is running are queued and will happen after all DWPT involved in the full flush are done flushing. Applications using multiple threads - during indexing and trigger a full flush (eg call commmit() or open a new + during indexing and trigger a full flush (eg call commit() or open a new NRT reader) can use significantly more transient memory. - IndexWriter#addDocument and IndexWriter.updateDocument can block indexing @@ -1266,7 +1275,7 @@ Changes in Runtime Behavior * LUCENE-3455: QueryParserBase.newFieldQuery() will throw a ParseException if any of the calls to the Analyzer throw an IOException. QueryParseBase.analyzeRangePart() - will throw a RuntimException if an IOException is thrown by the Analyzer. + will throw a RuntimeException if an IOException is thrown by the Analyzer. * LUCENE-4127: IndexWriter will now throw IllegalArgumentException if the first token of an indexed field has 0 positionIncrement @@ -1356,7 +1365,7 @@ API Changes customized on a per-field basis. (Robert Muir) * LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to - enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively. + enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode respectively. * LUCENE-3483: Move Function grouping collectors from Solr to grouping module. (Martijn van Groningen) @@ -1514,7 +1523,7 @@ New features * LUCENE-2742: Add native per-field postings format support. Codec lets you now register a postings format for each field and which is in turn recorded - into the index. Postings formtas are maintained on a per-segment basis and be + into the index. Postings formats are maintained on a per-segment basis and be resolved without knowing the actual postings format used for writing the segment. (Simon Willnauer) @@ -1722,7 +1731,7 @@ New features - o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies capitalization rules to tokens. - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a - CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes. + CharFilter, Tokenizer, and TokenFilter for transforming text with regexes. - o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word synonyms. - o.a.l.analysis.phonetic: Package for phonetic search, containing various @@ -1894,7 +1903,7 @@ Bug fixes DocsAndPositionsEnum while merging (Marc Sturlese, Erick Erickson, Robert Muir, Simon Willnauer, Mike McCandless) -* LUCENE-3589: BytesRef copy(short) didnt set length. +* LUCENE-3589: BytesRef copy(short) didn't set length. (Peter Chang via Robert Muir) * LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was @@ -1997,6 +2006,51 @@ Build XSL. (Greg Bowyer, Uwe Schindler) +======================= Lucene 3.6.2 ======================= + +Bug Fixes + +* LUCENE-4234: Exception when FacetsCollector is used with ScoreFacetRequest, + and the number of matching documents is too large. (Gilad Barkai via Shai Erera) + +* LUCENE-2686, LUCENE-3505, LUCENE-4401: Fix BooleanQuery scorers to + return correct freq(). + (Koji Sekiguchi, Mike McCandless, Liu Chao, Robert Muir) + +* LUCENE-2501: Fixed rare thread-safety issue that could cause + ArrayIndexOutOfBoundsException inside ByteBlockPool (Robert Muir, + Mike McCandless) + +* LUCENE-4297: BooleanScorer2 would multiply the coord() factor + twice for conjunctions: for most users this is no problem, but + if you had a customized Similarity that returned something other + than 1 when overlap == maxOverlap (always the case for conjunctions), + then the score would be incorrect. (Pascal Chollet, Robert Muir) + +* LUCENE-4300: BooleanQuery's rewrite was not always safe: if you + had a custom Similarity where coord(1,1) != 1F, then the rewritten + query would be scored differently. (Robert Muir) + +* LUCENE-4398: If you index many different field names in your + documents then due to a bug in how it measures its RAM + usage, IndexWriter would flush each segment too early eventually + reaching the point where it flushes after every doc. (Tim Smith via + Mike McCandless) + +* LUCENE-4411: when sampling is enabled for a FacetRequest, its depth + parameter is reset to the default (1), even if set otherwise. + (Gilad Barkai via Shai Erera) + +* LUCENE-4635: Fixed ArrayIndexOutOfBoundsException when in-memory + terms index requires more than 2.1 GB RAM (indices with billions of + terms). (Tom Burton-West via Mike McCandless) + +Documentation + +* LUCENE-4302: Fix facet userguide to have HTML loose doctype like + all other javadocs. (Karl Nicholas via Uwe Schindler) + + ======================= Lucene 3.6.1 ======================= More information about this release, including any errata related to the release notes, upgrade instructions, or other changes may be found online at: @@ -2043,7 +2097,7 @@ Tests random graph tokens. (Mike McCandless) * LUCENE-3968: factor out LookaheadTokenFilter from - MockGraphTokenFilter (Mike Mccandless) + MockGraphTokenFilter (Mike McCandless) ======================= Lucene 3.6.0 ======================= @@ -2323,7 +2377,7 @@ Bug fixes * LUCENE-3876: Fix bug where positions for a document exceeding Integer.MAX_VALUE/2 would produce a corrupt index. - (Simon Willnauer, Mike Mccandless, Robert Muir) + (Simon Willnauer, Mike McCandless, Robert Muir) * LUCENE-3880: UAX29URLEmailTokenizer now recognizes emails when the mailto: scheme is prepended. (Kai Gülzau, Steve Rowe) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java index 5499566032a..e7c20da60e8 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.ja.dict; import java.io.IOException; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST; /** * Thin wrapper around an FST with root-arc caching for Japanese. @@ -48,7 +48,7 @@ public final class TokenInfoFST { rootCache = cacheRootArcs(); } - @SuppressWarnings("unchecked") + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] cacheRootArcs() throws IOException { FST.Arc rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)]; FST.Arc firstArc = new FST.Arc(); diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat index 2b1cedb59e2..538cd4c7bf2 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 2df460dd023..bec0c8700f2 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -132,7 +132,7 @@ public class TokenInfoDictionaryBuilder { System.out.println(" encode..."); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true); - Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, true); + Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15); IntsRef scratch = new IntsRef(); long ord = -1; // first ord will be 0 String lastValue = null; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index f35fb6c416b..318421b6389 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -113,7 +113,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { this.field = field; this.doPackFST = doPackFST; this.acceptableOverheadRatio = acceptableOverheadRatio; - builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio, true); + builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio, true, 15); } private class PostingsWriter extends PostingsConsumer { diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package.html b/lucene/core/src/java/org/apache/lucene/analysis/package.html index e88eb3adbc2..9f835eb535b 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/package.html +++ b/lucene/core/src/java/org/apache/lucene/analysis/package.html @@ -230,7 +230,7 @@ and proximity searches (though sentence identification is not provided by Lucene create, or a combination of existing and newly created components. Before pursuing this approach, you may find it worthwhile to explore the analyzers-common library and/or ask on the - java-user@lucene.apache.org mailing list first to see if what you need already exists. If you are still committed to creating your own Analyzer, have a look at the source code of any one of the many samples diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java index 3360602ddad..5c8718e6937 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java @@ -276,13 +276,13 @@ public class BlockTreeTermsReader extends FieldsProducer { */ public static class Stats { /** How many nodes in the index FST. */ - public int indexNodeCount; + public long indexNodeCount; /** How many arcs in the index FST. */ - public int indexArcCount; + public long indexArcCount; /** Byte size of the index. */ - public int indexNumBytes; + public long indexNumBytes; /** Total number of terms in the field. */ public long totalTermCount; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index e80c9028be6..0074894625c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -23,7 +23,6 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.index.FieldInfo.IndexOptions; -import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; @@ -41,6 +40,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.NoOutputs; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -187,7 +187,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { public final static int DEFAULT_MAX_BLOCK_SIZE = 48; //public final static boolean DEBUG = false; - private final static boolean SAVE_DOT_FILES = false; + //private final static boolean SAVE_DOT_FILES = false; static final int OUTPUT_FLAGS_NUM_BITS = 2; static final int OUTPUT_FLAGS_MASK = 0x3; @@ -419,7 +419,8 @@ public class BlockTreeTermsWriter extends FieldsConsumer { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, null, false, true); + outputs, null, false, + PackedInts.COMPACT, true, 15); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} @@ -962,7 +963,9 @@ public class BlockTreeTermsWriter extends FieldsConsumer { 0, 0, true, true, Integer.MAX_VALUE, noOutputs, - new FindBlocks(), false, true); + new FindBlocks(), false, + PackedInts.COMPACT, + true, 15); postingsWriter.setField(fieldInfo); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index fbb26c3da4f..1da8087a382 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -3475,6 +3476,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { diagnostics.put("os.version", Constants.OS_VERSION); diagnostics.put("java.version", Constants.JAVA_VERSION); diagnostics.put("java.vendor", Constants.JAVA_VENDOR); + diagnostics.put("timestamp", Long.toString(new Date().getTime())); if (details != null) { diagnostics.putAll(details); } diff --git a/lucene/core/src/java/org/apache/lucene/index/Norm.java b/lucene/core/src/java/org/apache/lucene/index/Norm.java index fe9714eecba..7fb182f5731 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Norm.java +++ b/lucene/core/src/java/org/apache/lucene/index/Norm.java @@ -115,15 +115,6 @@ public final class Norm { setType(Type.FIXED_INTS_64); this.field.setLongValue(norm); } - - /** - * Sets a packed long norm value. - * @lucene.experimental - */ - public void setPackedLong(long norm) { - setType(Type.VAR_INTS); - this.field.setLongValue(norm); - } /** * Sets a byte norm value diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 4405db61b57..3b56addbe86 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -38,7 +38,7 @@ import org.apache.lucene.search.DocIdSetIterator; public final class FixedBitSet extends DocIdSet implements Bits { private final long[] bits; - private int numBits; + private final int numBits; /** returns the number of 64 bit words it would take to hold numBits */ public static int bits2words(int numBits) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java index 19ee5412943..791d2773320 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java @@ -36,9 +36,13 @@ import org.apache.lucene.util.packed.PackedInts; *

NOTE: The algorithm is described at * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

* - * The parameterized type T is the output type. See the + *

The parameterized type T is the output type. See the * subclasses of {@link Outputs}. * + *

FSTs larger than 2.1GB are now possible (as of Lucene + * 4.2). FSTs containing more than 2.1B nodes are also now + * possible, however they cannot be packed. + * * @lucene.experimental */ @@ -84,22 +88,11 @@ public class Builder { /** * Instantiates an FST/FSA builder without any pruning. A shortcut * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, - * boolean, int, Outputs, FreezeTail, boolean, boolean)} with - * pruning options turned off. + * boolean, int, Outputs, FreezeTail, boolean, float, + * boolean, int)} with pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true); - } - - /** - * Instantiates an FST/FSA builder with {@link PackedInts#DEFAULT} - * acceptableOverheadRatio. - */ - public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, - boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean willPackFST, boolean allowArrayArcs) { - this(inputType, minSuffixCount1, minSuffixCount2, doShareSuffix, doShareNonSingletonNodes, - shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT, allowArrayArcs); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true, 15); } /** @@ -147,10 +140,16 @@ public class Builder { * @param allowArrayArcs Pass false to disable the array arc optimization * while building the FST; this will make the resulting * FST smaller but slower to traverse. + * + * @param bytesPageBits How many bits wide to make each + * byte[] block in the BytesStore; if you know the FST + * will be large then make this larger. For example 15 + * bits = 32768 byte pages. */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { + FreezeTail freezeTail, boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs, + int bytesPageBits) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; @@ -158,9 +157,9 @@ public class Builder { this.shareMaxTailLength = shareMaxTailLength; this.doPackFST = doPackFST; this.acceptableOverheadRatio = acceptableOverheadRatio; - fst = new FST(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs); + fst = new FST(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs, bytesPageBits); if (doShareSuffix) { - dedupHash = new NodeHash(fst); + dedupHash = new NodeHash(fst, fst.bytes.getReverseReader(false)); } else { dedupHash = null; } @@ -174,7 +173,7 @@ public class Builder { } } - public int getTotStateCount() { + public long getTotStateCount() { return fst.nodeCount; } @@ -182,12 +181,12 @@ public class Builder { return frontier[0].inputCount; } - public int getMappedStateCount() { + public long getMappedStateCount() { return dedupHash == null ? 0 : fst.nodeCount; } private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { - final int node; + final long node; if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { if (nodeIn.numArcs == 0) { node = fst.addNode(nodeIn); @@ -475,7 +474,7 @@ public class Builder { fst.finish(compileNode(root, lastInput.length).node); if (doPackFST) { - return fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio); + return fst.pack(3, Math.max(10, (int) (fst.getNodeCount()/4)), acceptableOverheadRatio); } else { return fst; } @@ -513,8 +512,12 @@ public class Builder { boolean isCompiled(); } + public long fstSizeInBytes() { + return fst.sizeInBytes(); + } + static final class CompiledNode implements Node { - int node; + long node; @Override public boolean isCompiled() { return true; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java new file mode 100644 index 00000000000..504e8b7a11f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -0,0 +1,468 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +// TODO: merge with PagedBytes, except PagedBytes doesn't +// let you read while writing which FST needs + +class BytesStore extends DataOutput { + + private final List blocks = new ArrayList(); + + private final int blockSize; + private final int blockBits; + private final int blockMask; + + private byte[] current; + private int nextWrite; + + public BytesStore(int blockBits) { + this.blockBits = blockBits; + blockSize = 1 << blockBits; + blockMask = blockSize-1; + nextWrite = blockSize; + } + + /** Pulls bytes from the provided IndexInput. */ + public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException { + int blockSize = 2; + int blockBits = 1; + while(blockSize < numBytes && blockSize < maxBlockSize) { + blockSize *= 2; + blockBits++; + } + this.blockBits = blockBits; + this.blockSize = blockSize; + this.blockMask = blockSize-1; + int left = numBytes; + while(left > 0) { + final int chunk = Math.min(blockSize, left); + byte[] block = new byte[chunk]; + in.readBytes(block, 0, block.length); + blocks.add(block); + left -= chunk; + } + + // So .getPosition still works + nextWrite = blocks.get(blocks.size()-1).length; + } + + /** Absolute write byte; you must ensure dest is < max + * position written so far. */ + public void writeByte(int dest, byte b) { + int blockIndex = dest >> blockBits; + byte[] block = blocks.get(blockIndex); + block[dest & blockMask] = b; + } + + @Override + public void writeByte(byte b) { + if (nextWrite == blockSize) { + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + current[nextWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int len) { + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + System.arraycopy(b, offset, current, nextWrite, len); + nextWrite += len; + break; + } else { + if (chunk > 0) { + System.arraycopy(b, offset, current, nextWrite, chunk); + offset += chunk; + len -= chunk; + } + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + int getBlockBits() { + return blockBits; + } + + /** Absolute writeBytes without changing the current + * position. Note: this cannot "grow" the bytes, so you + * must only call it on already written parts. */ + void writeBytes(long dest, byte[] b, int offset, int len) { + //System.out.println(" BS.writeBytes dest=" + dest + " offset=" + offset + " len=" + len); + assert dest + len <= getPosition(): "dest=" + dest + " pos=" + getPosition() + " len=" + len; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = dest >> blockBits; + int upto = dest & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle chunk=" + chunk + " len=" + len); + if (len <= chunk) { + System.arraycopy(b, offset, block, upto, len); + break; + } else { + System.arraycopy(b, offset, block, upto, chunk); + offset += chunk; + len -= chunk; + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + } + } + */ + + final long end = dest + len; + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + //System.out.println(" cycle downTo=" + downTo + " len=" + len); + if (len <= downTo) { + //System.out.println(" final: offset=" + offset + " len=" + len + " dest=" + (downTo-len)); + System.arraycopy(b, offset, block, downTo-len, len); + break; + } else { + len -= downTo; + //System.out.println(" partial: offset=" + (offset + len) + " len=" + downTo + " dest=0"); + System.arraycopy(b, offset + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** Absolute copy bytes self to self, without changing the + * position. Note: this cannot "grow" the bytes, so must + * only call it on already written parts. */ + public void copyBytes(long src, long dest, int len) { + //System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); + assert src < dest; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = src >> blockBits; + int upto = src & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle: chunk=" + chunk + " len=" + len); + if (len <= chunk) { + writeBytes(dest, block, upto, len); + break; + } else { + writeBytes(dest, block, upto, chunk); + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + len -= chunk; + dest += chunk; + } + } + */ + + long end = src + len; + + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + //System.out.println(" cycle downTo=" + downTo); + if (len <= downTo) { + //System.out.println(" finish"); + writeBytes(dest, block, downTo-len, len); + break; + } else { + //System.out.println(" partial"); + len -= downTo; + writeBytes(dest + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** Writes an int at the absolute position without + * changing the current pointer. */ + public void writeInt(long pos, int value) { + int blockIndex = (int) (pos >> blockBits); + int upto = (int) (pos & blockMask); + byte[] block = blocks.get(blockIndex); + int shift = 24; + for(int i=0;i<4;i++) { + block[upto++] = (byte) (value >> shift); + shift -= 8; + if (upto == blockSize) { + upto = 0; + blockIndex++; + block = blocks.get(blockIndex); + } + } + } + + /** Reverse from srcPos, inclusive, to destPos, inclusive. */ + public void reverse(long srcPos, long destPos) { + assert srcPos < destPos; + assert destPos < getPosition(); + //System.out.println("reverse src=" + srcPos + " dest=" + destPos); + + int srcBlockIndex = (int) (srcPos >> blockBits); + int src = (int) (srcPos & blockMask); + byte[] srcBlock = blocks.get(srcBlockIndex); + + int destBlockIndex = (int) (destPos >> blockBits); + int dest = (int) (destPos & blockMask); + byte[] destBlock = blocks.get(destBlockIndex); + //System.out.println(" srcBlock=" + srcBlockIndex + " destBlock=" + destBlockIndex); + + int limit = (int) (destPos - srcPos + 1)/2; + for(int i=0;i= 0; + int blockIndex = (int) (newLen >> blockBits); + nextWrite = (int) (newLen & blockMask); + if (nextWrite == 0) { + blockIndex--; + nextWrite = blockSize; + } + blocks.subList(blockIndex+1, blocks.size()).clear(); + if (newLen == 0) { + current = null; + } else { + current = blocks.get(blockIndex); + } + assert newLen == getPosition(); + } + + public void finish() { + if (current != null) { + byte[] lastBuffer = new byte[nextWrite]; + System.arraycopy(current, 0, lastBuffer, 0, nextWrite); + blocks.set(blocks.size()-1, lastBuffer); + current = null; + } + } + + /** Writes all of our bytes to the target {@link DataOutput}. */ + public void writeTo(DataOutput out) throws IOException { + for(byte[] block : blocks) { + out.writeBytes(block, 0, block.length); + } + } + + public FST.BytesReader getForwardReader() { + if (blocks.size() == 1) { + return new ForwardBytesReader(blocks.get(0)); + } + return new FST.BytesReader() { + private byte[] current; + private int nextBuffer; + private int nextRead = blockSize; + + @Override + public byte readByte() { + if (nextRead == blockSize) { + current = blocks.get(nextBuffer++); + nextRead = 0; + } + return current[nextRead++]; + } + + @Override + public void skipBytes(int count) { + setPosition(getPosition() + count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + while(len > 0) { + int chunkLeft = blockSize - nextRead; + if (len <= chunkLeft) { + System.arraycopy(current, nextRead, b, offset, len); + nextRead += len; + break; + } else { + if (chunkLeft > 0) { + System.arraycopy(current, nextRead, b, offset, chunkLeft); + offset += chunkLeft; + len -= chunkLeft; + } + current = blocks.get(nextBuffer++); + nextRead = 0; + } + } + } + + @Override + public long getPosition() { + return ((long) nextBuffer-1)*blockSize + nextRead; + } + + @Override + public void setPosition(long pos) { + int bufferIndex = (int) (pos >> blockBits); + nextBuffer = bufferIndex+1; + current = blocks.get(bufferIndex); + nextRead = (int) (pos & blockMask); + assert getPosition() == pos; + } + + @Override + public boolean reversed() { + return false; + } + }; + } + + public FST.BytesReader getReverseReader() { + return getReverseReader(true); + } + + FST.BytesReader getReverseReader(boolean allowSingle) { + if (allowSingle && blocks.size() == 1) { + return new ReverseBytesReader(blocks.get(0)); + } + return new FST.BytesReader() { + private byte[] current = blocks.size() == 0 ? null : blocks.get(0); + private int nextBuffer = -1; + private int nextRead = 0; + + @Override + public byte readByte() { + if (nextRead == -1) { + current = blocks.get(nextBuffer--); + nextRead = blockSize-1; + } + return current[nextRead--]; + } + + @Override + public void skipBytes(int count) { + setPosition(getPosition() - count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i> blockBits); + nextBuffer = bufferIndex-1; + current = blocks.get(bufferIndex); + nextRead = (int) (pos & blockMask); + assert getPosition() == pos: "pos=" + pos + " getPos()=" + getPosition(); + } + + @Override + public boolean reversed() { + return true; + } + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 10d326e58e8..7e8f7b2087f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -27,8 +27,14 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; +/* +import java.io.Writer; +import java.io.OutputStreamWriter; +import java.io.FileOutputStream; +*/ import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.InputStreamDataInput; @@ -51,9 +57,6 @@ import org.apache.lucene.util.packed.PackedInts; // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. -// TODO: maybe make an explicit thread state that holds -// reusable stuff eg BytesReader, a scratch arc - // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! @@ -65,8 +68,6 @@ import org.apache.lucene.util.packed.PackedInts; * *

See the {@link org.apache.lucene.util.fst package * documentation} for some simple examples. - *

NOTE: the FST cannot be larger than ~2.1 GB - * because it uses int to address the byte[]. * * @lucene.experimental */ @@ -93,6 +94,8 @@ public final class FST { // position: private final static int BIT_TARGET_DELTA = 1 << 6; + // We use this as a marker (because this one flag is + // illegal by itself ...): private final static byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; /** @@ -125,24 +128,27 @@ public final class FST { /** Added optional packed format. */ private final static int VERSION_PACKED = 3; - private final static int VERSION_CURRENT = VERSION_PACKED; + /** Changed from int to vInt for encoding arc targets. + * Also changed maxBytesPerArc from int to vInt in the array case. */ + private final static int VERSION_VINT_TARGET = 4; + + private final static int VERSION_CURRENT = VERSION_VINT_TARGET; // Never serialized; just used to represent the virtual // final node w/ no arcs: - private final static int FINAL_END_NODE = -1; + private final static long FINAL_END_NODE = -1; // Never serialized; just used to represent the virtual // non-final node w/ no arcs: - private final static int NON_FINAL_END_NODE = 0; + private final static long NON_FINAL_END_NODE = 0; // if non-null, this FST accepts the empty string and // produces this output T emptyOutput; - // Not private to avoid synthetic access$NNN methods: - byte[] bytes; + final BytesStore bytes; - private int startNode = -1; + private long startNode = -1; public final Outputs outputs; @@ -150,13 +156,13 @@ public final class FST { // instead of storing the address of the target node for // a given arc, we mark a single bit noting that the next // node in the byte[] is the target node): - private int lastFrozenNode; + private long lastFrozenNode; private final T NO_OUTPUT; - public int nodeCount; - public int arcCount; - public int arcWithOutputCount; + public long nodeCount; + public long arcCount; + public long arcWithOutputCount; private final boolean packed; private PackedInts.Reader nodeRefToAddress; @@ -175,19 +181,19 @@ public final class FST { // From node (ord or address); currently only used when // building an FST w/ willPackFST=true: - int node; + long node; /** To node (ord or address) */ - public int target; + public long target; byte flags; public T nextFinalOutput; // address (into the byte[]), or ord/address if label == END_LABEL - int nextArc; + long nextArc; // This is non-zero if current arcs are fixed array: - int posArcsStart; + long posArcsStart; int bytesPerArc; int arcIdx; int numArcs; @@ -254,8 +260,6 @@ public final class FST { return (flags & bit) != 0; } - private final BytesWriter writer; - private GrowableWriter nodeAddress; // TODO: we could be smarter here, and prune periodically @@ -263,23 +267,28 @@ public final class FST { // clear early on: private GrowableWriter inCounts; + private final int version; + // make a new empty FST, for building; Builder invokes // this ctor - FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { + FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs, int bytesPageBits) { this.inputType = inputType; this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; - bytes = new byte[128]; + version = VERSION_CURRENT; + // 32 KB blocks: + bytes = new BytesStore(bytesPageBits); + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + bytes.writeByte((byte) 0); NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { - nodeAddress = new GrowableWriter(PackedInts.bitsRequired(bytes.length - 1), 8, acceptableOverheadRatio); + nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio); inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; } - - writer = new DefaultBytesWriter(); emptyOutput = null; packed = false; @@ -289,23 +298,29 @@ public final class FST { /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; - writer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): - CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); + version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET); packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); int numBytes = in.readVInt(); - bytes = new byte[numBytes]; - in.readBytes(bytes, 0, numBytes); - + emptyBytes.copyBytes(in, numBytes); + // De-serialize empty-string output: BytesReader reader; if (packed) { - reader = new ForwardBytesReader(bytes, 0); + reader = emptyBytes.getForwardReader(); } else { - reader = new ReverseBytesReader(bytes, bytes.length-1); + reader = emptyBytes.getReverseReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes-1); + } } emptyOutput = outputs.readFinalOutput(reader); } else { @@ -331,12 +346,13 @@ public final class FST { nodeRefToAddress = null; } startNode = in.readVInt(); - nodeCount = in.readVInt(); - arcCount = in.readVInt(); - arcWithOutputCount = in.readVInt(); + nodeCount = in.readVLong(); + arcCount = in.readVLong(); + arcWithOutputCount = in.readVLong(); - bytes = new byte[in.readVInt()]; - in.readBytes(bytes, 0, bytes.length); + int numBytes = in.readVInt(); + bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE); + NO_OUTPUT = outputs.getNoOutput(); cacheRootArcs(); @@ -345,6 +361,15 @@ public final class FST { // building; we need to break out mutable FST from // immutable allowArrayArcs = false; + + /* + if (bytes.length == 665) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(this, w, false, false); + w.close(); + System.out.println("Wrote FST to out.dot"); + } + */ } public INPUT_TYPE getInputType() { @@ -352,8 +377,8 @@ public final class FST { } /** Returns bytes used to represent the FST */ - public int sizeInBytes() { - int size = bytes.length; + public long sizeInBytes() { + long size = bytes.getPosition(); if (packed) { size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { @@ -363,25 +388,23 @@ public final class FST { return size; } - void finish(int startNode) throws IOException { - if (startNode == FINAL_END_NODE && emptyOutput != null) { - startNode = 0; - } + void finish(long startNode) throws IOException { if (this.startNode != -1) { throw new IllegalStateException("already finished"); } - byte[] finalBytes = new byte[writer.getPosition()]; - System.arraycopy(bytes, 0, finalBytes, 0, writer.getPosition()); - bytes = finalBytes; + if (startNode == FINAL_END_NODE && emptyOutput != null) { + startNode = 0; + } this.startNode = startNode; + bytes.finish(); cacheRootArcs(); } - private int getNodeAddress(int node) { + private long getNodeAddress(long node) { if (nodeAddress != null) { // Deref - return (int) nodeAddress.get(node); + return nodeAddress.get((int) node); } else { // Straight return node; @@ -481,12 +504,13 @@ public final class FST { if (packed) { ((PackedInts.Mutable) nodeRefToAddress).save(out); } - out.writeVInt(startNode); - out.writeVInt(nodeCount); - out.writeVInt(arcCount); - out.writeVInt(arcWithOutputCount); - out.writeVInt(bytes.length); - out.writeBytes(bytes, 0, bytes.length); + out.writeVLong(startNode); + out.writeVLong(nodeCount); + out.writeVLong(arcCount); + out.writeVLong(arcWithOutputCount); + long numBytes = bytes.getPosition(); + out.writeVLong(numBytes); + bytes.writeTo(out); } /** @@ -526,17 +550,16 @@ public final class FST { } } - private void writeLabel(int v) throws IOException { + private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; - writer.writeByte((byte) v); + out.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; - writer.writeShort((short) v); + out.writeShort((short) v); } else { - //writeInt(v); - writer.writeVInt(v); + out.writeVInt(v); } } @@ -562,8 +585,9 @@ public final class FST { // serializes new node by appending its bytes to the end // of the current byte[] - int addNode(Builder.UnCompiledNode nodeIn) throws IOException { - //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); + long addNode(Builder.UnCompiledNode nodeIn) throws IOException { + + //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; @@ -572,38 +596,28 @@ public final class FST { } } - int startAddress = writer.getPosition(); + final long startAddress = bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); final boolean doFixedArray = shouldExpand(nodeIn); - final int fixedArrayStart; if (doFixedArray) { + //System.out.println(" fixedArray"); if (bytesPerArc.length < nodeIn.numArcs) { bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } - // write a "false" first arc: - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(nodeIn.numArcs); - // placeholder -- we'll come back and write the number - // of bytes per arc (int) here: - // TODO: we could make this a vInt instead - writer.writeInt(0); - fixedArrayStart = writer.getPosition(); - //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); - } else { - fixedArrayStart = 0; } arcCount += nodeIn.numArcs; final int lastArc = nodeIn.numArcs-1; - int lastArcStart = writer.getPosition(); + long lastArcStart = bytes.getPosition(); int maxBytesPerArc = 0; for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; + //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); if (arcIdx == lastArc) { flags += BIT_LAST_ARC; @@ -630,111 +644,135 @@ public final class FST { if (!targetHasArcs) { flags += BIT_STOP_NODE; } else if (inCounts != null) { - inCounts.set(target.node, inCounts.get(target.node) + 1); + inCounts.set((int) target.node, inCounts.get((int) target.node) + 1); } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } - writer.writeByte((byte) flags); - writeLabel(arc.label); + bytes.writeByte((byte) flags); + writeLabel(bytes, arc.label); - // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); + outputs.write(arc.output, bytes); //System.out.println(" write output"); arcWithOutputCount++; } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); - outputs.writeFinalOutput(arc.nextFinalOutput, writer); + outputs.writeFinalOutput(arc.nextFinalOutput, bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - writer.writeInt(target.node); + bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, // but record how many bytes each one took, and max // byte size: if (doFixedArray) { - bytesPerArc[arcIdx] = writer.getPosition() - lastArcStart; - lastArcStart = writer.getPosition(); + bytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart); + lastArcStart = bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); //System.out.println(" bytes=" + bytesPerArc[arcIdx]); } } - - // TODO: if arc'd arrays will be "too wasteful" by some - // measure, eg if arcs have vastly different sized - // outputs, then we should selectively disable array for - // such cases + + // TODO: try to avoid wasteful cases: disable doFixedArray in that case + /* + * + * LUCENE-4682: what is a fair heuristic here? + * It could involve some of these: + * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount? + * 2. how much binSearch saves over scan: nodeIn.numArcs + * 3. waste: numBytes vs numBytesExpanded + * + * the one below just looks at #3 + if (doFixedArray) { + // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor???? + int numBytes = lastArcStart - startAddress; + int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs; + if (numBytesExpanded > numBytes*1.25) { + doFixedArray = false; + } + } + */ if (doFixedArray) { - //System.out.println(" doFixedArray"); + final int MAX_HEADER_SIZE = 11; // header(byte) + numArcs(vint) + numBytes(vint) assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size - final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; - assert ((long) fixedArrayStart) + ((long) nodeIn.numArcs) * maxBytesPerArc < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, sizeNeeded); - // TODO: we could make this a vInt instead - bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); - bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); - bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); - bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs); + // create the header + // TODO: clean this up: or just rewind+reuse and deal with it + byte header[] = new byte[MAX_HEADER_SIZE]; + ByteArrayDataOutput bad = new ByteArrayDataOutput(header); + // write a "false" first arc: + bad.writeByte(ARCS_AS_FIXED_ARRAY); + bad.writeVInt(nodeIn.numArcs); + bad.writeVInt(maxBytesPerArc); + int headerLen = bad.getPosition(); + + final long fixedArrayStart = startAddress + headerLen; // expand the arcs in place, backwards - int srcPos = writer.getPosition(); - int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; - writer.setPosition(destPos); - for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { - //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); - destPos -= maxBytesPerArc; - srcPos -= bytesPerArc[arcIdx]; - if (srcPos != destPos) { - assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; - System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + long srcPos = bytes.getPosition(); + long destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; + assert destPos >= srcPos; + if (destPos > srcPos) { + bytes.skipBytes((int) (destPos - srcPos)); + for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { + destPos -= maxBytesPerArc; + srcPos -= bytesPerArc[arcIdx]; + //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); + if (srcPos != destPos) { + //System.out.println(" copy len=" + bytesPerArc[arcIdx]); + assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; + bytes.copyBytes(srcPos, destPos, bytesPerArc[arcIdx]); + } } } + + // now write the header + bytes.writeBytes(startAddress, header, 0, headerLen); } - // reverse bytes in-place; we do this so that the - // "BIT_TARGET_NEXT" opto can work, ie, it reads the - // node just before the current one - final int endAddress = writer.getPosition() - 1; + final long thisNodeAddress = bytes.getPosition()-1; - int left = startAddress; - int right = endAddress; - while (left < right) { - final byte b = bytes[left]; - bytes[left++] = bytes[right]; - bytes[right--] = b; + bytes.reverse(startAddress, thisNodeAddress); + + // PackedInts uses int as the index, so we cannot handle + // > 2.1B nodes when packing: + if (nodeAddress != null && nodeCount == Integer.MAX_VALUE) { + throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes"); } - //System.out.println(" endAddress=" + endAddress); nodeCount++; - final int node; + final long node; if (nodeAddress != null) { + // Nodes are addressed by 1+ord: - if (nodeCount == nodeAddress.size()) { + if ((int) nodeCount == nodeAddress.size()) { nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } - nodeAddress.set(nodeCount, endAddress); + nodeAddress.set((int) nodeCount, thisNodeAddress); // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); node = nodeCount; } else { - node = endAddress; + node = thisNodeAddress; } lastFrozenNode = node; + //System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress); return node; } @@ -763,7 +801,7 @@ public final class FST { * * @return Returns the second argument * (arc). */ - public Arc readLastTargetArc(Arc follow, Arc arc, FST.BytesReader in) throws IOException { + public Arc readLastTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); @@ -774,19 +812,19 @@ public final class FST { arc.flags = BIT_LAST_ARC; return arc; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { // array: jump straight to end arc.numArcs = in.readVInt(); - if (packed) { + if (packed || version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); arc.arcIdx = arc.numArcs - 2; } else { arc.flags = b; @@ -804,18 +842,16 @@ public final class FST { } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { + } else if (packed) { + in.readVLong(); } else { - if (packed) { - in.readVInt(); - } else { - in.skip(4); - } + readUnpackedNodeTarget(in); } arc.flags = in.readByte(); } - // Undo the byte flags we read: - in.skip(-1); - arc.nextArc = in.pos; + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); } readNextRealArc(arc, in); assert arc.isLast(); @@ -823,6 +859,16 @@ public final class FST { } } + private long readUnpackedNodeTarget(BytesReader in) throws IOException { + long target; + if (version < VERSION_VINT_TARGET) { + target = in.readInt(); + } else { + target = in.readVLong(); + } + return target; + } + /** * Follow the follow arc and read the first arc of its target; * this changes the provided arc (2nd arg) in-place and returns @@ -853,10 +899,9 @@ public final class FST { } } - public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; - final int address = getNodeAddress(node); - in.pos = address; + public Arc readFirstRealTargetArc(long node, Arc arc, final BytesReader in) throws IOException { + final long address = getNodeAddress(node); + in.setPosition(address); //System.out.println(" readFirstRealTargtArc address=" //+ address); //System.out.println(" flags=" + arc.flags); @@ -866,13 +911,13 @@ public final class FST { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - if (packed) { + if (packed || version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } arc.arcIdx = -1; - arc.nextArc = arc.posArcsStart = in.pos; + arc.nextArc = arc.posArcsStart = in.getPosition(); //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { //arc.flags = b; @@ -889,11 +934,11 @@ public final class FST { * @return Returns true if arc points to a state in an * expanded array format. */ - boolean isExpandedTarget(Arc follow, FST.BytesReader in) throws IOException { + boolean isExpandedTarget(Arc follow, BytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); return in.readByte() == ARCS_AS_FIXED_ARRAY; } } @@ -917,30 +962,36 @@ public final class FST { assert !arc.isLast(); if (arc.label == END_LABEL) { - //System.out.println(" nextArc fake " + arc.nextArc); - int pos = in.pos = getNodeAddress(arc.nextArc); + //System.out.println(" nextArc fake " + + //arc.nextArc); + + long pos = getNodeAddress(arc.nextArc); + in.setPosition(pos); + final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { - //System.out.println(" nextArc fake array"); + //System.out.println(" nextArc fixed array"); in.readVInt(); - if (packed) { + + // Skip bytesPerArc: + if (packed || version >= VERSION_VINT_TARGET) { in.readVInt(); } else { in.readInt(); } } else { - in.pos = pos; + in.setPosition(pos); } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries - in.pos = arc.posArcsStart; - in.skip((1+arc.arcIdx)*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } } // skip flags @@ -951,7 +1002,6 @@ public final class FST { /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); @@ -961,10 +1011,11 @@ public final class FST { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } arc.flags = in.readByte(); arc.label = readLabel(in); @@ -987,9 +1038,9 @@ public final class FST { } else { arc.target = NON_FINAL_END_NODE; } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } else if (arc.flag(BIT_TARGET_NEXT)) { - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (nodeAddress == null) { @@ -998,35 +1049,36 @@ public final class FST { // must scan seekToNextNode(in); } else { - in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * arc.numArcs); } } - arc.target = in.pos; + arc.target = in.getPosition(); } else { arc.target = arc.node - 1; assert arc.target > 0; } } else { if (packed) { - final int pos = in.pos; - final int code = in.readVInt(); + final long pos = in.getPosition(); + final long code = in.readVLong(); if (arc.flag(BIT_TARGET_DELTA)) { // Address is delta-coded from current address: arc.target = pos + code; //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); } else if (code < nodeRefToAddress.size()) { // Deref - arc.target = (int) nodeRefToAddress.get(code); + arc.target = nodeRefToAddress.get((int) code); //System.out.println(" deref code=" + code + " target=" + arc.target); } else { // Absolute arc.target = code; - //System.out.println(" abs code=" + code + " derefLen=" + nodeRefToAddress.length); + //System.out.println(" abs code=" + code); } } else { - arc.target = in.readInt(); + arc.target = readUnpackedNodeTarget(in); } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } return arc; } @@ -1035,7 +1087,6 @@ public final class FST { * This returns null if the arc was not found, else the incoming arc. */ public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { assert cachedRootArcs != null; - assert in.bytes == bytes; if (labelToMatch == END_LABEL) { if (follow.isFinal()) { @@ -1070,7 +1121,7 @@ public final class FST { return null; } - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; @@ -1079,18 +1130,19 @@ public final class FST { if (in.readByte() == ARCS_AS_FIXED_ARRAY) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - if (packed) { + if (packed || version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; - in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { @@ -1145,9 +1197,9 @@ public final class FST { if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { if (packed) { - in.readVInt(); + in.readVLong(); } else { - in.readInt(); + readUnpackedNodeTarget(in); } } @@ -1157,16 +1209,16 @@ public final class FST { } } - public int getNodeCount() { + public long getNodeCount() { // 1+ in order to count the -1 implicit final node return 1+nodeCount; } - public int getArcCount() { + public long getArcCount() { return arcCount; } - public int getArcWithOutputCount() { + public long getArcWithOutputCount() { return arcWithOutputCount; } @@ -1191,56 +1243,6 @@ public final class FST { node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP); } - static abstract class BytesWriter extends DataOutput { - public abstract void setPosition(int posWrite); - public abstract int getPosition(); - } - - // Non-static: writes to FST's byte[] - class DefaultBytesWriter extends BytesWriter { - int posWrite; - - public DefaultBytesWriter() { - // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - posWrite = 1; - } - - @Override - public void writeByte(byte b) { - assert posWrite <= bytes.length; - if (bytes.length == posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes); - } - assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; - bytes[posWrite++] = b; - } - - @Override - public int getPosition() { - return posWrite; - } - - @Override - public void setPosition(int posWrite) { - this.posWrite = posWrite; - if (bytes.length < posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, posWrite); - } - } - - @Override - public void writeBytes(byte[] b, int offset, int length) { - final int size = posWrite + length; - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, size); - System.arraycopy(b, offset, bytes, posWrite, length); - posWrite += length; - } - } - /** Returns a {@link BytesReader} for this FST, positioned at * position 0. */ public BytesReader getBytesReader() { @@ -1249,87 +1251,34 @@ public final class FST { /** Returns a {@link BytesReader} for this FST, positioned at * the provided position. */ - public BytesReader getBytesReader(int pos) { + public BytesReader getBytesReader(long pos) { // TODO: maybe re-use via ThreadLocal? + BytesReader in; if (packed) { - return new ForwardBytesReader(bytes, pos); + in = bytes.getForwardReader(); } else { - return new ReverseBytesReader(bytes, pos); + in = bytes.getReverseReader(); } + if (pos != 0) { + in.setPosition(pos); + } + return in; } - /** Reads the bytes from this FST. Use {@link - * #getBytesReader(int)} to obtain an instance for this - * FST; re-use across calls (but only within a single - * thread) for better performance. */ + /** Reads bytes stored in an FST. */ public static abstract class BytesReader extends DataInput { - protected int pos; - protected final byte[] bytes; - protected BytesReader(byte[] bytes, int pos) { - this.bytes = bytes; - this.pos = pos; - } - abstract void skip(int byteCount); - abstract void skip(int base, int byteCount); - } + /** Get current read position. */ + public abstract long getPosition(); - final static class ReverseBytesReader extends BytesReader { + /** Set current read position. */ + public abstract void setPosition(long pos); - public ReverseBytesReader(byte[] bytes, int pos) { - super(bytes, pos); - } + /** Returns true if this reader uses reversed bytes + * under-the-hood. */ + public abstract boolean reversed(); - @Override - public byte readByte() { - return bytes[pos--]; - } - - @Override - public void readBytes(byte[] b, int offset, int len) { - for(int i=0;i { @@ -1451,14 +1400,13 @@ public final class FST { */ // Creates a packed FST - private FST(INPUT_TYPE inputType, PackedInts.Reader nodeRefToAddress, Outputs outputs) { + private FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { + version = VERSION_CURRENT; packed = true; this.inputType = inputType; - bytes = new byte[128]; - this.nodeRefToAddress = nodeRefToAddress; + bytes = new BytesStore(bytesPageBits); this.outputs = outputs; NO_OUTPUT = outputs.getNoOutput(); - writer = new DefaultBytesWriter(); // NOTE: bogus because this is only used during // building; we need to break out mutable FST from @@ -1480,6 +1428,9 @@ public final class FST { */ FST pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { + // NOTE: maxDerefNodes is intentionally int: we cannot + // support > 2.1B deref nodes + // TODO: other things to try // - renumber the nodes to get more next / better locality? // - allow multiple input labels on an arc, so @@ -1529,17 +1480,13 @@ public final class FST { //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); } - final FST fst = new FST(inputType, null, outputs); - - final BytesWriter writer = fst.writer; - // +1 because node ords start at 1 (0 is reserved as stop node): final GrowableWriter newNodeAddress = new GrowableWriter( - PackedInts.bitsRequired(bytes.length), 1 + nodeCount, acceptableOverheadRatio); + PackedInts.bitsRequired(this.bytes.getPosition()), (int) (1 + nodeCount), acceptableOverheadRatio); // Fill initial coarse guess: for(int node=1;node<=nodeCount;node++) { - newNodeAddress.set(node, 1 + bytes.length - nodeAddress.get(node)); + newNodeAddress.set(node, 1 + this.bytes.getPosition() - nodeAddress.get(node)); } int absCount; @@ -1547,6 +1494,8 @@ public final class FST { int topCount; int nextCount; + FST fst; + // Iterate until we converge: while(true) { @@ -1556,7 +1505,10 @@ public final class FST { // for assert: boolean negDelta = false; - writer.setPosition(0); + fst = new FST(inputType, outputs, bytes.getBlockBits()); + + final BytesStore writer = fst.bytes; + // Skip 0 byte since 0 is reserved target: writer.writeByte((byte) 0); @@ -1568,19 +1520,20 @@ public final class FST { int changedCount = 0; - int addressError = 0; + long addressError = 0; //int totWasted = 0; // Since we re-reverse the bytes, we now write the // nodes backwards, so that BIT_TARGET_NEXT is // unchanged: - for(int node=nodeCount;node>=1;node--) { + for(int node=(int)nodeCount;node>=1;node--) { fst.nodeCount++; - final int address = writer.getPosition(); + final long address = writer.getPosition(); + //System.out.println(" node: " + node + " address=" + address); if (address != newNodeAddress.get(node)) { - addressError = address - (int) newNodeAddress.get(node); + addressError = address - newNodeAddress.get(node); //System.out.println(" change: " + (address - newNodeAddress[node])); changed = true; newNodeAddress.set(node, address); @@ -1600,6 +1553,7 @@ public final class FST { writeNode: while(true) { // retry writing this node + //System.out.println(" cycle: retry"); readFirstRealTargetArc(node, arc, r); final boolean useArcArray = arc.bytesPerArc != 0; @@ -1617,9 +1571,9 @@ public final class FST { int maxBytesPerArc = 0; //int wasted = 0; while(true) { // iterate over all arcs for this node + //System.out.println(" cycle next arc"); - //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); - final int arcStartPos = writer.getPosition(); + final long arcStartPos = writer.getPosition(); nodeArcCount++; byte flags = 0; @@ -1654,19 +1608,18 @@ public final class FST { flags += BIT_ARC_HAS_OUTPUT; } - final Integer ptr; - final int absPtr; + final long absPtr; final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; if (doWriteTarget) { - ptr = topNodeMap.get(arc.target); + final Integer ptr = topNodeMap.get(arc.target); if (ptr != null) { absPtr = ptr; } else { - absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; + absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError; } - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2; + long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2; if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1677,12 +1630,13 @@ public final class FST { flags |= BIT_TARGET_DELTA; } } else { - ptr = null; absPtr = 0; } + assert flags != ARCS_AS_FIXED_ARRAY; writer.writeByte(flags); - fst.writeLabel(arc.label); + + fst.writeLabel(writer, arc.label); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, writer); @@ -1696,7 +1650,7 @@ public final class FST { if (doWriteTarget) { - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition(); + long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition(); if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1705,7 +1659,7 @@ public final class FST { if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - writer.writeVInt(delta); + writer.writeVLong(delta); if (!retry) { deltaCount++; } @@ -1717,7 +1671,7 @@ public final class FST { System.out.println(" abs"); } */ - writer.writeVInt(absPtr); + writer.writeVLong(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1729,7 +1683,7 @@ public final class FST { } if (useArcArray) { - final int arcBytes = writer.getPosition() - arcStartPos; + final int arcBytes = (int) (writer.getPosition() - arcStartPos); //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if @@ -1739,7 +1693,7 @@ public final class FST { // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; - writer.setPosition(arcStartPos + bytesPerArc); + writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition())); } if (arc.isLast()) { @@ -1764,11 +1718,12 @@ public final class FST { // Retry: bytesPerArc = maxBytesPerArc; - writer.setPosition(address); + writer.truncate(address); nodeArcCount = 0; retry = true; anyNegDelta = false; } + negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; @@ -1788,8 +1743,8 @@ public final class FST { } long maxAddress = 0; - for (int key : topNodeMap.keySet()) { - maxAddress = Math.max(maxAddress, newNodeAddress.get(key)); + for (long key : topNodeMap.keySet()) { + maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key)); } PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), @@ -1799,8 +1754,7 @@ public final class FST { } fst.nodeRefToAddress = nodeRefToAddressIn; - - fst.startNode = (int) newNodeAddress.get(startNode); + fst.startNode = newNodeAddress.get((int) startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); if (emptyOutput != null) { @@ -1810,11 +1764,8 @@ public final class FST { assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; assert fst.arcCount == arcCount; assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; - - final byte[] finalBytes = new byte[writer.getPosition()]; - //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); - System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.getPosition()); - fst.bytes = finalBytes; + + fst.bytes.finish(); fst.cacheRootArcs(); //final int size = fst.sizeInBytes(); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java index d4806fbe0ec..13be2f1c5ba 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -17,11 +17,11 @@ package org.apache.lucene.util.fst; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; -import java.io.IOException; - /** Can next() and advance() through the terms in an FST * * @lucene.experimental @@ -153,8 +153,8 @@ abstract class FSTEnum { boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -292,8 +292,8 @@ abstract class FSTEnum { boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java new file mode 100644 index 00000000000..1a9417f7a9c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java @@ -0,0 +1,62 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: can we use just ByteArrayDataInput...? need to +// add a .skipBytes to DataInput.. hmm and .setPosition + +/** Reads from a single byte[]. */ +final class ForwardBytesReader extends FST.BytesReader { + private final byte[] bytes; + private int pos; + + public ForwardBytesReader(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public byte readByte() { + return bytes[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + System.arraycopy(bytes, pos, b, offset, len); + pos += len; + } + + @Override + public void skipBytes(int count) { + pos += count; + } + + @Override + public long getPosition() { + return pos; + } + + @Override + public void setPosition(long pos) { + this.pos = (int) pos; + } + + @Override + public boolean reversed() { + return false; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index 096f2adc927..7e09a421f79 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -19,22 +19,27 @@ package org.apache.lucene.util.fst; import java.io.IOException; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; + // Used to dedup states (lookup already-frozen states) final class NodeHash { - private int[] table; + private GrowableWriter table; private int count; private int mask; private final FST fst; private final FST.Arc scratchArc = new FST.Arc(); + private final FST.BytesReader in; - public NodeHash(FST fst) { - table = new int[16]; + public NodeHash(FST fst, FST.BytesReader in) { + table = new GrowableWriter(8, 16, PackedInts.COMPACT); mask = 15; this.fst = fst; + this.in = in; } - private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { + private boolean nodesEqual(Builder.UnCompiledNode node, long address) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -73,7 +78,8 @@ final class NodeHash { final Builder.Arc arc = node.arcs[arcIdx]; //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); h = PRIME * h + arc.label; - h = PRIME * h + ((Builder.CompiledNode) arc.target).node; + long n = ((Builder.CompiledNode) arc.target).node; + h = PRIME * h + (int) (n^(n>>32)); h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode(); if (arc.isFinal) { @@ -85,16 +91,15 @@ final class NodeHash { } // hash code for a frozen node - private int hash(int node) throws IOException { + private int hash(long node) throws IOException { final int PRIME = 31; - final FST.BytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen node=" + node); int h = 0; fst.readFirstRealTargetArc(node, scratchArc, in); while(true) { - //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); + //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition()); h = PRIME * h + scratchArc.label; - h = PRIME * h + scratchArc.target; + h = PRIME * h + (int) (scratchArc.target^(scratchArc.target>>32)); h = PRIME * h + scratchArc.output.hashCode(); h = PRIME * h + scratchArc.nextFinalOutput.hashCode(); if (scratchArc.isFinal()) { @@ -109,26 +114,25 @@ final class NodeHash { return h & Integer.MAX_VALUE; } - public int add(Builder.UnCompiledNode nodeIn) throws IOException { - // System.out.println("hash: add count=" + count + " vs " + table.length); - final FST.BytesReader in = fst.getBytesReader(0); + public long add(Builder.UnCompiledNode nodeIn) throws IOException { + // System.out.println("hash: add count=" + count + " vs " + table.size()); final int h = hash(nodeIn); int pos = h & mask; int c = 0; while(true) { - final int v = table[pos]; + final long v = table.get(pos); if (v == 0) { // freeze & add - final int node = fst.addNode(nodeIn); + final long node = fst.addNode(nodeIn); //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; - table[pos] = node; - if (table.length < 2*count) { + table.set(pos, node); + if (table.size() < 2*count) { rehash(); } return node; - } else if (nodesEqual(nodeIn, v, in)) { + } else if (nodesEqual(nodeIn, v)) { // same node is already here return v; } @@ -139,12 +143,12 @@ final class NodeHash { } // called only by rehash - private void addNew(int address) throws IOException { + private void addNew(long address) throws IOException { int pos = hash(address) & mask; int c = 0; while(true) { - if (table[pos] == 0) { - table[pos] = address; + if (table.get(pos) == 0) { + table.set(pos, address); break; } @@ -154,16 +158,16 @@ final class NodeHash { } private void rehash() throws IOException { - final int[] oldTable = table; + final GrowableWriter oldTable = table; - if (oldTable.length >= Integer.MAX_VALUE/2) { + if (oldTable.size() >= Integer.MAX_VALUE/2) { throw new IllegalStateException("FST too large (> 2.1 GB)"); } - table = new int[2*table.length]; - mask = table.length-1; - for(int idx=0;idx arc = fst.getFirstArc(new FST.Arc()); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(0); // Accumulate output as we go T output = fst.outputs.getNoOutput(); @@ -64,7 +64,7 @@ public final class Util { public static T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -101,7 +101,7 @@ public final class Util { * fit this. */ public static IntsRef getByOutput(FST fst, long targetOutput) throws IOException { - final FST.BytesReader in = fst.getBytesReader(0); + final BytesReader in = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -147,8 +147,8 @@ public final class Util { boolean exact = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid); final byte flags = in.readByte(); fst.readLabel(in); final long minArcOutput; @@ -273,7 +273,7 @@ public final class Util { public static class TopNSearcher { private final FST fst; - private final FST.BytesReader bytesReader; + private final BytesReader bytesReader; private final int topN; private final int maxQueueDepth; @@ -374,7 +374,7 @@ public final class Util { //System.out.println("search topN=" + topN); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(0); final T NO_OUTPUT = fst.outputs.getNoOutput(); // TODO: we could enable FST to sorting arcs by weight @@ -544,7 +544,9 @@ public final class Util { * * *

- * Note: larger FSTs (a few thousand nodes) won't even render, don't bother. + * Note: larger FSTs (a few thousand nodes) won't even + * render, don't bother. If the FST is > 2.1 GB in size + * then this method will throw strange exceptions. * * @param sameRank * If true, the resulting dot file will try @@ -578,7 +580,7 @@ public final class Util { // A bitset of already seen states (target offset). final BitSet seen = new BitSet(); - seen.set(startArc.target); + seen.set((int) startArc.target); // Shape for states. final String stateShape = "circle"; @@ -595,7 +597,7 @@ public final class Util { emitDotState(out, "initial", "point", "white", ""); final T NO_OUTPUT = fst.outputs.getNoOutput(); - final FST.BytesReader r = fst.getBytesReader(0); + final BytesReader r = fst.getBytesReader(0); // final FST.Arc scratchArc = new FST.Arc(); @@ -617,7 +619,7 @@ public final class Util { finalOutput = null; } - emitDotState(out, Integer.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); + emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); } out.write(" initial -> " + startArc.target + "\n"); @@ -638,7 +640,8 @@ public final class Util { if (FST.targetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); - final int node = arc.target; + + final long node = arc.target; fst.readFirstRealTargetArc(arc.target, arc, r); @@ -648,7 +651,7 @@ public final class Util { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. - if (arc.target >= 0 && !seen.get(arc.target)) { + if (arc.target >= 0 && !seen.get((int) arc.target)) { /* boolean isFinal = false; @@ -675,12 +678,12 @@ public final class Util { finalOutput = ""; } - emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput); + emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); - seen.set(arc.target); + seen.set((int) arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); - sameLevelStates.add(arc.target); + sameLevelStates.add((int) arc.target); } String outs; @@ -893,8 +896,8 @@ public final class Util { // " targetLabel=" + targetLabel); while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc * mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * mid + 1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - label; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index dad5707167e..c0e96d2d484 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -99,7 +99,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { createIndex("index.nocfs", false, false); } */ - + /* // These are only needed for the special upgrade test to verify // that also single-segment indexes are correctly upgraded by IndexUpgrader. @@ -115,8 +115,40 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } */ + + /* + public void testCreateMoreTermsIndex() throws Exception { + // we use a real directory name that is not cleaned up, + // because this method is only used to create backwards + // indexes: + File indexDir = new File("moreterms"); + _TestUtil.rmDir(indexDir); + Directory dir = newFSDirectory(indexDir); + + LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); + mp.setUseCompoundFile(false); + mp.setNoCFSRatio(1.0); + mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY); + // TODO: remove randomness + IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) + .setMergePolicy(mp); + conf.setCodec(Codec.forName("Lucene40")); + IndexWriter writer = new IndexWriter(dir, conf); + LineFileDocs docs = new LineFileDocs(null, true); + for(int i=0;i<50;i++) { + writer.addDocument(docs.nextDoc()); + } + writer.close(); + dir.close(); + + // Gives you time to copy the index out!: (there is also + // a test option to not remove temp dir...): + Thread.sleep(100000); + } + */ + final static String[] oldNames = {"40.cfs", - "40.nocfs", + "40.nocfs", }; final String[] unsupportedNames = {"19.cfs", @@ -144,7 +176,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { }; final static String[] oldSingleSegmentNames = {"40.optimized.cfs", - "40.optimized.nocfs", + "40.optimized.nocfs", }; static Map oldIndexDirs; @@ -916,4 +948,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase { dir.close(); } } + + public static final String moreTermsIndex = "moreterms.40.zip"; + + public void testMoreTerms() throws Exception { + File oldIndexDir = _TestUtil.getTempDir("moreterms"); + _TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir); + Directory dir = newFSDirectory(oldIndexDir); + // TODO: more tests + _TestUtil.checkIndex(dir); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java index cf083c5a4cd..8d4e16690e5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java @@ -22,7 +22,6 @@ import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DocValues.Source; import org.apache.lucene.index.DocValues.Type; @@ -31,12 +30,14 @@ import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.ExactSimScorer; +import org.apache.lucene.search.similarities.Similarity.SimWeight; +import org.apache.lucene.search.similarities.Similarity.SloppySimScorer; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; /** * @@ -86,39 +87,6 @@ public class TestCustomNorms extends LuceneTestCase { dir.close(); docs.close(); } - - public void testPackedNorms() throws IOException { - Directory dir = newDirectory(); - IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - config.setSimilarity(new PackedNormSimilarity()); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config); - int num = _TestUtil.nextInt(random(), 1, 1000); - for (int i = 0; i < num; i++) { - Document doc = new Document(); - doc.add(new StringField("len", Integer.toString(i), Field.Store.YES)); - StringBuilder sb = new StringBuilder(); - for (int j = 0; j < i; j++) { - sb.append(" token"); - } - doc.add(new TextField("content", sb.toString(), Field.Store.NO)); - writer.addDocument(doc); - } - - DirectoryReader ir = writer.getReader(); - writer.close(); - for (AtomicReaderContext context : ir.leaves()) { - AtomicReader reader = context.reader(); - DocValues norms = reader.normValues("content"); - assertNotNull(norms); - Source source = norms.getSource(); - assertEquals(Type.VAR_INTS, source.getType()); - for (int i = 0; i < reader.maxDoc(); i++) { - assertEquals(source.getInt(i), Long.parseLong(reader.document(i).get("len"))); - } - } - ir.close(); - dir.close(); - } public void testExceptionOnRandomType() throws IOException { Directory dir = newDirectory(); @@ -334,28 +302,5 @@ public class TestCustomNorms extends LuceneTestCase { throw new UnsupportedOperationException(); } } - - class PackedNormSimilarity extends Similarity { - - @Override - public void computeNorm(FieldInvertState state, Norm norm) { - norm.setPackedLong(state.getLength()); - } - - @Override - public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException { - throw new UnsupportedOperationException(); - } - } } diff --git a/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip b/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip new file mode 100644 index 00000000000..53ad7ce31e9 Binary files /dev/null and b/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip differ diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java new file mode 100644 index 00000000000..cddc9a98869 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -0,0 +1,261 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.Random; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TimeUnits; +import org.apache.lucene.util.packed.PackedInts; +import org.junit.Ignore; +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; + +@Ignore("Requires tons of heap to run (10G works)") +@TimeoutSuite(millis = 100 * TimeUnits.HOUR) +public class Test2BFST extends LuceneTestCase { + + private static long LIMIT = 3L*1024*1024*1024; + + public void test() throws Exception { + int[] ints = new int[7]; + IntsRef input = new IntsRef(ints, 0, ints.length); + long seed = random().nextLong(); + + for(int doPackIter=0;doPackIter<2;doPackIter++) { + boolean doPack = doPackIter == 1; + + // Build FST w/ NoOutputs and stop when nodeCount > 3B + if (!doPack) { + System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); + Outputs outputs = NoOutputs.getSingleton(); + Object NO_OUTPUT = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, false, false, Integer.MAX_VALUE, outputs, + null, doPack, PackedInts.COMPACT, true, 15); + + int count = 0; + Random r = new Random(seed); + int[] ints2 = new int[200]; + IntsRef input2 = new IntsRef(ints2, 0, ints2.length); + while(true) { + //System.out.println("add: " + input + " -> " + output); + for(int i=10;i LIMIT) { + break; + } + nextInput(r, ints2); + } + + FST fst = b.finish(); + + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + + Arrays.fill(ints2, 0); + r = new Random(seed); + + for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + + Arrays.fill(ints2, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + for(int j=10;j outputs = ByteSequenceOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, + null, doPack, PackedInts.COMPACT, true, 15); + + byte[] outputBytes = new byte[20]; + BytesRef output = new BytesRef(outputBytes); + Arrays.fill(ints, 0); + int count = 0; + Random r = new Random(seed); + while(true) { + r.nextBytes(outputBytes); + //System.out.println("add: " + input + " -> " + output); + b.add(input, BytesRef.deepCopyOf(output)); + count++; + if (count % 1000000 == 0) { + System.out.println(count + "...: " + b.fstSizeInBytes() + " bytes"); + } + if (b.fstSizeInBytes() > LIMIT) { + break; + } + nextInput(r, ints); + } + + FST fst = b.finish(); + + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + + r = new Random(seed); + Arrays.fill(ints, 0); + + for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + r.nextBytes(outputBytes); + assertEquals(output, pair.output); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + } + + // Build FST w/ PositiveIntOutputs and stop when FST + // size = 3GB + { + System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); + Outputs outputs = PositiveIntOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, + null, doPack, PackedInts.COMPACT, true, 15); + + long output = 1; + + Arrays.fill(ints, 0); + int count = 0; + Random r = new Random(seed); + while(true) { + //System.out.println("add: " + input + " -> " + output); + b.add(input, output); + output += 1+r.nextInt(10); + count++; + if (count % 1000000 == 0) { + System.out.println(count + "...: " + b.fstSizeInBytes() + " bytes"); + } + if (b.fstSizeInBytes() > LIMIT) { + break; + } + nextInput(r, ints); + } + + FST fst = b.finish(); + + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + + Arrays.fill(ints, 0); + + output = 1; + r = new Random(seed); + for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + output = 1; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + assertEquals(output, pair.output.longValue()); + output += 1 + r.nextInt(10); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + } + } + } + + private void nextInput(Random r, int[] ints) { + int downTo = 6; + while(downTo >= 0) { + // Must add random amounts (and not just 1) because + // otherwise FST outsmarts us and remains tiny: + ints[downTo] += 1+r.nextInt(10); + if (ints[downTo] < 256) { + break; + } else { + ints[downTo] = 0; + downTo--; + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java new file mode 100644 index 00000000000..7b598ed69a8 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -0,0 +1,360 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestBytesStore extends LuceneTestCase { + + public void testRandom() throws Exception { + + final int iters = atLeast(10); + for(int iter=0;iter 1) { + int numOps = _TestUtil.nextInt(random(), 100, 200); + for(int op=0;op builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite, true); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite, PackedInts.DEFAULT, true, 15); boolean storeOrd = random().nextBoolean(); if (VERBOSE) { @@ -453,7 +453,7 @@ public class TestFSTs extends LuceneTestCase { this.outputs = outputs; this.doPack = doPack; - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack, !noArcArrays); + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack, PackedInts.DEFAULT, !noArcArrays, 15); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -484,8 +484,13 @@ public class TestFSTs extends LuceneTestCase { } } + long tMid = System.currentTimeMillis(); + System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); + assert builder.getTermCount() == ord; FST fst = builder.finish(); + long tEnd = System.currentTimeMillis(); + System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); if (fst == null) { System.out.println("FST was fully pruned!"); System.exit(0); @@ -513,6 +518,12 @@ public class TestFSTs extends LuceneTestCase { return; } + /* + IndexInput in = dir.openInput("fst.bin", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + */ + System.out.println("\nNow verify..."); while(true) { @@ -576,7 +587,7 @@ public class TestFSTs extends LuceneTestCase { } } - // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-2.0.8.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1022,7 +1033,7 @@ public class TestFSTs extends LuceneTestCase { throws IOException { if (FST.targetHasArcs(arc)) { int childCount = 0; - FST.BytesReader fstReader = fst.getBytesReader(0); + BytesReader fstReader = fst.getBytesReader(0); for (arc = fst.readFirstTargetArc(arc, arc, fstReader);; arc = fst.readNextArc(arc, fstReader), childCount++) { @@ -1062,7 +1073,7 @@ public class TestFSTs extends LuceneTestCase { public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean(), true); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean(), PackedInts.DEFAULT, true, 15); builder.add(Util.toUTF32("stat", new IntsRef()), 17L); builder.add(Util.toUTF32("station", new IntsRef()), 10L); final FST fst = builder.finish(); @@ -1077,7 +1088,7 @@ public class TestFSTs extends LuceneTestCase { public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final boolean willRewrite = random().nextBoolean(); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite, true); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite, PackedInts.DEFAULT, true, 15); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST fst = builder.finish(); @@ -1100,7 +1111,7 @@ public class TestFSTs extends LuceneTestCase { final Long nothing = outputs.getNoOutput(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, true); + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, true, 15); final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java index 6d85687fb56..7bb22f6ccbd 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java @@ -46,7 +46,7 @@ public class SearchFiles { /** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = - "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/java/4_0/demo.html for details."; + "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { System.out.println(usage); System.exit(0); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java index 6ced1c70ce8..ff2663302ef 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet.associations; import java.io.IOException; import org.apache.lucene.facet.search.PayloadIterator; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; @@ -46,12 +46,21 @@ public abstract class AssociationsPayloadIterator * It is assumed that all association values can be deserialized with the * given {@link CategoryAssociation}. */ - public AssociationsPayloadIterator(IndexReader reader, String field, T association) throws IOException { - pi = new PayloadIterator(reader, new Term(field, association.getCategoryListID())); - hasAssociations = pi.init(); + public AssociationsPayloadIterator(String field, T association) throws IOException { + pi = new PayloadIterator(new Term(field, association.getCategoryListID())); this.association = association; } + /** + * Sets the {@link AtomicReaderContext} for which {@link #setNextDoc(int)} + * calls will be made. Returns true iff this reader has associations for any + * of the documents belonging to the association given to the constructor. + */ + public final boolean setNextReader(AtomicReaderContext context) throws IOException { + hasAssociations = pi.setNextReader(context); + return hasAssociations; + } + /** * Skip to the requested document. Returns true iff the document has category * association values and they were read successfully. Associations are diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java index 0708910523d..4ddb077a546 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java @@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations; import java.io.IOException; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.collections.IntToFloatMap; /* @@ -31,9 +30,8 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato private final IntToFloatMap ordinalAssociations = new IntToFloatMap(); - public FloatAssociationsPayloadIterator(IndexReader reader, String field, CategoryFloatAssociation association) - throws IOException { - super(reader, field, association); + public FloatAssociationsPayloadIterator(String field, CategoryFloatAssociation association) throws IOException { + super(field, association); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java index e3bed4f51e5..c400c6d8768 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java @@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations; import java.io.IOException; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.collections.IntToIntMap; /* @@ -31,9 +30,8 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator< private final IntToIntMap ordinalAssociations = new IntToIntMap(); - public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association) - throws IOException { - super(reader, field, association); + public IntAssociationsPayloadIterator(String field, CategoryIntAssociation association) throws IOException { + super(field, association); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java index 576b9be077a..847da005d73 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java @@ -3,13 +3,10 @@ package org.apache.lucene.facet.index.params; import java.io.IOException; import java.io.Serializable; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; - import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.search.PayloadCategoryListIteraor; -import org.apache.lucene.facet.search.TotalFacetCounts; import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.index.Term; import org.apache.lucene.util.encoding.DGapIntEncoder; import org.apache.lucene.util.encoding.IntDecoder; import org.apache.lucene.util.encoding.IntEncoder; @@ -98,11 +95,6 @@ public class CategoryListParams implements Serializable { return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); } - /** - * Equality is defined by the 'term' that defines this category list. - * Sub-classes should override this method if a more complex calculation - * is needed to ensure equality. - */ @Override public boolean equals(Object o) { if (o == this) { @@ -121,29 +113,16 @@ public class CategoryListParams implements Serializable { return this.term.equals(other.term); } - /** - * Hashcode is similar to {@link #equals(Object)}, in that it uses - * the term that defines this category list to derive the hashcode. - * Subclasses need to ensure that equality/hashcode is correctly defined, - * or there could be side-effects in the {@link TotalFacetCounts} caching - * mechanism (as the filename for a Total Facet Counts array cache - * is dependent on the hashCode, so it should consistently return the same - * hash for identity). - */ @Override public int hashCode() { return this.hashCode; } - /** - * Create the category list iterator for the specified partition. - */ - public CategoryListIterator createCategoryListIterator(IndexReader reader, - int partition) throws IOException { + /** Create the {@link CategoryListIterator} for the specified partition. */ + public CategoryListIterator createCategoryListIterator(int partition) throws IOException { String categoryListTermStr = PartitionsUtils.partitionName(this, partition); Term payloadTerm = new Term(term.field(), categoryListTermStr); - return new PayloadCategoryListIteraor(reader, payloadTerm, - createEncoder().createMatchingDecoder()); + return new PayloadCategoryListIteraor(payloadTerm, createEncoder().createMatchingDecoder()); } } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java index a7bf378f2d2..6cbd6770a01 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java @@ -50,7 +50,7 @@ public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator { * Create an {@link AdaptiveFacetsAccumulator} * @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader) */ - public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, + public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { super(searchParams, indexReader, taxonomyReader); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java index 5132e930264..91ca03655ae 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.search; import java.io.IOException; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; /* @@ -23,6 +24,8 @@ import org.apache.lucene.util.IntsRef; /** * An interface for obtaining the category ordinals of documents. + * {@link #getOrdinals(int, IntsRef)} calls are done with document IDs that are + * local to the reader given to {@link #setNextReader(AtomicReaderContext)}. *

* NOTE: this class operates as a key to a map, and therefore you should * implement {@code equals()} and {@code hashCode()} for proper behavior. @@ -32,19 +35,20 @@ import org.apache.lucene.util.IntsRef; public interface CategoryListIterator { /** - * Initializes the iterator. This method must be called before any calls to - * {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are - * any relevant documents for this iterator. + * Sets the {@link AtomicReaderContext} for which + * {@link #getOrdinals(int, IntsRef)} calls will be made. Returns true iff any + * of the documents in this reader have category ordinals. This method must be + * called before any calls to {@link #getOrdinals(int, IntsRef)}. */ - public boolean init() throws IOException; - + public boolean setNextReader(AtomicReaderContext context) throws IOException; + /** * Stores the category ordinals of the given document ID in the given * {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows * the {@link IntsRef} if it is not large enough. * *

- * NOTE: if the requested document does not category ordinals + * NOTE: if the requested document does not have category ordinals * associated with it, {@link IntsRef#length} is set to zero. */ public void getOrdinals(int docID, IntsRef ints) throws IOException; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java index 3deba112f3e..b8639c3cc0d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java @@ -2,7 +2,7 @@ package org.apache.lucene.facet.search; import java.io.IOException; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; @@ -34,17 +34,15 @@ import org.apache.lucene.util.encoding.IntDecoder; public class PayloadCategoryListIteraor implements CategoryListIterator { private final IntDecoder decoder; - private final IndexReader indexReader; private final Term term; private final PayloadIterator pi; private final int hashCode; - public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { - pi = new PayloadIterator(indexReader, term); + public PayloadCategoryListIteraor(Term term, IntDecoder decoder) throws IOException { + pi = new PayloadIterator(term); this.decoder = decoder; - hashCode = indexReader.hashCode() ^ term.hashCode(); + hashCode = term.hashCode(); this.term = term; - this.indexReader = indexReader; } @Override @@ -58,7 +56,7 @@ public class PayloadCategoryListIteraor implements CategoryListIterator { } // Hash codes are the same, check equals() to avoid cases of hash-collisions. - return indexReader.equals(that.indexReader) && term.equals(that.term); + return term.equals(that.term); } @Override @@ -67,8 +65,8 @@ public class PayloadCategoryListIteraor implements CategoryListIterator { } @Override - public boolean init() throws IOException { - return pi.init(); + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return pi.setNextReader(context); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java index 7cc7527280d..7d956ac608e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java @@ -1,12 +1,10 @@ package org.apache.lucene.facet.search; import java.io.IOException; -import java.util.Iterator; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -42,99 +40,75 @@ import org.apache.lucene.util.BytesRef; */ public class PayloadIterator { - protected BytesRef data; - private TermsEnum reuseTE; - private DocsAndPositionsEnum currentDPE; + private DocsAndPositionsEnum dpe; private boolean hasMore; - private int curDocID, curDocBase; + private int curDocID; - private final Iterator leaves; private final Term term; - public PayloadIterator(IndexReader indexReader, Term term) throws IOException { - leaves = indexReader.leaves().iterator(); + public PayloadIterator(Term term) throws IOException { this.term = term; } - private void nextSegment() throws IOException { + /** + * Sets the {@link AtomicReaderContext} for which {@link #getPayload(int)} + * calls will be made. Returns true iff this reader has payload for any of the + * documents belonging to the {@link Term} given to the constructor. + */ + public boolean setNextReader(AtomicReaderContext context) throws IOException { hasMore = false; - while (leaves.hasNext()) { - AtomicReaderContext ctx = leaves.next(); - curDocBase = ctx.docBase; - Fields fields = ctx.reader().fields(); - if (fields != null) { - Terms terms = fields.terms(term.field()); - if (terms != null) { - reuseTE = terms.iterator(reuseTE); - if (reuseTE.seekExact(term.bytes(), true)) { - // this class is usually used to iterate on whatever a Query matched - // if it didn't match deleted documents, we won't receive them. if it - // did, we should iterate on them too, therefore we pass liveDocs=null - currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS); - if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - hasMore = true; - break; - } + Fields fields = context.reader().fields(); + if (fields != null) { + Terms terms = fields.terms(term.field()); + if (terms != null) { + reuseTE = terms.iterator(reuseTE); + if (reuseTE.seekExact(term.bytes(), true)) { + // this class is usually used to iterate on whatever a Query matched + // if it didn't match deleted documents, we won't receive them. if it + // did, we should iterate on them too, therefore we pass liveDocs=null + dpe = reuseTE.docsAndPositions(null, dpe, DocsAndPositionsEnum.FLAG_PAYLOADS); + if (dpe != null && (curDocID = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + hasMore = true; } } } } + return hasMore; } - /** - * Initialize the iterator. Should be done before the first call to - * {@link #getPayload(int)}. Returns {@code false} if no category list is - * found, or the category list has no documents. - */ - public boolean init() throws IOException { - nextSegment(); - return hasMore; - } - /** * Returns the {@link BytesRef payload} of the given document, or {@code null} * if the document does not exist, there are no more documents in the posting - * list, or the document exists but has not payload. You should call - * {@link #init()} before the first call to this method. + * list, or the document exists but has not payload. The given document IDs + * are treated as local to the reader given to + * {@link #setNextReader(AtomicReaderContext)}. */ public BytesRef getPayload(int docID) throws IOException { if (!hasMore) { return null; } - // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc - int localDocID = docID - curDocBase; - - if (curDocID > localDocID) { + if (curDocID > docID) { // document does not exist return null; } - if (curDocID < localDocID) { - // look for the document either in that segment, or others - while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) { - nextSegment(); // also updates curDocID - localDocID = docID - curDocBase; - // nextSegment advances to nextDoc, so check if we still need to advance - if (curDocID >= localDocID) { - break; + if (curDocID < docID) { + curDocID = dpe.advance(docID); + if (curDocID != docID) { // requested document does not have a payload + if (curDocID == DocIdSetIterator.NO_MORE_DOCS) { // no more docs in this reader + hasMore = false; } - } - - // we break from the above loop when: - // 1. we iterated over all segments (hasMore=false) - // 2. current segment advanced to a doc, either requested or higher - if (!hasMore || curDocID != localDocID) { return null; } } // we're on the document - assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase); - int pos = currentDPE.nextPosition(); - assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase); - return currentDPE.getPayload(); + assert dpe.freq() == 1 : "expecting freq=1 (got " + dpe.freq() + ") term=" + term + " doc=" + curDocID; + int pos = dpe.nextPosition(); + assert pos != -1 : "no positions for term=" + term + " doc=" + curDocID; + return dpe.getPayload(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java index 853651f2a1a..7b666be4008 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java @@ -62,7 +62,7 @@ public abstract class ScoredDocIdCollector extends Collector { } @Override - public ScoredDocIDsIterator scoredDocIdsIterator() { + protected ScoredDocIDsIterator scoredDocIdsIterator() { return new ScoredDocIDsIterator() { private DocIdSetIterator docIdsIter = docIds.iterator(); @@ -129,7 +129,7 @@ public abstract class ScoredDocIdCollector extends Collector { } @Override - public ScoredDocIDsIterator scoredDocIdsIterator() { + protected ScoredDocIDsIterator scoredDocIdsIterator() { return new ScoredDocIDsIterator() { private DocIdSetIterator docIdsIter = docIds.iterator(); @@ -189,8 +189,7 @@ public abstract class ScoredDocIdCollector extends Collector { * do not require scoring, it is better to set it to false. */ public static ScoredDocIdCollector create(int maxDoc, boolean enableScoring) { - return enableScoring ? new ScoringDocIdCollector(maxDoc) - : new NonScoringDocIdCollector(maxDoc); + return enableScoring ? new ScoringDocIdCollector(maxDoc) : new NonScoringDocIdCollector(maxDoc); } private ScoredDocIdCollector(int maxDoc) { @@ -198,13 +197,14 @@ public abstract class ScoredDocIdCollector extends Collector { docIds = new FixedBitSet(maxDoc); } + protected abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException; + /** Returns the default score used when scoring is disabled. */ public abstract float getDefaultScore(); /** Set the default score. Only applicable if scoring is disabled. */ public abstract void setDefaultScore(float defaultScore); - public abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException; public ScoredDocIDs getScoredDocIDs() { return new ScoredDocIDs() { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java index 32466269d60..7373e4dba34 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java @@ -4,22 +4,23 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.util.IntsRef; - import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.IntermediateFacetResult; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.util.PartitionsUtils; import org.apache.lucene.facet.util.ScoredDocIdsUtils; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -179,11 +180,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { List res = new ArrayList(); for (FacetRequest fr : searchParams.getFacetRequests()) { FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader); - IntermediateFacetResult tmpResult = fr2tmpRes.get(fr); + IntermediateFacetResult tmpResult = fr2tmpRes.get(fr); if (tmpResult == null) { continue; // do not add a null to the list. } - FacetResult facetRes = frHndlr.renderFacetResult(tmpResult); + FacetResult facetRes = frHndlr.renderFacetResult(tmpResult); // final labeling if allowed (because labeling is a costly operation) if (isAllowLabeling()) { frHndlr.labelResult(facetRes); @@ -213,18 +214,15 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { /** Check if it is worth to use complements */ protected boolean shouldComplement(ScoredDocIDs docids) { - return - mayComplement() && - (docids.size() > indexReader.numDocs() * getComplementThreshold()) ; + return mayComplement() && (docids.size() > indexReader.numDocs() * getComplementThreshold()) ; } /** * Iterate over the documents for this partition and fill the facet arrays with the correct * count/complement count/value. - * @throws IOException If there is a low-level I/O error. */ - private final void fillArraysForPartition(ScoredDocIDs docids, - FacetArrays facetArrays, int partition) throws IOException { + private final void fillArraysForPartition(ScoredDocIDs docids, FacetArrays facetArrays, int partition) + throws IOException { if (isUsingComplements) { initArraysByTotalCounts(facetArrays, partition, docids.size()); @@ -236,27 +234,41 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps for (Entry entry : categoryLists.entrySet()) { - CategoryListIterator categoryList = entry.getKey(); - if (!categoryList.init()) { - continue; - } - - Aggregator categorator = entry.getValue(); - ScoredDocIDsIterator iterator = docids.iterator(); + final ScoredDocIDsIterator iterator = docids.iterator(); + final CategoryListIterator categoryListIter = entry.getKey(); + final Aggregator aggregator = entry.getValue(); + Iterator contexts = indexReader.leaves().iterator(); + AtomicReaderContext current = null; + int maxDoc = -1; while (iterator.next()) { int docID = iterator.getDocID(); - categoryList.getOrdinals(docID, ordinals); - if (ordinals.length == 0) { - continue; + while (docID >= maxDoc) { // find the segment which contains this document + if (!contexts.hasNext()) { + throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?"); + } + current = contexts.next(); + maxDoc = current.docBase + current.reader().maxDoc(); + if (docID < maxDoc) { // segment has docs, check if it has categories + boolean validSegment = categoryListIter.setNextReader(current); + validSegment &= aggregator.setNextReader(current); + if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs + while (docID < maxDoc && iterator.next()) { + docID = iterator.getDocID(); + } + } + } } - categorator.aggregate(docID, iterator.getScore(), ordinals); + docID -= current.docBase; + categoryListIter.getOrdinals(docID, ordinals); + if (ordinals.length == 0) { + continue; // document does not have category ordinals + } + aggregator.aggregate(docID, iterator.getScore(), ordinals); } } } - /** - * Init arrays for partition by total counts, optionally applying a factor - */ + /** Init arrays for partition by total counts, optionally applying a factor */ private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) { int[] intArray = facetArrays.getIntArray(); totalFacetCounts.fillTotalCountsForPartition(intArray, partition); @@ -302,10 +314,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { for (FacetRequest facetRequest : searchParams.getFacetRequests()) { Aggregator categoryAggregator = facetRequest.createAggregator( - isUsingComplements, facetArrays, indexReader, taxonomyReader); + isUsingComplements, facetArrays, taxonomyReader); - CategoryListIterator cli = - facetRequest.createCategoryListIterator(indexReader, taxonomyReader, searchParams, partition); + CategoryListIterator cli = facetRequest.createCategoryListIterator(taxonomyReader, searchParams, partition); // get the aggregator Aggregator old = categoryLists.put(cli, categoryAggregator); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java index b5bb5c4cf0d..84c3b2fc05a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java @@ -170,7 +170,7 @@ public class TotalFacetCounts { Aggregator aggregator = new CountingAggregator(counts[partition]); HashMap map = new HashMap(); for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) { - final CategoryListIterator cli = clIteraor(clCache, clp, indexReader, partition); + final CategoryListIterator cli = clIteraor(clCache, clp, partition); map.put(cli, aggregator); } return map; @@ -181,14 +181,14 @@ public class TotalFacetCounts { return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed); } - static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, - IndexReader indexReader, int partition) throws IOException { + static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, int partition) + throws IOException { if (clCache != null) { CategoryListData cld = clCache.get(clp); if (cld != null) { return cld.iterator(partition); } } - return clp.createCategoryListIterator(indexReader, partition); + return clp.createCategoryListIterator(partition); } } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java index a3743317cc8..2eac9155706 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator; import java.io.IOException; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; /* @@ -22,21 +23,22 @@ import org.apache.lucene.util.IntsRef; */ /** - * An Aggregator is the analogue of Lucene's Collector (see - * {@link org.apache.lucene.search.Collector}), for processing the categories - * belonging to a certain document. The Aggregator is responsible for doing - * whatever it wishes with the categories it is fed, e.g., counting the number - * of times that each category appears, or performing some computation on their - * association values. - *

- * Much of the function of an Aggregator implementation is not described by this - * interface. This includes the constructor and getter methods to retrieve the - * results of the aggregation. + * Aggregates the categories of documents given to + * {@link #aggregate(int, float, IntsRef)}. Note that the document IDs are local + * to the reader given to {@link #setNextReader(AtomicReaderContext)}. * * @lucene.experimental */ public interface Aggregator { + /** + * Sets the {@link AtomicReaderContext} for which + * {@link #aggregate(int, float, IntsRef)} calls will be made. If this method + * returns false, {@link #aggregate(int, float, IntsRef)} should not be called + * for this reader. + */ + public boolean setNextReader(AtomicReaderContext context) throws IOException; + /** * Aggregate the ordinals of the given document ID (and its score). The given * ordinals offset is always zero. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java index 8cd71595dc8..53c03caf427 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator; import java.io.IOException; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; /* @@ -57,4 +58,9 @@ public class CountingAggregator implements Aggregator { return counterArray == null ? 0 : counterArray.hashCode(); } + @Override + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return true; + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java index 6c3dc492703..4b0a67083db 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator; import java.io.IOException; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; /* @@ -58,4 +59,9 @@ public class ScoringAggregator implements Aggregator { return hashCode; } + @Override + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return true; + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java index 22ebfecdba9..5aa38af2d33 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java @@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryFloatAssociation; import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.collections.IntToFloatMap; @@ -39,13 +39,13 @@ public class AssociationFloatSumAggregator implements Aggregator { protected final float[] sumArray; protected final FloatAssociationsPayloadIterator associations; - public AssociationFloatSumAggregator(IndexReader reader, float[] sumArray) throws IOException { - this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray); + public AssociationFloatSumAggregator(float[] sumArray) throws IOException { + this(CategoryListParams.DEFAULT_TERM.field(), sumArray); } - public AssociationFloatSumAggregator(String field, IndexReader reader, float[] sumArray) throws IOException { + public AssociationFloatSumAggregator(String field, float[] sumArray) throws IOException { this.field = field; - associations = new FloatAssociationsPayloadIterator(reader, field, new CategoryFloatAssociation()); + associations = new FloatAssociationsPayloadIterator(field, new CategoryFloatAssociation()); this.sumArray = sumArray; } @@ -76,4 +76,9 @@ public class AssociationFloatSumAggregator implements Aggregator { return field.hashCode(); } + @Override + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return associations.setNextReader(context); + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java index 2f12080a35b..52baf368aa7 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java @@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryIntAssociation; import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.collections.IntToIntMap; @@ -39,13 +39,13 @@ public class AssociationIntSumAggregator implements Aggregator { protected final int[] sumArray; protected final IntAssociationsPayloadIterator associations; - public AssociationIntSumAggregator(IndexReader reader, int[] sumArray) throws IOException { - this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray); + public AssociationIntSumAggregator(int[] sumArray) throws IOException { + this(CategoryListParams.DEFAULT_TERM.field(), sumArray); } - public AssociationIntSumAggregator(String field, IndexReader reader, int[] sumArray) throws IOException { + public AssociationIntSumAggregator(String field, int[] sumArray) throws IOException { this.field = field; - associations = new IntAssociationsPayloadIterator(reader, field, new CategoryIntAssociation()); + associations = new IntAssociationsPayloadIterator(field, new CategoryIntAssociation()); this.sumArray = sumArray; } @@ -76,4 +76,9 @@ public class AssociationIntSumAggregator implements Aggregator { return field.hashCode(); } + @Override + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return associations.setNextReader(context); + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java index 4dc5c694a8c..db4769c3d43 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java @@ -6,6 +6,7 @@ import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.IntsRef; @@ -56,25 +57,30 @@ public class CategoryListData { } /** Compute category list data for caching for faster iteration. */ - CategoryListData(IndexReader reader, TaxonomyReader taxo, - FacetIndexingParams iparams, CategoryListParams clp) throws IOException { + CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp) + throws IOException { - final int maxDoc = reader.maxDoc(); - int[][][]dpf = new int[maxDoc][][]; + int[][][]dpf = new int[reader.maxDoc()][][]; int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize()); IntsRef ordinals = new IntsRef(32); for (int part = 0; part < numPartitions; part++) { - CategoryListIterator cli = clp.createCategoryListIterator(reader, part); - if (cli.init()) { - for (int doc = 0; doc < maxDoc; doc++) { - cli.getOrdinals(doc, ordinals); - if (ordinals.length > 0) { - if (dpf[doc] == null) { - dpf[doc] = new int[numPartitions][]; - } - dpf[doc][part] = new int[ordinals.length]; - for (int i = 0; i < ordinals.length; i++) { - dpf[doc][part][i] = ordinals.ints[i]; + for (AtomicReaderContext context : reader.leaves()) { + CategoryListIterator cli = clp.createCategoryListIterator(part); + if (cli.setNextReader(context)) { + final int maxDoc = context.reader().maxDoc(); + for (int i = 0; i < maxDoc; i++) { + cli.getOrdinals(i, ordinals); + if (ordinals.length > 0) { + int doc = i + context.docBase; + if (dpf[doc] == null) { + dpf[doc] = new int[numPartitions][]; + } + if (dpf[doc][part] == null) { + dpf[doc][part] = new int[ordinals.length]; + } + for (int j = 0; j < ordinals.length; j++) { + dpf[doc][part][j] = ordinals.ints[j]; + } } } } @@ -93,6 +99,7 @@ public class CategoryListData { /** Internal: category list iterator over uncompressed category info in RAM */ private static class RAMCategoryListIterator implements CategoryListIterator { + private int docBase; private final int part; private final int[][][] dpc; @@ -102,13 +109,15 @@ public class CategoryListData { } @Override - public boolean init() throws IOException { + public boolean setNextReader(AtomicReaderContext context) throws IOException { + docBase = context.docBase; return dpc != null && dpc.length > part; } - + @Override public void getOrdinals(int docID, IntsRef ints) throws IOException { ints.length = 0; + docID += docBase; if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) { if (ints.ints.length < dpc[docID][part].length) { ints.grow(dpc[docID][part].length); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java index 71c7df7f0c4..2ee4636ae5a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java @@ -1,7 +1,5 @@ package org.apache.lucene.facet.search.params; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.aggregator.ComplementCountingAggregator; @@ -47,8 +45,7 @@ public class CountFacetRequest extends FacetRequest { } @Override - public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) { + public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) { // we rely on that, if needed, result is cleared by arrays! int[] a = arrays.getIntArray(); if (useComplements) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java index 388d0612817..851f3a3f3a2 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java @@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params; import java.io.IOException; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.search.FacetArrays; @@ -11,8 +9,8 @@ import org.apache.lucene.facet.search.FacetResultsHandler; import org.apache.lucene.facet.search.TopKFacetResultsHandler; import org.apache.lucene.facet.search.TopKInEachNodeHandler; import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.facet.search.cache.CategoryListData; import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.search.cache.CategoryListData; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -314,33 +312,29 @@ public abstract class FacetRequest implements Cloneable { * computation. * @param arrays * provider for facet arrays in use for current computation. - * @param indexReader - * index reader in effect. * @param taxonomy * reader of taxonomy in effect. * @throws IOException If there is a low-level I/O error. */ - public abstract Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader indexReader, - TaxonomyReader taxonomy) throws IOException; + public abstract Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) + throws IOException; /** - * Create the category list iterator for the specified partition. - * If a non null cache is provided which contains the required data, - * use it for the iteration. + * Create the category list iterator for the specified partition. If a non + * null cache is provided which contains the required data, use it for the + * iteration. */ - public CategoryListIterator createCategoryListIterator(IndexReader reader, - TaxonomyReader taxo, FacetSearchParams sParams, int partition) + public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, int partition) throws IOException { CategoryListCache clCache = sParams.getCategoryListCache(); CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath); - if (clCache!=null) { + if (clCache != null) { CategoryListData clData = clCache.get(clParams); - if (clData!=null) { + if (clData != null) { return clData.iterator(partition); } } - return clParams.createCategoryListIterator(reader, partition); + return clParams.createCategoryListIterator(partition); } /** diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java index b3ce102320b..8da3038a0be 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java @@ -1,7 +1,5 @@ package org.apache.lucene.facet.search.params; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.aggregator.ScoringAggregator; @@ -38,9 +36,7 @@ public class ScoreFacetRequest extends FacetRequest { } @Override - public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader reader, - TaxonomyReader taxonomy) { + public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) { assert !useComplements : "complements are not supported by this FacetRequest"; return new ScoringAggregator(arrays.getFloatArray()); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java index 35e7058c3e5..5e5a258d726 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java @@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations; import java.io.IOException; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.aggregator.associations.AssociationFloatSumAggregator; @@ -45,10 +43,10 @@ public class AssociationFloatSumFacetRequest extends FacetRequest { } @Override - public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader, - TaxonomyReader taxonomy) throws IOException { + public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) + throws IOException { assert !useComplements : "complements are not supported by this FacetRequest"; - return new AssociationFloatSumAggregator(reader, arrays.getFloatArray()); + return new AssociationFloatSumAggregator(arrays.getFloatArray()); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java index 96d5485de24..0d291119d6a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java @@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations; import java.io.IOException; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.aggregator.associations.AssociationIntSumAggregator; @@ -45,10 +43,10 @@ public class AssociationIntSumFacetRequest extends FacetRequest { } @Override - public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader, - TaxonomyReader taxonomy) throws IOException { + public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) + throws IOException { assert !useComplements : "complements are not supported by this FacetRequest"; - return new AssociationIntSumAggregator(reader, arrays.getIntArray()); + return new AssociationIntSumAggregator(arrays.getIntArray()); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java index e6f7f7d3b5a..294e602b762 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java @@ -60,6 +60,7 @@ public abstract class Sampler { /** * Construct with certain {@link SamplingParams} + * * @param params sampling params in effect * @throws IllegalArgumentException if the provided SamplingParams are not valid */ @@ -110,16 +111,15 @@ public abstract class Sampler { * @param sampleSetSize required size of sample set * @return sample of the input set in the required size */ - protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, - int sampleSetSize) throws IOException; + protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize) + throws IOException; /** * Get a fixer of sample facet accumulation results. Default implementation * returns a TakmiSampleFixer which is adequate only for * counting. For any other accumulator, provide a different fixer. */ - public SampleFixer getSampleFixer( - IndexReader indexReader, TaxonomyReader taxonomyReader, + public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader, FacetSearchParams searchParams) { return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams); } @@ -161,10 +161,10 @@ public abstract class Sampler { OverSampledFacetRequest sampledFreq = null; try { - sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest(); + sampledFreq = (OverSampledFacetRequest) facetResult.getFacetRequest(); } catch (ClassCastException e) { throw new IllegalArgumentException( - "It is only valid to call this method with result obtained for a" + + "It is only valid to call this method with result obtained for a " + "facet request created through sampler.overSamlpingSearchParams()", e); } @@ -215,19 +215,15 @@ public abstract class Sampler { } @Override - public CategoryListIterator createCategoryListIterator(IndexReader reader, - TaxonomyReader taxo, FacetSearchParams sParams, int partition) - throws IOException { - return orig.createCategoryListIterator(reader, taxo, sParams, partition); + public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, + int partition) throws IOException { + return orig.createCategoryListIterator(taxo, sParams, partition); } - @Override - public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader indexReader, - TaxonomyReader taxonomy) throws IOException { - return orig.createAggregator(useComplements, arrays, indexReader, - taxonomy); + public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) + throws IOException { + return orig.createAggregator(useComplements, arrays, taxonomy); } @Override @@ -245,4 +241,5 @@ public abstract class Sampler { return orig.supportsComplements(); } } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java index 54d2193f73f..1a1e6fa89d8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java @@ -91,8 +91,7 @@ class TakmiSampleFixer implements SampleFixer { * full set of matching documents. * @throws IOException If there is a low-level I/O error. */ - private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) - throws IOException { + private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException { // TODO (Facet): change from void to return the new, smaller docSet, and use // that for the children, as this will make their intersection ops faster. // can do this only when the new set is "sufficiently" smaller. @@ -109,8 +108,7 @@ class TakmiSampleFixer implements SampleFixer { Bits liveDocs = MultiFields.getLiveDocs(indexReader); int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs, drillDownTerm.field(), drillDownTerm.bytes(), - 0), - docIds.iterator()); + 0), docIds.iterator()); fresNode.setValue(updatedCount); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java index 402ba3dcf69..7d59a76dfed 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java @@ -5,6 +5,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.util.IntsRef; /* @@ -42,9 +43,10 @@ public class MultiCategoryListIterator implements CategoryListIterator { } @Override - public boolean init() throws IOException { + public boolean setNextReader(AtomicReaderContext context) throws IOException { + validIterators.clear(); for (CategoryListIterator cli : iterators) { - if (cli.init()) { + if (cli.setNextReader(context)) { validIterators.add(cli); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java b/lucene/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java index 248ec0da75b..923045ff2d9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java @@ -3,17 +3,18 @@ package org.apache.lucene.facet.util; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.OpenBitSetDISI; -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.ScoredDocIDsIterator; - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -49,48 +50,57 @@ public class ScoredDocIdsUtils { * @param reader holding the number of documents & information about deletions. */ public final static ScoredDocIDs getComplementSet(final ScoredDocIDs docids, final IndexReader reader) - throws IOException { + throws IOException { final int maxDoc = reader.maxDoc(); DocIdSet docIdSet = docids.getDocIDs(); - final OpenBitSet complement; - if (docIdSet instanceof OpenBitSet) { + final FixedBitSet complement; + if (docIdSet instanceof FixedBitSet) { // That is the most common case, if ScoredDocIdsCollector was used. - complement = ((OpenBitSet) docIdSet).clone(); + complement = ((FixedBitSet) docIdSet).clone(); } else { - complement = new OpenBitSetDISI(docIdSet.iterator(), maxDoc); + complement = new FixedBitSet(maxDoc); + DocIdSetIterator iter = docIdSet.iterator(); + int doc; + while ((doc = iter.nextDoc()) < maxDoc) { + complement.set(doc); + } } - complement.flip(0, maxDoc); - - // Remove all Deletions from the complement set clearDeleted(reader, complement); return createScoredDocIds(complement, maxDoc); } - - /** - * Clear all deleted documents from a given open-bit-set according to a given reader - */ - private static void clearDeleted(final IndexReader reader, - final OpenBitSet set) throws IOException { - + + /** Clear all deleted documents from a given open-bit-set according to a given reader */ + private static void clearDeleted(final IndexReader reader, final FixedBitSet set) throws IOException { + // If there are no deleted docs if (!reader.hasDeletions()) { return; // return immediately } - Bits bits = MultiFields.getLiveDocs(reader); - DocIdSetIterator it = set.iterator(); - int doc = DocIdSetIterator.NO_MORE_DOCS; - while ((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - if (!bits.get(doc)) { - set.fastClear(doc); + int doc = it.nextDoc(); + for (AtomicReaderContext context : reader.leaves()) { + AtomicReader r = context.reader(); + final int maxDoc = r.maxDoc() + context.docBase; + if (doc >= maxDoc) { // skip this segment + continue; } + if (!r.hasDeletions()) { // skip all docs that belong to this reader as it has no deletions + while ((doc = it.nextDoc()) < maxDoc) {} + continue; + } + Bits liveDocs = r.getLiveDocs(); + do { + if (!liveDocs.get(doc - context.docBase)) { + set.clear(doc); + } + } while ((doc = it.nextDoc()) < maxDoc); } } - + /** * Create a subset of an existing ScoredDocIDs object. * @@ -274,8 +284,7 @@ public class ScoredDocIdsUtils { if (target <= next) { target = next + 1; } - return next = target >= maxDoc ? NO_MORE_DOCS - : target; + return next = target >= maxDoc ? NO_MORE_DOCS : target; } @Override @@ -420,4 +429,5 @@ public class ScoredDocIdsUtils { } } } + } \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java index 9d815cffa01..709bb3c16f5 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java @@ -317,8 +317,7 @@ public abstract class FacetTestBase extends LuceneTestCase { } /** Validate results equality */ - protected static void assertSameResults(List expected, - List actual) { + protected static void assertSameResults(List expected, List actual) { String expectedResults = resStringValueOnly(expected); String actualResults = resStringValueOnly(actual); if (!expectedResults.equals(actualResults)) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java index 7003fbbf6b6..4c2f1fcfb6b 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java @@ -29,12 +29,11 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; public class AdaptiveAccumulatorTest extends BaseSampleTestTopK { @Override - protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, - TaxonomyReader taxoReader, IndexReader indexReader, - FacetSearchParams searchParams) { - AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams, - indexReader, taxoReader); + protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader, + IndexReader indexReader, FacetSearchParams searchParams) { + AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams, indexReader, taxoReader); res.setSampler(sampler); return res; } + } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java index 7db8056a71c..c326bbb30c6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java @@ -14,6 +14,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; @@ -106,30 +107,31 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - IntsRef ordinals = new IntsRef(); - CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); - cli.init(); int totalCategories = 0; - for (int i = 0; i < data.length; i++) { - Set values = new HashSet(); - for (int j = 0; j < data[i].length; j++) { - values.add(data[i].ints[j]); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder()); + for (AtomicReaderContext context : reader.leaves()) { + cli.setNextReader(context); + int maxDoc = context.reader().maxDoc(); + int dataIdx = context.docBase; + for (int doc = 0; doc < maxDoc; doc++, dataIdx++) { + Set values = new HashSet(); + for (int j = 0; j < data[dataIdx].length; j++) { + values.add(data[dataIdx].ints[j]); + } + cli.getOrdinals(doc, ordinals); + assertTrue("no ordinals for document " + doc, ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); + } + totalCategories += ordinals.length; } - cli.getOrdinals(i, ordinals); - assertTrue("no ordinals for document " + i, ordinals.length > 0); - for (int j = 0; j < ordinals.length; j++) { - assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); - } - totalCategories += ordinals.length; } - assertEquals("Missing categories!",10,totalCategories); + assertEquals("Missing categories!", 10, totalCategories); reader.close(); dir.close(); } - /** - * Test that a document with no payloads does not confuse the payload decoder. - */ @Test public void testPayloadIteratorWithInvalidDoc() throws Exception { Directory dir = newDirectory(); @@ -160,24 +162,28 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - IntsRef ordinals = new IntsRef(); - CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); - assertTrue("Failed to initialize payload iterator", cli.init()); int totalCategories = 0; - for (int i = 0; i < data.length; i++) { - Set values = new HashSet(); - for (int j = 0; j < data[i].length; j++) { - values.add(data[i].ints[j]); - } - cli.getOrdinals(i, ordinals); - if (i == 0) { - assertTrue("document 0 must have a payload", ordinals.length > 0); - for (int j = 0; j < ordinals.length; j++) { - assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder()); + for (AtomicReaderContext context : reader.leaves()) { + cli.setNextReader(context); + int maxDoc = context.reader().maxDoc(); + int dataIdx = context.docBase; + for (int doc = 0; doc < maxDoc; doc++, dataIdx++) { + Set values = new HashSet(); + for (int j = 0; j < data[dataIdx].length; j++) { + values.add(data[dataIdx].ints[j]); + } + cli.getOrdinals(doc, ordinals); + if (dataIdx == 0) { + assertTrue("document 0 must have a payload", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); + } + totalCategories += ordinals.length; + } else { + assertTrue("only document 0 should have a payload", ordinals.length == 0); } - totalCategories += ordinals.length; - } else { - assertTrue("only document 0 should have a payload", ordinals.length == 0); } } assertEquals("Wrong number of total categories!", 2, totalCategories); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java index aaafc9abd1e..0d7d3173aa1 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java @@ -22,6 +22,7 @@ import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.AtomicReaderContext; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -132,8 +133,8 @@ public class TestCategoryListCache extends FacetTestBase { } } @Override - public boolean init() throws IOException { - return it.init(); + public boolean setNextReader(AtomicReaderContext context) throws IOException { + return it.setNextReader(context); } }; } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java new file mode 100644 index 00000000000..db88c7359f3 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java @@ -0,0 +1,128 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.AssertingCategoryListIterator; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestStandardFacetsAccumulator extends LuceneTestCase { + + private void indexTwoDocs(IndexWriter indexWriter, FacetFields facetFields, boolean withContent) throws Exception { + for (int i = 0; i < 2; i++) { + Document doc = new Document(); + if (withContent) { + doc.add(new StringField("f", "a", Store.NO)); + } + if (facetFields != null) { + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("A", Integer.toString(i)))); + } + indexWriter.addDocument(doc); + } + + indexWriter.commit(); + } + + @Test + public void testSegmentsWithoutCategoriesOrResults() throws Exception { + // tests the accumulator when there are segments with no results + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES); // prevent merges + IndexWriter indexWriter = new IndexWriter(indexDir, iwc); + FacetIndexingParams fip = new FacetIndexingParams(new CategoryListParams() { + @Override + public CategoryListIterator createCategoryListIterator(int partition) throws IOException { + return new AssertingCategoryListIterator(super.createCategoryListIterator(partition)); + } + }); + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + FacetFields facetFields = new FacetFields(taxoWriter, fip); + indexTwoDocs(indexWriter, facetFields, false); // 1st segment, no content, with categories + indexTwoDocs(indexWriter, null, true); // 2nd segment, with content, no categories + indexTwoDocs(indexWriter, facetFields, true); // 3rd segment ok + indexTwoDocs(indexWriter, null, false); // 4th segment, no content, or categories + indexTwoDocs(indexWriter, null, true); // 5th segment, with content, no categories + indexTwoDocs(indexWriter, facetFields, true); // 6th segment, with content, with categories + IOUtils.close(indexWriter, taxoWriter); + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + IndexSearcher indexSearcher = new IndexSearcher(indexReader); + + // search for "f:a", only segments 1 and 3 should match results + Query q = new TermQuery(new Term("f", "a")); + ArrayList requests = new ArrayList(1); + CountFacetRequest countNoComplements = new CountFacetRequest(new CategoryPath("A"), 10) { + @Override + public boolean supportsComplements() { + return false; // disable complements + } + }; + requests.add(countNoComplements); + FacetSearchParams fsp = new FacetSearchParams(requests, fip); + FacetsCollector fc = new FacetsCollector(fsp , indexReader, taxoReader); + indexSearcher.search(q, fc); + List results = fc.getFacetResults(); + assertEquals("received too many facet results", 1, results.size()); + FacetResultNode frn = results.get(0).getFacetResultNode(); + assertEquals("wrong weight for \"A\"", 4, (int) frn.getValue()); + assertEquals("wrong number of children", 2, frn.getNumSubResults()); + for (FacetResultNode node : frn.getSubResults()) { + assertEquals("wrong weight for child " + node.getLabel(), 2, (int) node.getValue()); + } + IOUtils.close(indexReader, taxoReader); + + IOUtils.close(indexDir, taxoDir); + } + +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java index 404ac5f54e8..526505190a4 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java @@ -17,6 +17,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; import org.apache.lucene.facet.util.MultiCategoryListIterator; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; @@ -100,21 +101,24 @@ public class MultiCategoryListIteratorTest extends LuceneTestCase { clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams); iterators[i] = clCache.get(clp).iterator(0); // no partitions } else { - iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder); + iterators[i] = new PayloadCategoryListIteraor(clp.getTerm(), decoder); } } MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators); - assertTrue("failed to init multi-iterator", cli.init()); - IntsRef ordinals = new IntsRef(); - int maxDoc = indexReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - cli.getOrdinals(i, ordinals); - assertTrue("document " + i + " does not have categories", ordinals.length > 0); - for (int j = 0; j < ordinals.length; j++) { - CategoryPath cp = taxoReader.getPath(ordinals.ints[j]); - assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp); - if (cp.length == 2) { - assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1])); + for (AtomicReaderContext context : indexReader.leaves()) { + assertTrue("failed to init multi-iterator", cli.setNextReader(context)); + IntsRef ordinals = new IntsRef(); + final int maxDoc = context.reader().maxDoc(); + for (int i = 0; i < maxDoc; i++) { + cli.getOrdinals(i, ordinals); + assertTrue("document " + i + " does not have categories", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + CategoryPath cp = taxoReader.getPath(ordinals.ints[j]); + assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp); + if (cp.length == 2) { + int globalDoc = i + context.docBase; + assertEquals("invalid category for document " + globalDoc, globalDoc, Integer.parseInt(cp.components[1])); + } } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java b/lucene/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java index 008e8ab75cb..29398c58bb0 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java @@ -59,9 +59,8 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { return res; } - protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, - TaxonomyReader taxoReader, IndexReader indexReader, - FacetSearchParams searchParams); + protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader, + IndexReader indexReader, FacetSearchParams searchParams); /** * Try out faceted search with sampling enabled and complements either disabled or enforced @@ -89,7 +88,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { // try several times in case of failure, because the test has a chance to fail // if the top K facets are not sufficiently common with the sample set - for (int nTrial=0; nTrial=RETRIES-1) { + if (nTrial >= RETRIES - 1) { throw e; // no more retries allowed, must fail } } @@ -119,14 +118,11 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { assertSameResults(expected, sampledResults); } - private FacetsCollector samplingCollector( - final boolean complement, - final Sampler sampler, + private FacetsCollector samplingCollector(final boolean complement, final Sampler sampler, FacetSearchParams samplingSearchParams) { FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) { @Override - protected FacetsAccumulator initFacetsAccumulator( - FacetSearchParams facetSearchParams, IndexReader indexReader, + protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams); acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT); @@ -144,12 +140,13 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { samplingParams.setMinSampleSize((int) (100 * retryFactor)); samplingParams.setMaxSampleSize((int) (10000 * retryFactor)); samplingParams.setOversampleFactor(5.0 * retryFactor); + samplingParams.setSamplingThreshold(11000); //force sampling - samplingParams.setSamplingThreshold(11000); //force sampling Sampler sampler = useRandomSampler ? new RandomSampler(samplingParams, new Random(random().nextLong())) : new RepeatableSampler(samplingParams); assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs)); return sampler; } + } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/util/AssertingCategoryListIterator.java b/lucene/facet/src/test/org/apache/lucene/facet/util/AssertingCategoryListIterator.java new file mode 100644 index 00000000000..27139f3123f --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/util/AssertingCategoryListIterator.java @@ -0,0 +1,65 @@ +package org.apache.lucene.facet.util; + +import java.io.IOException; + +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.util.IntsRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryListIterator} which asserts that + * {@link #getOrdinals(int, IntsRef)} is not called before + * {@link #setNextReader(AtomicReaderContext)} and that if + * {@link #setNextReader(AtomicReaderContext)} returns false, + * {@link #getOrdinals(int, IntsRef)} isn't called. + */ +public class AssertingCategoryListIterator implements CategoryListIterator { + + private final CategoryListIterator delegate; + private boolean setNextReaderCalled = false; + private boolean validSegment = false; + private int maxDoc; + + public AssertingCategoryListIterator(CategoryListIterator delegate) { + this.delegate = delegate; + } + + @Override + public boolean setNextReader(AtomicReaderContext context) throws IOException { + setNextReaderCalled = true; + maxDoc = context.reader().maxDoc(); + return validSegment = delegate.setNextReader(context); + } + + @Override + public void getOrdinals(int docID, IntsRef ints) throws IOException { + if (!setNextReaderCalled) { + throw new RuntimeException("should not call getOrdinals without setNextReader first"); + } + if (!validSegment) { + throw new RuntimeException("should not call getOrdinals if setNextReader returned false"); + } + if (docID >= maxDoc) { + throw new RuntimeException("docID is larger than current maxDoc; forgot to call setNextReader?"); + } + delegate.getOrdinals(docID, ints); + } + +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java b/lucene/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java index cebb2333682..3ae661521d9 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java @@ -9,6 +9,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.search.ScoredDocIdCollector; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; @@ -21,14 +24,9 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.OpenBitSet; -import org.apache.lucene.util.OpenBitSetDISI; -import org.junit.Test; - +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.ScoredDocIDsIterator; -import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.junit.Test; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -52,21 +50,21 @@ public class TestScoredDocIDsUtils extends LuceneTestCase { @Test public void testComplementIterator() throws Exception { final int n = atLeast(10000); - final OpenBitSet bits = new OpenBitSet(n); - for (int i = 0; i < 5 * n; i++) { - bits.flip(random().nextInt(n)); + final FixedBitSet bits = new FixedBitSet(n); + Random random = random(); + for (int i = 0; i < n; i++) { + int idx = random.nextInt(n); + bits.flip(idx, idx + 1); } - OpenBitSet verify = new OpenBitSet(n); - verify.or(bits); + FixedBitSet verify = new FixedBitSet(bits); ScoredDocIDs scoredDocIDs = ScoredDocIdsUtils.createScoredDocIds(bits, n); Directory dir = newDirectory(); - IndexReader reader = createReaderWithNDocs(random(), n, dir); + IndexReader reader = createReaderWithNDocs(random, n, dir); try { - assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs, - reader).size()); + assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs, reader).size()); } finally { reader.close(); dir.close(); @@ -147,7 +145,7 @@ public class TestScoredDocIDsUtils extends LuceneTestCase { searcher.search(q, collector); ScoredDocIDs scoredDocIds = collector.getScoredDocIDs(); - OpenBitSet resultSet = new OpenBitSetDISI(scoredDocIds.getDocIDs().iterator(), reader.maxDoc()); + FixedBitSet resultSet = (FixedBitSet) scoredDocIds.getDocIDs(); // Getting the complement set of the query result ScoredDocIDs complementSet = ScoredDocIdsUtils.getComplementSet(scoredDocIds, reader); @@ -164,12 +162,11 @@ public class TestScoredDocIDsUtils extends LuceneTestCase { assertFalse( "Complement-Set must not contain deleted documents (doc="+docNum+")", live != null && !live.get(docNum)); - assertNull( - "Complement-Set must not contain docs from the original set (doc="+ docNum+")", + assertNull("Complement-Set must not contain docs from the original set (doc="+ docNum+")", reader.document(docNum).getField("del")); assertFalse( "Complement-Set must not contain docs from the original set (doc="+docNum+")", - resultSet.fastGet(docNum)); + resultSet.get(docNum)); } } finally { reader.close(); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index 518c14234f9..e2309879996 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -24,6 +24,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.*; +import org.apache.lucene.util.packed.PackedInts; /** * Finite state automata based implementation of "autocomplete" functionality. @@ -237,7 +238,8 @@ public class FSTCompletionBuilder { final Object empty = outputs.getNoOutput(); final Builder builder = new Builder( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, null, false, true); + shareMaxTailLength, outputs, null, false, + PackedInts.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 2ff3ee4491c..4d3924d667d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -40,6 +40,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.packed.PackedInts; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -288,7 +289,16 @@ public class FSTTester { outputs, null, willRewrite, - true); + PackedInts.DEFAULT, + true, + 15); + if (LuceneTestCase.VERBOSE) { + if (willRewrite) { + System.out.println("TEST: packed FST"); + } else { + System.out.println("TEST: non-packed FST"); + } + } for(InputOutput pair : pairs) { if (pair.output instanceof List) { diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5c33e2a634d..1578a75a5ec 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -41,8 +41,6 @@ Detailed Change List Other Changes ---------------------- -* SOLR-3735: Relocate the example mime-to-extension mapping, and - upgrade Velocity Engine to 1.7 (ehatcher) ================== 4.1.0 ================== @@ -50,14 +48,14 @@ Versions of Major Components --------------------- Apache Tika 1.2 Carrot2 3.6.2 -Velocity 1.6.4 and Velocity Tools 2.0 +Velocity 1.7 and Velocity Tools 2.0 Apache UIMA 2.3.1 Apache ZooKeeper 3.4.5 Upgrading from Solr 4.0.0 ---------------------- -Custom java parsing plugins need to migrade from throwing the internal +Custom java parsing plugins need to migrate from throwing the internal ParseException to throwing SyntaxError. BaseDistributedSearchTestCase now randomizes the servlet context it uses when @@ -150,7 +148,7 @@ New Features CoreAdmin API the same way as the data directory. (Mark Miller) * SOLR-4028: When using ZK chroot, it would be nice if Solr would create the - initial path when it doesn't exist. (Tomas Fernandez Lobbe via Mark Miller) + initial path when it doesn't exist. (Tomás Fernández Löbbe via Mark Miller) * SOLR-3948: Calculate/display deleted documents in admin interface. (Shawn Heisey via Mark Miller) @@ -209,6 +207,9 @@ New Features * SOLR-2201: DIH's "formatDate" function now supports a timezone as an optional fourth parameter (James Dyer, Mark Waddle) +* SOLR-4302: New parameter 'indexInfo' (defaults to true) in CoreAdmin STATUS + command can be used to omit index specific information (Shahar Davidson via shalin) + Optimizations ---------------------- @@ -226,12 +227,12 @@ Optimizations dynamicField's (steffkes) * SOLR-3941: The "commitOnLeader" part of distributed recovery can use - openSearcher=false. (Tomas Fernandez Lobbe via Mark Miller) + openSearcher=false. (Tomás Fernández Löbbe via Mark Miller) * SOLR-4063: Allow CoreContainer to load multiple SolrCores in parallel rather than just serially. (Mark Miller) -* SOLR-4199: When doing zk retries due to connectionloss, rather than just +* SOLR-4199: When doing zk retries due to connection loss, rather than just retrying for 2 minutes, retry in proportion to the session timeout. (Mark Miller) @@ -250,6 +251,10 @@ Optimizations * SOLR-3982: Admin UI: Various Dataimport Improvements (steffkes) +* SOLR-4296: Admin UI: Improve Dataimport Auto-Refresh (steffkes) + +* SOLR-3458: Allow multiple Items to stay open on Plugins-Page (steffkes) + Bug Fixes ---------------------- @@ -362,7 +367,7 @@ Bug Fixes * SOLR-4081: QueryParsing.toString, used during debugQuery=true, did not correctly handle ExtendedQueries such as WrappedQuery - (used when cache=false), spatial queries, and frange queires. + (used when cache=false), spatial queries, and frange queries. (Eirik Lygre, yonik) * SOLR-3959: Ensure the internal comma separator of poly fields is escaped @@ -403,7 +408,7 @@ Bug Fixes * SOLR-4162: ZkCli usage examples are not correct because the zkhost parameter is not present and it is mandatory for all commands. - (Tomas Fernandez Lobbe via Mark Miller) + (Tomás Fernández Löbbe via Mark Miller) * SOLR-4071: Validate that name is pass to Collections API create, and behave the same way as on startup when collection.configName is not explicitly passed. @@ -495,7 +500,7 @@ Bug Fixes * SOLR-4279: Wrong exception message if _version_ field is multivalued (shalin) * SOLR-4170: The 'backup' ReplicationHandler command can sometimes use a stale - index directory rather than the current one. (Mark Miller, Marcin Rzewuck) + index directory rather than the current one. (Mark Miller, Marcin Rzewucki) * SOLR-3876: Solr Admin UI is completely dysfunctional on IE 9 (steffkes) @@ -503,6 +508,17 @@ Bug Fixes import works fine with SolrCloud clusters (Deniz Durmus, James Dyer, Erick Erickson, shalin) +* SOLR-4291: Harden the Overseer work queue thread loop. (Mark Miller) + +* SOLR-3820: Solr Admin Query form is missing some edismax request parameters + (steffkes) + +* SOLR-4217: post.jar no longer ignores -Dparams when -Durl is used. + (Alexandre Rafalovitch, ehatcher) + +* SOLR-4303: On replication, if the generation of the master is lower than the + slave we need to force a full copy of the index. (Mark Miller, Gregg Donovan) + Other Changes ---------------------- @@ -580,6 +596,16 @@ Other Changes * SOLR-4208: ExtendedDismaxQParserPlugin has been refactored to make subclassing easier. (Tomás Fernández Löbbe, hossman) +* SOLR-3735: Relocate the example mime-to-extension mapping, and + upgrade Velocity Engine to 1.7 (ehatcher) + +* SOLR-4287: Removed "apache-" prefix from Solr distribution and artifact + filenames. (Ryan Ernst, Robert Muir, Steve Rowe) + +* SOLR-4016: Deduplication does not work with atomic/partial updates so + disallow atomic update requests which change signature generating fields. + (Joel Nothman, yonik, shalin) + ================== 4.0.0 ================== Versions of Major Components @@ -862,7 +888,7 @@ Bug Fixes * SOLR-3527: SolrCmdDistributor drops some of the important commit attributes (maxOptimizeSegments, softCommit, expungeDeletes) when sending a commit to - replicas. (Andy Laird, Tomas Fernandez Lobbe, Mark Miller) + replicas. (Andy Laird, Tomás Fernández Löbbe, Mark Miller) * SOLR-3844: SolrCore reload can fail because it tries to remove the index write lock while already holding it. (Mark Miller) @@ -1273,7 +1299,7 @@ New Features * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now supports "percentages" which get evaluated relative the current size of the cache when warming happens. - (Tomas Fernandez Lobbe and hossman) + (Tomás Fernández Löbbe and hossman) * SOLR-1932: New relevancy function queries: termfreq, tf, docfreq, idf norm, maxdoc, numdocs. (yonik) @@ -1644,12 +1670,12 @@ Bug Fixes down to it via acceptDocs since LUCENE-1536. (Mike Hugo, yonik) * SOLR-3214: If you use multiple fl entries rather than a comma separated list, all but the first - entry can be ignored if you are using distributed search. (Tomas Fernandez Lobbe via Mark Miller) + entry can be ignored if you are using distributed search. (Tomás Fernández Löbbe via Mark Miller) * SOLR-3352: eDismax: pf2 should kick in for a query with 2 terms (janhoy) * SOLR-3361: ReplicationHandler "maxNumberOfBackups" doesn't work if backups are triggered on commit - (James Dyer, Tomas Fernandez Lobbe) + (James Dyer, Tomás Fernández Löbbe) * SOLR-2605: fixed tracking of the 'defaultCoreName' in CoreContainer so that CoreAdminHandler could return consistent information regardless of wether @@ -1864,7 +1890,17 @@ Documentation * SOLR-2232: Improved README info on solr.solr.home in examples (Eric Pugh and hossman) - + +================== 3.6.2 ================== + +Bug Fixes +---------------------- +* SOLR-3790: ConcurrentModificationException could be thrown when using hl.fl=*. + (yonik, koji) + +* SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token. + (Tom Burton-West, Robert Muir) + ================== 3.6.1 ================== More information about this release, including any errata related to the release notes, upgrade instructions, or other changes may be found online at: @@ -1877,7 +1913,7 @@ Bug Fixes (Uwe Schindler, Mike McCandless, Robert Muir) * SOLR-3361: ReplicationHandler "maxNumberOfBackups" doesn't work if backups are triggered on commit - (James Dyer, Tomas Fernandez Lobbe) + (James Dyer, Tomás Fernández Löbbe) * SOLR-3375: Fix charset problems with HttpSolrServer (Roger Håkansson, yonik, siren) diff --git a/solr/README.txt b/solr/README.txt index 6d23cea0166..7b5ec4790c8 100644 --- a/solr/README.txt +++ b/solr/README.txt @@ -45,11 +45,11 @@ example/ Please see example/README.txt for information about running this example. -dist/apache-solr-XX.war +dist/solr-XX.war The Apache Solr Application. Deploy this WAR file to any servlet container to run Apache Solr. -dist/apache-solr--XX.jar +dist/solr--XX.jar The Apache Solr libraries. To compile Apache Solr Plugins, one or more of these will be required. The core library is required at a minimum. (see http://wiki.apache.org/solr/SolrPlugins diff --git a/solr/common-build.xml b/solr/common-build.xml index 77925bec746..f56e96bb1b7 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -25,7 +25,7 @@ - + @@ -114,7 +114,7 @@ - + @@ -214,13 +214,13 @@ - + - + diff --git a/solr/contrib/uima/README.txt b/solr/contrib/uima/README.txt index 9a862b7e7d1..d32063bc08d 100644 --- a/solr/contrib/uima/README.txt +++ b/solr/contrib/uima/README.txt @@ -19,7 +19,7 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f - + 2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options: diff --git a/solr/contrib/uima/src/test-files/uima/solr/collection1/conf/solrconfig.xml b/solr/contrib/uima/src/test-files/uima/solr/collection1/conf/solrconfig.xml index 1a7109d7519..ebd7903b9d3 100644 --- a/solr/contrib/uima/src/test-files/uima/solr/collection1/conf/solrconfig.xml +++ b/solr/contrib/uima/src/test-files/uima/solr/collection1/conf/solrconfig.xml @@ -44,8 +44,8 @@ in that directory which completely match the regex (anchored on both ends) will be included. --> - - + + - - + + + + true + non_indexed_signature_sS + false + v_t,t_field + org.apache.solr.update.processor.TextProfileSignature + + + + diff --git a/solr/core/src/test/org/apache/solr/core/AlternateDirectoryTest.java b/solr/core/src/test/org/apache/solr/core/AlternateDirectoryTest.java index 707d42de4cd..4ec7df041bd 100755 --- a/solr/core/src/test/org/apache/solr/core/AlternateDirectoryTest.java +++ b/solr/core/src/test/org/apache/solr/core/AlternateDirectoryTest.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.store.Directory; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.core.DirectoryFactory.DirContext; import org.junit.BeforeClass; import org.junit.Test; @@ -46,7 +47,7 @@ public class AlternateDirectoryTest extends SolrTestCaseJ4 { public static volatile Directory dir; @Override - public Directory create(String path) throws IOException { + public Directory create(String path, DirContext dirContext) throws IOException { openCalled = true; return dir = newFSDirectory(new File(path)); diff --git a/solr/core/src/test/org/apache/solr/core/RAMDirectoryFactoryTest.java b/solr/core/src/test/org/apache/solr/core/RAMDirectoryFactoryTest.java index ed0e41d7848..eb7c14d8eb5 100644 --- a/solr/core/src/test/org/apache/solr/core/RAMDirectoryFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/core/RAMDirectoryFactoryTest.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.solr.core.DirectoryFactory.DirContext; /** * Test-case for RAMDirectoryFactory @@ -37,13 +38,13 @@ public class RAMDirectoryFactoryTest extends LuceneTestCase { final Directory directory = new RAMDirectory(); RAMDirectoryFactory factory = new RAMDirectoryFactory() { @Override - protected Directory create(String path) { + protected Directory create(String path, DirContext dirContext) { return directory; } }; String path = "/fake/path"; - Directory dir1 = factory.get(path, null); - Directory dir2 = factory.get(path, null); + Directory dir1 = factory.get(path, DirContext.DEFAULT, null); + Directory dir2 = factory.get(path, DirContext.DEFAULT, null); assertEquals("RAMDirectoryFactory should not create new instance of RefCntRamDirectory " + "every time open() is called for the same path", dir1, dir2); @@ -53,7 +54,7 @@ public class RAMDirectoryFactoryTest extends LuceneTestCase { private void dotestOpenSucceedForEmptyDir() throws IOException { RAMDirectoryFactory factory = new RAMDirectoryFactory(); - Directory dir = factory.get("/fake/path", null); + Directory dir = factory.get("/fake/path", DirContext.DEFAULT, null); assertNotNull("RAMDirectoryFactory should create RefCntRamDirectory even if the path doen't lead " + "to index directory on the file system", dir); factory.release(dir); diff --git a/solr/core/src/test/org/apache/solr/update/processor/SignatureUpdateProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/SignatureUpdateProcessorFactoryTest.java index a9b510b7965..7003ce098d2 100755 --- a/solr/core/src/test/org/apache/solr/update/processor/SignatureUpdateProcessorFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/SignatureUpdateProcessorFactoryTest.java @@ -64,7 +64,7 @@ public class SignatureUpdateProcessorFactoryTest extends SolrTestCaseJ4 { chain = "dedupe"; // set the default that most tests expect } - void checkNumDocs(int n) { + static void checkNumDocs(int n) { SolrQueryRequest req = req(); try { assertEquals(n, req.getSearcher().getIndexReader().numDocs()); @@ -353,7 +353,11 @@ public class SignatureUpdateProcessorFactoryTest extends SolrTestCaseJ4 { } } - private void addDoc(String doc) throws Exception { + private void addDoc(String doc) throws Exception { + addDoc(doc, chain); + } + + static void addDoc(String doc, String chain) throws Exception { Map params = new HashMap(); MultiMapSolrParams mmparams = new MultiMapSolrParams(params); params.put(UpdateParams.UPDATE_CHAIN, new String[] { chain }); diff --git a/solr/core/src/test/org/apache/solr/update/processor/TestPartialUpdateDeduplication.java b/solr/core/src/test/org/apache/solr/update/processor/TestPartialUpdateDeduplication.java new file mode 100644 index 00000000000..c4cf5115c18 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/TestPartialUpdateDeduplication.java @@ -0,0 +1,74 @@ +package org.apache.solr.update.processor; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.google.common.collect.Maps; +import org.apache.noggit.ObjectBuilder; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static org.apache.solr.update.processor.SignatureUpdateProcessorFactoryTest.addDoc; + +public class TestPartialUpdateDeduplication extends SolrTestCaseJ4 { + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tlog.xml", "schema15.xml"); + } + + @Test + public void testPartialUpdates() throws Exception { + SignatureUpdateProcessorFactoryTest.checkNumDocs(0); + String chain = "dedupe"; + // partial update + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", "2a"); + Map map = Maps.newHashMap(); + map.put("set", "Hello Dude man!"); + doc.addField("v_t", map); + UpdateRequest req = new UpdateRequest(); + req.add(doc); + boolean exception_ok = false; + try { + addDoc(req.getXML(), chain); + } catch (Exception e) { + exception_ok = true; + } + assertTrue("Should have gotten an exception with partial update on signature generating field", + exception_ok); + + SignatureUpdateProcessorFactoryTest.checkNumDocs(0); + addDoc(adoc("id", "2a", "v_t", "Hello Dude man!", "name", "ali babi'"), chain); + doc = new SolrInputDocument(); + doc.addField("id", "2a"); + map = Maps.newHashMap(); + map.put("set", "name changed"); + doc.addField("name", map); + req = new UpdateRequest(); + req.add(doc); + addDoc(req.getXML(), chain); + addDoc(commit(), chain); + SignatureUpdateProcessorFactoryTest.checkNumDocs(1); + } +} diff --git a/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java b/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java index 863c1690747..0cb7d74de99 100644 --- a/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java +++ b/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java @@ -56,6 +56,7 @@ public class SimplePostToolTest extends SolrTestCaseJ4 { t_web = SimplePostTool.parseArgsAndInit(args); System.setProperty("params", "param1=foo¶m2=bar"); + System.setProperty("url", "http://localhost:5150/solr/update"); t_test = SimplePostTool.parseArgsAndInit(args); pf = new MockPageFetcher(); @@ -76,7 +77,7 @@ public class SimplePostToolTest extends SolrTestCaseJ4 { assertEquals(1, t_web.recursive); assertEquals(10, t_web.delay); - assertNotNull(t_test.solrUrl); + assertEquals("http://localhost:5150/solr/update?param1=foo¶m2=bar",t_test.solrUrl.toExternalForm()); } @Test diff --git a/solr/example/example-DIH/solr/db/conf/solrconfig.xml b/solr/example/example-DIH/solr/db/conf/solrconfig.xml index 4a569144856..d1463b95992 100644 --- a/solr/example/example-DIH/solr/db/conf/solrconfig.xml +++ b/solr/example/example-DIH/solr/db/conf/solrconfig.xml @@ -28,7 +28,7 @@ - + diff --git a/solr/example/example-DIH/solr/mail/conf/solrconfig.xml b/solr/example/example-DIH/solr/mail/conf/solrconfig.xml index 23194a39845..36d23c0466a 100644 --- a/solr/example/example-DIH/solr/mail/conf/solrconfig.xml +++ b/solr/example/example-DIH/solr/mail/conf/solrconfig.xml @@ -34,7 +34,7 @@ - + diff --git a/solr/example/example-DIH/solr/rss/conf/solrconfig.xml b/solr/example/example-DIH/solr/rss/conf/solrconfig.xml index 8a83e19c97b..01e1967feb1 100644 --- a/solr/example/example-DIH/solr/rss/conf/solrconfig.xml +++ b/solr/example/example-DIH/solr/rss/conf/solrconfig.xml @@ -28,7 +28,7 @@ - + diff --git a/solr/example/example-DIH/solr/solr/conf/solrconfig.xml b/solr/example/example-DIH/solr/solr/conf/solrconfig.xml index be0d4ede42b..b90d13f60c5 100644 --- a/solr/example/example-DIH/solr/solr/conf/solrconfig.xml +++ b/solr/example/example-DIH/solr/solr/conf/solrconfig.xml @@ -28,7 +28,7 @@ - + diff --git a/solr/example/example-DIH/solr/tika/conf/solrconfig.xml b/solr/example/example-DIH/solr/tika/conf/solrconfig.xml index 91a97104a90..2ad4c09f5c5 100644 --- a/solr/example/example-DIH/solr/tika/conf/solrconfig.xml +++ b/solr/example/example-DIH/solr/tika/conf/solrconfig.xml @@ -34,7 +34,7 @@ - + diff --git a/solr/example/solr/collection1/conf/solrconfig.xml b/solr/example/solr/collection1/conf/solrconfig.xml index cee2e1609c3..41c8304d348 100755 --- a/solr/example/solr/collection1/conf/solrconfig.xml +++ b/solr/example/solr/collection1/conf/solrconfig.xml @@ -70,16 +70,16 @@ with their external dependencies. --> - + - + - + - + @@ -64,7 +64,7 @@ - diff --git a/solr/webapp/web/css/styles/dataimport.css b/solr/webapp/web/css/styles/dataimport.css index 712d137226f..21732b61809 100644 --- a/solr/webapp/web/css/styles/dataimport.css +++ b/solr/webapp/web/css/styles/dataimport.css @@ -139,14 +139,14 @@ display: none !important; } -#content #dataimport #current_state .time, +#content #dataimport #current_state .last_update, #content #dataimport #current_state .info { display: block; padding-left: 21px; } -#content #dataimport #current_state .time +#content #dataimport #current_state .last_update { color: #c0c0c0; font-size: 11px; diff --git a/solr/webapp/web/css/styles/query.css b/solr/webapp/web/css/styles/query.css index ceeeacd9743..0547e2ca101 100644 --- a/solr/webapp/web/css/styles/query.css +++ b/solr/webapp/web/css/styles/query.css @@ -26,16 +26,22 @@ width: 100%; } +#content #query #form input, +#content #query #form textarea +{ + width: 98%; +} + #content #query #form #start { float: left; - margin-right: 2%; - width: 49%; + width: 45%; } #content #query #form #rows { - width: 49%; + float: right; + width: 45%; } #content #query #form .checkbox input diff --git a/solr/webapp/web/js/scripts/dataimport.js b/solr/webapp/web/js/scripts/dataimport.js index 15a65dd8fdf..e70780a2ed0 100644 --- a/solr/webapp/web/js/scripts/dataimport.js +++ b/solr/webapp/web/js/scripts/dataimport.js @@ -431,7 +431,6 @@ sammy.get success : function( response, text_status, xhr ) { var state_element = $( '#current_state', content_element ); - var time_element = $( '.time', state_element ); var status = response.status; var rollback_time = response.statusMessages.Rolledback || null; @@ -448,30 +447,64 @@ sammy.get function dataimport_compute_details( response, details_element, elapsed_seconds ) { - var config = { + details_element + .show(); + + // -- + + var document_config = { 'Requests' : 'Total Requests made to DataSource', 'Fetched' : 'Total Rows Fetched', 'Skipped' : 'Total Documents Skipped', 'Processed' : 'Total Documents Processed' }; - var details = []; - for( var key in config ) + var document_details = []; + for( var key in document_config ) { - var value = parseInt( response.statusMessages[config[key]], 10 ); + var value = parseInt( response.statusMessages[document_config[key]], 10 ); - var detail = '' + key.esc() + ': ' + format_number( value ).esc(); + var detail = '' + key.esc() + ': ' + format_number( value ).esc(); if( elapsed_seconds && 'skipped' !== key.toLowerCase() ) { detail += ' (' + format_number( Math.round( value / elapsed_seconds ) ).esc() + '/s)' } - details.push( detail ); + document_details.push( detail ); }; - details_element - .html( details.join( ', ' ) ) - .show(); + $( '.docs', details_element ) + .html( document_details.join( ', ' ) ); + + // -- + + var dates_config = { + 'Started' : 'Full Dump Started', + 'Aborted' : 'Aborted', + 'Rolledback' : 'Rolledback' + }; + + var dates_details = []; + for( var key in dates_config ) + { + var value = response.statusMessages[dates_config[key]]; + + if( value ) + { + var detail = '' + key.esc() + ': ' + + '' + value.esc() + ''; + dates_details.push( detail ); + } + }; + + var dates_element = $( '.dates', details_element ); + + dates_element + .html( dates_details.join( ', ' ) ); + + $( '.time', dates_element ) + .removeData( 'timeago' ) + .timeago(); }; var get_time_taken = function get_default_time_taken() @@ -524,22 +557,14 @@ sammy.get ); }; - var set_time = function set_time( time_text ) - { - time_element - .text( time_text ) - .removeData( 'timeago' ) - .timeago() - .show(); - } - state_element .removeAttr( 'class' ); - time_element - .empty() - .hide(); - + var current_time = new Date(); + $( '.last_update abbr', state_element ) + .text( current_time.toTimeString().split( ' ' ).shift() ) + .attr( 'title', current_time.toUTCString() ); + $( '.info', state_element ) .removeClass( 'loader' ); @@ -563,26 +588,12 @@ sammy.get : 'Indexing ...'; show_full_info( info_text, elapsed_seconds ); - - if( !app.timeout && autorefresh_status ) - { - app.timeout = window.setTimeout - ( - function() - { - dataimport_fetch_status( true ) - }, - dataimport_timeout - ); - } } else if( rollback_time ) { state_element .addClass( 'failure' ); - set_time( rollback_time ); - show_full_info(); } else if( abort_time ) @@ -590,8 +601,6 @@ sammy.get state_element .addClass( 'aborted' ); - set_time( abort_time ); - show_full_info( 'Aborting current Import ...' ); } else if( 'idle' === status && 0 !== messages_count ) @@ -599,12 +608,6 @@ sammy.get state_element .addClass( 'success' ); - var started_at = response.statusMessages['Full Dump Started']; - if( started_at ) - { - set_time( started_at ); - } - show_full_info(); } else @@ -625,6 +628,18 @@ sammy.get $( '#raw_output_container', content_element ).html( code ); hljs.highlightBlock( code.get(0) ); + + if( !app.timeout && autorefresh_status ) + { + app.timeout = window.setTimeout + ( + function() + { + dataimport_fetch_status( true ) + }, + dataimport_timeout + ); + } }, error : function( xhr, text_status, error_thrown ) { diff --git a/solr/webapp/web/js/scripts/plugins.js b/solr/webapp/web/js/scripts/plugins.js index 739c65e6921..e8b94f62d5b 100644 --- a/solr/webapp/web/js/scripts/plugins.js +++ b/solr/webapp/web/js/scripts/plugins.js @@ -206,7 +206,7 @@ var render_plugin_data = function( plugin_data, plugin_sort, types ) } content += '
  • ' + "\n"; - content += ''; + content += ''; content += '' + bean.esc() + ''; content += '' + "\n"; content += '
      ' + "\n"; @@ -279,8 +279,48 @@ var render_plugin_data = function( plugin_data, plugin_sort, types ) frame_element .html( content ); - $( 'a[href="' + decodeURIComponent( active_context.path ) + '"]', frame_element ) - .parent().addClass( 'expanded' ); + + var path = active_context.path.split( '?entry=' ); + var entries = ( path[1] || '' ).split( ',' ); + + var entry_count = entries.length; + for( var i = 0; i < entry_count; i++ ) + { + $( 'a[data-bean="' + entries[i] + '"]', frame_element ) + .parent().addClass( 'expanded' ); + } + + $( 'a', frame_element ) + .off( 'click' ) + .on + ( + 'click', + function( event ) + { + var self = $( this ); + var bean = self.data( 'bean' ); + + var split = '?entry='; + var path = active_context.path.split( split ); + var entry = ( path[1] || '' ); + + var regex = new RegExp( bean.replace( /\//g, '\\/' ) + '(,|$)' ); + var match = regex.test( entry ); + + var url = path[0] + split; + + url += match + ? entry.replace( regex, '' ) + : entry + ',' + bean; + + url = url.replace( /=,/, '=' ); + url = url.replace( /,$/, '' ); + url = url.replace( /\?entry=$/, '' ); + + active_context.redirect( url ); + return false; + } + ); // Try to make links for anything with http (but leave the rest alone) $( '.detail dd' ).each(function(index) { diff --git a/solr/webapp/web/js/scripts/query.js b/solr/webapp/web/js/scripts/query.js index 78b9f6f4202..aefb4ba76ca 100644 --- a/solr/webapp/web/js/scripts/query.js +++ b/solr/webapp/web/js/scripts/query.js @@ -113,27 +113,38 @@ sammy.get 'submit', function( event ) { - var form_map = {}; var form_values = []; - var all_form_values = query_form.formToArray(); - - for( var i = 0; i < all_form_values.length; i++ ) + + var add_to_form_values = function add_to_form_values( fields ) { - if( !all_form_values[i].value || 0 === all_form_values[i].value.length ) - { - continue; - } + for( var i in fields ) + { + if( !fields[i].value || 0 === fields[i].value.length ) + { + continue; + } + + form_values.push( fields[i] ); + } + }; + + var fieldsets = $( '> fieldset', query_form ); + + var fields = fieldsets.first().formToArray(); + add_to_form_values( fields ); - var name_parts = all_form_values[i].name.split( '.' ); - if( 1 < name_parts.length && !form_map[name_parts[0]] ) - { - console.debug( 'skip "' + all_form_values[i].name + '", parent missing' ); - continue; - } - - form_map[all_form_values[i].name] = all_form_values[i].value; - form_values.push( all_form_values[i] ); - } + fieldsets.not( '.common' ) + .each + ( + function( i, set ) + { + if( $( 'legend input', set ).is( ':checked' ) ) + { + var fields = $( set ).formToArray(); + add_to_form_values( fields ); + } + } + ); var handler_path = $( '#qt', query_form ).val(); if( '/' !== handler_path[0] ) @@ -144,7 +155,13 @@ sammy.get var query_url = window.location.protocol + '//' + window.location.host + core_basepath + handler_path + '?' + $.param( form_values ); - + + var custom_parameters = $( '#custom_parameters', query_form ).val(); + if( custom_parameters && 0 !== custom_parameters.length ) + { + query_url += '&' + custom_parameters.replace( /^&/, '' ); + } + url_element .attr( 'href', query_url ) .text( query_url ) diff --git a/solr/webapp/web/tpl/dataimport.html b/solr/webapp/web/tpl/dataimport.html index 6fd9af12547..17f3e1da0ee 100644 --- a/solr/webapp/web/tpl/dataimport.html +++ b/solr/webapp/web/tpl/dataimport.html @@ -22,11 +22,14 @@ limitations under the License.
      - +

      Last Update: Unknown

      -
      +
      +
      +
      +
      diff --git a/solr/webapp/web/tpl/query.html b/solr/webapp/web/tpl/query.html index c12be43fe40..1bcce1f5c56 100644 --- a/solr/webapp/web/tpl/query.html +++ b/solr/webapp/web/tpl/query.html @@ -48,8 +48,10 @@ limitations under the License. start, rows - - +
      + + +
      + + + @@ -160,6 +167,34 @@ limitations under the License. + + + + + + + + + + + + + + + + + + + + + +