From b679816a7001ba986618f103a9a9c67ececc7ff8 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Tue, 6 Apr 2010 19:19:27 +0000 Subject: [PATCH] LUCENE-2370: Reintegrate flex_1458 branch into trunk (revision 931101) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@931278 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 78 +- lucene/LICENSE.txt | 56 + lucene/NOTICE.txt | 9 + .../org/apache/lucene/index/SegmentInfo.java | 7 + .../apache/lucene/index/SegmentMerger.java | 19 +- .../apache/lucene/index/SegmentReader.java | 12 + .../apache/lucene/index/codecs/Codec.java} | 14 +- .../lucene/index/codecs/CodecProvider.java | 25 + .../org/apache/lucene/store/DataInput.java | 234 ++ .../org/apache/lucene/store/DataOutput.java | 194 + .../org/apache/lucene/store/IndexInput.java | 206 +- .../org/apache/lucene/store/IndexOutput.java | 169 +- .../java/org/apache/lucene/util/BytesRef.java | 27 + .../org/apache/lucene/util/UnicodeUtil.java | 6 +- .../analysis/TestNumericTokenStream.java | 73 - .../TestTermAttributeImpl.java | 4 +- .../test/org/apache/lucene/index/TestDoc.java | 11 +- .../apache/lucene/index/TestIndexReader.java | 32 +- .../apache/lucene/index/TestIndexWriter.java | 45 +- .../lucene/index/TestIndexWriterDelete.java | 16 +- .../lucene/index/TestIndexWriterReader.java | 6 +- .../lucene/index/TestLazyProxSkipping.java | 2 +- .../lucene/index/TestMultiLevelSkipList.java | 14 +- .../org/apache/lucene/index/TestPayloads.java | 7 +- .../lucene/index/TestSegmentMerger.java | 8 +- .../lucene/index/TestSegmentReader.java | 1 + .../lucene/index/TestSegmentTermDocs.java | 29 +- .../lucene/index/TestSegmentTermEnum.java | 17 - .../lucene/index/TestStressIndexing2.java | 8 +- .../org/apache/lucene/search/CheckHits.java | 2 +- .../search/TestCachingWrapperFilter.java | 2 +- .../search/TestNumericRangeQuery32.java | 68 +- .../search/TestNumericRangeQuery64.java | 4 + .../org/apache/lucene/search/TestSort.java | 74 +- .../apache/lucene/search/TestTermScorer.java | 18 +- .../apache/lucene/search/TestWildcard.java | 3 +- .../lucene/util/TestAttributeSource.java | 12 +- .../apache/lucene/util/TestNumericUtils.java | 4 + lucene/build.xml | 47 +- lucene/common-build.xml | 5 + lucene/contrib/benchmark/sortBench.py | 553 +++ .../byTask/feeds/EnwikiDocMaker.java | 38 + .../benchmark/byTask/feeds/LineDocMaker.java | 50 + .../benchmark/byTask/TestPerfTasksLogic.java | 29 +- .../highlight/WeightedSpanTermExtractor.java | 13 +- .../lucene/index/FieldNormModifier.java | 73 +- .../lucene/index/MultiPassIndexSplitter.java | 38 + .../lucene/index/TermVectorAccessor.java | 78 +- .../org/apache/lucene/misc/HighFreqTerms.java | 43 +- .../lucene/misc/LengthNormModifier.java | 154 + .../lucene/index/TestFieldNormModifier.java | 14 +- .../apache/lucene/search/DuplicateFilter.java | 164 +- .../lucene/search/FuzzyLikeThisQuery.java | 76 +- .../apache/lucene/search/TestRemoteSort.java | 34 +- .../spatial/tier/CartesianShapeFilter.java | 50 +- .../lucene/spatial/tier/TestCartesian.java | 27 +- .../lucene/spatial/tier/TestDistance.java | 6 +- .../lucene/search/spell/LuceneDictionary.java | 50 +- .../surround/query/SrndPrefixQuery.java | 46 +- .../surround/query/SrndTermQuery.java | 21 +- .../surround/query/SrndTruncQuery.java | 56 +- .../lucene/analysis/NumericTokenStream.java | 152 +- .../org/apache/lucene/analysis/Token.java | 65 +- .../tokenattributes/CharTermAttribute.java | 71 + .../CharTermAttributeImpl.java | 255 ++ .../tokenattributes/TermAttribute.java | 2 + .../tokenattributes/TermAttributeImpl.java | 208 +- .../TermToBytesRefAttribute.java | 47 + .../lucene/document/CompressionTools.java | 6 +- .../lucene/index/AbstractAllTermDocs.java | 1 + .../org/apache/lucene/index/AllDocsEnum.java | 78 + .../org/apache/lucene/index/AllTermDocs.java | 2 + .../apache/lucene/index/ByteBlockPool.java | 20 +- .../apache/lucene/index/ByteSliceReader.java | 21 +- .../apache/lucene/index/ByteSliceWriter.java | 18 +- .../apache/lucene/index/CharBlockPool.java | 60 - .../org/apache/lucene/index/CheckIndex.java | 200 +- .../lucene/index/CompoundFileReader.java | 3 +- .../apache/lucene/index/DirectoryReader.java | 183 +- .../lucene/index/DocFieldProcessor.java | 2 +- .../index/DocFieldProcessorPerThread.java | 9 +- .../lucene/index/DocInverterPerField.java | 3 +- .../lucene/index/DocInverterPerThread.java | 8 +- .../lucene/index/DocsAndPositionsEnum.java | 44 + .../org/apache/lucene/index/DocsEnum.java | 93 + .../apache/lucene/index/DocumentsWriter.java | 234 +- .../org/apache/lucene/index/FieldInfo.java | 17 +- .../org/apache/lucene/index/FieldInfos.java | 5 +- .../java/org/apache/lucene/index/Fields.java | 36 + .../org/apache/lucene/index/FieldsEnum.java | 74 + .../lucene/index/FilterIndexReader.java | 15 +- .../index/FormatPostingsDocsWriter.java | 129 - .../index/FormatPostingsFieldsWriter.java | 75 - .../index/FormatPostingsPositionsWriter.java | 89 - .../index/FormatPostingsTermsConsumer.java | 47 - .../index/FormatPostingsTermsWriter.java | 73 - .../lucene/index/FreqProxFieldMergeState.java | 23 +- .../lucene/index/FreqProxTermsWriter.java | 151 +- .../index/FreqProxTermsWriterPerField.java | 37 +- .../apache/lucene/index/IndexFileDeleter.java | 36 +- .../lucene/index/IndexFileNameFilter.java | 44 +- .../apache/lucene/index/IndexFileNames.java | 61 +- .../org/apache/lucene/index/IndexReader.java | 271 +- .../org/apache/lucene/index/IndexWriter.java | 70 +- .../lucene/index/IndexWriterConfig.java | 19 + .../org/apache/lucene/index/LegacyFields.java | 41 + .../apache/lucene/index/LegacyFieldsEnum.java | 337 ++ ...eInfo.java => LegacySegmentMergeInfo.java} | 5 +- ...ueue.java => LegacySegmentMergeQueue.java} | 12 +- .../org/apache/lucene/index/LegacyTerms.java | 52 + .../index/MultiDocsAndPositionsEnum.java | 135 + .../apache/lucene/index/MultiDocsEnum.java | 113 + .../org/apache/lucene/index/MultiFields.java | 229 + .../apache/lucene/index/MultiFieldsEnum.java | 142 + .../org/apache/lucene/index/MultiReader.java | 62 +- .../org/apache/lucene/index/MultiTerms.java | 84 + .../apache/lucene/index/MultiTermsEnum.java | 397 ++ .../lucene/index/MultipleTermPositions.java | 4 +- .../lucene/index/ParallelPostingsArray.java | 48 +- .../apache/lucene/index/ParallelReader.java | 79 +- .../lucene/index/ReadOnlyDirectoryReader.java | 13 +- .../org/apache/lucene/index/SegmentInfo.java | 104 +- .../org/apache/lucene/index/SegmentInfos.java | 31 +- .../apache/lucene/index/SegmentMerger.java | 305 +- .../apache/lucene/index/SegmentReadState.java | 43 + .../apache/lucene/index/SegmentReader.java | 593 ++- .../lucene/index/SegmentWriteState.java | 63 +- .../lucene/index/StoredFieldsWriter.java | 4 +- .../java/org/apache/lucene/index/Term.java | 9 +- .../org/apache/lucene/index/TermDocs.java | 4 +- .../org/apache/lucene/index/TermEnum.java | 4 +- .../apache/lucene/index/TermInfosWriter.java | 228 - .../apache/lucene/index/TermPositions.java | 3 +- .../index/TermVectorsTermsWriterPerField.java | 84 +- .../TermVectorsTermsWriterPerThread.java | 6 +- .../lucene/index/TermVectorsWriter.java | 10 +- .../java/org/apache/lucene/index/Terms.java | 101 + .../org/apache/lucene/index/TermsEnum.java | 181 + .../index/TermsHashConsumerPerField.java | 4 +- .../lucene/index/TermsHashPerField.java | 290 +- .../lucene/index/TermsHashPerThread.java | 54 +- .../org/apache/lucene/index/codecs/Codec.java | 59 + .../lucene/index/codecs/CodecProvider.java | 108 + .../lucene/index/codecs/FieldsConsumer.java | 51 + .../FieldsProducer.java} | 14 +- .../MappingMultiDocsAndPositionsEnum.java | 121 + .../index/codecs/MappingMultiDocsEnum.java | 99 + .../lucene/index/codecs/MergeState.java | 42 + .../codecs/MultiLevelSkipListReader.java | 281 ++ .../codecs/MultiLevelSkipListWriter.java | 153 + .../lucene/index/codecs/PostingsConsumer.java | 97 + .../lucene/index/codecs/TermsConsumer.java | 99 + .../intblock/FixedIntBlockIndexInput.java | 190 + .../intblock/FixedIntBlockIndexOutput.java | 118 + .../index/codecs/intblock/IntBlockCodec.java | 140 + .../intblock/SimpleIntBlockFactory.java | 41 + .../intblock/SimpleIntBlockIndexInput.java | 67 + .../intblock/SimpleIntBlockIndexOutput.java | 56 + .../index/codecs/preflex/PreFlexCodec.java | 80 + .../index/codecs/preflex/PreFlexFields.java | 488 +++ .../{ => codecs/preflex}/SegmentTermDocs.java | 57 +- .../{ => codecs/preflex}/SegmentTermEnum.java | 32 +- .../preflex}/SegmentTermPositions.java | 24 +- .../{ => codecs/preflex}/TermBuffer.java | 26 +- .../index/{ => codecs/preflex}/TermInfo.java | 8 +- .../{ => codecs/preflex}/TermInfosReader.java | 48 +- .../index/codecs/pulsing/PulsingCodec.java | 155 + .../pulsing/PulsingPostingsReaderImpl.java | 381 ++ .../pulsing/PulsingPostingsWriterImpl.java | 311 ++ .../index/codecs/sep/IntIndexInput.java | 75 + .../index/codecs/sep/IntIndexOutput.java | 58 + .../index/codecs/sep/IntStreamFactory.java | 33 + .../lucene/index/codecs/sep/SepCodec.java | 150 + .../codecs/sep/SepPostingsReaderImpl.java | 679 +++ .../codecs/sep/SepPostingsWriterImpl.java | 287 ++ .../index/codecs/sep/SepSkipListReader.java | 205 + .../index/codecs/sep/SepSkipListWriter.java | 197 + .../index/codecs/sep/SingleIntFactory.java | 33 + .../index/codecs/sep/SingleIntIndexInput.java | 114 + .../codecs/sep/SingleIntIndexOutput.java | 84 + .../standard}/DefaultSkipListReader.java | 19 +- .../standard}/DefaultSkipListWriter.java | 26 +- .../codecs/standard/DeltaBytesReader.java | 48 + .../codecs/standard/DeltaBytesWriter.java | 67 + .../index/codecs/standard/PagedBytes.java | 129 + .../SimpleStandardTermsIndexReader.java | 442 ++ .../SimpleStandardTermsIndexWriter.java | 186 + .../index/codecs/standard/StandardCodec.java | 149 + .../standard/StandardPostingsReader.java | 56 + .../standard/StandardPostingsReaderImpl.java | 594 +++ .../standard/StandardPostingsWriter.java | 43 + .../standard/StandardPostingsWriterImpl.java | 234 ++ .../standard/StandardTermsDictReader.java | 480 +++ .../standard/StandardTermsDictWriter.java | 176 + .../standard/StandardTermsIndexReader.java | 76 + .../standard/StandardTermsIndexWriter.java | 38 + .../index/codecs/standard/TermState.java | 54 + .../apache/lucene/search/AutomatonQuery.java | 151 + .../lucene/search/AutomatonTermsEnum.java | 377 ++ .../lucene/search/ConstantScoreQuery.java | 4 +- .../lucene/search/ExactPhraseScorer.java | 14 +- .../org/apache/lucene/search/FieldCache.java | 95 +- .../apache/lucene/search/FieldCacheImpl.java | 370 +- .../lucene/search/FieldCacheRangeFilter.java | 161 +- .../lucene/search/FilteredTermEnum.java | 15 +- .../lucene/search/FilteredTermsEnum.java | 233 ++ .../org/apache/lucene/search/FuzzyQuery.java | 12 +- .../apache/lucene/search/FuzzyTermEnum.java | 6 +- .../apache/lucene/search/FuzzyTermsEnum.java | 555 +++ .../lucene/search/MatchAllDocsQuery.java | 21 +- .../lucene/search/MultiPhraseQuery.java | 200 +- .../apache/lucene/search/MultiTermQuery.java | 233 +- .../search/MultiTermQueryWrapperFilter.java | 111 +- .../lucene/search/NumericRangeFilter.java | 3 - .../lucene/search/NumericRangeQuery.java | 140 +- .../apache/lucene/search/PhrasePositions.java | 27 +- .../org/apache/lucene/search/PhraseQuery.java | 34 +- .../apache/lucene/search/PhraseScorer.java | 10 +- .../org/apache/lucene/search/PrefixQuery.java | 20 +- .../apache/lucene/search/PrefixTermEnum.java | 2 + .../apache/lucene/search/PrefixTermsEnum.java | 50 + .../org/apache/lucene/search/RegexpQuery.java | 106 + .../org/apache/lucene/search/Similarity.java | 1 + .../apache/lucene/search/SingleTermEnum.java | 1 + .../apache/lucene/search/SingleTermsEnum.java | 53 + .../lucene/search/SloppyPhraseScorer.java | 6 +- .../org/apache/lucene/search/TermQuery.java | 27 +- .../apache/lucene/search/TermRangeFilter.java | 3 - .../apache/lucene/search/TermRangeQuery.java | 38 +- .../lucene/search/TermRangeTermEnum.java | 2 + .../lucene/search/TermRangeTermsEnum.java | 132 + .../org/apache/lucene/search/TermScorer.java | 69 +- .../apache/lucene/search/WildcardQuery.java | 122 +- .../lucene/search/WildcardTermEnum.java | 6 +- .../search/function/MultiValueSource.java | 136 + .../search/function/ValueSourceQuery.java | 31 +- .../search/payloads/PayloadTermQuery.java | 34 +- .../lucene/search/spans/SpanTermQuery.java | 26 +- .../org/apache/lucene/search/spans/Spans.java | 1 - .../apache/lucene/search/spans/TermSpans.java | 99 +- .../org/apache/lucene/store/DataInput.java | 239 ++ .../org/apache/lucene/store/DataOutput.java | 194 + .../org/apache/lucene/store/Directory.java | 19 +- .../org/apache/lucene/store/IndexInput.java | 213 +- .../org/apache/lucene/store/IndexOutput.java | 169 +- .../org/apache/lucene/util/ArrayUtil.java | 23 + .../org/apache/lucene/util/BitVector.java | 7 +- .../Bits.java} | 15 +- .../org/apache/lucene/util/BitsSlice.java | 46 + .../java/org/apache/lucene/util/BytesRef.java | 250 ++ .../org/apache/lucene/util/CodecUtil.java | 72 + .../java/org/apache/lucene/util/IntsRef.java | 96 + .../org/apache/lucene/util/MultiBits.java | 87 + .../org/apache/lucene/util/NumericUtils.java | 299 +- .../org/apache/lucene/util/OpenBitSet.java | 7 +- .../apache/lucene/util/RamUsageEstimator.java | 15 +- .../org/apache/lucene/util/ReaderUtil.java | 84 +- .../org/apache/lucene/util/UnicodeUtil.java | 237 +- .../lucene/util/automaton/Automaton.java | 744 ++++ .../util/automaton/AutomatonProvider.java | 50 + .../lucene/util/automaton/BasicAutomata.java | 242 ++ .../util/automaton/BasicOperations.java | 625 +++ .../automaton/Lev1ParametricDescription.java | 117 + .../automaton/Lev2ParametricDescription.java | 217 + .../util/automaton/LevenshteinAutomata.java | 258 ++ .../automaton/MinimizationOperations.java | 275 ++ .../apache/lucene/util/automaton/RegExp.java | 1000 +++++ .../lucene/util/automaton/RunAutomaton.java | 210 + .../util/automaton/SpecialOperations.java | 179 + .../apache/lucene/util/automaton/State.java | 209 + .../lucene/util/automaton/StatePair.java | 101 + .../lucene/util/automaton/Transition.java | 174 + .../util/automaton/TransitionComparator.java | 75 + .../util/automaton/createLevAutomata.py | 492 +++ .../apache/lucene/util/automaton/package.html | 47 + .../apache/lucene/util/packed/Direct16.java | 86 + .../apache/lucene/util/packed/Direct32.java | 82 + .../apache/lucene/util/packed/Direct64.java | 79 + .../apache/lucene/util/packed/Direct8.java | 86 + .../apache/lucene/util/packed/Packed32.java | 221 + .../apache/lucene/util/packed/Packed64.java | 211 + .../apache/lucene/util/packed/PackedInts.java | 296 ++ .../util/packed/PackedReaderIterator.java | 84 + .../lucene/util/packed/PackedWriter.java | 113 + .../apache/lucene/util/packed/package.html | 16 + .../src/test/org/apache/lucene/TestDemo.java | 11 +- .../org/apache/lucene/TestExternalCodecs.java | 887 ++++ .../lucene/TestSearchForDuplicates.java | 3 + .../analysis/TestNumericTokenStream.java | 39 +- .../TestCharTermAttributeImpl.java | 177 + .../TestSimpleAttributeImpls.java | 1 + .../TestTermAttributeImpl.java | 4 +- .../apache/lucene/document/TestDateTools.java | 2 +- .../org/apache/lucene/index/FlexTestUtil.java | 656 +++ .../index/TestAddIndexesNoOptimize.java | 8 +- .../index/TestBackwardsCompatibility.java | 135 +- .../org/apache/lucene/index/TestCodecs.java | 614 +++ .../test/org/apache/lucene/index/TestDoc.java | 11 +- .../org/apache/lucene/index/TestFlex.java | 84 + .../lucene/index/TestFlexExternalReader.java | 82 + .../apache/lucene/index/TestIndexReader.java | 62 +- .../lucene/index/TestIndexReaderReopen.java | 2 + .../apache/lucene/index/TestIndexWriter.java | 218 +- .../lucene/index/TestIndexWriterConfig.java | 3 + .../lucene/index/TestIndexWriterDelete.java | 35 +- .../lucene/index/TestIndexWriterReader.java | 15 - .../lucene/index/TestLazyProxSkipping.java | 4 +- .../lucene/index/TestMultiLevelSkipList.java | 20 +- .../org/apache/lucene/index/TestNorms.java | 1 + .../org/apache/lucene/index/TestOmitTf.java | 32 +- .../org/apache/lucene/index/TestPayloads.java | 29 +- .../lucene/index/TestSegmentMerger.java | 8 +- .../lucene/index/TestSegmentReader.java | 3 + .../lucene/index/TestSegmentTermDocs.java | 26 +- .../lucene/index/TestSegmentTermEnum.java | 18 +- .../lucene/index/TestStressIndexing.java | 4 +- .../lucene/index/TestStressIndexing2.java | 186 +- .../lucene/index/TestTermEnumSurrogate.java | 53 + .../codecs/intblock/TestIntBlockCodec.java | 42 + .../org/apache/lucene/index/index.30.cfs.zip | Bin 0 -> 4786 bytes .../apache/lucene/index/index.30.nocfs.zip | Bin 0 -> 8953 bytes .../org/apache/lucene/search/CheckHits.java | 2 +- .../lucene/search/JustCompileSearch.java | 11 +- .../org/apache/lucene/search/QueryUtils.java | 1 - .../lucene/search/TestAutomatonQuery.java | 210 + .../search/TestAutomatonQueryUnicode.java | 178 + .../search/TestCachingWrapperFilter.java | 2 +- .../lucene/search/TestFilteredSearch.java | 17 +- .../apache/lucene/search/TestFuzzyQuery.java | 13 +- .../apache/lucene/search/TestFuzzyQuery2.java | 142 + .../lucene/search/TestMultiPhraseQuery.java | 30 +- .../search/TestMultiTermQueryBWComp.java | 239 ++ .../search/TestNumericRangeQuery32.java | 53 +- .../search/TestNumericRangeQuery64.java | 20 +- .../lucene/search/TestPositionIncrement.java | 38 +- .../apache/lucene/search/TestPrefixQuery.java | 10 + .../apache/lucene/search/TestRegexpQuery.java | 128 + .../lucene/search/TestRegexpRandom.java | 144 + .../lucene/search/TestRegexpRandom2.java | 221 + .../org/apache/lucene/search/TestSort.java | 31 +- .../lucene/search/TestTermRangeQuery.java | 24 + .../apache/lucene/search/TestTermScorer.java | 17 +- .../apache/lucene/search/TestWildcard.java | 86 +- .../lucene/search/TestWildcardRandom.java | 137 + .../lucene/search/function/TestOrdValues.java | 4 +- .../search/function/TestValueSource.java | 64 + .../apache/lucene/search/fuzzyTestData.txt | 3721 +++++++++++++++++ .../apache/lucene/store/MockRAMDirectory.java | 6 +- .../lucene/util/MultiCodecTestCase.java | 46 + .../lucene/util/TestAttributeSource.java | 52 +- .../apache/lucene/util/TestNumericUtils.java | 40 +- .../apache/lucene/util/TestUnicodeUtil.java | 84 + .../util/automaton/TestBasicOperations.java | 71 + .../automaton/TestLevenshteinAutomata.java | 261 ++ .../lucene/util/packed/TestPackedInts.java | 225 + .../solr/handler/AnalysisRequestHandler.java | 32 +- .../handler/AnalysisRequestHandlerBase.java | 46 +- .../response/PHPSerializedResponseWriter.java | 5 +- .../org/apache/solr/schema/TrieDateField.java | 9 +- .../org/apache/solr/schema/TrieField.java | 22 +- .../org/apache/solr/search/TestDocSet.java | 2 +- 361 files changed, 35338 insertions(+), 4869 deletions(-) rename lucene/{src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java => backwards/src/java/org/apache/lucene/index/codecs/Codec.java} (64%) create mode 100644 lucene/backwards/src/java/org/apache/lucene/index/codecs/CodecProvider.java create mode 100644 lucene/backwards/src/java/org/apache/lucene/store/DataInput.java create mode 100644 lucene/backwards/src/java/org/apache/lucene/store/DataOutput.java create mode 100644 lucene/backwards/src/java/org/apache/lucene/util/BytesRef.java delete mode 100644 lucene/backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java create mode 100644 lucene/contrib/benchmark/sortBench.py create mode 100644 lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java create mode 100644 lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/misc/LengthNormModifier.java create mode 100644 lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java create mode 100644 lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java create mode 100644 lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java create mode 100644 lucene/src/java/org/apache/lucene/index/AllDocsEnum.java delete mode 100644 lucene/src/java/org/apache/lucene/index/CharBlockPool.java create mode 100644 lucene/src/java/org/apache/lucene/index/DocsAndPositionsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/DocsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/Fields.java create mode 100644 lucene/src/java/org/apache/lucene/index/FieldsEnum.java delete mode 100644 lucene/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java delete mode 100644 lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java delete mode 100644 lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java delete mode 100644 lucene/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java delete mode 100644 lucene/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/LegacyFields.java create mode 100644 lucene/src/java/org/apache/lucene/index/LegacyFieldsEnum.java rename lucene/src/java/org/apache/lucene/index/{SegmentMergeInfo.java => LegacySegmentMergeInfo.java} (94%) rename lucene/src/java/org/apache/lucene/index/{SegmentMergeQueue.java => LegacySegmentMergeQueue.java} (76%) create mode 100644 lucene/src/java/org/apache/lucene/index/LegacyTerms.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiDocsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiFields.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiFieldsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiTerms.java create mode 100644 lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/SegmentReadState.java delete mode 100644 lucene/src/java/org/apache/lucene/index/TermInfosWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/Terms.java create mode 100644 lucene/src/java/org/apache/lucene/index/TermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/Codec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java rename lucene/src/java/org/apache/lucene/index/{FormatPostingsFieldsConsumer.java => codecs/FieldsProducer.java} (77%) create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsAndPositionsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/MergeState.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockFactory.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexOutput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/SegmentTermDocs.java (81%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/SegmentTermEnum.java (86%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/SegmentTermPositions.java (90%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/TermBuffer.java (83%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/TermInfo.java (89%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/preflex}/TermInfosReader.java (89%) create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/IntStreamFactory.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntFactory.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java rename lucene/src/java/org/apache/lucene/index/{ => codecs/standard}/DefaultSkipListReader.java (87%) rename lucene/src/java/org/apache/lucene/index/{ => codecs/standard}/DefaultSkipListWriter.java (89%) create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java create mode 100644 lucene/src/java/org/apache/lucene/search/AutomatonQuery.java create mode 100644 lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/RegexpQuery.java create mode 100644 lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/TermRangeTermsEnum.java create mode 100644 lucene/src/java/org/apache/lucene/search/function/MultiValueSource.java create mode 100644 lucene/src/java/org/apache/lucene/store/DataInput.java create mode 100644 lucene/src/java/org/apache/lucene/store/DataOutput.java rename lucene/src/java/org/apache/lucene/{index/FormatPostingsDocsConsumer.java => util/Bits.java} (66%) create mode 100644 lucene/src/java/org/apache/lucene/util/BitsSlice.java create mode 100644 lucene/src/java/org/apache/lucene/util/BytesRef.java create mode 100644 lucene/src/java/org/apache/lucene/util/CodecUtil.java create mode 100644 lucene/src/java/org/apache/lucene/util/IntsRef.java create mode 100644 lucene/src/java/org/apache/lucene/util/MultiBits.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/Automaton.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/RegExp.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/State.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/StatePair.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/Transition.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/createLevAutomata.py create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/package.html create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Direct16.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Direct32.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Direct64.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Direct8.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Packed32.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/Packed64.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/PackedInts.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/PackedReaderIterator.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/PackedWriter.java create mode 100644 lucene/src/java/org/apache/lucene/util/packed/package.html create mode 100644 lucene/src/test/org/apache/lucene/TestExternalCodecs.java create mode 100644 lucene/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java create mode 100644 lucene/src/test/org/apache/lucene/index/FlexTestUtil.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestCodecs.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestFlex.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestFlexExternalReader.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java create mode 100644 lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java create mode 100644 lucene/src/test/org/apache/lucene/index/index.30.cfs.zip create mode 100644 lucene/src/test/org/apache/lucene/index/index.30.nocfs.zip create mode 100644 lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestMultiTermQueryBWComp.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java create mode 100644 lucene/src/test/org/apache/lucene/search/function/TestValueSource.java create mode 100644 lucene/src/test/org/apache/lucene/search/fuzzyTestData.txt create mode 100644 lucene/src/test/org/apache/lucene/util/MultiCodecTestCase.java create mode 100644 lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java create mode 100644 lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java create mode 100644 lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java create mode 100644 lucene/src/test/org/apache/lucene/util/packed/TestPackedInts.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6b5e9a34bb0..00207ffa914 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -1,5 +1,79 @@ Lucene Change Log +======================= Flexible Indexing Branch ======================= + +Changes in backwards compatibility policy + +* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing: + + - MultiReader ctor now throws IOException + + - Directory.copy/Directory.copyTo now copies all files (not just + index files), since what is and isn't and index file is now + dependent on the codecs used. (Mike McCandless) + + - UnicodeUtil now uses BytesRef for UTF-8 output, and some method + signatures have changed to CharSequence. These are internal APIs + and subject to change suddenly. (Robert Muir, Mike McCandless) + + - Positional queries (PhraseQuery, *SpanQuery) will now throw an + exception if use them on a field that omits positions during + indexing (previously they silently returned no results). + + - FieldCache.(Byte,Short,Int,Long,Float,Double}Parser's API has + changed -- each parse method now takes a BytesRef instead of a + String. If you have an existing Parser, a simple way to fix it is + invoke BytesRef.utf8ToString, and pass that String to your + existing parser. This will work, but performance would be better + if you could fix your parser to instead operate directly on the + byte[] in the BytesRef. + + - The internal (experimental) API of NumericUtils changed completely + from String to BytesRef. Client code should never use this class, + so the change would normally not affect you. If you used some of + the methods to inspect terms or create TermQueries out of + prefix encoded terms, change to use BytesRef. Please note: + Do not use TermQueries to search for single numeric terms. + The recommended way is to create a corresponding NumericRangeQuery + with upper and lower bound equal and included. TermQueries do not + score correct, so the constant score mode of NRQ is the only + correct way to handle single value queries. + + - NumericTokenStream now works directly on byte[] terms. If you + plug a TokenFilter on top of this stream, you will likely get + an IllegalArgumentException, because the NTS does not support + TermAttribute/CharTermAttribute. If you want to further filter + or attach Payloads to NTS, use the new NumericTermAttribute. + +Bug Fixes + +* LUCENE-2222: FixedIntBlockIndexInput incorrectly read one block of + 0s before the actual data. (Renaud Delbru via Mike McCandless) + +* LUCENE-2344: PostingsConsumer.merge was failing to call finishDoc, + which caused corruption for sep codec. Also fixed several tests to + test all 4 core codecs. (Renaud Delbru via Mike McCandless) + +New features + +* LUCENE-1606, LUCENE-2089: Adds AutomatonQuery, a MultiTermQuery that + matches terms against a finite-state machine. Implement WildcardQuery + and FuzzyQuery with finite-state methods. Adds RegexpQuery. + (Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller) + +* LUCENE-1990: Adds internal packed ints implementation, to be used + for more efficient storage of int arrays when the values are + bounded, for example for storing the terms dict index Toke Toke + Eskildsen via Mike McCandless) + +* LUCENE-2321: Cutover to a more RAM efficient packed-ints based + representation for the in-memory terms dict index. (Mike + McCandless) + +* LUCENE-2126: Add new classes for data (de)serialization: DataInput + and DataOutput. IndexInput and IndexOutput extend these new classes. + (Michael Busch) + ======================= Trunk (not yet released) ======================= Changes in backwards compatibility policy @@ -297,8 +371,8 @@ Optimizations Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation - into core, and moved the ICU-based collation support into contrib/icu. - (Robert Muir) + into core, and moved the ICU-based collation support into contrib/icu. + (Robert Muir) * LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch is now included in the svn repository using "svn copy" after release. diff --git a/lucene/LICENSE.txt b/lucene/LICENSE.txt index dfd5693ce9c..f1086dcb2cf 100644 --- a/lucene/LICENSE.txt +++ b/lucene/LICENSE.txt @@ -237,4 +237,60 @@ http://www.python.org. Full license is here: http://www.python.org/download/releases/2.4.2/license/ +Some code in src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. diff --git a/lucene/NOTICE.txt b/lucene/NOTICE.txt index e4ce6a8c6fd..e11fc9c3229 100644 --- a/lucene/NOTICE.txt +++ b/lucene/NOTICE.txt @@ -46,3 +46,12 @@ provided by Xiaoping Gao and copyright 2009 by www.imdict.net. ICU4J, (under contrib/icu) is licensed under an MIT styles license (contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Brics Automaton (under src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ diff --git a/lucene/backwards/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/backwards/src/java/org/apache/lucene/index/SegmentInfo.java index bd748cdc856..439b3a4f43b 100644 --- a/lucene/backwards/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/backwards/src/java/org/apache/lucene/index/SegmentInfo.java @@ -21,6 +21,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BitVector; +import org.apache.lucene.index.codecs.Codec; import java.io.IOException; import java.util.List; import java.util.Map; @@ -129,6 +130,12 @@ public final class SegmentInfo { assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } + // stub + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, + Codec codec) { + } + /** * Copy everything from src SegmentInfo into our instance. */ diff --git a/lucene/backwards/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/backwards/src/java/org/apache/lucene/index/SegmentMerger.java index 1badf78e1d5..7dd4272588e 100644 --- a/lucene/backwards/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/backwards/src/java/org/apache/lucene/index/SegmentMerger.java @@ -29,6 +29,8 @@ import org.apache.lucene.index.MergePolicy.MergeAbortedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -98,7 +100,12 @@ final class SegmentMerger { } termIndexInterval = writer.getTermIndexInterval(); } - + + // stub + SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs) { + checkAbort = null; + } + boolean hasProx() { return fieldInfos.hasProx(); } @@ -171,6 +178,11 @@ final class SegmentMerger { } } + // stub + final List createCompoundFile(String fileName, SegmentInfo info) { + return null; + } + final List createCompoundFile(String fileName) throws IOException { CompoundFileWriter cfsWriter = @@ -553,6 +565,11 @@ final class SegmentMerger { } } + // stub + Codec getCodec() { + return null; + } + private SegmentMergeQueue queue = null; private final void mergeTerms() throws CorruptIndexException, IOException { diff --git a/lucene/backwards/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/backwards/src/java/org/apache/lucene/index/SegmentReader.java index 5aec0156868..c33aaa3856d 100644 --- a/lucene/backwards/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/backwards/src/java/org/apache/lucene/index/SegmentReader.java @@ -37,6 +37,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.index.codecs.CodecProvider; /** @version $Id */ /** @@ -594,6 +595,17 @@ public class SegmentReader extends IndexReader implements Cloneable { return instance; } + // stub + public static SegmentReader get(boolean readOnly, + Directory dir, + SegmentInfo si, + int readBufferSize, + boolean doOpenStores, + int termInfosIndexDivisor, + CodecProvider codecs) { + return null; + } + void openDocStores() throws IOException { core.openDocStores(si); } diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java b/lucene/backwards/src/java/org/apache/lucene/index/codecs/Codec.java similarity index 64% rename from lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java rename to lucene/backwards/src/java/org/apache/lucene/index/codecs/Codec.java index 13a2a115b02..d1d64f8a0c3 100644 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java +++ b/lucene/backwards/src/java/org/apache/lucene/index/codecs/Codec.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,15 +17,7 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; +// stub +public class Codec { - -abstract class FormatPostingsPositionsConsumer { - - /** Add a new position & payload. If payloadLength > 0 - * you must read those bytes from the IndexInput. */ - abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; - - /** Called when we are done adding positions & payloads */ - abstract void finish() throws IOException; } diff --git a/lucene/backwards/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/backwards/src/java/org/apache/lucene/index/codecs/CodecProvider.java new file mode 100644 index 00000000000..354cb8fede6 --- /dev/null +++ b/lucene/backwards/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -0,0 +1,25 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// stub +public class CodecProvider { + public static CodecProvider getDefault() { + return null; + } +} \ No newline at end of file diff --git a/lucene/backwards/src/java/org/apache/lucene/store/DataInput.java b/lucene/backwards/src/java/org/apache/lucene/store/DataInput.java new file mode 100644 index 00000000000..a74c3ea0221 --- /dev/null +++ b/lucene/backwards/src/java/org/apache/lucene/store/DataInput.java @@ -0,0 +1,234 @@ +package org.apache.lucene.store; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * Abstract base class for performing read operations of Lucene's low-level + * data types. + */ +public abstract class DataInput implements Cloneable { + private byte[] bytes; // used by readString() + private char[] chars; // used by readModifiedUTF8String() + private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format + + /** Reads and returns a single byte. + * @see DataOutput#writeByte(byte) + */ + public abstract byte readByte() throws IOException; + + /** Reads a specified number of bytes into an array at the specified offset. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @see DataOutput#writeBytes(byte[],int) + */ + public abstract void readBytes(byte[] b, int offset, int len) + throws IOException; + + /** Reads a specified number of bytes into an array at the + * specified offset with control over whether the read + * should be buffered (callers who have their own buffer + * should pass in "false" for useBuffer). Currently only + * {@link BufferedIndexInput} respects this parameter. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @param useBuffer set to false if the caller will handle + * buffering. + * @see DataOutput#writeBytes(byte[],int) + */ + public void readBytes(byte[] b, int offset, int len, boolean useBuffer) + throws IOException + { + // Default to ignoring useBuffer entirely + readBytes(b, offset, len); + } + + /** Reads two bytes and returns a short. + * @see DataOutput#writeByte(byte) + */ + public short readShort() throws IOException { + return (short) (((readByte() & 0xFF) << 8) | (readByte() & 0xFF)); + } + + /** Reads four bytes and returns an int. + * @see DataOutput#writeInt(int) + */ + public int readInt() throws IOException { + return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) + | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); + } + + /** Reads an int stored in variable-length format. Reads between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataOutput#writeVInt(int) + */ + public int readVInt() throws IOException { + byte b = readByte(); + int i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7F) << shift; + } + return i; + } + + /** Reads eight bytes and returns a long. + * @see DataOutput#writeLong(long) + */ + public long readLong() throws IOException { + return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); + } + + /** Reads a long stored in variable-length format. Reads between one and + * nine bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. */ + public long readVLong() throws IOException { + byte b = readByte(); + long i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7FL) << shift; + } + return i; + } + + /** Call this if readString should read characters stored + * in the old modified UTF8 format (length in java chars + * and java's modified UTF8 encoding). This is used for + * indices written pre-2.4 See LUCENE-510 for details. */ + public void setModifiedUTF8StringsMode() { + preUTF8Strings = true; + } + + /** Reads a string. + * @see DataOutput#writeString(String) + */ + public String readString() throws IOException { + if (preUTF8Strings) + return readModifiedUTF8String(); + int length = readVInt(); + if (bytes == null || length > bytes.length) + bytes = new byte[(int) (length*1.25)]; + readBytes(bytes, 0, length); + return new String(bytes, 0, length, "UTF-8"); + } + + private String readModifiedUTF8String() throws IOException { + int length = readVInt(); + if (chars == null || length > chars.length) + chars = new char[length]; + readChars(chars, 0, length); + return new String(chars, 0, length); + } + + /** Reads Lucene's old "modified UTF-8" encoded + * characters into an array. + * @param buffer the array to read characters into + * @param start the offset in the array to start storing characters + * @param length the number of characters to read + * @see DataOutput#writeChars(String,int,int) + * @deprecated -- please use readString or readBytes + * instead, and construct the string + * from those utf8 bytes + */ + @Deprecated + public void readChars(char[] buffer, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + byte b = readByte(); + if ((b & 0x80) == 0) + buffer[i] = (char)(b & 0x7F); + else if ((b & 0xE0) != 0xE0) { + buffer[i] = (char)(((b & 0x1F) << 6) + | (readByte() & 0x3F)); + } else { + buffer[i] = (char)(((b & 0x0F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); + } + } + } + + /** + * Expert + * + * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still + * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything + * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine + * how many more bytes to read + * @param length The number of chars to read + * @deprecated this method operates on old "modified utf8" encoded + * strings + */ + @Deprecated + public void skipChars(int length) throws IOException{ + for (int i = 0; i < length; i++) { + byte b = readByte(); + if ((b & 0x80) == 0){ + //do nothing, we only need one byte + } else if ((b & 0xE0) != 0xE0) { + readByte();//read an additional byte + } else { + //read two additional bytes. + readByte(); + readByte(); + } + } + } + + /** Returns a clone of this stream. + * + *

Clones of a stream access the same data, and are positioned at the same + * point as the stream they were cloned from. + * + *

Expert: Subclasses must ensure that clones may be positioned at + * different points in the input from each other and from the stream they + * were cloned from. + */ + @Override + public Object clone() { + DataInput clone = null; + try { + clone = (DataInput)super.clone(); + } catch (CloneNotSupportedException e) {} + + clone.bytes = null; + clone.chars = null; + + return clone; + } + + public Map readStringStringMap() throws IOException { + final Map map = new HashMap(); + final int count = readInt(); + for(int i=0;i> 24)); + writeByte((byte)(i >> 16)); + writeByte((byte)(i >> 8)); + writeByte((byte) i); + } + + /** Writes an int in a variable-length format. Writes between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVInt() + */ + public void writeVInt(int i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a long as eight bytes. + * @see DataInput#readLong() + */ + public void writeLong(long i) throws IOException { + writeInt((int) (i >> 32)); + writeInt((int) i); + } + + /** Writes an long in a variable-length format. Writes between one and five + * bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVLong() + */ + public void writeVLong(long i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a string. + * @see DataInput#readString() + */ + public void writeString(String s) throws IOException { + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); + writeVInt(utf8Result.length); + writeBytes(utf8Result.bytes, 0, utf8Result.length); + } + + /** Writes a sub sequence of characters from s as the old + * format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes + * instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(String s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s.charAt(i); + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + /** Writes a sub sequence of characters from char[] as + * the old format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(char[] s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s[i]; + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + private static int COPY_BUFFER_SIZE = 16384; + private byte[] copyBuffer; + + /** Copy numBytes bytes from input to ourself. */ + public void copyBytes(DataInput input, long numBytes) throws IOException { + assert numBytes >= 0: "numBytes=" + numBytes; + long left = numBytes; + if (copyBuffer == null) + copyBuffer = new byte[COPY_BUFFER_SIZE]; + while(left > 0) { + final int toCopy; + if (left > COPY_BUFFER_SIZE) + toCopy = COPY_BUFFER_SIZE; + else + toCopy = (int) left; + input.readBytes(copyBuffer, 0, toCopy); + writeBytes(copyBuffer, 0, toCopy); + left -= toCopy; + } + } + + public void writeStringStringMap(Map map) throws IOException { + if (map == null) { + writeInt(0); + } else { + writeInt(map.size()); + for(final Map.Entry entry: map.entrySet()) { + writeString(entry.getKey()); + writeString(entry.getValue()); + } + } + } +} diff --git a/lucene/backwards/src/java/org/apache/lucene/store/IndexInput.java b/lucene/backwards/src/java/org/apache/lucene/store/IndexInput.java index a45e9837005..1268c93191d 100644 --- a/lucene/backwards/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/backwards/src/java/org/apache/lucene/store/IndexInput.java @@ -17,180 +17,14 @@ package org.apache.lucene.store; * limitations under the License. */ -import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import java.util.HashMap; +import java.io.IOException; /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ -public abstract class IndexInput implements Cloneable,Closeable { - private byte[] bytes; // used by readString() - private char[] chars; // used by readModifiedUTF8String() - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - - /** Reads and returns a single byte. - * @see IndexOutput#writeByte(byte) - */ - public abstract byte readByte() throws IOException; - - /** Reads a specified number of bytes into an array at the specified offset. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @see IndexOutput#writeBytes(byte[],int) - */ - public abstract void readBytes(byte[] b, int offset, int len) - throws IOException; - - /** Reads a specified number of bytes into an array at the - * specified offset with control over whether the read - * should be buffered (callers who have their own buffer - * should pass in "false" for useBuffer). Currently only - * {@link BufferedIndexInput} respects this parameter. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @param useBuffer set to false if the caller will handle - * buffering. - * @see IndexOutput#writeBytes(byte[],int) - */ - public void readBytes(byte[] b, int offset, int len, boolean useBuffer) - throws IOException - { - // Default to ignoring useBuffer entirely - readBytes(b, offset, len); - } - - /** Reads four bytes and returns an int. - * @see IndexOutput#writeInt(int) - */ - public int readInt() throws IOException { - return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) - | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); - } - - /** Reads an int stored in variable-length format. Reads between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexOutput#writeVInt(int) - */ - public int readVInt() throws IOException { - byte b = readByte(); - int i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7F) << shift; - } - return i; - } - - /** Reads eight bytes and returns a long. - * @see IndexOutput#writeLong(long) - */ - public long readLong() throws IOException { - return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); - } - - /** Reads a long stored in variable-length format. Reads between one and - * nine bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. */ - public long readVLong() throws IOException { - byte b = readByte(); - long i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7FL) << shift; - } - return i; - } - - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - - /** Reads a string. - * @see IndexOutput#writeString(String) - */ - public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); - int length = readVInt(); - if (bytes == null || length > bytes.length) - bytes = new byte[(int) (length*1.25)]; - readBytes(bytes, 0, length); - return new String(bytes, 0, length, "UTF-8"); - } - - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - if (chars == null || length > chars.length) - chars = new char[length]; - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see IndexOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - - /** - * Expert - * - * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still - * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything - * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine - * how many more bytes to read - * @param length The number of chars to read - * @deprecated this method operates on old "modified utf8" encoded - * strings - */ - public void skipChars(int length) throws IOException{ - for (int i = 0; i < length; i++) { - byte b = readByte(); - if ((b & 0x80) == 0){ - //do nothing, we only need one byte - } - else if ((b & 0xE0) != 0xE0) { - readByte();//read an additional byte - } else{ - //read two additional bytes. - readByte(); - readByte(); - } - } - } - - +public abstract class IndexInput extends DataInput implements Cloneable,Closeable { /** Closes the stream to further operations. */ public abstract void close() throws IOException; @@ -207,38 +41,4 @@ public abstract class IndexInput implements Cloneable,Closeable { /** The number of bytes in the file. */ public abstract long length(); - - /** Returns a clone of this stream. - * - *

Clones of a stream access the same data, and are positioned at the same - * point as the stream they were cloned from. - * - *

Expert: Subclasses must ensure that clones may be positioned at - * different points in the input from each other and from the stream they - * were cloned from. - */ - @Override - public Object clone() { - IndexInput clone = null; - try { - clone = (IndexInput)super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.bytes = null; - clone.chars = null; - - return clone; - } - - public Map readStringStringMap() throws IOException { - final Map map = new HashMap(); - final int count = readInt(); - for(int i=0;i> 24)); - writeByte((byte)(i >> 16)); - writeByte((byte)(i >> 8)); - writeByte((byte) i); - } - - /** Writes an int in a variable-length format. Writes between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVInt() - */ - public void writeVInt(int i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a long as eight bytes. - * @see IndexInput#readLong() - */ - public void writeLong(long i) throws IOException { - writeInt((int) (i >> 32)); - writeInt((int) i); - } - - /** Writes an long in a variable-length format. Writes between one and five - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVLong() - */ - public void writeVLong(long i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a string. - * @see IndexInput#readString() - */ - public void writeString(String s) throws IOException { - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - writeVInt(utf8Result.length); - writeBytes(utf8Result.result, 0, utf8Result.length); - } - - /** Writes a sub sequence of characters from s as the old - * format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes - * instead or use {@link #writeString} - */ - public void writeChars(String s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = (int)s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - /** Writes a sub sequence of characters from char[] as - * the old format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} - */ - public void writeChars(char[] s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = (int)s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - private static int COPY_BUFFER_SIZE = 16384; - private byte[] copyBuffer; - - /** Copy numBytes bytes from input to ourself. */ - public void copyBytes(IndexInput input, long numBytes) throws IOException { - assert numBytes >= 0: "numBytes=" + numBytes; - long left = numBytes; - if (copyBuffer == null) - copyBuffer = new byte[COPY_BUFFER_SIZE]; - while(left > 0) { - final int toCopy; - if (left > COPY_BUFFER_SIZE) - toCopy = COPY_BUFFER_SIZE; - else - toCopy = (int) left; - input.readBytes(copyBuffer, 0, toCopy); - writeBytes(copyBuffer, 0, toCopy); - left -= toCopy; - } - } +public abstract class IndexOutput extends DataOutput implements Closeable { /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; @@ -208,17 +57,5 @@ public abstract class IndexOutput implements Closeable { * undefined. Otherwise the file is truncated. * @param length file length */ - public void setLength(long length) throws IOException {}; - - public void writeStringStringMap(Map map) throws IOException { - if (map == null) { - writeInt(0); - } else { - writeInt(map.size()); - for(final Map.Entry entry: map.entrySet()) { - writeString(entry.getKey()); - writeString(entry.getValue()); - } - } - } + public void setLength(long length) throws IOException {} } diff --git a/lucene/backwards/src/java/org/apache/lucene/util/BytesRef.java b/lucene/backwards/src/java/org/apache/lucene/util/BytesRef.java new file mode 100644 index 00000000000..1f3c6ff7a60 --- /dev/null +++ b/lucene/backwards/src/java/org/apache/lucene/util/BytesRef.java @@ -0,0 +1,27 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// stub for tests only +public class BytesRef { + public BytesRef(int capacity) {} + public BytesRef() {} + public byte[] bytes; + public int offset; + public int length; +}; diff --git a/lucene/backwards/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/backwards/src/java/org/apache/lucene/util/UnicodeUtil.java index 6f219e6eaf0..6d94bfe0ad2 100644 --- a/lucene/backwards/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/backwards/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -106,6 +106,10 @@ final public class UnicodeUtil { } } + // stubs for tests only + public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) {} + public static void UTF16toUTF8(CharSequence s, int offset, int length, BytesRef result) {} + /** Encode characters from a char[] source, starting at * offset and stopping when the character 0xffff is seen. * Returns the number of bytes written to bytesOut. */ @@ -223,7 +227,7 @@ final public class UnicodeUtil { /** Encode characters from this String, starting at offset * for length characters. Returns the number of bytes * written to bytesOut. */ - public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) { + public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, UTF8Result result) { final int end = offset + length; byte[] out = result.result; diff --git a/lucene/backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java b/lucene/backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java deleted file mode 100644 index 9a48a07449a..00000000000 --- a/lucene/backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java +++ /dev/null @@ -1,73 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.NumericUtils; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; - -public class TestNumericTokenStream extends BaseTokenStreamTestCase { - - static final long lvalue = 4573245871874382L; - static final int ivalue = 123456; - - public void testLongStream() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); - final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); - for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); - } - assertFalse("No more tokens available", stream.incrementToken()); - } - - public void testIntStream() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); - final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); - for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); - } - assertFalse("No more tokens available", stream.incrementToken()); - } - - public void testNotInitialized() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(); - - try { - stream.reset(); - fail("reset() should not succeed."); - } catch (IllegalStateException e) { - // pass - } - - try { - stream.incrementToken(); - fail("incrementToken() should not succeed."); - } catch (IllegalStateException e) { - // pass - } - } - -} diff --git a/lucene/backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java b/lucene/backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java index 678a5ca6c91..7072dc7c90c 100644 --- a/lucene/backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java +++ b/lucene/backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java @@ -107,10 +107,10 @@ public class TestTermAttributeImpl extends LuceneTestCase { char[] b = {'a', 'l', 'o', 'h', 'a'}; TermAttributeImpl t = new TermAttributeImpl(); t.setTermBuffer(b, 0, 5); - assertEquals("term=aloha", t.toString()); + assertEquals("aloha", t.toString()); t.setTermBuffer("hi there"); - assertEquals("term=hi there", t.toString()); + assertEquals("hi there", t.toString()); } public void testMixedStringArray() throws Exception { diff --git a/lucene/backwards/src/test/org/apache/lucene/index/TestDoc.java b/lucene/backwards/src/test/org/apache/lucene/index/TestDoc.java index 6b94ff9fd7f..045a5226069 100644 --- a/lucene/backwards/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/backwards/src/test/org/apache/lucene/index/TestDoc.java @@ -35,6 +35,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.index.codecs.CodecProvider; /** JUnit adaptation of an older test case DocTest. */ @@ -180,20 +181,24 @@ public class TestDoc extends LuceneTestCase { SegmentReader r1 = SegmentReader.get(true, si1, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); SegmentReader r2 = SegmentReader.get(true, si2, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); - SegmentMerger merger = new SegmentMerger(si1.dir, merged); + SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, merged, null, CodecProvider.getDefault()); merger.add(r1); merger.add(r2); merger.merge(); merger.closeReaders(); + final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, + useCompoundFile, true, -1, null, false, merger.hasProx(), + merger.getCodec()); + if (useCompoundFile) { - List filesToDelete = merger.createCompoundFile(merged + ".cfs"); + List filesToDelete = merger.createCompoundFile(merged + ".cfs", info); for (Iterator iter = filesToDelete.iterator(); iter.hasNext();) si1.dir.deleteFile((String) iter.next()); } - return new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, useCompoundFile, true); + return info; } diff --git a/lucene/backwards/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/backwards/src/test/org/apache/lucene/index/TestIndexReader.java index 3c6160ec255..85dedd0151c 100644 --- a/lucene/backwards/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/backwards/src/test/org/apache/lucene/index/TestIndexReader.java @@ -986,29 +986,7 @@ public class TestIndexReader extends LuceneTestCase // new IndexFileDeleter, have it delete // unreferenced files, then verify that in fact // no files were deleted: - String[] startFiles = dir.listAll(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); - String[] endFiles = dir.listAll(); - - Arrays.sort(startFiles); - Arrays.sort(endFiles); - - //for(int i=0;i= 0); } diff --git a/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java b/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java index cce81c6ed8c..ae17e40e67b 100644 --- a/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java +++ b/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java @@ -56,14 +56,13 @@ public class TestSegmentTermDocs extends LuceneTestCase { SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) - { - int docId = segTermDocs.doc(); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -78,20 +77,20 @@ public class TestSegmentTermDocs extends LuceneTestCase { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } diff --git a/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java b/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java index 89ad1e8fd87..4b684cb5d9c 100644 --- a/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java +++ b/lucene/backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java @@ -61,23 +61,6 @@ public class TestSegmentTermEnum extends LuceneTestCase verifyDocFreq(); } - public void testPrevTermAtEnd() throws IOException - { - Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - addDoc(writer, "aaa bbb"); - writer.close(); - SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); - } - private void verifyDocFreq() throws IOException { diff --git a/lucene/backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java b/lucene/backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java index 063f0ccca5f..0f02bcdaeb5 100644 --- a/lucene/backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java +++ b/lucene/backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java @@ -352,7 +352,7 @@ public class TestStressIndexing2 extends LuceneTestCase { if (!termEnum1.next()) break; } - // iterate until we get some docs + // iterate until we get some docs int len2; for(;;) { len2=0; @@ -369,12 +369,12 @@ public class TestStressIndexing2 extends LuceneTestCase { if (!termEnum2.next()) break; } - if (!hasDeletes) - assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); - assertEquals(len1, len2); if (len1==0) break; // no more terms + if (!hasDeletes) + assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); + assertEquals(term1, term2); // sort info2 to get it into ascending docid diff --git a/lucene/backwards/src/test/org/apache/lucene/search/CheckHits.java b/lucene/backwards/src/test/org/apache/lucene/search/CheckHits.java index 6f42684233a..1638289b70d 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/CheckHits.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/CheckHits.java @@ -33,7 +33,7 @@ public class CheckHits { * different order of operations from the actual scoring method ... * this allows for a small amount of variation */ - public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.00005f; + public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f; /** * Tests that all documents up to maxDoc which are *not* in the diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java b/lucene/backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java index c4b8d369d05..7d6234de76a 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java @@ -65,7 +65,7 @@ public class TestCachingWrapperFilter extends LuceneTestCase { if (originalSet.isCacheable()) { assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass()); } else { - assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI); + assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI || cachedSet == DocIdSet.EMPTY_DOCIDSET); } } diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java b/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java index d95d03ffb03..8587bb80043 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java @@ -230,6 +230,8 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { testRightOpenRange(2); } + /* TESTs disabled, because incompatible API change in 3.1/flex: + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="field"+precisionStep; @@ -298,6 +300,8 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { testRandomTrieAndClassicRangeQuery(Integer.MAX_VALUE); } + */ + private void testRangeSplit(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="ascfield"+precisionStep; @@ -443,37 +447,39 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { assertFalse(q2.equals(q1)); } - private void testEnum(int lower, int upper) throws Exception { - NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); - FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); - try { - int count = 0; - do { - final Term t = termEnum.term(); - if (t != null) { - final int val = NumericUtils.prefixCodedToInt(t.text()); - assertTrue("value not in bounds", val >= lower && val <= upper); - count++; - } else break; - } while (termEnum.next()); - assertFalse(termEnum.next()); - System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); - } finally { - termEnum.close(); - } - } +// Removed for now - NumericRangeQuery does not currently implement getEnum - public void testEnum() throws Exception { - int count=3000; - int lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3); - // test enum with values - testEnum(lower, upper); - // test empty enum - testEnum(upper, lower); - // test empty enum outside of bounds - lower = distance*noDocs+startOffset; - upper = 2 * lower; - testEnum(lower, upper); - } +// private void testEnum(int lower, int upper) throws Exception { +// NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); +// FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); +// try { +// int count = 0; +// do { +// final Term t = termEnum.term(); +// if (t != null) { +// final int val = NumericUtils.prefixCodedToInt(t.text()); +// assertTrue("value not in bounds", val >= lower && val <= upper); +// count++; +// } else break; +// } while (termEnum.next()); +// assertFalse(termEnum.next()); +// System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); +// } finally { +// termEnum.close(); +// } +// } +// +// public void testEnum() throws Exception { +// int count=3000; +// int lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3); +// // test enum with values +// testEnum(lower, upper); +// // test empty enum +// testEnum(upper, lower); +// // test empty enum outside of bounds +// lower = distance*noDocs+startOffset; +// upper = 2 * lower; +// testEnum(lower, upper); +// } } diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java b/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java index bc9bce8549c..e574501a6e4 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java @@ -245,6 +245,8 @@ public class TestNumericRangeQuery64 extends LuceneTestCase { testRightOpenRange(2); } + /* TESTs disabled, because incompatible API change in 3.1/flex: + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="field"+precisionStep; @@ -317,6 +319,8 @@ public class TestNumericRangeQuery64 extends LuceneTestCase { testRandomTrieAndClassicRangeQuery(Integer.MAX_VALUE); } + */ + private void testRangeSplit(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="ascfield"+precisionStep; diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestSort.java b/lucene/backwards/src/test/org/apache/lucene/search/TestSort.java index dc318926eb0..3df140ae789 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestSort.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.LockObtainFailedException; @@ -332,20 +333,28 @@ public class TestSort extends LuceneTestCase implements Serializable { FieldCache fc = FieldCache.DEFAULT; - sort.setSort (new SortField ("parser", new FieldCache.IntParser(){ - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + sort.setSort ( new SortField ("parser", new FieldCache.IntParser(){ + public final int parseInt(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; + } + }), SortField.FIELD_DOC); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " IntParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.FloatParser(){ - public final float parseFloat(final String val) { - return (float) Math.sqrt( val.charAt(0) ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.FloatParser(){ + public final float parseFloat(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final float parseFloat(final BytesRef term) { + return (float) Math.sqrt( term.bytes[term.offset] ); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " FloatParser"); fc.purgeAllCaches(); @@ -354,34 +363,49 @@ public class TestSort extends LuceneTestCase implements Serializable { public final long parseLong(final String val) { return (val.charAt(0)-'A') * 1234567890L; } - }), SortField.FIELD_DOC ); + public final long parseLong(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 1234567890L; + } + }), SortField.FIELD_DOC); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " LongParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.DoubleParser(){ - public final double parseDouble(final String val) { - return Math.pow( val.charAt(0), (val.charAt(0)-'A') ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.DoubleParser(){ + public final double parseDouble(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final double parseDouble(final BytesRef term) { + return Math.pow( term.bytes[term.offset], (term.bytes[term.offset]-'A') ); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " DoubleParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ByteParser(){ - public final byte parseByte(final String val) { - return (byte) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ByteParser(){ + public final byte parseByte(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final byte parseByte(final BytesRef term) { + return (byte) (term.bytes[term.offset]-'A'); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ByteParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ShortParser(){ - public final short parseShort(final String val) { - return (short) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ShortParser(){ + public final short parseShort(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final short parseShort(final BytesRef term) { + return (short) (term.bytes[term.offset]-'A'); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ShortParser"); fc.purgeAllCaches(); @@ -439,8 +463,12 @@ public class TestSort extends LuceneTestCase implements Serializable { @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(final String term) { + // dummy + return 0; + } + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; } }); } diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/backwards/src/test/org/apache/lucene/search/TestTermScorer.java index 1dae462694f..3614460bc30 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestTermScorer.java @@ -72,9 +72,9 @@ public class TestTermScorer extends LuceneTestCase Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + //we have 2 documents with the term all in them, one document for all the other values final List docs = new ArrayList(); //must call next first @@ -138,9 +138,9 @@ public class TestTermScorer extends LuceneTestCase Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue("score is not correct", ts.score() == 1.6931472f); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); @@ -155,9 +155,9 @@ public class TestTermScorer extends LuceneTestCase Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + assertTrue("Didn't skip", ts.advance(3) != DocIdSetIterator.NO_MORE_DOCS); //The next doc should be doc 5 assertTrue("doc should be number 5", ts.docID() == 5); diff --git a/lucene/backwards/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/backwards/src/test/org/apache/lucene/search/TestWildcard.java index 36e5d1c2d34..6e2f497ba1e 100644 --- a/lucene/backwards/src/test/org/apache/lucene/search/TestWildcard.java +++ b/lucene/backwards/src/test/org/apache/lucene/search/TestWildcard.java @@ -114,6 +114,7 @@ public class TestWildcard * rewritten to a single PrefixQuery. The boost and rewriteMethod should be * preserved. */ + /* disable because rewrites changed in flex/trunk public void testPrefixTerm() throws IOException { RAMDirectory indexStore = getIndexStore("field", new String[]{"prefix", "prefixx"}); IndexSearcher searcher = new IndexSearcher(indexStore, true); @@ -145,7 +146,7 @@ public class TestWildcard expected.setRewriteMethod(wq.getRewriteMethod()); expected.setBoost(wq.getBoost()); assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - } + }*/ /** * Tests Wildcard queries with an asterisk. diff --git a/lucene/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java b/lucene/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java index d76cce57381..28c5f57097c 100644 --- a/lucene/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java +++ b/lucene/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java @@ -78,22 +78,22 @@ public class TestAttributeSource extends LuceneTestCase { public void testCloneAttributes() { final AttributeSource src = new AttributeSource(); - final TermAttribute termAtt = src.addAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt = src.addAttribute(FlagsAttribute.class); final TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + flagsAtt.setFlags(1234); typeAtt.setType("TestType"); final AttributeSource clone = src.cloneAttributes(); final Iterator> it = clone.getAttributeClassesIterator(); - assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("FlagsAttribute must be the first attribute", FlagsAttribute.class, it.next()); assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); assertFalse("No more attributes", it.hasNext()); - final TermAttribute termAtt2 = clone.getAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt2 = clone.getAttribute(FlagsAttribute.class); final TypeAttribute typeAtt2 = clone.getAttribute(TypeAttribute.class); - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); } diff --git a/lucene/backwards/src/test/org/apache/lucene/util/TestNumericUtils.java b/lucene/backwards/src/test/org/apache/lucene/util/TestNumericUtils.java index 6a85a1d887e..9c0ac8a919e 100644 --- a/lucene/backwards/src/test/org/apache/lucene/util/TestNumericUtils.java +++ b/lucene/backwards/src/test/org/apache/lucene/util/TestNumericUtils.java @@ -26,6 +26,8 @@ import java.util.Iterator; public class TestNumericUtils extends LuceneTestCase { + /* TESTs disabled, because incompatible API change in 3.1/flex: + public void testLongConversionAndOrdering() throws Exception { // generate a series of encoded longs, each numerical one bigger than the one before String last=null; @@ -131,6 +133,8 @@ public class TestNumericUtils extends LuceneTestCase { } } } + + */ public void testDoubles() throws Exception { double[] vals=new double[]{ diff --git a/lucene/build.xml b/lucene/build.xml index c5fed861cf8..f8a93c513e2 100644 --- a/lucene/build.xml +++ b/lucene/build.xml @@ -104,24 +104,24 @@ The source distribution does not contain sources of the previous Lucene Java ver - + - - + - - + + - + @@ -715,6 +715,41 @@ The source distribution does not contain sources of the previous Lucene Java ver + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 16ae704c67e..2cf13b63d2b 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -119,6 +119,11 @@ + + + + + diff --git a/lucene/contrib/benchmark/sortBench.py b/lucene/contrib/benchmark/sortBench.py new file mode 100644 index 00000000000..420969da398 --- /dev/null +++ b/lucene/contrib/benchmark/sortBench.py @@ -0,0 +1,553 @@ +import types +import re +import time +import os +import shutil +import sys +import cPickle +import datetime + +# TODO +# - build wiki/random index as needed (balanced or not, varying # segs, docs) +# - verify step +# - run searches +# - get all docs query in here + +if sys.platform.lower().find('darwin') != -1: + osName = 'osx' +elif sys.platform.lower().find('win') != -1: + osName = 'windows' +elif sys.platform.lower().find('linux') != -1: + osName = 'linux' +else: + osName = 'unix' + +TRUNK_DIR = '/lucene/clean' +FLEX_DIR = '/lucene/flex.branch' + +DEBUG = False + +# let shell find it: +JAVA_COMMAND = 'java -Xms2048M -Xmx2048M -Xbatch -server' +#JAVA_COMMAND = 'java -Xms1024M -Xmx1024M -Xbatch -server -XX:+AggressiveOpts -XX:CompileThreshold=100 -XX:+UseFastAccessorMethods' + +INDEX_NUM_THREADS = 1 + +INDEX_NUM_DOCS = 5000000 + +LOG_DIR = 'logs' + +DO_BALANCED = False + +if osName == 'osx': + WIKI_FILE = '/x/lucene/enwiki-20090724-pages-articles.xml.bz2' + INDEX_DIR_BASE = '/lucene' +else: + WIKI_FILE = '/x/lucene/enwiki-20090724-pages-articles.xml.bz2' + INDEX_DIR_BASE = '/x/lucene' + +if DEBUG: + NUM_ROUND = 0 +else: + NUM_ROUND = 7 + +if 0: + print 'compile...' + if '-nocompile' not in sys.argv: + if os.system('ant compile > compile.log 2>&1') != 0: + raise RuntimeError('compile failed (see compile.log)') + +BASE_SEARCH_ALG = ''' +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory +work.dir = $INDEX$ +search.num.hits = $NUM_HITS$ +query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker +file.query.maker.file = queries.txt +print.hits.field = $PRINT_FIELD$ +log.queries=true +log.step=100000 + +$OPENREADER$ +{"XSearchWarm" $SEARCH$} + +# Turn off printing, after warming: +SetProp(print.hits.field,) + +$ROUNDS$ +CloseReader +RepSumByPrefRound XSearch +''' + +BASE_INDEX_ALG = ''' +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer + +$OTHER$ +deletion.policy = org.apache.lucene.benchmark.utils.NoDeletionPolicy +doc.tokenized = false +doc.body.tokenized = true +doc.stored = true +doc.body.stored = false +doc.term.vector = false +log.step.AddDoc=10000 + +directory=FSDirectory +autocommit=false +compound=false + +work.dir=$WORKDIR$ + +{ "BuildIndex" + - CreateIndex + $INDEX_LINE$ + - CommitIndex(dp0) + - CloseIndex + $DELETIONS$ +} + +RepSumByPrefRound BuildIndex +''' + +class RunAlgs: + + def __init__(self, resultsPrefix): + self.counter = 0 + self.results = [] + self.fOut = open('%s.txt' % resultsPrefix, 'wb') + + def makeIndex(self, label, dir, source, numDocs, balancedNumSegs=None, deletePcts=None): + + if source not in ('wiki', 'random'): + raise RuntimeError('source must be wiki or random') + + if dir is not None: + fullDir = '%s/contrib/benchmark' % dir + if DEBUG: + print ' chdir %s' % fullDir + os.chdir(fullDir) + + indexName = '%s.%s.nd%gM' % (source, label, numDocs/1000000.0) + if balancedNumSegs is not None: + indexName += '_balanced%d' % balancedNumSegs + fullIndexPath = '%s/%s' % (INDEX_DIR_BASE, indexName) + + if os.path.exists(fullIndexPath): + print 'Index %s already exists...' % fullIndexPath + return indexName + + print 'Now create index %s...' % fullIndexPath + + s = BASE_INDEX_ALG + + if source == 'wiki': + other = '''doc.index.props = true +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=%s +''' % WIKI_FILE + #addDoc = 'AddDoc(1024)' + addDoc = 'AddDoc' + else: + other = '''doc.index.props = true +content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource +''' + addDoc = 'AddDoc' + if INDEX_NUM_THREADS > 1: + #other += 'doc.reuse.fields=false\n' + s = s.replace('$INDEX_LINE$', '[ { "AddDocs" %s > : %s } : %s' % \ + (addDoc, numDocs/INDEX_NUM_THREADS, INDEX_NUM_THREADS)) + else: + s = s.replace('$INDEX_LINE$', '{ "AddDocs" %s > : %s' % \ + (addDoc, numDocs)) + + s = s.replace('$WORKDIR$', fullIndexPath) + + if deletePcts is not None: + dp = '# Do deletions\n' + dp += 'OpenReader(false)\n' + for pct in deletePcts: + if pct != 0: + dp += 'DeleteByPercent(%g)\n' % pct + dp += 'CommitIndex(dp%g)\n' % pct + dp += 'CloseReader()\n' + else: + dp = '' + + s = s.replace('$DELETIONS$', dp) + + if balancedNumSegs is not None: + other += ''' merge.factor=1000 + max.buffered=%d + ram.flush.mb=2000 + ''' % (numDocs/balancedNumSegs) + else: + if source == 'random': + other += 'ram.flush.mb=1.0\n' + else: + other += 'ram.flush.mb=32.0\n' + + s = s.replace('$OTHER$', other) + + try: + self.runOne(dir, s, 'index_%s' % indexName, isIndex=True) + except: + if os.path.exists(fullIndexPath): + shutil.rmtree(fullIndexPath) + raise + return indexName + + def getLogPrefix(self, **dArgs): + l = dArgs.items() + l.sort() + s = '_'.join(['%s=%s' % tup for tup in l]) + s = s.replace(' ', '_') + s = s.replace('"', '_') + return s + + def runOne(self, dir, alg, logFileName, expectedMaxDocs=None, expectedNumDocs=None, queries=None, verify=False, isIndex=False): + + fullDir = '%s/contrib/benchmark' % dir + if DEBUG: + print ' chdir %s' % fullDir + os.chdir(fullDir) + + if queries is not None: + if type(queries) in types.StringTypes: + queries = [queries] + open('queries.txt', 'wb').write('\n'.join(queries)) + + if DEBUG: + algFile = 'tmp.alg' + else: + algFile = 'tmp.%s.alg' % os.getpid() + open(algFile, 'wb').write(alg) + + fullLogFileName = '%s/contrib/benchmark/%s/%s' % (dir, LOG_DIR, logFileName) + print ' log: %s' % fullLogFileName + if not os.path.exists(LOG_DIR): + print ' mkdir %s' % LOG_DIR + os.makedirs(LOG_DIR) + + command = '%s -classpath ../../build/classes/java:../../build/classes/demo:../../build/contrib/highlighter/classes/java:lib/commons-digester-1.7.jar:lib/commons-collections-3.1.jar:lib/commons-compress-1.0.jar:lib/commons-logging-1.0.4.jar:lib/commons-beanutils-1.7.0.jar:lib/xerces-2.9.0.jar:lib/xml-apis-2.9.0.jar:../../build/contrib/benchmark/classes/java org.apache.lucene.benchmark.byTask.Benchmark %s > "%s" 2>&1' % (JAVA_COMMAND, algFile, fullLogFileName) + + if DEBUG: + print 'command=%s' % command + + try: + t0 = time.time() + if os.system(command) != 0: + raise RuntimeError('FAILED') + t1 = time.time() + finally: + if not DEBUG: + os.remove(algFile) + + if isIndex: + s = open(fullLogFileName, 'rb').read() + if s.find('Exception in thread "') != -1 or s.find('at org.apache.lucene') != -1: + raise RuntimeError('alg hit exceptions') + return + + else: + + # Parse results: + bestQPS = None + count = 0 + nhits = None + numDocs = None + maxDocs = None + warmTime = None + r = re.compile('^ ([0-9]+): (.*)$') + topN = [] + + for line in open(fullLogFileName, 'rb').readlines(): + m = r.match(line.rstrip()) + if m is not None: + topN.append(m.group(2)) + if line.startswith('totalHits = '): + nhits = int(line[12:].strip()) + if line.startswith('maxDoc() = '): + maxDocs = int(line[12:].strip()) + if line.startswith('numDocs() = '): + numDocs = int(line[12:].strip()) + if line.startswith('XSearchWarm'): + v = line.strip().split() + warmTime = float(v[5]) + if line.startswith('XSearchReal'): + v = line.strip().split() + # print len(v), v + upto = 0 + i = 0 + qps = None + while i < len(v): + if v[i] == '-': + i += 1 + continue + else: + upto += 1 + i += 1 + if upto == 5: + qps = float(v[i-1].replace(',', '')) + break + + if qps is None: + raise RuntimeError('did not find qps') + + count += 1 + if bestQPS is None or qps > bestQPS: + bestQPS = qps + + if not verify: + if count != NUM_ROUND: + raise RuntimeError('did not find %s rounds (got %s)' % (NUM_ROUND, count)) + if warmTime is None: + raise RuntimeError('did not find warm time') + else: + bestQPS = 1.0 + warmTime = None + + if nhits is None: + raise RuntimeError('did not see "totalHits = XXX"') + + if maxDocs is None: + raise RuntimeError('did not see "maxDoc() = XXX"') + + if maxDocs != expectedMaxDocs: + raise RuntimeError('maxDocs() mismatch: expected %s but got %s' % (expectedMaxDocs, maxDocs)) + + if numDocs is None: + raise RuntimeError('did not see "numDocs() = XXX"') + + if numDocs != expectedNumDocs: + raise RuntimeError('numDocs() mismatch: expected %s but got %s' % (expectedNumDocs, numDocs)) + + return nhits, warmTime, bestQPS, topN + + def getAlg(self, indexPath, searchTask, numHits, deletes=None, verify=False, printField=''): + + s = BASE_SEARCH_ALG + s = s.replace('$PRINT_FIELD$', 'doctitle') + + if not verify: + s = s.replace('$ROUNDS$', + ''' + { "Rounds" + { "Run" + { "TestSearchSpeed" + { "XSearchReal" $SEARCH$ > : 3.0s + } + NewRound + } : %d + } + ''' % NUM_ROUND) + else: + s = s.replace('$ROUNDS$', '') + + if deletes is None: + s = s.replace('$OPENREADER$', 'OpenReader') + else: + s = s.replace('$OPENREADER$', 'OpenReader(true,dp%g)' % deletes) + s = s.replace('$INDEX$', indexPath) + s = s.replace('$SEARCH$', searchTask) + s = s.replace('$NUM_HITS$', str(numHits)) + + return s + + def compare(self, baseline, new, *params): + + if new[0] != baseline[0]: + raise RuntimeError('baseline found %d hits but new found %d hits' % (baseline[0], new[0])) + + qpsOld = baseline[2] + qpsNew = new[2] + pct = 100.0*(qpsNew-qpsOld)/qpsOld + print ' diff: %.1f%%' % pct + self.results.append((qpsOld, qpsNew, params)) + + self.fOut.write('|%s|%.2f|%.2f|%.1f%%|\n' % \ + ('|'.join(str(x) for x in params), + qpsOld, qpsNew, pct)) + self.fOut.flush() + + def save(self, name): + f = open('%s.pk' % name, 'wb') + cPickle.dump(self.results, f) + f.close() + +def verify(r1, r2): + if r1[0] != r2[0]: + raise RuntimeError('different total hits: %s vs %s' % (r1[0], r2[0])) + + h1 = r1[3] + h2 = r2[3] + if len(h1) != len(h2): + raise RuntimeError('different number of results') + else: + for i in range(len(h1)): + s1 = h1[i].replace('score=NaN', 'score=na').replace('score=0.0', 'score=na') + s2 = h2[i].replace('score=NaN', 'score=na').replace('score=0.0', 'score=na') + if s1 != s2: + raise RuntimeError('hit %s differs: %s vs %s' % (i, s1 ,s2)) + +def usage(): + print + print 'Usage: python -u %s -run | -report ' % sys.argv[0] + print + print ' -run runs all tests, saving results to file .pk' + print ' -report opens .pk and prints Jira table' + print ' -verify confirm old & new produce identical results' + print + sys.exit(1) + +def main(): + + if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR) + + if '-run' in sys.argv: + i = sys.argv.index('-run') + mode = 'run' + if i < len(sys.argv)-1: + name = sys.argv[1+i] + else: + usage() + elif '-report' in sys.argv: + i = sys.argv.index('-report') + mode = 'report' + if i < len(sys.argv)-1: + name = sys.argv[1+i] + else: + usage() + elif '-verify' in sys.argv: + mode = 'verify' + name = None + else: + usage() + + if mode in ('run', 'verify'): + run(mode, name) + else: + report(name) + +def report(name): + + print '||Query||Deletes %||Tot hits||QPS old||QPS new||Pct change||' + + results = cPickle.load(open('%s.pk' % name)) + for qpsOld, qpsNew, params in results: + pct = 100.0*(qpsNew-qpsOld)/qpsOld + if pct < 0.0: + c = 'red' + else: + c = 'green' + + params = list(params) + + query = params[0] + if query == '*:*': + query = '' + params[0] = query + + pct = '{color:%s}%.1f%%{color}' % (c, pct) + print '|%s|%.2f|%.2f|%s|' % \ + ('|'.join(str(x) for x in params), + qpsOld, qpsNew, pct) + +def run(mode, name): + + for dir in (TRUNK_DIR, FLEX_DIR): + dir = '%s/contrib/benchmark' % dir + print '"ant compile" in %s...' % dir + os.chdir(dir) + if os.system('ant compile') != 0: + raise RuntimeError('ant compile failed') + + r = RunAlgs(name) + + if not os.path.exists(WIKI_FILE): + print + print 'ERROR: wiki source file "%s" does not exist' % WIKI_FILE + print + sys.exit(1) + + print + print 'JAVA:\n%s' % os.popen('java -version 2>&1').read() + + print + if osName != 'windows': + print 'OS:\n%s' % os.popen('uname -a 2>&1').read() + else: + print 'OS:\n%s' % sys.platform + + deletePcts = (0.0, 0.1, 1.0, 10) + + indexes = {} + for rev in ('baseline', 'flex'): + if rev == 'baseline': + dir = TRUNK_DIR + else: + dir = FLEX_DIR + source = 'wiki' + indexes[rev] = r.makeIndex(rev, dir, source, INDEX_NUM_DOCS, deletePcts=deletePcts) + + doVerify = mode == 'verify' + source = 'wiki' + numHits = 10 + + queries = ( + 'body:[tec TO tet]', + 'real*', + '1', + '2', + '+1 +2', + '+1 -2', + '1 2 3 -4', + '"world economy"') + + for query in queries: + + for deletePct in deletePcts: + + print '\nRUN: query=%s deletes=%g%% nhits=%d' % \ + (query, deletePct, numHits) + + maxDocs = INDEX_NUM_DOCS + numDocs = int(INDEX_NUM_DOCS * (1.0-deletePct/100.)) + + prefix = r.getLogPrefix(query=query, deletePct=deletePct) + indexPath = '%s/%s' % (INDEX_DIR_BASE, indexes['baseline']) + + # baseline (trunk) + s = r.getAlg(indexPath, + 'Search', + numHits, + deletes=deletePct, + verify=doVerify, + printField='doctitle') + baseline = r.runOne(TRUNK_DIR, s, 'baseline_%s' % prefix, maxDocs, numDocs, query, verify=doVerify) + + # flex + indexPath = '%s/%s' % (INDEX_DIR_BASE, indexes['flex']) + s = r.getAlg(indexPath, + 'Search', + numHits, + deletes=deletePct, + verify=doVerify, + printField='doctitle') + flex = r.runOne(FLEX_DIR, s, 'flex_%s' % prefix, maxDocs, numDocs, query, verify=doVerify) + + print ' %d hits' % flex[0] + + verify(baseline, flex) + + if mode == 'run' and not DEBUG: + r.compare(baseline, flex, + query, deletePct, baseline[0]) + r.save(name) + +def cleanScores(l): + for i in range(len(l)): + pos = l[i].find(' score=') + l[i] = l[i][:pos].strip() + +if __name__ == '__main__': + main() diff --git a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java new file mode 100644 index 00000000000..f202b0c16ce --- /dev/null +++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java @@ -0,0 +1,38 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * A {@link DocMaker} which reads the English Wikipedia dump. Uses + * {@link EnwikiContentSource} as its content source, regardless if a different + * content source was defined in the configuration. + * @deprecated Please use {@link DocMaker} instead, with content.source=EnwikiContentSource + */ +@Deprecated +public class EnwikiDocMaker extends DocMaker { + @Override + public void setConfig(Config config) { + super.setConfig(config); + // Override whatever content source was set in the config + source = new EnwikiContentSource(); + source.setConfig(config); + System.out.println("NOTE: EnwikiDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=EnwikiContentSource"); + } +} diff --git a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java new file mode 100644 index 00000000000..5f54c0f6646 --- /dev/null +++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -0,0 +1,50 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * A DocMaker reading one line at a time as a Document from a single file. This + * saves IO cost (over DirContentSource) of recursing through a directory and + * opening a new file for every document. It also re-uses its Document and Field + * instance to improve indexing speed.
+ * The expected format of each line is (arguments are separated by <TAB>): + * title, date, body. If a line is read in a different format, a + * {@link RuntimeException} will be thrown. In general, you should use this doc + * maker with files that were created with + * {@link org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask}.
+ *
+ * Config properties: + *

    + *
  • doc.random.id.limit=N (default -1) -- create random docid in the range + * 0..N; this is useful with UpdateDoc to test updating random documents; if + * this is unspecified or -1, then docid is sequentially assigned + *
+ * @deprecated Please use {@link DocMaker} instead, with content.source=LineDocSource + */ +@Deprecated +public class LineDocMaker extends DocMaker { + @Override + public void setConfig(Config config) { + super.setConfig(config); + source = new LineDocSource(); + source.setConfig(config); + System.out.println("NOTE: LineDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=LineDocSource"); + } +} diff --git a/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 4b5c616c8da..39d382f3330 100755 --- a/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -37,11 +37,12 @@ import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.TermFreqVector; @@ -474,16 +475,20 @@ public class TestPerfTasksLogic extends LuceneTestCase { IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory(), true); assertEquals(NUM_DOCS, reader.numDocs()); - TermEnum terms = reader.terms(); - TermDocs termDocs = reader.termDocs(); int totalTokenCount2 = 0; - while(terms.next()) { - Term term = terms.term(); - /* not-tokenized, but indexed field */ - if (term != null && term.field() != DocMaker.ID_FIELD) { - termDocs.seek(terms.term()); - while (termDocs.next()) - totalTokenCount2 += termDocs.freq(); + + FieldsEnum fields = MultiFields.getFields(reader).iterator(); + String fieldName = null; + while((fieldName = fields.next()) != null) { + if (fieldName == DocMaker.ID_FIELD) + continue; + TermsEnum terms = fields.terms(); + DocsEnum docs = null; + while(terms.next() != null) { + docs = terms.docs(MultiFields.getDeletedDocs(reader), docs); + while(docs.nextDoc() != docs.NO_MORE_DOCS) { + totalTokenCount2 += docs.freq(); + } } } reader.close(); diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index bdd6cf980f6..0e8b51add59 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -150,11 +150,16 @@ public class WeightedSpanTermExtractor { mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); query = mtq; } - FakeReader fReader = new FakeReader(); - MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq); - if (fReader.field != null) { - IndexReader ir = getReaderForField(fReader.field); + if (mtq.getField() != null) { + IndexReader ir = getReaderForField(mtq.getField()); extract(query.rewrite(ir), terms); + } else { + FakeReader fReader = new FakeReader(); + MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq); + if (fReader.field != null) { + IndexReader ir = getReaderForField(fReader.field); + extract(query.rewrite(ir), terms); + } } } else if (query instanceof MultiPhraseQuery) { final MultiPhraseQuery mpq = (MultiPhraseQuery) query; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java index 39ebc6972fc..bf4804ed1e3 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java @@ -19,11 +19,15 @@ package org.apache.lucene.index; import java.io.IOException; import java.io.File; import java.util.Date; +import java.util.List; +import java.util.ArrayList; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; /** * Given a directory and a list of fields, updates the fieldNorms in place for every document. @@ -104,46 +108,46 @@ public class FieldNormModifier { */ public void reSetNorms(String field) throws IOException { String fieldName = StringHelper.intern(field); - int[] termCounts = new int[0]; IndexReader reader = null; - TermEnum termEnum = null; - TermDocs termDocs = null; try { - reader = IndexReader.open(dir, true); - termCounts = new int[reader.maxDoc()]; - try { - termEnum = reader.terms(new Term(field)); - try { - termDocs = reader.termDocs(); - do { - Term term = termEnum.term(); - if (term != null && term.field().equals(fieldName)) { - termDocs.seek(termEnum.term()); - while (termDocs.next()) { - termCounts[termDocs.doc()] += termDocs.freq(); + reader = IndexReader.open(dir, false); + + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + + for(IndexReader subReader : subReaders) { + final Bits delDocs = subReader.getDeletedDocs(); + + int[] termCounts = new int[subReader.maxDoc()]; + Fields fields = subReader.fields(); + if (fields != null) { + Terms terms = fields.terms(field); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(termsEnum.next() != null) { + docs = termsEnum.docs(delDocs, docs); + while(true) { + int docID = docs.nextDoc(); + if (docID != docs.NO_MORE_DOCS) { + termCounts[docID] += docs.freq(); + } else { + break; + } } } - } while (termEnum.next()); - - } finally { - if (null != termDocs) termDocs.close(); + } } - } finally { - if (null != termEnum) termEnum.close(); - } - } finally { - if (null != reader) reader.close(); - } - - try { - reader = IndexReader.open(dir, false); - for (int d = 0; d < termCounts.length; d++) { - if (! reader.isDeleted(d)) { - if (sim == null) - reader.setNorm(d, fieldName, Similarity.encodeNorm(1.0f)); - else - reader.setNorm(d, fieldName, sim.encodeNormValue(sim.lengthNorm(fieldName, termCounts[d]))); + + for (int d = 0; d < termCounts.length; d++) { + if (delDocs == null || !delDocs.get(d)) { + if (sim == null) { + subReader.setNorm(d, fieldName, Similarity.encodeNorm(1.0f)); + } else { + subReader.setNorm(d, fieldName, sim.encodeNormValue(sim.lengthNorm(fieldName, termCounts[d]))); + } + } } } @@ -151,5 +155,4 @@ public class FieldNormModifier { if (null != reader) reader.close(); } } - } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java index 6a54e9bbf33..3092d4287b7 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; /** @@ -172,6 +173,8 @@ public class MultiPassIndexSplitter { * list of deletions. */ public static class FakeDeleteIndexReader extends FilterIndexReader { + // TODO: switch to flex api, here + OpenBitSet dels; OpenBitSet oldDels = null; @@ -202,6 +205,7 @@ public class MultiPassIndexSplitter { if (oldDels != null) { dels.or(oldDels); } + storeDelDocs(null); } @Override @@ -214,6 +218,16 @@ public class MultiPassIndexSplitter { return !dels.isEmpty(); } + @Override + public IndexReader[] getSequentialSubReaders() { + return null; + } + + @Override + public Bits getDeletedDocs() { + return dels; + } + @Override public boolean isDeleted(int n) { return dels.get(n); @@ -235,5 +249,29 @@ public class MultiPassIndexSplitter { } }; } + + @Override + public TermDocs termDocs() throws IOException { + return new FilterTermDocs(in.termDocs()) { + + @Override + public boolean next() throws IOException { + boolean res; + while ((res = super.next())) { + if (!dels.get(doc())) { + break; + } + } + return res; + } + }; + } + + @Override + public TermDocs termDocs(Term term) throws IOException { + TermDocs termDocs = termDocs(); + termDocs.seek(term); + return termDocs; + } } } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java index 293c37a532f..2c1400c4766 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java @@ -1,10 +1,5 @@ package org.apache.lucene.index; -import org.apache.lucene.util.StringHelper; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; /* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +15,14 @@ import java.util.List; * */ +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + /** * Transparent access to the vector space model, @@ -97,40 +100,53 @@ public class TermVectorAccessor { positions.clear(); } - TermEnum termEnum = indexReader.terms(new Term(field, "")); - if (termEnum.term() != null) { - while (termEnum.term().field() == field) { - TermPositions termPositions = indexReader.termPositions(termEnum.term()); - if (termPositions.skipTo(documentNumber)) { - - frequencies.add(Integer.valueOf(termPositions.freq())); - tokens.add(termEnum.term().text()); - - + final Bits delDocs = MultiFields.getDeletedDocs(indexReader); + + Terms terms = MultiFields.getTerms(indexReader, field); + boolean anyTerms = false; + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + DocsAndPositionsEnum postings = null; + while(true) { + BytesRef text = termsEnum.next(); + if (text != null) { + anyTerms = true; if (!mapper.isIgnoringPositions()) { - int[] positions = new int[termPositions.freq()]; - for (int i = 0; i < positions.length; i++) { - positions[i] = termPositions.nextPosition(); - } - this.positions.add(positions); + docs = postings = termsEnum.docsAndPositions(delDocs, postings); } else { - positions.add(null); + docs = termsEnum.docs(delDocs, docs); } - } - termPositions.close(); - if (!termEnum.next()) { + + int docID = docs.advance(documentNumber); + if (docID == documentNumber) { + + frequencies.add(Integer.valueOf(docs.freq())); + tokens.add(text.utf8ToString()); + + if (!mapper.isIgnoringPositions()) { + int[] positions = new int[docs.freq()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = postings.nextPosition(); + } + this.positions.add(positions); + } else { + positions.add(null); + } + } + } else { break; } } - mapper.setDocumentNumber(documentNumber); - mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); - for (int i = 0; i < tokens.size(); i++) { - mapper.map(tokens.get(i), frequencies.get(i).intValue(), (TermVectorOffsetInfo[]) null, positions.get(i)); + + if (anyTerms) { + mapper.setDocumentNumber(documentNumber); + mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); + for (int i = 0; i < tokens.size(); i++) { + mapper.map(tokens.get(i), frequencies.get(i).intValue(), (TermVectorOffsetInfo[]) null, positions.get(i)); + } } } - termEnum.close(); - - } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java index 915f7b0ffd6..2b50b1c6223 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java @@ -18,7 +18,10 @@ package org.apache.lucene.misc; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.Terms; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.PriorityQueue; @@ -50,20 +53,40 @@ public class HighFreqTerms { } TermInfoQueue tiq = new TermInfoQueue(numTerms); - TermEnum terms = reader.terms(); if (field != null) { - while (terms.next()) { - if (terms.term().field().equals(field)) { - tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq())); + Terms terms = reader.fields().terms(field); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + while(true) { + BytesRef term = termsEnum.next(); + if (term != null) { + tiq.insertWithOverflow(new TermInfo(new Term(field, term.utf8ToString()), termsEnum.docFreq())); + } else { + break; + } + } + } + } else { + FieldsEnum fields = reader.fields().iterator(); + while(true) { + field = fields.next(); + if (field != null) { + TermsEnum terms = fields.terms(); + while(true) { + BytesRef term = terms.next(); + if (term != null) { + tiq.insertWithOverflow(new TermInfo(new Term(field, term.toString()), terms.docFreq())); + } else { + break; + } + } + } else { + break; } } } - else { - while (terms.next()) { - tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq())); - } - } + while (tiq.size() != 0) { TermInfo termInfo = tiq.pop(); System.out.println(termInfo.term + " " + termInfo.docFreq); diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/LengthNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/LengthNormModifier.java new file mode 100644 index 00000000000..d75043d94cd --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/LengthNormModifier.java @@ -0,0 +1,154 @@ +package org.apache.lucene.misc; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.StringHelper; + +import java.io.File; +import java.io.IOException; +import java.util.Date; + +/** + * Given a directory, a Similarity, and a list of fields, updates the + * fieldNorms in place for every document using the Similarity.lengthNorm. + * + *

+ * NOTE: This only works if you do not use field/document boosts in your + * index. + *

+ * + * @version $Id$ + * @deprecated Use {@link org.apache.lucene.index.FieldNormModifier} + */ +@Deprecated +public class LengthNormModifier { + + /** + * Command Line Execution method. + * + *
+   * Usage: LengthNormModifier /path/index package.SimilarityClassName field1 field2 ...
+   * 
+ */ + public static void main(String[] args) throws IOException { + if (args.length < 3) { + System.err.println("Usage: LengthNormModifier [field2] ..."); + System.exit(1); + } + + Similarity s = null; + try { + s = Class.forName(args[1]).asSubclass(Similarity.class).newInstance(); + } catch (Exception e) { + System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]); + e.printStackTrace(System.err); + } + + File index = new File(args[0]); + Directory d = FSDirectory.open(index); + + LengthNormModifier lnm = new LengthNormModifier(d, s); + + for (int i = 2; i < args.length; i++) { + System.out.print("Updating field: " + args[i] + " " + (new Date()).toString() + " ... "); + lnm.reSetNorms(args[i]); + System.out.println(new Date().toString()); + } + + d.close(); + } + + + private Directory dir; + private Similarity sim; + + /** + * Constructor for code that wishes to use this class progaomatically. + * + * @param d The Directory to modify + * @param s The Similarity to use in reSetNorms + */ + public LengthNormModifier(Directory d, Similarity s) { + dir = d; + sim = s; + } + + /** + * Resets the norms for the specified field. + * + *

+ * Opens a new IndexReader on the Directory given to this instance, + * modifies the norms using the Similarity given to this instance, + * and closes the IndexReader. + *

+ * + * @param field the field whose norms should be reset + */ + public void reSetNorms(String field) throws IOException { + String fieldName = StringHelper.intern(field); + int[] termCounts = new int[0]; + + IndexReader reader = null; + TermEnum termEnum = null; + TermDocs termDocs = null; + try { + reader = IndexReader.open(dir, false); + termCounts = new int[reader.maxDoc()]; + try { + termEnum = reader.terms(new Term(field)); + try { + termDocs = reader.termDocs(); + do { + Term term = termEnum.term(); + if (term != null && term.field().equals(fieldName)) { + termDocs.seek(termEnum.term()); + while (termDocs.next()) { + termCounts[termDocs.doc()] += termDocs.freq(); + } + } + } while (termEnum.next()); + } finally { + if (null != termDocs) termDocs.close(); + } + } finally { + if (null != termEnum) termEnum.close(); + } + } finally { + if (null != reader) reader.close(); + } + + try { + reader = IndexReader.open(dir, false); + for (int d = 0; d < termCounts.length; d++) { + if (! reader.isDeleted(d)) { + byte norm = Similarity.encodeNorm(sim.lengthNorm(fieldName, termCounts[d])); + reader.setNorm(d, fieldName, norm); + } + } + } finally { + if (null != reader) reader.close(); + } + } + +} diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java index 3d49df9d6bf..9bf90604658 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java @@ -76,13 +76,9 @@ public class TestFieldNormModifier extends LuceneTestCase { writer.close(); } - public void testMissingField() { + public void testMissingField() throws Exception { FieldNormModifier fnm = new FieldNormModifier(store, s); - try { - fnm.reSetNorms("nobodyherebutuschickens"); - } catch (Exception e) { - assertNull("caught something", e); - } + fnm.reSetNorms("nobodyherebutuschickens"); } public void testFieldWithNoNorm() throws Exception { @@ -97,11 +93,7 @@ public class TestFieldNormModifier extends LuceneTestCase { r.close(); FieldNormModifier fnm = new FieldNormModifier(store, s); - try { - fnm.reSetNorms("nonorm"); - } catch (Exception e) { - assertNull("caught something", e); - } + fnm.reSetNorms("nonorm"); // nothing should have changed r = IndexReader.open(store, false); diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java index ef806f2b46d..3a249344986 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java @@ -18,10 +18,13 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; public class DuplicateFilter extends Filter { @@ -79,88 +82,87 @@ public class DuplicateFilter extends Filter } } - private OpenBitSet correctBits(IndexReader reader) throws IOException - { - - OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - int lastDoc=-1; - //set non duplicates - TermDocs td = reader.termDocs(currTerm); - if(td.next()) - { - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - bits.set(td.doc()); - } - else - { - do - { - lastDoc=td.doc(); - }while(td.next()); - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } + private OpenBitSet correctBits(IndexReader reader) throws IOException { + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); //assume all are INvalid + final Bits delDocs = MultiFields.getDeletedDocs(reader); + Terms terms = reader.fields().terms(fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != docs.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { + bits.set(doc); + } else { + int lastDoc = doc; + while (true) { + lastDoc = doc; + doc = docs.nextDoc(); + if (doc == docs.NO_MORE_DOCS) { + break; + } + } + bits.set(lastDoc); + } + } + } + } + } + return bits; + } private OpenBitSet fastBits(IndexReader reader) throws IOException - { + { OpenBitSet bits=new OpenBitSet(reader.maxDoc()); - bits.set(0,reader.maxDoc()); //assume all are valid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - if(te.docFreq()>1) - { - int lastDoc=-1; - //unset potential duplicates - TermDocs td = reader.termDocs(currTerm); - td.next(); - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - td.next(); - } - do - { - lastDoc=td.doc(); - bits.clear(lastDoc); - }while(td.next()); - if(keepMode==KM_USE_LAST_OCCURRENCE) - { - //restore the last bit - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } + bits.set(0,reader.maxDoc()); //assume all are valid + final Bits delDocs = MultiFields.getDeletedDocs(reader); + Terms terms = reader.fields().terms(fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + if (termsEnum.docFreq() > 1) { + // unset potential duplicates + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != docs.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { + doc = docs.nextDoc(); + } + } + + int lastDoc = -1; + while (true) { + lastDoc = doc; + bits.clear(lastDoc); + doc = docs.nextDoc(); + if (doc == docs.NO_MORE_DOCS) { + break; + } + } + + if (keepMode==KM_USE_LAST_OCCURRENCE) { + // restore the last bit + bits.set(lastDoc); + } + } + } + } + } + + return bits; + } public String getFieldName() { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index 1cb16da02e8..84c76bd7dd0 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -29,7 +29,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** @@ -172,8 +172,8 @@ public class FuzzyLikeThisQuery extends Query * Adds user input for "fuzzification" * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed * @param fieldName - * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum) - * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum) + * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermsEnum) + * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum) */ public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) { @@ -195,48 +195,44 @@ public class FuzzyLikeThisQuery extends Query String term = termAtt.term(); if(!processedTerms.contains(term)) { - processedTerms.add(term); - ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term - float minScore=0; - Term startTerm=internSavingTemplateTerm.createTerm(term); - FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength); - TermEnum origEnum = reader.terms(startTerm); - int df=0; - if(startTerm.equals(origEnum.term())) - { - df=origEnum.docFreq(); //store the df so all variants use same idf - } - int numVariants=0; - int totalVariantDocFreqs=0; - do - { - Term possibleMatch=fe.term(); - if(possibleMatch!=null) - { - numVariants++; - totalVariantDocFreqs+=fe.docFreq(); - float score=fe.difference(); - if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm); - variantsQ.insertWithOverflow(st); - minScore = variantsQ.top().score; // maintain minScore - } + processedTerms.add(term); + ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term + float minScore=0; + Term startTerm=internSavingTemplateTerm.createTerm(term); + FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, startTerm, f.minSimilarity, f.prefixLength); + //store the df so all variants use same idf + int df = reader.docFreq(startTerm); + int numVariants=0; + int totalVariantDocFreqs=0; + BytesRef possibleMatch; + MultiTermQuery.BoostAttribute boostAtt = + fe.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + while ((possibleMatch = fe.next()) != null) { + if (possibleMatch!=null) { + numVariants++; + totalVariantDocFreqs+=fe.docFreq(); + float score=boostAtt.getBoost(); + if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ + ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), possibleMatch.utf8ToString()),score,startTerm); + variantsQ.insertWithOverflow(st); + minScore = variantsQ.top().score; // maintain minScore + } + } } - } - while(fe.next()); - if(numVariants>0) - { - int avgDf=totalVariantDocFreqs/numVariants; - if(df==0)//no direct match we can use as df for all variants + + if(numVariants>0) + { + int avgDf=totalVariantDocFreqs/numVariants; + if(df==0)//no direct match we can use as df for all variants { df=avgDf; //use avg df of all variants } - // take the top variants (scored by edit distance) and reset the score - // to include an IDF factor then add to the global queue for ranking - // overall top query terms - int size = variantsQ.size(); - for(int i = 0; i < size; i++) + // take the top variants (scored by edit distance) and reset the score + // to include an IDF factor then add to the global queue for ranking + // overall top query terms + int size = variantsQ.size(); + for(int i = 0; i < size; i++) { ScoreTerm st = variantsQ.pop(); st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs); diff --git a/lucene/contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java b/lucene/contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java index 3cfd8b7e4e8..d61e1380d4f 100644 --- a/lucene/contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java +++ b/lucene/contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -219,8 +220,8 @@ public class TestRemoteSort extends LuceneTestCase implements Serializable { @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(BytesRef termRef) { + return (termRef.utf8ToString().charAt(0)-'A') * 123456; } }); } @@ -245,6 +246,29 @@ public class TestRemoteSort extends LuceneTestCase implements Serializable { runMultiSorts(multi, true); // this runs on the full index } + // test custom search when remote + /* rewrite with new API + public void testRemoteCustomSort() throws Exception { + Searchable searcher = getRemote(); + MultiSearcher multi = new MultiSearcher (new Searchable[] { searcher }); + sort.setSort (new SortField ("custom", SampleComparable.getComparatorSource())); + assertMatches (multi, queryX, sort, "CAIEG"); + sort.setSort (new SortField ("custom", SampleComparable.getComparatorSource(), true)); + assertMatches (multi, queryY, sort, "HJDBF"); + + assertSaneFieldCaches(getName() + " ComparatorSource"); + FieldCache.DEFAULT.purgeAllCaches(); + + SortComparator custom = SampleComparable.getComparator(); + sort.setSort (new SortField ("custom", custom)); + assertMatches (multi, queryX, sort, "CAIEG"); + sort.setSort (new SortField ("custom", custom, true)); + assertMatches (multi, queryY, sort, "HJDBF"); + + assertSaneFieldCaches(getName() + " Comparator"); + FieldCache.DEFAULT.purgeAllCaches(); + }*/ + // test that the relevancy scores are the same even if // hits are sorted public void testNormalizedScores() throws Exception { @@ -294,7 +318,7 @@ public class TestRemoteSort extends LuceneTestCase implements Serializable { assertSameValues (scoresY, getScores (remote.search (queryY, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresA, getScores (remote.search (queryA, null, 1000, sort).scoreDocs, remote)); - sort.setSort (new SortField("float", SortField.FLOAT), new SortField("string", SortField.STRING)); + sort.setSort (new SortField("float", SortField.FLOAT)); assertSameValues (scoresX, getScores (remote.search (queryX, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresY, getScores (remote.search (queryY, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresA, getScores (remote.search (queryA, null, 1000, sort).scoreDocs, remote)); @@ -314,6 +338,10 @@ public class TestRemoteSort extends LuceneTestCase implements Serializable { expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; assertMatches(multi, queryA, sort, expected); + sort.setSort(new SortField ("int", SortField.INT)); + expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; + assertMatches(multi, queryA, sort, expected); + sort.setSort(new SortField ("float", SortField.FLOAT), SortField.FIELD_DOC); assertMatches(multi, queryA, sort, "GDHJCIEFAB"); diff --git a/lucene/contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java b/lucene/contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java index 9c3840d24d0..20e951c503f 100644 --- a/lucene/contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java +++ b/lucene/contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java @@ -19,12 +19,15 @@ package org.apache.lucene.spatial.tier; import java.io.IOException; import java.util.List; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.Filter; import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.OpenBitSet; /** @@ -44,22 +47,41 @@ public class CartesianShapeFilter extends Filter { @Override public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { - final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); - final TermDocs termDocs = reader.termDocs(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); final List area = shape.getArea(); - int sz = area.size(); + final int sz = area.size(); - final Term term = new Term(fieldName); // iterate through each boxid - for (int i =0; i< sz; i++) { - double boxId = area.get(i).doubleValue(); - termDocs.seek(term.createTerm(NumericUtils.doubleToPrefixCoded(boxId))); - // iterate through all documents - // which have this boxId - while (termDocs.next()) { - bits.fastSet(termDocs.doc()); + final BytesRef bytesRef = new BytesRef(NumericUtils.BUF_SIZE_LONG); + if (sz == 1) { + double boxId = area.get(0).doubleValue(); + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(boxId), 0, bytesRef); + return new DocIdSet() { + @Override + public DocIdSetIterator iterator() throws IOException { + return MultiFields.getTermDocsEnum(reader, delDocs, fieldName, bytesRef); + } + + @Override + public boolean isCacheable() { + return false; + } + }; + } else { + final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + for (int i =0; i< sz; i++) { + double boxId = area.get(i).doubleValue(); + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(boxId), 0, bytesRef); + final DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, delDocs, fieldName, bytesRef); + if (docsEnum == null) continue; + // iterate through all documents + // which have this boxId + int doc; + while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + bits.fastSet(doc); + } } + return bits; } - return bits; } } diff --git a/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java b/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java index bef0e599d60..7f7e859c473 100644 --- a/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java +++ b/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java @@ -24,6 +24,7 @@ import java.util.Map; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -49,7 +50,6 @@ import org.apache.lucene.spatial.tier.projections.SinusoidalProjector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; public class TestCartesian extends LuceneTestCase { @@ -96,8 +96,8 @@ public class TestCartesian extends LuceneTestCase { doc.add(new Field("name", name,Field.Store.YES, Field.Index.ANALYZED)); // convert the lat / long to lucene fields - doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat),Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng),Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new NumericField(latField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lat)); + doc.add(new NumericField(lngField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lng)); // add a default meta field to make searching all documents easy doc.add(new Field("metafile", "doc",Field.Store.YES, Field.Index.ANALYZED)); @@ -105,10 +105,9 @@ public class TestCartesian extends LuceneTestCase { int ctpsize = ctps.size(); for (int i =0; i < ctpsize; i++){ CartesianTierPlotter ctp = ctps.get(i); - doc.add(new Field(ctp.getTierFieldName(), - NumericUtils.doubleToPrefixCoded(ctp.getTierBoxId(lat,lng)), + doc.add(new NumericField(ctp.getTierFieldName(), Integer.MAX_VALUE, Field.Store.YES, - Field.Index.NOT_ANALYZED_NO_NORMS)); + true).setDoubleValue(ctp.getTierBoxId(lat,lng))); doc.add(new Field(geoHashPrefix, GeoHashUtils.encode(lat,lng), Field.Store.YES, @@ -275,8 +274,8 @@ public class TestCartesian extends LuceneTestCase { Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -369,8 +368,8 @@ public class TestCartesian extends LuceneTestCase { for(int i =0 ; i < results; i++){ Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -464,8 +463,8 @@ public class TestCartesian extends LuceneTestCase { Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -558,8 +557,8 @@ public class TestCartesian extends LuceneTestCase { Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); diff --git a/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java b/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java index 286f9421039..18559cee815 100644 --- a/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java +++ b/lucene/contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; @@ -28,7 +29,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; import org.apache.lucene.store.RAMDirectory; public class TestDistance extends LuceneTestCase { @@ -63,8 +63,8 @@ public class TestDistance extends LuceneTestCase { doc.add(new Field("name", name,Field.Store.YES, Field.Index.ANALYZED)); // convert the lat / long to lucene fields - doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat),Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng),Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new NumericField(latField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lat)); + doc.add(new NumericField(lngField, Integer.MAX_VALUE,Field.Store.YES, true).setDoubleValue(lng)); // add a default meta field to make searching all documents easy doc.add(new Field("metafile", "doc",Field.Store.YES, Field.Index.ANALYZED)); diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java index 74ca37ff1fe..3ab41c2813c 100755 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java @@ -21,8 +21,10 @@ import org.apache.lucene.index.IndexReader; import java.util.Iterator; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.StringHelper; import java.io.*; @@ -52,55 +54,39 @@ public class LuceneDictionary implements Dictionary { final class LuceneIterator implements Iterator { - private TermEnum termEnum; - private Term actualTerm; - private boolean hasNextCalled; + private TermsEnum termsEnum; + private BytesRef pendingTerm; LuceneIterator() { try { - termEnum = reader.terms(new Term(field)); + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + termsEnum = terms.iterator(); + pendingTerm = termsEnum.next(); + } } catch (IOException e) { throw new RuntimeException(e); } } public String next() { - if (!hasNextCalled) { - hasNext(); + if (pendingTerm == null) { + return null; } - hasNextCalled = false; + + String result = pendingTerm.utf8ToString(); try { - termEnum.next(); + pendingTerm = termsEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } - return (actualTerm != null) ? actualTerm.text() : null; + return result; } public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; - } - hasNextCalled = true; - - actualTerm = termEnum.term(); - - // if there are no words return false - if (actualTerm == null) { - return false; - } - - String currentField = actualTerm.field(); - - // if the next word doesn't have the same field return false - if (currentField != field) { - actualTerm = null; - return false; - } - - return true; + return pendingTerm != null; } public void remove() { diff --git a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java index c39b2f008f5..68e25c68c81 100644 --- a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java +++ b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java @@ -17,16 +17,21 @@ package org.apache.lucene.queryParser.surround.query; */ import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import java.io.IOException; public class SrndPrefixQuery extends SimpleTerm { + private final BytesRef prefixRef; public SrndPrefixQuery(String prefix, boolean quoted, char truncator) { super(quoted); this.prefix = prefix; + prefixRef = new BytesRef(prefix); this.truncator = truncator; } @@ -53,20 +58,35 @@ public class SrndPrefixQuery extends SimpleTerm { MatchingTermVisitor mtv) throws IOException { /* inspired by PrefixQuery.rewrite(): */ - TermEnum enumerator = reader.terms(getLucenePrefixTerm(fieldName)); - try { - do { - Term term = enumerator.term(); - if ((term != null) - && term.text().startsWith(getPrefix()) - && term.field().equals(fieldName)) { - mtv.visitMatchingTerm(term); + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + + boolean skip = false; + TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getPrefix())); + if (status == TermsEnum.SeekStatus.FOUND) { + mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); + } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { + if (termsEnum.term().startsWith(prefixRef)) { + mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString())); } else { - break; + skip = true; } - } while (enumerator.next()); - } finally { - enumerator.close(); + } else { + // EOF + skip = true; + } + + if (!skip) { + while(true) { + BytesRef text = termsEnum.next(); + if (text != null && text.startsWith(prefixRef)) { + mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString())); + } else { + break; + } + } + } } } } diff --git a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java index 79de254bd91..732a9e64304 100644 --- a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java +++ b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java @@ -20,7 +20,10 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; public class SrndTermQuery extends SimpleTerm { @@ -46,16 +49,14 @@ public class SrndTermQuery extends SimpleTerm { MatchingTermVisitor mtv) throws IOException { /* check term presence in index here for symmetry with other SimpleTerm's */ - TermEnum enumerator = reader.terms(getLuceneTerm(fieldName)); - try { - Term it= enumerator.term(); /* same or following index term */ - if ((it != null) - && it.text().equals(getTermText()) - && it.field().equals(fieldName)) { - mtv.visitMatchingTerm(it); + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + + TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getTermText())); + if (status == TermsEnum.SeekStatus.FOUND) { + mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } - } finally { - enumerator.close(); } } } diff --git a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java index 4f281012c46..c2a13cb8da0 100644 --- a/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java +++ b/lucene/contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java @@ -17,8 +17,11 @@ package org.apache.lucene.queryParser.surround.query; */ import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import java.io.IOException; @@ -40,6 +43,7 @@ public class SrndTruncQuery extends SimpleTerm { private final char mask; private String prefix; + private BytesRef prefixRef; private Pattern pattern; @@ -68,6 +72,7 @@ public class SrndTruncQuery extends SimpleTerm { i++; } prefix = truncated.substring(0, i); + prefixRef = new BytesRef(prefix); StringBuilder re = new StringBuilder(); while (i < truncated.length()) { @@ -84,26 +89,37 @@ public class SrndTruncQuery extends SimpleTerm { MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); - TermEnum enumerator = reader.terms(new Term(fieldName, prefix)); - Matcher matcher = pattern.matcher(""); - try { - do { - Term term = enumerator.term(); - if (term != null) { - String text = term.text(); - if ((! text.startsWith(prefix)) || (! term.field().equals(fieldName))) { - break; - } else { - matcher.reset( text.substring(prefixLength)); - if (matcher.matches()) { - mtv.visitMatchingTerm(term); - } - } + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + Matcher matcher = pattern.matcher(""); + try { + TermsEnum termsEnum = terms.iterator(); + + TermsEnum.SeekStatus status = termsEnum.seek(prefixRef); + BytesRef text; + if (status == TermsEnum.SeekStatus.FOUND) { + text = prefixRef; + } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { + text = termsEnum.term(); + } else { + text = null; } - } while (enumerator.next()); - } finally { - enumerator.close(); - matcher.reset(); + + while(text != null) { + if (text != null && text.startsWith(prefixRef)) { + String textString = text.utf8ToString(); + matcher.reset(textString.substring(prefixLength)); + if (matcher.matches()) { + mtv.visitMatchingTerm(new Term(fieldName, textString)); + } + } else { + break; + } + text = termsEnum.next(); + } + } finally { + matcher.reset(); + } } } } diff --git a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java index 1d5830ff638..b7e1c679268 100644 --- a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java +++ b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java @@ -17,12 +17,17 @@ package org.apache.lucene.analysis; * limitations under the License. */ +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -91,6 +96,88 @@ public final class NumericTokenStream extends TokenStream { /** The lower precision tokens gets this token type assigned. */ public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; + + /** Expert: Use this attribute to get the details of the currently generated token + * @lucene.experimental + * @since 3.1 + */ + public interface NumericTermAttribute extends Attribute { + /** Returns current shift value, undefined before first token */ + int getShift(); + /** Returns {@link NumericTokenStream}'s raw value as {@code long} */ + long getRawValue(); + /** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */ + int getValueSize(); + } + + private static final class NumericAttributeFactory extends AttributeFactory { + private final AttributeFactory delegate; + private NumericTokenStream ts = null; + + NumericAttributeFactory(AttributeFactory delegate) { + this.delegate = delegate; + } + + @Override + public AttributeImpl createAttributeInstance(Class attClass) { + if (attClass == NumericTermAttribute.class) + return new NumericTermAttributeImpl(ts); + if (attClass.isAssignableFrom(CharTermAttribute.class) || attClass.isAssignableFrom(TermAttribute.class)) + throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute/TermAttribute."); + return delegate.createAttributeInstance(attClass); + } + } + + private static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute { + private final NumericTokenStream ts; + + public NumericTermAttributeImpl(NumericTokenStream ts) { + this.ts = ts; + } + + public int toBytesRef(BytesRef bytes) { + try { + assert ts.valSize == 64 || ts.valSize == 32; + return (ts.valSize == 64) ? + NumericUtils.longToPrefixCoded(ts.value, ts.shift, bytes) : + NumericUtils.intToPrefixCoded((int) ts.value, ts.shift, bytes); + } catch (IllegalArgumentException iae) { + // return empty token before first + bytes.length = 0; + return 0; + } + } + + public int getShift() { return ts.shift; } + public long getRawValue() { return ts.value; } + public int getValueSize() { return ts.valSize; } + + @Override + public void clear() { + // this attribute has no contents to clear + } + + @Override + public boolean equals(Object other) { + return other == this; + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + + @Override + public void copyTo(AttributeImpl target) { + // this attribute has no contents to copy + } + + @Override + public Object clone() { + // cannot throw CloneNotSupportedException (checked) + throw new UnsupportedOperationException(); + } + } /** * Creates a token stream for numeric values using the default precisionStep @@ -107,23 +194,15 @@ public final class NumericTokenStream extends TokenStream { * before using set a value using the various set???Value() methods. */ public NumericTokenStream(final int precisionStep) { - super(); - this.precisionStep = precisionStep; - if (precisionStep < 1) - throw new IllegalArgumentException("precisionStep must be >=1"); - } + super(new NumericAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY)); + // we must do this after the super call :( + ((NumericAttributeFactory) getAttributeFactory()).ts = this; + addAttribute(NumericTermAttribute.class); - /** - * Expert: Creates a token stream for numeric values with the specified - * precisionStep using the given {@link AttributeSource}. - * The stream is not yet initialized, - * before using set a value using the various set???Value() methods. - */ - public NumericTokenStream(AttributeSource source, final int precisionStep) { - super(source); this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); + shift = -precisionStep; } /** @@ -134,10 +213,15 @@ public final class NumericTokenStream extends TokenStream { * before using set a value using the various set???Value() methods. */ public NumericTokenStream(AttributeFactory factory, final int precisionStep) { - super(factory); + super(new NumericAttributeFactory(factory)); + // we must do this after the super call :( + ((NumericAttributeFactory) getAttributeFactory()).ts = this; + addAttribute(NumericTermAttribute.class); + this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); + shift = -precisionStep; } /** @@ -149,7 +233,7 @@ public final class NumericTokenStream extends TokenStream { public NumericTokenStream setLongValue(final long value) { this.value = value; valSize = 64; - shift = 0; + shift = -precisionStep; return this; } @@ -162,7 +246,7 @@ public final class NumericTokenStream extends TokenStream { public NumericTokenStream setIntValue(final int value) { this.value = value; valSize = 32; - shift = 0; + shift = -precisionStep; return this; } @@ -175,7 +259,7 @@ public final class NumericTokenStream extends TokenStream { public NumericTokenStream setDoubleValue(final double value) { this.value = NumericUtils.doubleToSortableLong(value); valSize = 64; - shift = 0; + shift = -precisionStep; return this; } @@ -188,7 +272,7 @@ public final class NumericTokenStream extends TokenStream { public NumericTokenStream setFloatValue(final float value) { this.value = NumericUtils.floatToSortableInt(value); valSize = 32; - shift = 0; + shift = -precisionStep; return this; } @@ -196,37 +280,24 @@ public final class NumericTokenStream extends TokenStream { public void reset() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); - shift = 0; + shift = -precisionStep; } @Override public boolean incrementToken() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); - if (shift >= valSize) + shift += precisionStep; + if (shift >= valSize) { + // reset so the attribute still works after exhausted stream + shift -= precisionStep; return false; + } clearAttributes(); - final char[] buffer; - switch (valSize) { - case 64: - buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); - termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - break; - - case 32: - buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); - termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - break; - - default: - // should not happen - throw new IllegalArgumentException("valSize must be 32 or 64"); - } - + // the TermToBytesRefAttribute is directly accessing shift & value. typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); - shift += precisionStep; return true; } @@ -238,12 +309,11 @@ public final class NumericTokenStream extends TokenStream { } // members - private final TermAttribute termAtt = addAttribute(TermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private int shift = 0, valSize = 0; // valSize==0 means not initialized + int shift, valSize = 0; // valSize==0 means not initialized private final int precisionStep; - private long value = 0L; + long value = 0L; } diff --git a/lucene/src/java/org/apache/lucene/analysis/Token.java b/lucene/src/java/org/apache/lucene/analysis/Token.java index 32242ae9e0f..47d03d655e0 100644 --- a/lucene/src/java/org/apache/lucene/analysis/Token.java +++ b/lucene/src/java/org/apache/lucene/analysis/Token.java @@ -64,14 +64,14 @@ import org.apache.lucene.util.AttributeImpl; implementing the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of the constructors that starts with null text. To load - the token from a char[] use {@link #setTermBuffer(char[], int, int)}. - To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}. - Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + the token from a char[] use {@link #copyBuffer(char[], int, int)}. + To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. + Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, if you know that your text is shorter than the capacity of the termBuffer - or {@link #resizeTermBuffer(int)}, if there is any possibility + or {@link #resizeBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, - or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to set the length of the term text. See LUCENE-969 for details.

@@ -100,7 +100,7 @@ import org.apache.lucene.util.AttributeImpl;
  • Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
    -    return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
    +    return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
       
  • @@ -115,6 +115,7 @@ import org.apache.lucene.util.AttributeImpl; @see org.apache.lucene.index.Payload */ +// TODO: change superclass to CharTermAttribute in 4.0! public class Token extends TermAttributeImpl implements TypeAttribute, PositionIncrementAttribute, FlagsAttribute, OffsetAttribute, PayloadAttribute { @@ -172,7 +173,7 @@ public class Token extends TermAttributeImpl * @param end end offset */ public Token(String text, int start, int end) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; } @@ -187,7 +188,7 @@ public class Token extends TermAttributeImpl * @param typ token type */ public Token(String text, int start, int end, String typ) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; type = typ; @@ -204,7 +205,7 @@ public class Token extends TermAttributeImpl * @param flags token type bits */ public Token(String text, int start, int end, int flags) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; this.flags = flags; @@ -221,7 +222,7 @@ public class Token extends TermAttributeImpl * @param end */ public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { - setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); + copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); startOffset = start; endOffset = end; } @@ -270,7 +271,7 @@ public class Token extends TermAttributeImpl corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be - equal to {@link #termLength}, as the term text may have been altered by a + equal to {@link #length}, as the term text may have been altered by a stemmer or some other filter. */ public final int startOffset() { return startOffset; @@ -351,7 +352,7 @@ public class Token extends TermAttributeImpl @Override public String toString() { final StringBuilder sb = new StringBuilder(); - sb.append('(').append(term()).append(',') + sb.append('(').append(super.toString()).append(',') .append(startOffset).append(',').append(endOffset); if (!"word".equals(type)) sb.append(",type=").append(type); @@ -387,7 +388,7 @@ public class Token extends TermAttributeImpl /** Makes a clone, but replaces the term buffer & * start/end offset in the process. This is more * efficient than doing a full clone (and then calling - * setTermBuffer) because it saves a wasted copy of the old + * {@link #copyBuffer}) because it saves a wasted copy of the old * termBuffer. */ public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); @@ -442,16 +443,16 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset}, * {@link #setType} * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { clearNoTermBuffer(); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); payload = null; positionIncrement = 1; - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -459,14 +460,14 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { clearNoTermBuffer(); - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -474,14 +475,14 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -489,14 +490,14 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -504,14 +505,14 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -519,14 +520,14 @@ public class Token extends TermAttributeImpl } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -538,7 +539,7 @@ public class Token extends TermAttributeImpl * @param prototype */ public void reinit(Token prototype) { - setTermBuffer(prototype.termBuffer(), 0, prototype.termLength()); + copyBuffer(prototype.buffer(), 0, prototype.length()); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -553,7 +554,7 @@ public class Token extends TermAttributeImpl * @param newTerm */ public void reinit(Token prototype, String newTerm) { - setTermBuffer(newTerm); + setEmpty().append(newTerm); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -570,7 +571,7 @@ public class Token extends TermAttributeImpl * @param length */ public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { - setTermBuffer(newTermBuffer, offset, length); + copyBuffer(newTermBuffer, offset, length); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; diff --git a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java new file mode 100644 index 00000000000..8914b78a19f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java @@ -0,0 +1,71 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +/** + * The term text of a Token. + */ +public interface CharTermAttribute extends Attribute, CharSequence, Appendable { + + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void copyBuffer(char[] buffer, int offset, int length); + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public char[] buffer(); + + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeBuffer(int newSize); + + /** Set number of valid characters (length of the term) in + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeBuffer(int)} first. + * @param length the truncated length + */ + public CharTermAttribute setLength(int length); + + /** Sets the length of the termBuffer to zero. + * Use this method before appending contents + * using the {@link Appendable} interface. + */ + public CharTermAttribute setEmpty(); + + // the following methods are redefined to get rid of IOException declaration: + public CharTermAttribute append(CharSequence csq); + public CharTermAttribute append(CharSequence csq, int start, int end); + public CharTermAttribute append(char c); + +} diff --git a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java new file mode 100644 index 00000000000..7e0d7b9e65d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java @@ -0,0 +1,255 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.nio.CharBuffer; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; + +/** + * The term text of a Token. + */ +public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttribute, TermAttribute, TermToBytesRefAttribute, Cloneable, Serializable { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)]; + private int termLength = 0; + + @Deprecated + public String term() { + // don't delegate to toString() here! + return new String(termBuffer, 0, termLength); + } + + public void copyBuffer(char[] buffer, int offset, int length) { + growTermBuffer(length); + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + @Deprecated + public void setTermBuffer(char[] buffer, int offset, int length) { + copyBuffer(buffer, offset, length); + } + + @Deprecated + public void setTermBuffer(String buffer) { + int length = buffer.length(); + growTermBuffer(length); + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + @Deprecated + public void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + growTermBuffer(length); + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + + public char[] buffer() { + return termBuffer; + } + + @Deprecated + public char[] termBuffer() { + return termBuffer; + } + + public char[] resizeBuffer(int newSize) { + if (termBuffer == null) { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } else { + if(termBuffer.length < newSize){ + // Not big enough; create a new array with slight + // over allocation and preserve content + final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + } + return termBuffer; + } + + @Deprecated + public char[] resizeTermBuffer(int newSize) { + return resizeBuffer(newSize); + } + + private void growTermBuffer(int newSize) { + if (termBuffer == null) { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } else { + if(termBuffer.length < newSize){ + // Not big enough; create a new array with slight + // over allocation: + termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } + } + } + + @Deprecated + public int termLength() { + return termLength; + } + + public CharTermAttribute setLength(int length) { + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); + termLength = length; + return this; + } + + public CharTermAttribute setEmpty() { + termLength = 0; + return this; + } + + @Deprecated + public void setTermLength(int length) { + setLength(length); + } + + // *** TermToBytesRefAttribute interface *** + public int toBytesRef(BytesRef target) { + // TODO: Maybe require that bytes is already initialized? TermsHashPerField ensures this. + if (target.bytes == null) { + target.bytes = new byte[termLength * 4]; + } + return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); + } + + // *** CharSequence interface *** + public int length() { + return termLength; + } + + public char charAt(int index) { + if (index >= termLength) + throw new IndexOutOfBoundsException(); + return termBuffer[index]; + } + + public CharSequence subSequence(final int start, final int end) { + if (start > termLength || end > termLength) + throw new IndexOutOfBoundsException(); + return new String(termBuffer, start, end - start); + } + + // *** Appendable interface *** + public CharTermAttribute append(CharSequence csq) { + return append(csq, 0, csq.length()); + } + + public CharTermAttribute append(CharSequence csq, int start, int end) { + resizeBuffer(termLength + end - start); + if (csq instanceof String) { + ((String) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof StringBuilder) { + ((StringBuilder) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof StringBuffer) { + ((StringBuffer) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof CharBuffer && ((CharBuffer) csq).hasArray()) { + final CharBuffer cb = (CharBuffer) csq; + System.arraycopy(cb.array(), cb.arrayOffset() + cb.position() + start, termBuffer, termLength, end - start); + } else { + while (start < end) + termBuffer[termLength++] = csq.charAt(start++); + // no fall-through here, as termLength is updated! + return this; + } + termLength += end - start; + return this; + } + + public CharTermAttribute append(char c) { + resizeBuffer(termLength + 1)[termLength++] = c; + return this; + } + + // *** AttributeImpl *** + + @Override + public int hashCode() { + int code = termLength; + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); + return code; + } + + @Override + public void clear() { + termLength = 0; + } + + @Override + public Object clone() { + CharTermAttributeImpl t = (CharTermAttributeImpl)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = termBuffer.clone(); + } + return t; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof CharTermAttributeImpl) { + final CharTermAttributeImpl o = ((CharTermAttributeImpl) other); + if (termLength != o.termLength) + return false; + for(int i=0;inew String(token.termBuffer(), 0, token.termLength()) - */ - public String term() { - initTermBuffer(); - return new String(termBuffer, 0, termLength); - } - - /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public void setTermBuffer(char[] buffer, int offset, int length) { - growTermBuffer(length); - System.arraycopy(buffer, offset, termBuffer, 0, length); - termLength = length; - } - - /** Copies the contents of buffer into the termBuffer array. - * @param buffer the buffer to copy - */ - public void setTermBuffer(String buffer) { - int length = buffer.length(); - growTermBuffer(length); - buffer.getChars(0, length, termBuffer, 0); - termLength = length; - } - - /** Copies the contents of buffer, starting at offset and continuing - * for length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public void setTermBuffer(String buffer, int offset, int length) { - assert offset <= buffer.length(); - assert offset + length <= buffer.length(); - growTermBuffer(length); - buffer.getChars(offset, offset + length, termBuffer, 0); - termLength = length; - } - - /** Returns the internal termBuffer character array which - * you can then directly alter. If the array is too - * small for your token, use {@link - * #resizeTermBuffer(int)} to increase it. After - * altering the buffer be sure to call {@link - * #setTermLength} to record the number of valid - * characters that were placed into the termBuffer. */ - public char[] termBuffer() { - initTermBuffer(); - return termBuffer; - } - - /** Grows the termBuffer to at least size newSize, preserving the - * existing content. Note: If the next operation is to change - * the contents of the term buffer use - * {@link #setTermBuffer(char[], int, int)}, - * {@link #setTermBuffer(String)}, or - * {@link #setTermBuffer(String, int, int)} - * to optimally combine the resize with the setting of the termBuffer. - * @param newSize minimum size of the new termBuffer - * @return newly created termBuffer with length >= newSize - */ - public char[] resizeTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation and preserve content - final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); - termBuffer = newCharBuffer; - } - } - return termBuffer; - } - - - /** Allocates a buffer char[] of at least newSize, without preserving the existing content. - * its always used in places that set the content - * @param newSize minimum size of the buffer - */ - private void growTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation: - termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } - } - } - - private void initTermBuffer() { - if (termBuffer == null) { - termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)]; - termLength = 0; - } - } - - /** Return number of valid characters (length of the term) - * in the termBuffer array. */ - public int termLength() { - return termLength; - } - - /** Set number of valid characters (length of the term) in - * the termBuffer array. Use this to truncate the termBuffer - * or to synchronize with external manipulation of the termBuffer. - * Note: to grow the size of the array, - * use {@link #resizeTermBuffer(int)} first. - * @param length the truncated length - */ - public void setTermLength(int length) { - initTermBuffer(); - if (length > termBuffer.length) - throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); - termLength = length; - } - - @Override - public int hashCode() { - initTermBuffer(); - int code = termLength; - code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); - return code; - } - - @Override - public void clear() { - termLength = 0; - } - - @Override - public Object clone() { - TermAttributeImpl t = (TermAttributeImpl)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = termBuffer.clone(); - } - return t; - } - - @Override - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof TermAttributeImpl) { - initTermBuffer(); - TermAttributeImpl o = ((TermAttributeImpl) other); - o.initTermBuffer(); - - if (termLength != o.termLength) - return false; - for(int i=0;i + * int hash = 0; + * for (int i = termBytes.offset; i < termBytes.offset+termBytes.length; i++) { + * hash = 31*hash + termBytes.bytes[i]; + * } + * + * Implement this for performance reasons, if your code can calculate + * the hash on-the-fly. If this is not the case, just return + * {@code termBytes.hashCode()}. + */ + public int toBytesRef(BytesRef termBytes); +} diff --git a/lucene/src/java/org/apache/lucene/document/CompressionTools.java b/lucene/src/java/org/apache/lucene/document/CompressionTools.java index ecd768be982..5ad1264d338 100644 --- a/lucene/src/java/org/apache/lucene/document/CompressionTools.java +++ b/lucene/src/java/org/apache/lucene/document/CompressionTools.java @@ -21,6 +21,8 @@ import java.util.zip.Deflater; import java.util.zip.Inflater; import java.util.zip.DataFormatException; import java.io.ByteArrayOutputStream; + +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; /** Simple utility class providing static methods to @@ -84,9 +86,9 @@ public class CompressionTools { * compressionLevel (constants are defined in * java.util.zip.Deflater). */ public static byte[] compressString(String value, int compressionLevel) { - UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); + BytesRef result = new BytesRef(10); UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result); - return compress(result.result, 0, result.length, compressionLevel); + return compress(result.bytes, 0, result.length, compressionLevel); } /** Decompress the byte array previously returned by diff --git a/lucene/src/java/org/apache/lucene/index/AbstractAllTermDocs.java b/lucene/src/java/org/apache/lucene/index/AbstractAllTermDocs.java index 3b0a7132650..0a521c9cbfe 100644 --- a/lucene/src/java/org/apache/lucene/index/AbstractAllTermDocs.java +++ b/lucene/src/java/org/apache/lucene/index/AbstractAllTermDocs.java @@ -26,6 +26,7 @@ import java.io.IOException; * packages. This means the API is freely subject to * change, and, the class could be removed entirely, in any * Lucene release. Use directly at your own risk! */ +@Deprecated public abstract class AbstractAllTermDocs implements TermDocs { protected int maxDoc; diff --git a/lucene/src/java/org/apache/lucene/index/AllDocsEnum.java b/lucene/src/java/org/apache/lucene/index/AllDocsEnum.java new file mode 100644 index 00000000000..733dfbd4cae --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/AllDocsEnum.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import org.apache.lucene.util.Bits; +import java.io.IOException; + +class AllDocsEnum extends DocsEnum { + protected final Bits skipDocs; + protected final int maxDoc; + protected final IndexReader reader; + protected int doc = -1; + + protected AllDocsEnum(IndexReader reader, Bits skipDocs) { + this.skipDocs = skipDocs; + this.maxDoc = reader.maxDoc(); + this.reader = reader; + } + + @Override + public int freq() { + return 1; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc+1); + } + + @Override + public int read() throws IOException { + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + int i = 0; + while (i < docs.length && doc < maxDoc) { + if (skipDocs == null || !skipDocs.get(doc)) { + docs[i] = doc; + freqs[i] = 1; + ++i; + } + doc++; + } + return i; + } + + @Override + public int advance(int target) throws IOException { + doc = target; + while (doc < maxDoc) { + if (skipDocs == null || !skipDocs.get(doc)) { + return doc; + } + doc++; + } + doc = NO_MORE_DOCS; + return doc; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/AllTermDocs.java b/lucene/src/java/org/apache/lucene/index/AllTermDocs.java index 762f0218f38..95e4f7ea9bf 100644 --- a/lucene/src/java/org/apache/lucene/index/AllTermDocs.java +++ b/lucene/src/java/org/apache/lucene/index/AllTermDocs.java @@ -19,6 +19,8 @@ package org.apache.lucene.index; import org.apache.lucene.util.BitVector; +/** @deprecated Switch to AllDocsEnum */ +@Deprecated class AllTermDocs extends AbstractAllTermDocs { protected BitVector deletedDocs; diff --git a/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java b/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java index 8d144588900..b583fecb06b 100644 --- a/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java +++ b/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java @@ -34,11 +34,11 @@ package org.apache.lucene.index; * hit a non-zero byte. */ import java.util.Arrays; +import org.apache.lucene.util.BytesRef; import java.util.List; import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; import org.apache.lucene.util.ArrayUtil; - final class ByteBlockPool { abstract static class Allocator { @@ -149,5 +149,23 @@ final class ByteBlockPool { return newUpto+3; } + + // Fill in a BytesRef from term's length & bytes encoded in + // byte block + final BytesRef setBytesRef(BytesRef term, int textStart) { + final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; + int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK; + if ((bytes[pos] & 0x80) == 0) { + // length is 1 byte + term.length = bytes[pos]; + term.offset = pos+1; + } else { + // length is 2 bytes + term.length = (bytes[pos]&0x7f) + ((bytes[pos+1]&0xff)<<7); + term.offset = pos+2; + } + assert term.length >= 0; + return term; + } } diff --git a/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java b/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java index 5b8e3882a7d..a298aa0cb33 100644 --- a/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java +++ b/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java @@ -17,16 +17,17 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + /* IndexInput that knows how to read the byte slices written * by Posting and PostingVector. We read the bytes in * each slice until we hit the end of that slice at which * point we read the forwarding address of the next slice * and then jump to it.*/ -final class ByteSliceReader extends IndexInput { +final class ByteSliceReader extends DataInput { ByteBlockPool pool; int bufferUpto; byte[] buffer; @@ -75,7 +76,7 @@ final class ByteSliceReader extends IndexInput { return buffer[upto++]; } - public long writeTo(IndexOutput out) throws IOException { + public long writeTo(DataOutput out) throws IOException { long size = 0; while(true) { if (limit + bufferOffset == endIndex) { @@ -136,14 +137,4 @@ final class ByteSliceReader extends IndexInput { } } } - - @Override - public long getFilePointer() {throw new RuntimeException("not implemented");} - @Override - public long length() {throw new RuntimeException("not implemented");} - @Override - public void seek(long pos) {throw new RuntimeException("not implemented");} - @Override - public void close() {throw new RuntimeException("not implemented");} -} - +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java b/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java index 8103cb8ffbd..a8e4d7ffa2d 100644 --- a/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java +++ b/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.store.DataOutput; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -24,7 +26,7 @@ package org.apache.lucene.index; * posting list for many terms in RAM. */ -final class ByteSliceWriter { +final class ByteSliceWriter extends DataOutput { private byte[] slice; private int upto; @@ -38,7 +40,7 @@ final class ByteSliceWriter { /** * Set up the writer to write at address. - */ + */ public void init(int address) { slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert slice != null; @@ -48,6 +50,7 @@ final class ByteSliceWriter { } /** Write byte into byte slice stream */ + @Override public void writeByte(byte b) { assert slice != null; if (slice[upto] != 0) { @@ -60,6 +63,7 @@ final class ByteSliceWriter { assert upto != slice.length; } + @Override public void writeBytes(final byte[] b, int offset, final int len) { final int offsetEnd = offset + len; while(offset < offsetEnd) { @@ -78,12 +82,4 @@ final class ByteSliceWriter { public int getAddress() { return upto + (offset0 & DocumentsWriter.BYTE_BLOCK_NOT_MASK); } - - public void writeVInt(int i) { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte) i); - } -} +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/CharBlockPool.java b/lucene/src/java/org/apache/lucene/index/CharBlockPool.java deleted file mode 100644 index 39d24459648..00000000000 --- a/lucene/src/java/org/apache/lucene/index/CharBlockPool.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; -import org.apache.lucene.util.ArrayUtil; - -final class CharBlockPool { - - public char[][] buffers = new char[10][]; - int numBuffer; - - int bufferUpto = -1; // Which buffer we are upto - public int charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer - - public char[] buffer; // Current head buffer - public int charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset - final private DocumentsWriter docWriter; - - public CharBlockPool(DocumentsWriter docWriter) { - this.docWriter = docWriter; - } - - public void reset() { - docWriter.recycleCharBlocks(buffers, 1+bufferUpto); - bufferUpto = -1; - charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; - charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; - } - - public void nextBuffer() { - if (1+bufferUpto == buffers.length) { - char[][] newBuffers = new char[ArrayUtil.oversize(buffers.length+1, - NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); - buffers = newBuffers; - } - buffer = buffers[1+bufferUpto] = docWriter.getCharBlock(); - bufferUpto++; - - charUpto = 0; - charOffset += DocumentsWriter.CHAR_BLOCK_SIZE; - } -} - diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index 13f95d6ea4d..5ece0f60f0e 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -22,6 +22,9 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.document.AbstractField; // for javadocs import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import java.text.NumberFormat; import java.io.PrintStream; @@ -122,6 +125,9 @@ public class CheckIndex { /** Name of the segment. */ public String name; + /** Name of codec used to read this segment. */ + public String codec; + /** Document count (does not take deletions into account). */ public int docCount; @@ -263,26 +269,6 @@ public class CheckIndex { infoStream.println(msg); } - private static class MySegmentTermDocs extends SegmentTermDocs { - - int delCount; - - MySegmentTermDocs(SegmentReader p) { - super(p); - } - - @Override - public void seek(Term term) throws IOException { - super.seek(term); - delCount = 0; - } - - @Override - protected void skippingDoc() throws IOException { - delCount++; - } - } - /** Returns a {@link Status} instance detailing * the state of the index. * @@ -296,6 +282,10 @@ public class CheckIndex { return checkIndex(null); } + protected Status checkIndex(List onlySegments) throws IOException { + return checkIndex(onlySegments, CodecProvider.getDefault()); + } + /** Returns a {@link Status} instance detailing * the state of the index. * @@ -308,13 +298,13 @@ public class CheckIndex { *

    WARNING: make sure * you only call this when the index is not opened by any * writer. */ - public Status checkIndex(List onlySegments) throws IOException { + protected Status checkIndex(List onlySegments, CodecProvider codecs) throws IOException { NumberFormat nf = NumberFormat.getInstance(); SegmentInfos sis = new SegmentInfos(); Status result = new Status(); result.dir = dir; try { - sis.read(dir); + sis.read(dir, codecs); } catch (Throwable t) { msg("ERROR: could not read any segments file in directory"); result.missingSegments = true; @@ -371,6 +361,8 @@ public class CheckIndex { sFormat = "FORMAT_USER_DATA [Lucene 2.9]"; else if (format == SegmentInfos.FORMAT_DIAGNOSTICS) sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]"; + else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS) + sFormat = "FORMAT_FLEX_POSTINGS [Lucene 3.1]"; else if (format < SegmentInfos.CURRENT_FORMAT) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; @@ -429,6 +421,9 @@ public class CheckIndex { SegmentReader reader = null; try { + final String codec = info.getCodec().name; + msg(" codec=" + codec); + segInfoStat.codec = codec; msg(" compound=" + info.getUseCompoundFile()); segInfoStat.compound = info.getUseCompoundFile(); msg(" hasProx=" + info.getHasProx()); @@ -452,6 +447,7 @@ public class CheckIndex { msg(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile()); segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile(); } + final String delFileName = info.getDelFileName(); if (delFileName == null){ msg(" no deletions"); @@ -503,7 +499,7 @@ public class CheckIndex { segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader); // Test the Term Index - segInfoStat.termIndexStatus = testTermIndex(info, reader); + segInfoStat.termIndexStatus = testTermIndex(reader); // Test Stored Fields segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf); @@ -586,69 +582,129 @@ public class CheckIndex { /** * Test the term index. */ - private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) { + private Status.TermIndexStatus testTermIndex(SegmentReader reader) { final Status.TermIndexStatus status = new Status.TermIndexStatus(); + final int maxDoc = reader.maxDoc(); + final Bits delDocs = reader.getDeletedDocs(); + try { + if (infoStream != null) { infoStream.print(" test: terms, freq, prox..."); } - final TermEnum termEnum = reader.terms(); - final TermPositions termPositions = reader.termPositions(); + final Fields fields = reader.fields(); + if (fields == null) { + msg("OK [no fields/terms]"); + return status; + } + + final FieldsEnum fieldsEnum = fields.iterator(); + while(true) { + final String field = fieldsEnum.next(); + if (field == null) { + break; + } + + final TermsEnum terms = fieldsEnum.terms(); - // Used only to count up # deleted docs for this term - final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); + DocsEnum docs = null; + DocsAndPositionsEnum postings = null; - final int maxDoc = reader.maxDoc(); + boolean hasOrd = true; + final long termCountStart = status.termCount; - while (termEnum.next()) { - status.termCount++; - final Term term = termEnum.term(); - final int docFreq = termEnum.docFreq(); - termPositions.seek(term); - int lastDoc = -1; - int freq0 = 0; - status.totFreq += docFreq; - while (termPositions.next()) { - freq0++; - final int doc = termPositions.doc(); - final int freq = termPositions.freq(); - if (doc <= lastDoc) - throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); - if (doc >= maxDoc) - throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); + while(true) { - lastDoc = doc; - if (freq <= 0) - throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); - - int lastPos = -1; - status.totPos += freq; - for(int j=0;j= maxDoc) { + throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); + } + + lastDoc = doc; + if (freq <= 0) { + throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); + } + + int lastPos = -1; + if (postings != null) { + for(int j=0;j subReaderToSlice = new HashMap(); private Map normsCache = new HashMap(); private int maxDoc = 0; private int numDocs = -1; private boolean hasDeletions = false; +// static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, +// final int termInfosIndexDivisor) throws CorruptIndexException, IOException { +// return open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor, null); +// } + static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, - final int termInfosIndexDivisor) throws CorruptIndexException, IOException { + final int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + final CodecProvider codecs2; + if (codecs == null) { + codecs2 = CodecProvider.getDefault(); + } else { + codecs2 = codecs; + } return (IndexReader) new SegmentInfos.FindSegmentsFile(directory) { @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs2); if (readOnly) - return new ReadOnlyDirectoryReader(directory, infos, deletionPolicy, termInfosIndexDivisor); + return new ReadOnlyDirectoryReader(directory, infos, deletionPolicy, termInfosIndexDivisor, codecs2); else - return new DirectoryReader(directory, infos, deletionPolicy, false, termInfosIndexDivisor); + return new DirectoryReader(directory, infos, deletionPolicy, false, termInfosIndexDivisor, codecs2); } }.run(commit); } /** Construct reading the named set of readers. */ - DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws IOException { +// DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws IOException { +// this(directory, sis, deletionPolicy, readOnly, termInfosIndexDivisor, null); +// } + + /** Construct reading the named set of readers. */ + DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = sis; this.deletionPolicy = deletionPolicy; this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } + // To reduce the chance of hitting FileNotFound // (and having to retry), we open segments in // reverse because IndexWriter merges & deletes @@ -115,12 +144,16 @@ class DirectoryReader extends IndexReader implements Cloneable { } // Used by near real-time search - DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor) throws IOException { + DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = writer.getDirectory(); this.readOnly = true; segmentInfos = infos; - segmentInfosStart = (SegmentInfos) infos.clone(); this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } // IndexWriter synchronizes externally before calling // us, which ensures infos will not change; so there's @@ -166,11 +199,17 @@ class DirectoryReader extends IndexReader implements Cloneable { /** This constructor is only used for {@link #reopen()} */ DirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, - Map oldNormsCache, boolean readOnly, boolean doClone, int termInfosIndexDivisor) throws IOException { + Map oldNormsCache, boolean readOnly, boolean doClone, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = infos; this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } + // we put the old SegmentReaders in a map, that allows us // to lookup a reader using its segment name @@ -296,24 +335,44 @@ class DirectoryReader extends IndexReader implements Cloneable { buffer.append(' '); } buffer.append(subReaders[i]); + buffer.append(' '); } buffer.append(')'); return buffer.toString(); } - private void initialize(SegmentReader[] subReaders) { + private void initialize(SegmentReader[] subReaders) throws IOException { this.subReaders = subReaders; starts = new int[subReaders.length + 1]; // build starts array + + final List subFields = new ArrayList(); + final List fieldSlices = new ArrayList(); + for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs - if (subReaders[i].hasDeletions()) + if (subReaders[i].hasDeletions()) { hasDeletions = true; + } + + final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i], subReaders[i].maxDoc(), i); + subReaderToSlice.put(subReaders[i], slice); + + final Fields f = subReaders[i].fields(); + if (f != null) { + subFields.add(f); + fieldSlices.add(slice); + } } starts[subReaders.length] = maxDoc; } + @Override + public Bits getDeletedDocs() { + throw new UnsupportedOperationException("please use MultiFields.getDeletedDocs if you really need a top level Bits deletedDocs (NOTE that it's usually better to work per segment instead)"); + } + @Override public final synchronized Object clone() { try { @@ -435,7 +494,7 @@ class DirectoryReader extends IndexReader implements Cloneable { @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs); return doReopen(infos, false, openReadOnly); } }.run(commit); @@ -444,9 +503,9 @@ class DirectoryReader extends IndexReader implements Cloneable { private synchronized DirectoryReader doReopen(SegmentInfos infos, boolean doClone, boolean openReadOnly) throws CorruptIndexException, IOException { DirectoryReader reader; if (openReadOnly) { - reader = new ReadOnlyDirectoryReader(directory, infos, subReaders, starts, normsCache, doClone, termInfosIndexDivisor); + reader = new ReadOnlyDirectoryReader(directory, infos, subReaders, starts, normsCache, doClone, termInfosIndexDivisor, null); } else { - reader = new DirectoryReader(directory, infos, subReaders, starts, normsCache, false, doClone, termInfosIndexDivisor); + reader = new DirectoryReader(directory, infos, subReaders, starts, normsCache, false, doClone, termInfosIndexDivisor, null); } return reader; } @@ -640,7 +699,7 @@ class DirectoryReader extends IndexReader implements Cloneable { // Optimize single segment case: return subReaders[0].terms(); } else { - return new MultiTermEnum(this, subReaders, starts, null); + return new MultiTermEnum(this, subReaders, starts, null); } } @@ -664,6 +723,16 @@ class DirectoryReader extends IndexReader implements Cloneable { return total; } + @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + int total = 0; // sum freqs in segments + for (int i = 0; i < subReaders.length; i++) { + total += subReaders[i].docFreq(field, term); + } + return total; + } + @Override public TermDocs termDocs() throws IOException { ensureOpen(); @@ -686,6 +755,11 @@ class DirectoryReader extends IndexReader implements Cloneable { } } + @Override + public Fields fields() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getFields if you really need a top level Fields (NOTE that it's usually better to work per segment instead)"); + } + @Override public TermPositions termPositions() throws IOException { ensureOpen(); @@ -731,7 +805,7 @@ class DirectoryReader extends IndexReader implements Cloneable { // we have to check whether index has changed since this reader was opened. // if so, this reader is no longer valid for deletion - if (SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) { + if (SegmentInfos.readCurrentVersion(directory, codecs) > segmentInfos.getVersion()) { stale = true; this.writeLock.release(); this.writeLock = null; @@ -751,13 +825,18 @@ class DirectoryReader extends IndexReader implements Cloneable { */ @Override protected void doCommit(Map commitUserData) throws IOException { + // poll subreaders for changes + for (int i = 0; !hasChanges && i < subReaders.length; i++) { + hasChanges |= subReaders[i].hasChanges; + } + if (hasChanges) { segmentInfos.setUserData(commitUserData); // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: IndexFileDeleter deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, null, null); + segmentInfos, null, null, codecs); // Checkpoint the state we are about to change, in // case we have to roll back: @@ -827,21 +906,31 @@ class DirectoryReader extends IndexReader implements Cloneable { } } + @Override + public long getUniqueTermCount() throws IOException { + throw new UnsupportedOperationException(""); + } + @Override public Map getCommitUserData() { ensureOpen(); return segmentInfos.getUserData(); } + /** + * Check whether this IndexReader is still using the current (i.e., most recently committed) version of the index. If + * a writer has committed any changes to the index since this reader was opened, this will return false, + * in which case you must open a new IndexReader in order + * to see the changes. Use {@link IndexWriter#commit} to + * commit changes to the index. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ @Override public boolean isCurrent() throws CorruptIndexException, IOException { ensureOpen(); - if (writer == null || writer.isClosed()) { - // we loaded SegmentInfos from the directory - return SegmentInfos.readCurrentVersion(directory) == segmentInfos.getVersion(); - } else { - return writer.nrtIsCurrent(segmentInfosStart); - } + return SegmentInfos.readCurrentVersion(directory, codecs) == segmentInfos.getVersion(); } @Override @@ -893,6 +982,11 @@ class DirectoryReader extends IndexReader implements Cloneable { return subReaders; } + @Override + public int getSubReaderDocBase(IndexReader subReader) { + return subReaderToSlice.get(subReader).start; + } + /** Returns the directory this index resides in. */ @Override public Directory directory() { @@ -919,12 +1013,17 @@ class DirectoryReader extends IndexReader implements Cloneable { /** @see org.apache.lucene.index.IndexReader#listCommits */ public static Collection listCommits(Directory dir) throws IOException { + return listCommits(dir, CodecProvider.getDefault()); + } + + /** @see org.apache.lucene.index.IndexReader#listCommits */ + public static Collection listCommits(Directory dir, CodecProvider codecs) throws IOException { final String[] files = dir.listAll(); Collection commits = new ArrayList(); SegmentInfos latest = new SegmentInfos(); - latest.read(dir); + latest.read(dir, codecs); final long currentGen = latest.getGeneration(); commits.add(new ReaderCommit(latest, dir)); @@ -941,7 +1040,7 @@ class DirectoryReader extends IndexReader implements Cloneable { try { // IOException allowed to throw there, in case // segments_N is corrupt - sis.read(dir, fileName); + sis.read(dir, fileName, codecs); } catch (FileNotFoundException fnfe) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth @@ -1020,30 +1119,34 @@ class DirectoryReader extends IndexReader implements Cloneable { return userData; } } - + + // @deprecated This is pre-flex API + // Exposes pre-flex API by doing on-the-fly merging + // pre-flex API to each segment static class MultiTermEnum extends TermEnum { IndexReader topReader; // used for matching TermEnum to TermDocs - private SegmentMergeQueue queue; + private LegacySegmentMergeQueue queue; private Term term; private int docFreq; - final SegmentMergeInfo[] matchingSegments; // null terminated array of matching segments + final LegacySegmentMergeInfo[] matchingSegments; // null terminated array of matching segments public MultiTermEnum(IndexReader topReader, IndexReader[] readers, int[] starts, Term t) throws IOException { this.topReader = topReader; - queue = new SegmentMergeQueue(readers.length); - matchingSegments = new SegmentMergeInfo[readers.length+1]; + queue = new LegacySegmentMergeQueue(readers.length); + matchingSegments = new LegacySegmentMergeInfo[readers.length+1]; for (int i = 0; i < readers.length; i++) { IndexReader reader = readers[i]; TermEnum termEnum; if (t != null) { termEnum = reader.terms(t); - } else + } else { termEnum = reader.terms(); + } - SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader); + LegacySegmentMergeInfo smi = new LegacySegmentMergeInfo(starts[i], termEnum, reader); smi.ord = i; if (t == null ? smi.next() : termEnum.term() != null) queue.add(smi); // initialize queue @@ -1059,7 +1162,7 @@ class DirectoryReader extends IndexReader implements Cloneable { @Override public boolean next() throws IOException { for (int i=0; iNOTE: the default impl simply delegates to {@link + * #nextDoc}, but subclasses may do this more + * efficiently. */ + public int read() throws IOException { + int count = 0; + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + while(count < docs.length) { + final int doc = nextDoc(); + if (doc != NO_MORE_DOCS) { + docs[count] = doc; + freqs[count] = freq(); + count++; + } else { + break; + } + } + return count; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java index 803c81f1661..0326ff3ab62 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -30,6 +30,7 @@ import java.util.Map.Entry; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; @@ -41,6 +42,7 @@ import org.apache.lucene.store.RAMFile; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Constants; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; /** @@ -282,7 +284,6 @@ final class DocumentsWriter { // If we've allocated 5% over our RAM budget, we then // free down to 95% - private long freeTrigger = (long) (IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB*1024*1024*1.05); private long freeLevel = (long) (IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB*1024*1024*0.95); // Flush @ this number of docs. If ramBufferSize is @@ -353,7 +354,6 @@ final class DocumentsWriter { ramBufferSize = (long) (mb*1024*1024); waitQueuePauseBytes = (long) (ramBufferSize*0.1); waitQueueResumeBytes = (long) (ramBufferSize*0.05); - freeTrigger = (long) (1.05 * ramBufferSize); freeLevel = (long) (0.95 * ramBufferSize); } } @@ -550,7 +550,6 @@ final class DocumentsWriter { flushPending = false; for(int i=0;i 0) { - TermDocs docs = reader.termDocs(); try { + Fields fields = reader.fields(); + TermsEnum termsEnum = null; + + String currentField = null; + BytesRef termRef = new BytesRef(); + DocsEnum docs = null; + for (Entry entry: deletesFlushed.terms.entrySet()) { Term term = entry.getKey(); - // LUCENE-2086: we should be iterating a TreeMap, - // here, so terms better be in order: + // Since we visit terms sorted, we gain performance + // by re-using the same TermsEnum and seeking only + // forwards + if (term.field() != currentField) { + assert currentField == null || currentField.compareTo(term.field()) < 0; + currentField = term.field(); + Terms terms = fields.terms(currentField); + if (terms != null) { + termsEnum = terms.iterator(); + } else { + termsEnum = null; + } + } + + if (termsEnum == null) { + continue; + } assert checkDeleteTerm(term); - docs.seek(term); - int limit = entry.getValue().getNum(); - while (docs.next()) { - int docID = docs.doc(); - if (docIDStart+docID >= limit) - break; - reader.deleteDocument(docID); - any = true; + + termRef.copy(term.text()); + + if (termsEnum.seek(termRef, false) == TermsEnum.SeekStatus.FOUND) { + DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); + + if (docsEnum != null) { + docs = docsEnum; + int limit = entry.getValue().getNum(); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) { + break; + } + reader.deleteDocument(docID); + any = true; + } + } } } } finally { - docs.close(); + //docs.close(); } } - // Delete by docID for (Integer docIdInt : deletesFlushed.docIDs) { int docID = docIdInt.intValue(); @@ -1118,7 +1156,7 @@ final class DocumentsWriter { } synchronized boolean doBalanceRAM() { - return ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger); + return ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize); } /** Does the synchronized work to finish/flush the @@ -1201,7 +1239,6 @@ final class DocumentsWriter { return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; } - long numBytesAlloc; long numBytesUsed; NumberFormat nf = NumberFormat.getInstance(); @@ -1243,6 +1280,8 @@ final class DocumentsWriter { final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK; + final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2; + private class ByteBlockAllocator extends ByteBlockPool.Allocator { final int blockSize; @@ -1259,19 +1298,16 @@ final class DocumentsWriter { final int size = freeByteBlocks.size(); final byte[] b; if (0 == size) { + b = new byte[blockSize]; // Always record a block allocated, even if // trackAllocations is false. This is necessary // because this block will be shared between // things that don't track allocations (term // vectors) and things that do (freq/prox // postings). - numBytesAlloc += blockSize; - b = new byte[blockSize]; + numBytesUsed += blockSize; } else b = freeByteBlocks.remove(size-1); - if (trackAllocations) - numBytesUsed += blockSize; - assert numBytesUsed <= numBytesAlloc; return b; } } @@ -1291,7 +1327,7 @@ final class DocumentsWriter { final int size = blocks.size(); for(int i=0;i freeCharBlocks = new ArrayList(); - - /* Allocate another char[] from the shared pool */ - synchronized char[] getCharBlock() { - final int size = freeCharBlocks.size(); - final char[] c; - if (0 == size) { - numBytesAlloc += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - c = new char[CHAR_BLOCK_SIZE]; - } else - c = freeCharBlocks.remove(size-1); - // We always track allocations of char blocks, for now, - // because nothing that skips allocation tracking - // (currently only term vectors) uses its own char - // blocks. - numBytesUsed += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - assert numBytesUsed <= numBytesAlloc; - return c; - } - - /* Return char[]s to the pool */ - synchronized void recycleCharBlocks(char[][] blocks, int numBlocks) { - for(int i=0;i freeTrigger) { + if (numBytesUsed+deletesRAMUsed > ramBufferSize) { if (infoStream != null) message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + - " vs trigger=" + toMB(flushTrigger) + - " allocMB=" + toMB(numBytesAlloc) + + " vs trigger=" + toMB(ramBufferSize) + " deletesMB=" + toMB(deletesRAMUsed) + - " vs trigger=" + toMB(freeTrigger) + " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + - " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE) + - " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE)); + " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE)); - final long startBytesAlloc = numBytesAlloc + deletesRAMUsed; + final long startBytesUsed = numBytesUsed + deletesRAMUsed; int iter = 0; @@ -1427,46 +1410,38 @@ final class DocumentsWriter { boolean any = true; - while(numBytesAlloc+deletesRAMUsed > freeLevel) { + while(numBytesUsed+deletesRAMUsed > freeLevel) { synchronized(this) { - if (0 == perDocAllocator.freeByteBlocks.size() - && 0 == byteBlockAllocator.freeByteBlocks.size() - && 0 == freeCharBlocks.size() - && 0 == freeIntBlocks.size() - && !any) { + if (0 == perDocAllocator.freeByteBlocks.size() && + 0 == byteBlockAllocator.freeByteBlocks.size() && + 0 == freeIntBlocks.size() && !any) { // Nothing else to free -- must flush now. - bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger; + bufferIsFull = numBytesUsed+deletesRAMUsed > ramBufferSize; if (infoStream != null) { - if (numBytesUsed > flushTrigger) + if (numBytesUsed+deletesRAMUsed > ramBufferSize) message(" nothing to free; now set bufferIsFull"); else message(" nothing to free"); } - assert numBytesUsed <= numBytesAlloc; break; } - if ((0 == iter % 5) && byteBlockAllocator.freeByteBlocks.size() > 0) { + if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) { byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); - numBytesAlloc -= BYTE_BLOCK_SIZE; + numBytesUsed -= BYTE_BLOCK_SIZE; } - if ((1 == iter % 5) && freeCharBlocks.size() > 0) { - freeCharBlocks.remove(freeCharBlocks.size()-1); - numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - } - - if ((2 == iter % 5) && freeIntBlocks.size() > 0) { + if ((1 == iter % 4) && freeIntBlocks.size() > 0) { freeIntBlocks.remove(freeIntBlocks.size()-1); - numBytesAlloc -= INT_BLOCK_SIZE * INT_NUM_BYTE; + numBytesUsed -= INT_BLOCK_SIZE * INT_NUM_BYTE; } - if ((3 == iter % 5) && perDocAllocator.freeByteBlocks.size() > 0) { + if ((2 == iter % 4) && perDocAllocator.freeByteBlocks.size() > 0) { // Remove upwards of 32 blocks (each block is 1K) for (int i = 0; i < 32; ++i) { perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1); - numBytesAlloc -= PER_DOC_BLOCK_SIZE; + numBytesUsed -= PER_DOC_BLOCK_SIZE; if (perDocAllocator.freeByteBlocks.size() == 0) { break; } @@ -1474,7 +1449,7 @@ final class DocumentsWriter { } } - if ((4 == iter % 5) && any) + if ((3 == iter % 4) && any) // Ask consumer to free any recycled state any = consumer.freeRAM(); @@ -1482,26 +1457,7 @@ final class DocumentsWriter { } if (infoStream != null) - message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); - - } else { - // If we have not crossed the 100% mark, but have - // crossed the 95% mark of RAM we are actually - // using, go ahead and flush. This prevents - // over-allocating and then freeing, with every - // flush. - synchronized(this) { - - if (numBytesUsed+deletesRAMUsed > flushTrigger) { - if (infoStream != null) - message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + - " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + - " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) + - " triggerMB=" + nf.format(flushTrigger/1024./1024.)); - - bufferIsFull = true; - } - } + message(" after free: freedMB=" + nf.format((startBytesUsed-numBytesUsed-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.)); } } diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/src/java/org/apache/lucene/index/FieldInfo.java index 90fb11af257..e15b5680fa3 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfo.java @@ -17,20 +17,21 @@ package org.apache.lucene.index; * limitations under the License. */ -final class FieldInfo { - String name; - boolean isIndexed; - int number; +/** @lucene.experimental */ +public final class FieldInfo { + public String name; + public boolean isIndexed; + public int number; // true if term vector for this field should be stored boolean storeTermVector; boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; - boolean omitNorms; // omit norms associated with indexed fields - boolean omitTermFreqAndPositions; - - boolean storePayloads; // whether this field stores payloads together with term positions + public boolean omitNorms; // omit norms associated with indexed fields + public boolean omitTermFreqAndPositions; + + public boolean storePayloads; // whether this field stores payloads together with term positions FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index 19a0df29ed8..6ea6e927a77 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -32,8 +32,9 @@ import java.util.*; * of this class are thread-safe for multiple readers, but only one thread can * be adding documents at a time, with no other reader or writer threads * accessing this object. + * @lucene.experimental */ -final class FieldInfos { +public final class FieldInfos { // Used internally (ie not written to *.fnm files) for pre-2.9 files public static final int FORMAT_PRE = -1; @@ -120,7 +121,7 @@ final class FieldInfos { } /** Returns true if any fields do not omitTermFreqAndPositions */ - boolean hasProx() { + public boolean hasProx() { final int numFields = byNumber.size(); for(int i=0;i 0 && delta <= 0)) - throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); - - if ((++df % skipInterval) == 0) { - // TODO: abstraction violation - skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); - skipListWriter.bufferSkip(df); - } - - assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; - - lastDocID = docID; - if (omitTermFreqAndPositions) - out.writeVInt(delta); - else if (1 == termDocFreq) - out.writeVInt((delta<<1) | 1); - else { - out.writeVInt(delta<<1); - out.writeVInt(termDocFreq); - } - - return posWriter; - } - - private final TermInfo termInfo = new TermInfo(); // minimize consing - final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); - - /** Called when we are done adding docs to this term */ - @Override - void finish() throws IOException { - long skipPointer = skipListWriter.writeSkip(out); - - // TODO: this is abstraction violation -- we should not - // peek up into parents terms encoding format - termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); - - if (df > 0) { - parent.termsOut.add(fieldInfo.number, - utf8.result, - utf8.length, - termInfo); - } - - lastDocID = 0; - df = 0; - } - - void close() throws IOException { - out.close(); - posWriter.close(); - } -} diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java deleted file mode 100644 index 4efa2c00cee..00000000000 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { - - final Directory dir; - final String segment; - final TermInfosWriter termsOut; - final FieldInfos fieldInfos; - final FormatPostingsTermsWriter termsWriter; - final DefaultSkipListWriter skipListWriter; - final int totalNumDocs; - - public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { - super(); - - dir = state.directory; - segment = state.segmentName; - totalNumDocs = state.numDocs; - this.fieldInfos = fieldInfos; - termsOut = new TermInfosWriter(dir, - segment, - fieldInfos, - state.termIndexInterval); - - // TODO: this is a nasty abstraction violation (that we - // peek down to find freqOut/proxOut) -- we need a - // better abstraction here whereby these child consumers - // can provide skip data or not - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - totalNumDocs, - null, - null); - - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); - - termsWriter = new FormatPostingsTermsWriter(state, this); - } - - /** Add a new field */ - @Override - FormatPostingsTermsConsumer addField(FieldInfo field) { - termsWriter.setField(field); - return termsWriter; - } - - /** Called when we are done adding everything. */ - @Override - void finish() throws IOException { - termsOut.close(); - termsWriter.close(); - } -} diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java b/lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java deleted file mode 100644 index 6a2266784bf..00000000000 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java +++ /dev/null @@ -1,89 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexOutput; - - -import java.io.IOException; - -final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer { - - final FormatPostingsDocsWriter parent; - final IndexOutput out; - - boolean omitTermFreqAndPositions; - boolean storePayloads; - int lastPayloadLength = -1; - - FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException { - this.parent = parent; - omitTermFreqAndPositions = parent.omitTermFreqAndPositions; - if (parent.parent.parent.fieldInfos.hasProx()) { - // At least one field does not omit TF, so create the - // prox file - final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION); - state.flushedFiles.add(fileName); - out = parent.parent.parent.dir.createOutput(fileName); - parent.skipListWriter.setProxOutput(out); - } else - // Every field omits TF so we will write no prox file - out = null; - } - - int lastPosition; - - /** Add a new position & payload */ - @Override - void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException { - assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; - assert out != null; - - final int delta = position - lastPosition; - lastPosition = position; - - if (storePayloads) { - if (payloadLength != lastPayloadLength) { - lastPayloadLength = payloadLength; - out.writeVInt((delta<<1)|1); - out.writeVInt(payloadLength); - } else - out.writeVInt(delta << 1); - if (payloadLength > 0) - out.writeBytes(payload, payloadLength); - } else - out.writeVInt(delta); - } - - void setField(FieldInfo fieldInfo) { - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; - storePayloads = omitTermFreqAndPositions ? false : fieldInfo.storePayloads; - } - - /** Called when we are done adding positions & payloads */ - @Override - void finish() { - lastPosition = 0; - lastPayloadLength = -1; - } - - void close() throws IOException { - if (out != null) - out.close(); - } -} diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java b/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java deleted file mode 100644 index dfc85ae3144..00000000000 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - -/** - * @lucene.experimental - */ - -abstract class FormatPostingsTermsConsumer { - - /** Adds a new term in this field; term ends with U+FFFF - * char */ - abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException; - - char[] termBuffer; - FormatPostingsDocsConsumer addTerm(String text) throws IOException { - final int len = text.length(); - if (termBuffer == null || termBuffer.length < 1+len) - termBuffer = new char[ArrayUtil.oversize(1+len, RamUsageEstimator.NUM_BYTES_CHAR)]; - text.getChars(0, len, termBuffer, 0); - termBuffer[len] = 0xffff; - return addTerm(termBuffer, 0); - } - - /** Called when we are done adding terms to this field */ - abstract void finish() throws IOException; -} diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java deleted file mode 100644 index 33e4d4f9057..00000000000 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java +++ /dev/null @@ -1,73 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { - - final FormatPostingsFieldsWriter parent; - final FormatPostingsDocsWriter docsWriter; - final TermInfosWriter termsOut; - FieldInfo fieldInfo; - - FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { - super(); - this.parent = parent; - termsOut = parent.termsOut; - docsWriter = new FormatPostingsDocsWriter(state, this); - } - - void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - docsWriter.setField(fieldInfo); - } - - char[] currentTerm; - int currentTermStart; - - long freqStart; - long proxStart; - - /** Adds a new term in this field */ - @Override - FormatPostingsDocsConsumer addTerm(char[] text, int start) { - currentTerm = text; - currentTermStart = start; - - // TODO: this is abstraction violation -- ideally this - // terms writer is not so "invasive", looking for file - // pointers in its child consumers. - freqStart = docsWriter.out.getFilePointer(); - if (docsWriter.posWriter.out != null) - proxStart = docsWriter.posWriter.out.getFilePointer(); - - parent.skipListWriter.resetSkip(); - - return docsWriter; - } - - /** Called when we are done adding terms to this field */ - @Override - void finish() { - } - - void close() throws IOException { - docsWriter.close(); - } -} diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java b/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java index c84c32f4237..533af28a3cc 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java @@ -18,6 +18,8 @@ package org.apache.lucene.index; */ import java.io.IOException; +import java.util.Comparator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray; @@ -31,13 +33,12 @@ final class FreqProxFieldMergeState { final FreqProxTermsWriterPerField field; final int numPostings; - final CharBlockPool charPool; + private final ByteBlockPool bytePool; final int[] termIDs; final FreqProxPostingsArray postings; int currentTermID; - char[] text; - int textOffset; + final BytesRef text = new BytesRef(); private int postingUpto = -1; @@ -47,29 +48,31 @@ final class FreqProxFieldMergeState { int docID; int termFreq; - public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) { + public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, Comparator termComp) { this.field = field; - this.charPool = field.perThread.termsHashPerThread.charPool; this.numPostings = field.termsHashPerField.numPostings; - this.termIDs = field.termsHashPerField.sortPostings(); + this.bytePool = field.perThread.termsHashPerThread.bytePool; + this.termIDs = field.termsHashPerField.sortPostings(termComp); this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray; } boolean nextTerm() throws IOException { postingUpto++; - if (postingUpto == numPostings) + if (postingUpto == numPostings) { return false; + } currentTermID = termIDs[postingUpto]; docID = 0; + // Get BytesRef final int textStart = postings.textStarts[currentTermID]; - text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - textOffset = textStart & DocumentsWriter.CHAR_BLOCK_MASK; + bytePool.setBytesRef(text, textStart); field.termsHashPerField.initReader(freq, currentTermID, 0); - if (!field.fieldInfo.omitTermFreqAndPositions) + if (!field.fieldInfo.omitTermFreqAndPositions) { field.termsHashPerField.initReader(prox, currentTermID, 1); + } // Should always be true boolean result = nextDoc(); diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index 85269c971c2..7a3a062426a 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -17,14 +17,19 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.util.UnicodeUtil; - import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Map; -import java.util.ArrayList; +import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.Comparator; + +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.util.BytesRef; final class FreqProxTermsWriter extends TermsHashConsumer { @@ -33,27 +38,13 @@ final class FreqProxTermsWriter extends TermsHashConsumer { return new FreqProxTermsWriterPerThread(perThread); } - private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else if (0xffff == c1) - return 0; - } - } - @Override void closeDocStore(SegmentWriteState state) {} + @Override void abort() {} + private int flushedDocCount; // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields @@ -66,6 +57,8 @@ final class FreqProxTermsWriter extends TermsHashConsumer { // Gather all FieldData's that have postings, across all // ThreadStates List allFields = new ArrayList(); + + flushedDocCount = state.numDocs; for (Map.Entry> entry : threadsAndFields.entrySet()) { @@ -79,21 +72,23 @@ final class FreqProxTermsWriter extends TermsHashConsumer { } } - // Sort by field name - Collections.sort(allFields); final int numAllFields = allFields.size(); - // TODO: allow Lucene user to customize this consumer: - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + // Sort by field name + Collections.sort(allFields); + + // TODO: allow Lucene user to customize this codec: + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + /* Current writer chain: - FormatPostingsFieldsConsumer - -> IMPL: FormatPostingsFieldsWriter - -> FormatPostingsTermsConsumer - -> IMPL: FormatPostingsTermsWriter - -> FormatPostingsDocConsumer - -> IMPL: FormatPostingsDocWriter - -> FormatPostingsPositionsConsumer + FieldsConsumer + -> IMPL: FormatPostingsTermsDictWriter + -> TermsConsumer + -> IMPL: FormatPostingsTermsDictWriter.TermsWriter + -> DocsConsumer + -> IMPL: FormatPostingsDocsWriter + -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ @@ -134,25 +129,29 @@ final class FreqProxTermsWriter extends TermsHashConsumer { FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } - - consumer.finish(); + consumer.close(); } - private byte[] payloadBuffer; + BytesRef payload; /* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, - FormatPostingsFieldsConsumer consumer) + FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; + final BytesRef text = new BytesRef(); + final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; + final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); + final Comparator termComp = termsConsumer.getComparator(); + for(int i=0;i extensions; - private HashSet extensionsInCFS; + private final HashSet extensions; - // Prevent instantiation. - private IndexFileNameFilter() { + public IndexFileNameFilter(CodecProvider codecs) { extensions = new HashSet(); for (String ext : IndexFileNames.INDEX_EXTENSIONS) { extensions.add(ext); } - extensionsInCFS = new HashSet(); - for (String ext : IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE) { - extensionsInCFS.add(ext); + if (codecs != null) { + for(String ext : codecs.getAllExtensions()) { + extensions.add(ext); + } } } @@ -66,29 +67,4 @@ public class IndexFileNameFilter implements FilenameFilter { } return false; } - - /** - * Returns true if this is a file that would be contained - * in a CFS file. This function should only be called on - * files that pass the above "accept" (ie, are already - * known to be a Lucene index file). - */ - public boolean isCFSFile(String name) { - int i = name.lastIndexOf('.'); - if (i != -1) { - String extension = name.substring(1+i); - if (extensionsInCFS.contains(extension)) { - return true; - } - if (extension.startsWith("f") && - extension.matches("f\\d+")) { - return true; - } - } - return false; - } - - public static IndexFileNameFilter getFilter() { - return singleton; - } } diff --git a/lucene/src/java/org/apache/lucene/index/IndexFileNames.java b/lucene/src/java/org/apache/lucene/index/IndexFileNames.java index 51483d484e7..b1e86b3ea66 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexFileNames.java +++ b/lucene/src/java/org/apache/lucene/index/IndexFileNames.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.index.codecs.Codec; // for javadocs + /** * This class contains useful constants representing filenames and extensions * used by lucene, as well as convenience methods for querying whether a file @@ -25,16 +27,24 @@ package org.apache.lucene.index; * generation and extension ( * {@link #fileNameFromGeneration(String, String, long) fileNameFromGeneration}, * {@link #segmentFileName(String, String) segmentFileName}). - * + * + *

    NOTE: extensions used by codecs are not + * listed here. You must interact with the {@link Codec} + * directly. + * * @lucene.internal */ + public final class IndexFileNames { /** Name of the index segment file */ public static final String SEGMENTS = "segments"; + /** Extension of gen file */ + public static final String GEN_EXTENSION = "gen"; + /** Name of the generation reference file name */ - public static final String SEGMENTS_GEN = "segments.gen"; + public static final String SEGMENTS_GEN = "segments." + GEN_EXTENSION; /** Name of the index deletable file (only used in * pre-lockless indices) */ @@ -43,18 +53,6 @@ public final class IndexFileNames { /** Extension of norms file */ public static final String NORMS_EXTENSION = "nrm"; - /** Extension of freq postings file */ - public static final String FREQ_EXTENSION = "frq"; - - /** Extension of prox postings file */ - public static final String PROX_EXTENSION = "prx"; - - /** Extension of terms file */ - public static final String TERMS_EXTENSION = "tis"; - - /** Extension of terms index file */ - public static final String TERMS_INDEX_EXTENSION = "tii"; - /** Extension of stored fields index file */ public static final String FIELDS_INDEX_EXTENSION = "fdx"; @@ -88,9 +86,6 @@ public final class IndexFileNames { /** Extension of separate norms */ public static final String SEPARATE_NORMS_EXTENSION = "s"; - /** Extension of gen file */ - public static final String GEN_EXTENSION = "gen"; - /** * This array contains all filename extensions used by * Lucene's index files, with two exceptions, namely the @@ -104,10 +99,6 @@ public final class IndexFileNames { FIELD_INFOS_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, DELETES_EXTENSION, VECTORS_INDEX_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, @@ -117,22 +108,6 @@ public final class IndexFileNames { COMPOUND_FILE_STORE_EXTENSION, }; - /** File extensions that are added to a compound file - * (same as above, minus "del", "gen", "cfs"). */ - public static final String[] INDEX_EXTENSIONS_IN_COMPOUND_FILE = new String[] { - FIELD_INFOS_EXTENSION, - FIELDS_INDEX_EXTENSION, - FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, - VECTORS_INDEX_EXTENSION, - VECTORS_DOCUMENTS_EXTENSION, - VECTORS_FIELDS_EXTENSION, - NORMS_EXTENSION - }; - public static final String[] STORE_INDEX_EXTENSIONS = new String[] { VECTORS_INDEX_EXTENSION, VECTORS_FIELDS_EXTENSION, @@ -143,22 +118,13 @@ public final class IndexFileNames { public static final String[] NON_STORE_INDEX_EXTENSIONS = new String[] { FIELD_INFOS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, - TERMS_EXTENSION, - TERMS_INDEX_EXTENSION, NORMS_EXTENSION }; - /** File extensions of old-style index files */ - public static final String COMPOUND_EXTENSIONS[] = new String[] { + static final String COMPOUND_EXTENSIONS_NOT_CODEC[] = new String[] { FIELD_INFOS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION }; /** File extensions for term vector support */ @@ -222,6 +188,7 @@ public final class IndexFileNames { */ public static final String segmentFileName(String segmentName, String ext) { if (ext.length() > 0) { + assert !ext.startsWith("."); return new StringBuilder(segmentName.length() + 1 + ext.length()).append( segmentName).append('.').append(ext).toString(); } else { diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index 7f3ae551a64..891231422de 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -20,7 +20,11 @@ package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.*; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; // for javadocs import java.io.File; import java.io.FileOutputStream; @@ -213,7 +217,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException { - return open(directory, null, null, true, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, null, null, true, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Returns an IndexReader reading the index in the given @@ -227,7 +231,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in the given @@ -241,7 +245,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, boolean readOnly) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -259,7 +263,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -287,7 +291,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor); + return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor, null); } /** Expert: returns an IndexReader reading the index in @@ -307,7 +311,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -337,11 +341,78 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor); + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor, null); } - private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor); + /** Expert: returns an IndexReader reading the index in + * the given Directory, with a custom {@link + * IndexDeletionPolicy}, and specified {@link CodecProvider}. + * You should pass readOnly=true, since it gives much + * better concurrent performance, unless you intend to do + * write operations (delete documents or change norms) + * with the reader. + * @param directory the index directory + * @param deletionPolicy a custom deletion policy (only used + * if you use this reader to perform deletes or to set + * norms); see {@link IndexWriter} for details. + * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader + * @param termInfosIndexDivisor Subsamples which indexed + * terms are loaded into RAM. This has the same effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1. Set this + * to -1 to skip loading the terms index entirely. + * @param codecs CodecProvider to use when opening index + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor, codecs); + } + + /** Expert: returns an IndexReader reading the index in + * the given Directory, using a specific commit and with + * a custom {@link IndexDeletionPolicy} and specified + * {@link CodecProvider}. You should pass readOnly=true, since + * it gives much better concurrent performance, unless + * you intend to do write operations (delete documents or + * change norms) with the reader. + + * @param commit the specific {@link IndexCommit} to open; + * see {@link IndexReader#listCommits} to list all commits + * in a directory + * @param deletionPolicy a custom deletion policy (only used + * if you use this reader to perform deletes or to set + * norms); see {@link IndexWriter} for details. + * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader + * @param termInfosIndexDivisor Subsamples which indexed + * terms are loaded into RAM. This has the same effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1. Set this + * to -1 to skip loading the terms index entirely. + * @param codecs CodecProvider to use when opening index + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor, codecs); + } + + private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor, + CodecProvider codecs) throws CorruptIndexException, IOException { + if (codecs == null) { + codecs = CodecProvider.getDefault(); + } + return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor, codecs); } /** @@ -483,7 +554,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentVersion(directory); + return SegmentInfos.readCurrentVersion(directory, CodecProvider.getDefault()); } /** @@ -501,7 +572,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @see #getCommitUserData() */ public static Map getCommitUserData(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentUserData(directory); + return SegmentInfos.readCurrentUserData(directory, CodecProvider.getDefault()); } /** @@ -803,24 +874,63 @@ public abstract class IndexReader implements Cloneable,Closeable { * calling terms(), {@link TermEnum#next()} must be called * on the resulting enumeration before calling other methods such as * {@link TermEnum#term()}. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermEnum terms() throws IOException; + /** Flex API: returns {@link Fields} for this reader. + * This method may return null if the reader has no + * postings. + * + *

    NOTE: if this is a multi reader ({@link + * #getSequentialSubReaders} is not null) then this + * method will throw UnsupportedOperationException. If + * you really need a {@link Fields} for such a reader, + * use {@link MultiFields#getFields}. However, for + * performance reasons, it's best to get all sub-readers + * using {@link ReaderUtil#gatherSubReaders} and iterate + * through them yourself. */ + public Fields fields() throws IOException { + return new LegacyFields(this); + } + /** Returns an enumeration of all terms starting at a given term. If * the given term does not exist, the enumeration is positioned at the * first term greater than the supplied term. The enumeration is * ordered by Term.compareTo(). Each term is greater than all that * precede it in the enumeration. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermEnum terms(Term t) throws IOException; /** Returns the number of documents containing the term t. * @throws IOException if there is a low-level IO error + * @deprecated Use {@link #docFreq(String,BytesRef)} instead. */ + @Deprecated public abstract int docFreq(Term t) throws IOException; + /** Returns the number of documents containing the term + * t. This method returns 0 if the term or + * field does not exists. This method does not take into + * account deleted documents that have not yet been merged + * away. */ + public int docFreq(String field, BytesRef term) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return 0; + } + final Terms terms = fields.terms(field); + if (terms == null) { + return 0; + } + return terms.docFreq(term); + } + /** Returns an enumeration of all the documents which contain * term. For each document, the document number, the frequency of * the term in that document is also provided, for use in @@ -832,8 +942,10 @@ public abstract class IndexReader implements Cloneable,Closeable { * *

    The enumeration is ordered by document number. Each document number * is greater than all that precede it in the enumeration. + * @deprecated Use the new flex API ({@link #termDocsEnum}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public TermDocs termDocs(Term term) throws IOException { ensureOpen(); TermDocs termDocs = termDocs(); @@ -841,9 +953,57 @@ public abstract class IndexReader implements Cloneable,Closeable { return termDocs; } + /** This may return null if the field does not exist.*/ + public Terms terms(String field) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return null; + } + return fields.terms(field); + } + + /** Returns {@link DocsEnum} for the specified field & + * term. This may return null, if either the field or + * term does not exist. */ + public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Fields fields = fields(); + if (fields == null) { + return null; + } + final Terms terms = fields.terms(field); + if (terms != null) { + return terms.docs(skipDocs, term, null); + } else { + return null; + } + } + + /** Returns {@link DocsAndPositionsEnum} for the specified + * field & term. This may return null, if either the + * field or term does not exist, or, positions were not + * stored for this term. */ + public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Fields fields = fields(); + if (fields == null) { + return null; + } + final Terms terms = fields.terms(field); + if (terms != null) { + return terms.docsAndPositions(skipDocs, term, null); + } else { + return null; + } + } + /** Returns an unpositioned {@link TermDocs} enumerator. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermDocs termDocs() throws IOException; /** Returns an enumeration of all the documents which contain @@ -861,8 +1021,11 @@ public abstract class IndexReader implements Cloneable,Closeable { *

    This positional information facilitates phrase and proximity searching. *

    The enumeration is ordered by document number. Each document number is * greater than all that precede it in the enumeration. + * @deprecated Please switch the flex API ({@link + * #termDocsEnum}) instead * @throws IOException if there is a low-level IO error */ + @Deprecated public TermPositions termPositions(Term term) throws IOException { ensureOpen(); TermPositions termPositions = termPositions(); @@ -871,14 +1034,17 @@ public abstract class IndexReader implements Cloneable,Closeable { } /** Returns an unpositioned {@link TermPositions} enumerator. + * @deprecated Please switch the flex API ({@link + * #termDocsEnum}) instead * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermPositions termPositions() throws IOException; /** Deletes the document numbered docNum. Once a document is - * deleted it will not appear in TermDocs or TermPostitions enumerations. + * deleted it will not appear in TermDocs or TermPositions enumerations. * Attempts to read its field with the {@link #document} * method will result in an error. The presence of this document may still be * reflected in the {@link #docFreq} statistic, though @@ -1009,9 +1175,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * @throws IOException if there is a low-level IO error */ public final synchronized void commit(Map commitUserData) throws IOException { - if (hasChanges) { - doCommit(commitUserData); - } + doCommit(commitUserData); hasChanges = false; } @@ -1044,6 +1208,33 @@ public abstract class IndexReader implements Cloneable,Closeable { */ public abstract Collection getFieldNames(FieldOption fldOption); + // Only used by external subclasses of IndexReader; all + // internal classes should implement Bits more + // efficiently: + private final class DeletedDocsBits implements Bits { + public boolean get(int docID) { + return isDeleted(docID); + } + public int length() { + return maxDoc(); + } + } + + private Bits deletedDocsBits; + + /** Returns the {@link Bits} representing deleted docs. A + * set bit indicates the doc ID has been deleted. This + * method should return null when there are no deleted + * docs. + * + * @lucene.experimental */ + public Bits getDeletedDocs() throws IOException { + if (deletedDocsBits == null) { + deletedDocsBits = new DeletedDocsBits(); + } + return deletedDocsBits; + } + /** * Expert: return the IndexCommit that this reader has * opened. This method is only implemented by those @@ -1169,6 +1360,12 @@ public abstract class IndexReader implements Cloneable,Closeable { return null; } + + /** Expert: returns the docID base for this subReader. */ + public int getSubReaderDocBase(IndexReader subReader) { + throw new UnsupportedOperationException(); + } + /** Expert */ public Object getFieldCacheKey() { return this; @@ -1177,17 +1374,26 @@ public abstract class IndexReader implements Cloneable,Closeable { /** Returns the number of unique terms (across all fields) * in this reader. * - * This method returns long, even though internally - * Lucene cannot handle more than 2^31 unique terms, for - * a possible future when this limitation is removed. - * * @throws UnsupportedOperationException if this count * cannot be easily determined (eg Multi*Readers). * Instead, you should call {@link * #getSequentialSubReaders} and ask each sub reader for * its unique term count. */ public long getUniqueTermCount() throws IOException { - throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); + long numTerms = 0; + final Fields fields = fields(); + if (fields == null) { + return 0; + } + FieldsEnum it = fields.iterator(); + while(true) { + String field = it.next(); + if (field == null) { + break; + } + numTerms += fields.terms(field).getUniqueTermCount(); + } + return numTerms; } /** For IndexReader implementations that use @@ -1198,4 +1404,29 @@ public abstract class IndexReader implements Cloneable,Closeable { public int getTermInfosIndexDivisor() { throw new UnsupportedOperationException("This reader does not support this method."); } + + + private Fields fields; + + /** lucene.internal */ + void storeFields(Fields fields) { + this.fields = fields; + } + + /** lucene.internal */ + Fields retrieveFields() { + return fields; + } + + private Bits storedDelDocs; + + /** lucene.internal */ + void storeDelDocs(Bits delDocs) { + this.storedDelDocs = delDocs; + } + + /** lucene.internal */ + Bits retrieveDelDocs() { + return storedDelDocs; + } } diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/src/java/org/apache/lucene/index/IndexWriter.java index d1dc2ccb53b..1292729ad42 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriter.java @@ -28,6 +28,7 @@ import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.Constants; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.Version; @@ -232,12 +233,13 @@ public class IndexWriter implements Closeable { public final static int DEFAULT_TERM_INDEX_INTERVAL = IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL; /** - * Absolute hard maximum length for a term. If a term - * arrives from the analyzer longer than this length, it - * is skipped and a message is printed to infoStream, if - * set (see {@link #setInfoStream}). + * Absolute hard maximum length for a term, in bytes once + * encoded as UTF8. If a term arrives from the analyzer + * longer than this length, it is skipped and a message is + * printed to infoStream, if set (see {@link + * #setInfoStream}). */ - public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH; + public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH_UTF8; // The normal read buffer size defaults to 1024, but // increasing this during merging seems to yield @@ -334,7 +336,7 @@ public class IndexWriter implements Closeable { * *

    Note that this is functionally equivalent to calling * {#commit} and then using {@link IndexReader#open} to - * open a new reader. But the turarnound time of this + * open a new reader. But the turnaround time of this * method should be faster since it avoids the potentially * costly {@link #commit}.

    * @@ -420,7 +422,7 @@ public class IndexWriter implements Closeable { // just like we do when loading segments_N synchronized(this) { applyDeletes(); - final IndexReader r = new ReadOnlyDirectoryReader(this, segmentInfos, termInfosIndexDivisor); + final IndexReader r = new ReadOnlyDirectoryReader(this, segmentInfos, termInfosIndexDivisor, codecs); if (infoStream != null) { message("return reader version=" + r.getVersion() + " reader=" + r); } @@ -629,7 +631,7 @@ public class IndexWriter implements Closeable { // TODO: we may want to avoid doing this while // synchronized // Returns a ref, which we xfer to readerMap: - sr = SegmentReader.get(false, info.dir, info, readBufferSize, doOpenStores, termsIndexDivisor); + sr = SegmentReader.get(false, info.dir, info, readBufferSize, doOpenStores, termsIndexDivisor, codecs); if (info.dir == directory) { // Only pool if reader is not external @@ -639,7 +641,7 @@ public class IndexWriter implements Closeable { if (doOpenStores) { sr.openDocStores(); } - if (termsIndexDivisor != -1 && !sr.termsIndexLoaded()) { + if (termsIndexDivisor != -1) { // If this reader was originally opened because we // needed to merge it, we didn't load the terms // index. But now, if the caller wants the terms @@ -1038,6 +1040,8 @@ public class IndexWriter implements Closeable { .setOpenMode(OpenMode.APPEND).setMaxFieldLength(mfl.getLimit()) .setIndexDeletionPolicy(deletionPolicy).setIndexCommit(commit)); } + + CodecProvider codecs; /** * Constructs a new IndexWriter per the settings given in conf. @@ -1081,6 +1085,8 @@ public class IndexWriter implements Closeable { mergePolicy.setIndexWriter(this); mergeScheduler = conf.getMergeScheduler(); mergedSegmentWarmer = conf.getMergedSegmentWarmer(); + codecs = conf.getCodecProvider(); + poolReaders = conf.getReaderPooling(); OpenMode mode = conf.getOpenMode(); @@ -1111,7 +1117,7 @@ public class IndexWriter implements Closeable { // segments_N file with no segments: boolean doCommit; try { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); segmentInfos.clear(); doCommit = false; } catch (IOException e) { @@ -1129,7 +1135,7 @@ public class IndexWriter implements Closeable { changeCount++; } } else { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); IndexCommit commit = conf.getIndexCommit(); if (commit != null) { @@ -1141,7 +1147,7 @@ public class IndexWriter implements Closeable { if (commit.getDirectory() != directory) throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory"); SegmentInfos oldInfos = new SegmentInfos(); - oldInfos.read(directory, commit.getSegmentsFileName()); + oldInfos.read(directory, commit.getSegmentsFileName(), codecs); segmentInfos.replace(oldInfos); changeCount++; if (infoStream != null) @@ -1159,7 +1165,7 @@ public class IndexWriter implements Closeable { // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, conf.getIndexDeletionPolicy(), - segmentInfos, infoStream, docWriter); + segmentInfos, infoStream, docWriter, this.codecs); if (deleter.startingCommitDeleted) // Deletion policy deleted the "head" commit point. @@ -1174,6 +1180,7 @@ public class IndexWriter implements Closeable { pushMaxBufferedDocs(); if (infoStream != null) { + message("init: create=" + create); messageState(); } @@ -3135,7 +3142,7 @@ public class IndexWriter implements Closeable { } SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); + sis.read(dirs[i], codecs); for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); assert !segmentInfos.contains(info): "dup info dir=" + info.dir + " name=" + info.name; @@ -3321,7 +3328,7 @@ public class IndexWriter implements Closeable { try { mergedName = newSegmentName(); - merger = new SegmentMerger(this, mergedName, null); + merger = new SegmentMerger(directory, termIndexInterval, mergedName, null, codecs); SegmentReader sReader = null; synchronized(this) { @@ -3344,7 +3351,7 @@ public class IndexWriter implements Closeable { synchronized(this) { segmentInfos.clear(); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false, merger.hasProx()); + -1, null, false, merger.hasProx(), merger.getCodec()); setDiagnostics(info, "addIndexes(IndexReader...)"); segmentInfos.add(info); } @@ -3391,7 +3398,7 @@ public class IndexWriter implements Closeable { startTransaction(false); try { - merger.createCompoundFile(mergedName + ".cfs"); + merger.createCompoundFile(mergedName + ".cfs", info); synchronized(this) { info.setUseCompoundFile(true); } @@ -3742,7 +3749,9 @@ public class IndexWriter implements Closeable { directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - docWriter.hasProx()); + docWriter.hasProx(), + docWriter.getCodec()); + setDiagnostics(newSegment, "flush"); } @@ -3956,7 +3965,7 @@ public class IndexWriter implements Closeable { } } } - + merge.info.setHasProx(merger.hasProx()); segmentInfos.subList(start, start + merge.segments.size()).clear(); @@ -4032,7 +4041,7 @@ public class IndexWriter implements Closeable { mergeInit(merge); if (infoStream != null) - message("now merge\n merge=" + merge.segString(directory) + "\n merge=" + merge + "\n index=" + segString()); + message("now merge\n merge=" + merge.segString(directory) + "\n index=" + segString()); mergeMiddle(merge); mergeSuccess(merge); @@ -4258,7 +4267,8 @@ public class IndexWriter implements Closeable { docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - false); + false, + null); Map details = new HashMap(); @@ -4338,7 +4348,7 @@ public class IndexWriter implements Closeable { if (infoStream != null) message("merging " + merge.segString(directory)); - merger = new SegmentMerger(this, mergedName, merge); + merger = new SegmentMerger(directory, termIndexInterval, mergedName, merge, codecs); merge.readers = new SegmentReader[numSegments]; merge.readersClone = new SegmentReader[numSegments]; @@ -4411,8 +4421,17 @@ public class IndexWriter implements Closeable { // This is where all the work happens: mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores); + // Record which codec was used to write the segment + merge.info.setCodec(merger.getCodec()); + assert mergedDocCount == totDocCount; + // Very important to do this before opening the reader + // because codec must know if prox was written for + // this segment: + //System.out.println("merger set hasProx=" + merger.hasProx() + " seg=" + merge.info.name); + merge.info.setHasProx(merger.hasProx()); + // TODO: in the non-realtime case, we may want to only // keep deletes (it's costly to open entire reader // when we just need deletes) @@ -4450,8 +4469,9 @@ public class IndexWriter implements Closeable { merge.readersClone[i].close(); } catch (Throwable t) { } - // This was a private clone and we had the only reference - assert merge.readersClone[i].getRefCount() == 0; + // This was a private clone and we had the + // only reference + assert merge.readersClone[i].getRefCount() == 0: "refCount should be 0 but is " + merge.readersClone[i].getRefCount(); } } } else { @@ -4484,7 +4504,7 @@ public class IndexWriter implements Closeable { final String compoundFileName = IndexFileNames.segmentFileName(mergedName, IndexFileNames.COMPOUND_FILE_EXTENSION); try { - merger.createCompoundFile(compoundFileName); + merger.createCompoundFile(compoundFileName, merge.info); success = true; } catch (IOException ioe) { synchronized(this) { diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java index 4e5e19ad472..80e0ab7b190 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -20,6 +20,7 @@ package org.apache.lucene.index; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.DocumentsWriter.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.Version; @@ -78,6 +79,9 @@ public final class IndexWriterConfig implements Cloneable { */ public static long WRITE_LOCK_TIMEOUT = 1000; + /** Default {@link CodecProvider}. */ + public final static CodecProvider DEFAULT_CODEC_PROVIDER = CodecProvider.getDefault(); + /** The maximum number of simultaneous threads that may be * indexing documents at once in IndexWriter; if more * than this many threads arrive they will wait for @@ -119,6 +123,7 @@ public final class IndexWriterConfig implements Cloneable { private int maxBufferedDocs; private IndexingChain indexingChain; private IndexReaderWarmer mergedSegmentWarmer; + private CodecProvider codecProvider; private MergePolicy mergePolicy; private int maxThreadStates; private boolean readerPooling; @@ -149,6 +154,7 @@ public final class IndexWriterConfig implements Cloneable { maxBufferedDocs = DEFAULT_MAX_BUFFERED_DOCS; indexingChain = DocumentsWriter.defaultIndexingChain; mergedSegmentWarmer = null; + codecProvider = DEFAULT_CODEC_PROVIDER; mergePolicy = new LogByteSizeMergePolicy(); maxThreadStates = DEFAULT_MAX_THREAD_STATES; readerPooling = DEFAULT_READER_POOLING; @@ -509,6 +515,18 @@ public final class IndexWriterConfig implements Cloneable { this.mergePolicy = mergePolicy == null ? new LogByteSizeMergePolicy() : mergePolicy; return this; } + + /** Set the CodecProvider. See {@link CodecProvider}. */ + public IndexWriterConfig setCodecProvider(CodecProvider codecProvider) { + this.codecProvider = codecProvider; + return this; + } + + /** Returns the current merged segment warmer. See {@link IndexReaderWarmer}. */ + public CodecProvider getCodecProvider() { + return codecProvider; + } + /** * Returns the current MergePolicy in use by this writer. @@ -584,6 +602,7 @@ public final class IndexWriterConfig implements Cloneable { sb.append("ramBufferSizeMB=").append(ramBufferSizeMB).append("\n"); sb.append("maxBufferedDocs=").append(maxBufferedDocs).append("\n"); sb.append("mergedSegmentWarmer=").append(mergedSegmentWarmer).append("\n"); + sb.append("codecProvider=").append(codecProvider).append("\n"); sb.append("mergePolicy=").append(mergePolicy).append("\n"); sb.append("maxThreadStates=").append(maxThreadStates).append("\n"); sb.append("readerPooling=").append(readerPooling).append("\n"); diff --git a/lucene/src/java/org/apache/lucene/index/LegacyFields.java b/lucene/src/java/org/apache/lucene/index/LegacyFields.java new file mode 100644 index 00000000000..9523a1e1004 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/LegacyFields.java @@ -0,0 +1,41 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Implements flex API (FieldsEnum/TermsEnum) on top of + * non-flex API. Used only for IndexReader impls outside + * Lucene's core. */ +class LegacyFields extends Fields { + private final IndexReader r; + + public LegacyFields(IndexReader r) throws IOException { + this.r = r; + } + + @Override + public FieldsEnum iterator() throws IOException { + return new LegacyFieldsEnum(r); + } + + @Override + public Terms terms(String field) throws IOException { + return new LegacyTerms(r, field); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/LegacyFieldsEnum.java b/lucene/src/java/org/apache/lucene/index/LegacyFieldsEnum.java new file mode 100644 index 00000000000..d233a2c2138 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/LegacyFieldsEnum.java @@ -0,0 +1,337 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** Implements flex API (FieldsEnum/TermsEnum) on top of + * pre-flex API. Used only for IndexReader impls outside + * Lucene's core. + * + * @deprecated Migrate the external reader to the flex API */ +@Deprecated +class LegacyFieldsEnum extends FieldsEnum { + private final IndexReader r; + private TermEnum terms; + private String field; + private boolean init; + + public LegacyFieldsEnum(IndexReader r) throws IOException { + this.r = r; + terms = r.terms(); + init = true; + } + + @Override + public String next() throws IOException { + + if (field != null) { + terms.close(); + // jump to end of the current field: + terms = r.terms(new Term(field, "\uFFFF")); + assert terms.term() == null || !terms.term().field.equals(field); + } + if (init) { + init = false; + if (!terms.next()) { + return null; + } + } + if (terms.term() != null) { + String newField = terms.term().field; + assert field == null || !newField.equals(field); + field = newField; + return field; + } else { + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + return new LegacyTermsEnum(r, field); + } + + static class LegacyTermsEnum extends TermsEnum { + private final IndexReader r; + private final String field; + private TermEnum terms; + private BytesRef current; + private final BytesRef tr = new BytesRef(); + + LegacyTermsEnum(IndexReader r, String field) throws IOException { + this.r = r; + this.field = field; + } + + @Override + public Comparator getComparator() { + // Pre-flex indexes always sorted in UTF16 order + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + + @Override + public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { + if (terms != null) { + terms.close(); + } + terms = r.terms(new Term(field, text.utf8ToString())); + + final Term t = terms.term(); + if (t == null) { + current = null; + return SeekStatus.END; + } else if (t.field() == field) { + tr.copy(t.text()); + current = tr; + if (text.bytesEquals(tr)) { + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } else { + return SeekStatus.END; + } + } + + @Override + public SeekStatus seek(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef next() throws IOException { + if (terms == null) { + // first next -- seek to start of field + terms = r.terms(new Term(field, "")); + final Term t = terms.term(); + if (t == null || t.field != field) { + return null; + } else { + tr.copy(terms.term().text()); + return current = tr; + } + } else if (terms.next()) { + if (terms.term().field == field) { + tr.copy(terms.term().text()); + return current = tr; + } else { + return null; + } + } else { + return null; + } + } + + @Override + public BytesRef term() { + return current; + } + + @Override + public int docFreq() { + return terms.docFreq(); + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + if (reuse != null) { + return ((LegacyDocsEnum) reuse).reset(terms.term(), skipDocs); + } else { + return (new LegacyDocsEnum(r, field)).reset(terms.term(), skipDocs); + } + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (reuse != null) { + return ((LegacyDocsAndPositionsEnum) reuse).reset(terms.term(), skipDocs); + } else { + return (new LegacyDocsAndPositionsEnum(r, field)).reset(terms.term(), skipDocs); + } + } + + public void close() throws IOException { + terms.close(); + } + } + + // Emulates flex on top of legacy API + private static class LegacyDocsEnum extends DocsEnum { + private final IndexReader r; + private final String field; + private final TermDocs td; + + private Term term; + + private int doc = -1; + + LegacyDocsEnum(IndexReader r, String field) throws IOException { + this.r = r; + this.field = field; + td = r.termDocs(); + } + + public DocsEnum reset(Term term, Bits skipDocs) throws IOException { + this.term = term; + td.seek(term); + + if (skipDocs != MultiFields.getDeletedDocs(r)) { + // An external reader's TermDocs/Positions will + // silently skip deleted docs, so, we can't allow + // arbitrary skipDocs here: + throw new IllegalStateException("external IndexReader requires skipDocs == MultiFields.getDeletedDocs()"); + } + + return this; + } + + @Override + public int nextDoc() throws IOException { + if (td.next()) { + return doc = td.doc(); + } else { + return doc = NO_MORE_DOCS; + } + } + + @Override + public int advance(int target) throws IOException { + if (td.skipTo(target)) { + return doc = td.doc(); + } else { + return doc = NO_MORE_DOCS; + } + } + + @Override + public int freq() { + return td.freq(); + } + + @Override + public int docID() { + return doc; + } + } + + // Emulates flex on top of legacy API + private static class LegacyDocsAndPositionsEnum extends DocsAndPositionsEnum { + private final IndexReader r; + private final String field; + private final TermPositions tp; + + private Term term; + + private int doc = -1; + + LegacyDocsAndPositionsEnum(IndexReader r, String field) throws IOException { + this.r = r; + this.field = field; + tp = r.termPositions(); + } + + public DocsAndPositionsEnum reset(Term term, Bits skipDocs) throws IOException { + this.term = term; + tp.seek(term); + + if (skipDocs != MultiFields.getDeletedDocs(r)) { + // An external reader's TermDocs/Positions will + // silently skip deleted docs, so, we can't allow + // arbitrary skipDocs here: + throw new IllegalStateException("external IndexReader requires skipDocs == MultiFields.getDeletedDocs() skipDocs=" + skipDocs + " MultiFields.getDeletedDocs=" + MultiFields.getDeletedDocs(r) + " r=" + r); + } + + return this; + } + + @Override + public int nextDoc() throws IOException { + if (tp.next()) { + return doc = tp.doc(); + } else { + return doc = NO_MORE_DOCS; + } + } + + @Override + public int advance(int target) throws IOException { + if (tp.skipTo(target)) { + return doc = tp.doc(); + } else { + return doc = NO_MORE_DOCS; + } + } + + @Override + public int freq() { + return tp.freq(); + } + + @Override + public int docID() { + return doc; + } + + // NOTE: we don't override bulk-read (docs & freqs) API + // -- leave it to base class, because TermPositions + // can't do bulk read + + @Override + public int nextPosition() throws IOException { + return tp.nextPosition(); + } + + @Override + public int getPayloadLength() { + return tp.getPayloadLength(); + } + + private BytesRef payload; + + @Override + public BytesRef getPayload() throws IOException { + final int len = tp.getPayloadLength(); + if (payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[len]; + } else { + if (payload.bytes.length < len) { + payload.grow(len); + } + } + + payload.bytes = tp.getPayload(payload.bytes, 0); + payload.length = len; + return payload; + } + + @Override + public boolean hasPayload() { + return tp.isPayloadAvailable(); + } + } +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java b/lucene/src/java/org/apache/lucene/index/LegacySegmentMergeInfo.java similarity index 94% rename from lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java rename to lucene/src/java/org/apache/lucene/index/LegacySegmentMergeInfo.java index 0c9280ead07..de97259b81e 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java +++ b/lucene/src/java/org/apache/lucene/index/LegacySegmentMergeInfo.java @@ -19,7 +19,8 @@ package org.apache.lucene.index; import java.io.IOException; -final class SegmentMergeInfo { +// @deprecated This is pre-flex API +final class LegacySegmentMergeInfo { Term term; int base; int ord; // the position of the segment in a MultiReader @@ -29,7 +30,7 @@ final class SegmentMergeInfo { private TermPositions postings; // use getPositions() private int[] docMap; // use getDocMap() - SegmentMergeInfo(int b, TermEnum te, IndexReader r) + LegacySegmentMergeInfo(int b, TermEnum te, IndexReader r) throws IOException { base = b; reader = r; diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java b/lucene/src/java/org/apache/lucene/index/LegacySegmentMergeQueue.java similarity index 76% rename from lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java rename to lucene/src/java/org/apache/lucene/index/LegacySegmentMergeQueue.java index 877203c1a8d..8cecb6c85df 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java +++ b/lucene/src/java/org/apache/lucene/index/LegacySegmentMergeQueue.java @@ -20,23 +20,23 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.util.PriorityQueue; -final class SegmentMergeQueue extends PriorityQueue { - SegmentMergeQueue(int size) { +final class LegacySegmentMergeQueue extends PriorityQueue { + LegacySegmentMergeQueue(int size) { initialize(size); } @Override - protected final boolean lessThan(SegmentMergeInfo stiA, SegmentMergeInfo stiB) { - int comparison = stiA.term.compareTo(stiB.term); + protected final boolean lessThan(LegacySegmentMergeInfo a, LegacySegmentMergeInfo b) { + int comparison = a.term.compareTo(b.term); if (comparison == 0) - return stiA.base < stiB.base; + return a.base < b.base; else return comparison < 0; } final void close() throws IOException { while (top() != null) - pop().close(); + ((LegacySegmentMergeInfo)pop()).close(); } } diff --git a/lucene/src/java/org/apache/lucene/index/LegacyTerms.java b/lucene/src/java/org/apache/lucene/index/LegacyTerms.java new file mode 100644 index 00000000000..ff966998d82 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/LegacyTerms.java @@ -0,0 +1,52 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRef; + +/** Implements flex API (FieldsEnum/TermsEnum) on top of + * pre-flex API. Used only for IndexReader impls outside + * Lucene's core. */ +class LegacyTerms extends Terms { + + private final IndexReader r; + private final String field; + + LegacyTerms(IndexReader r, String field) { + this.r = r; + this.field = StringHelper.intern(field); + } + + @Override + public TermsEnum iterator() throws IOException { + return new LegacyFieldsEnum.LegacyTermsEnum(r, field); + } + + @Override + public Comparator getComparator() { + // Pre-flex indexes always sorted in UTF16 order + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } +} + + + diff --git a/lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java new file mode 100644 index 00000000000..ba19046350d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java @@ -0,0 +1,135 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Exposes flex API, merged from flex API of sub-segments. + * + * @lucene.experimental + */ + +public final class MultiDocsAndPositionsEnum extends DocsAndPositionsEnum { + private EnumWithSlice[] subs; + int numSubs; + int upto; + DocsAndPositionsEnum current; + int currentBase; + Bits skipDocs; + int doc = -1; + + MultiDocsAndPositionsEnum reset(final EnumWithSlice[] subs, final int numSubs) throws IOException { + this.numSubs = numSubs; + this.subs = subs; + upto = -1; + current = null; + return this; + } + + public int getNumSubs() { + return numSubs; + } + + public EnumWithSlice[] getSubs() { + return subs; + } + + @Override + public int freq() { + return current.freq(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + while(true) { + if (current != null) { + final int doc = current.advance(target-currentBase); + if (doc == NO_MORE_DOCS) { + current = null; + } else { + return this.doc = doc + currentBase; + } + } else if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + current = subs[upto].docsAndPositionsEnum; + currentBase = subs[upto].slice.start; + } + } + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (current == null) { + if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + current = subs[upto].docsAndPositionsEnum; + currentBase = subs[upto].slice.start; + } + } + + final int doc = current.nextDoc(); + if (doc != NO_MORE_DOCS) { + return this.doc = currentBase + doc; + } else { + current = null; + } + } + } + + @Override + public int nextPosition() throws IOException { + return current.nextPosition(); + } + + @Override + public int getPayloadLength() { + return current.getPayloadLength(); + } + + @Override + public boolean hasPayload() { + return current.hasPayload(); + } + + @Override + public BytesRef getPayload() throws IOException { + return current.getPayload(); + } + + // TODO: implement bulk read more efficiently than super + public final static class EnumWithSlice { + public DocsAndPositionsEnum docsAndPositionsEnum; + public ReaderUtil.Slice slice; + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/MultiDocsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiDocsEnum.java new file mode 100644 index 00000000000..804db92dbe1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/MultiDocsEnum.java @@ -0,0 +1,113 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; +import java.io.IOException; + +/** + * Exposes flex API, merged from flex API of sub-segments. + * + * @lucene.experimental + */ + +public final class MultiDocsEnum extends DocsEnum { + private EnumWithSlice[] subs; + int numSubs; + int upto; + DocsEnum current; + int currentBase; + Bits skipDocs; + int doc = -1; + + MultiDocsEnum reset(final EnumWithSlice[] subs, final int numSubs) throws IOException { + this.numSubs = numSubs; + this.subs = subs; + upto = -1; + current = null; + return this; + } + + public int getNumSubs() { + return numSubs; + } + + public EnumWithSlice[] getSubs() { + return subs; + } + + @Override + public int freq() { + return current.freq(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + while(true) { + if (current != null) { + final int doc = current.advance(target-currentBase); + if (doc == NO_MORE_DOCS) { + current = null; + } else { + return this.doc = doc + currentBase; + } + } else if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + current = subs[upto].docsEnum; + currentBase = subs[upto].slice.start; + } + } + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (current == null) { + if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + current = subs[upto].docsEnum; + currentBase = subs[upto].slice.start; + } + } + + final int doc = current.nextDoc(); + if (doc != NO_MORE_DOCS) { + return this.doc = currentBase + doc; + } else { + current = null; + } + } + } + + // TODO: implement bulk read more efficiently than super + public final static class EnumWithSlice { + public DocsEnum docsEnum; + public ReaderUtil.Slice slice; + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/MultiFields.java b/lucene/src/java/org/apache/lucene/index/MultiFields.java new file mode 100644 index 00000000000..d0f1c6ac090 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/MultiFields.java @@ -0,0 +1,229 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.ReaderUtil.Gather; // for javadocs +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.MultiBits; + +/** + * Exposes flex API, merged from flex API of sub-segments. + * This is useful when you're interacting with an {@link + * IndexReader} implementation that consists of sequential + * sub-readers (eg {@link DirectoryReader} or {@link + * MultiReader}). + * + *

    NOTE: for multi readers, you'll get better + * performance by gathering the sub readers using {@link + * ReaderUtil#gatherSubReaders} and then operate per-reader, + * instead of using this class. + * + * @lucene.experimental + */ + +public final class MultiFields extends Fields { + private final Fields[] subs; + private final ReaderUtil.Slice[] subSlices; + private final Map terms = new HashMap(); + + /** Returns a single {@link Fields} instance for this + * reader, merging fields/terms/docs/positions on the + * fly. This method will not return null. + * + *

    : this is a slow way to access postings. + * It's better to get the sub-readers (using {@link + * Gather}) and iterate through them + * yourself. */ + public static Fields getFields(IndexReader r) throws IOException { + final IndexReader[] subs = r.getSequentialSubReaders(); + if (subs == null) { + // already an atomic reader + return r.fields(); + } else if (subs.length == 0) { + // no fields + return null; + } else if (subs.length == 1) { + return getFields(subs[0]); + } else { + + Fields currentFields = r.retrieveFields(); + if (currentFields == null) { + + final List fields = new ArrayList(); + final List slices = new ArrayList(); + + new ReaderUtil.Gather(r) { + @Override + protected void add(int base, IndexReader r) throws IOException { + fields.add(r.fields()); + slices.add(new ReaderUtil.Slice(base, r.maxDoc(), fields.size()-1)); + } + }.run(); + + if (fields.size() == 0) { + return null; + } else if (fields.size() == 1) { + currentFields = fields.get(0); + } else { + currentFields = new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), + slices.toArray(ReaderUtil.Slice.EMPTY_ARRAY)); + } + r.storeFields(currentFields); + } + return currentFields; + } + } + + public static Bits getDeletedDocs(IndexReader r) throws IOException { + Bits result; + if (r.hasDeletions()) { + + result = r.retrieveDelDocs(); + if (result == null) { + + final List bits = new ArrayList(); + final List starts = new ArrayList(); + + final int maxDoc = new ReaderUtil.Gather(r) { + @Override + protected void add(int base, IndexReader r) throws IOException { + // record all delDocs, even if they are null + bits.add(r.getDeletedDocs()); + starts.add(base); + } + }.run(); + starts.add(maxDoc); + + assert bits.size() > 0; + if (bits.size() == 1) { + // Only one actual sub reader -- optimize this case + result = bits.get(0); + } else { + result = new MultiBits(bits, starts); + } + r.storeDelDocs(result); + } + } else { + result = null; + } + + return result; + } + + /** This method may return null if the field does not exist.*/ + public static Terms getTerms(IndexReader r, String field) throws IOException { + final Fields fields = getFields(r); + if (fields == null) { + return null; + } else { + return fields.terms(field); + } + } + + /** Returns {@link DocsEnum} for the specified field & + * term. This may return null if the term does not + * exist. */ + public static DocsEnum getTermDocsEnum(IndexReader r, Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Terms terms = getTerms(r, field); + if (terms != null) { + return terms.docs(skipDocs, term, null); + } else { + return null; + } + } + + /** Returns {@link DocsAndPositionsEnum} for the specified + * field & term. This may return null if the term does + * not exist or positions were not indexed. */ + public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Terms terms = getTerms(r, field); + if (terms != null) { + return terms.docsAndPositions(skipDocs, term, null); + } else { + return null; + } + } + + public MultiFields(Fields[] subs, ReaderUtil.Slice[] subSlices) { + this.subs = subs; + this.subSlices = subSlices; + } + + @Override + public FieldsEnum iterator() throws IOException { + + final List fieldsEnums = new ArrayList(); + final List fieldsSlices = new ArrayList(); + for(int i=0;i subs2 = new ArrayList(); + final List slices2 = new ArrayList(); + + // Gather all sub-readers that share this field + for(int i=0;i 0) { + while(true) { + top[numTop++] = queue.pop(); + if (queue.size() == 0 || (queue.top()).current != top[0].current) { + break; + } + } + currentField = top[0].current; + } else { + currentField = null; + } + + return currentField; + } + + @Override + public TermsEnum terms() throws IOException { + final List termsEnums = new ArrayList(); + for(int i=0;i= 0: "length=" + slice.length; + this.fields = fields; + } + } + + private final static class FieldMergeQueue extends PriorityQueue { + FieldMergeQueue(int size) { + initialize(size); + } + + @Override + protected final boolean lessThan(FieldsEnumWithSlice fieldsA, FieldsEnumWithSlice fieldsB) { + // No need to break ties by field name: TermsEnum handles that + return fieldsA.current.compareTo(fieldsB.current) < 0; + } + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/MultiReader.java b/lucene/src/java/org/apache/lucene/index/MultiReader.java index f29a7b3f864..3c830b73d24 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiReader.java +++ b/lucene/src/java/org/apache/lucene/index/MultiReader.java @@ -25,17 +25,21 @@ import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.index.DirectoryReader.MultiTermDocs; -import org.apache.lucene.index.DirectoryReader.MultiTermEnum; -import org.apache.lucene.index.DirectoryReader.MultiTermPositions; +import org.apache.lucene.index.DirectoryReader.MultiTermDocs; // deprecated +import org.apache.lucene.index.DirectoryReader.MultiTermEnum; // deprecated +import org.apache.lucene.index.DirectoryReader.MultiTermPositions; // deprecated import org.apache.lucene.search.Similarity; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; /** An IndexReader which reads multiple indexes, appending - * their content. */ + * their content. */ public class MultiReader extends IndexReader implements Cloneable { protected IndexReader[] subReaders; private int[] starts; // 1st docno for each segment + private final Map subReaderToSlice = new HashMap(); private boolean[] decrefOnClose; // remember which subreaders to decRef on close private Map normsCache = new HashMap(); private int maxDoc = 0; @@ -49,7 +53,7 @@ public class MultiReader extends IndexReader implements Cloneable { *

    Note that all subreaders are closed if this Multireader is closed.

    * @param subReaders set of (sub)readers */ - public MultiReader(IndexReader... subReaders) { + public MultiReader(IndexReader... subReaders) throws IOException { initialize(subReaders, true); } @@ -61,14 +65,15 @@ public class MultiReader extends IndexReader implements Cloneable { * when this MultiReader is closed * @param subReaders set of (sub)readers */ - public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) { + public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException { initialize(subReaders, closeSubReaders); } - private void initialize(IndexReader[] subReaders, boolean closeSubReaders) { + private void initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException { this.subReaders = subReaders.clone(); starts = new int[subReaders.length + 1]; // build starts array decrefOnClose = new boolean[subReaders.length]; + for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs @@ -80,12 +85,34 @@ public class MultiReader extends IndexReader implements Cloneable { decrefOnClose[i] = false; } - if (subReaders[i].hasDeletions()) + if (subReaders[i].hasDeletions()) { hasDeletions = true; + } + + final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i], + subReaders[i].maxDoc(), + i); + subReaderToSlice.put(subReaders[i], slice); } + starts[subReaders.length] = maxDoc; } - + + @Override + public long getUniqueTermCount() throws IOException { + throw new UnsupportedOperationException(""); + } + + @Override + public int getSubReaderDocBase(IndexReader subReader) { + return subReaderToSlice.get(subReader).start; + } + + @Override + public Fields fields() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getFields if you really need a top level Fields (NOTE that it's usually better to work per segment instead)"); + } + /** * Tries to reopen the subreaders. *
    @@ -128,6 +155,11 @@ public class MultiReader extends IndexReader implements Cloneable { } } + @Override + public Bits getDeletedDocs() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getDeletedDocs if you really need a top level Bits deletedDocs (NOTE that it's usually better to work per segment instead)"); + } + /** * If clone is true then we clone each of the subreaders * @param doClone @@ -367,7 +399,17 @@ public class MultiReader extends IndexReader implements Cloneable { total += subReaders[i].docFreq(t); return total; } - + + @Override + public int docFreq(String field, BytesRef t) throws IOException { + ensureOpen(); + int total = 0; // sum freqs in segments + for (int i = 0; i < subReaders.length; i++) { + total += subReaders[i].docFreq(field, t); + } + return total; + } + @Override public TermDocs termDocs() throws IOException { ensureOpen(); diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java new file mode 100644 index 00000000000..4e265c056e6 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -0,0 +1,84 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; +import java.util.Comparator; + +/** + * Exposes flex API, merged from flex API of + * sub-segments. + * + * @lucene.experimental + */ + +public final class MultiTerms extends Terms { + private final Terms[] subs; + private final ReaderUtil.Slice[] subSlices; + private final Comparator termComp; + + public MultiTerms(Terms[] subs, ReaderUtil.Slice[] subSlices) throws IOException { + this.subs = subs; + this.subSlices = subSlices; + + Comparator _termComp = null; + for(int i=0;i subTermComp = subs[i].getComparator(); + if (subTermComp != null && !subTermComp.equals(_termComp)) { + throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + } + } + } + + termComp = _termComp; + } + + @Override + public TermsEnum iterator() throws IOException { + + final List termsEnums = new ArrayList(); + for(int i=0;i 0) { + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + } else { + return TermsEnum.EMPTY; + } + } + + @Override + public Comparator getComparator() { + return termComp; + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java new file mode 100644 index 00000000000..59e64914f0a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -0,0 +1,397 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BitsSlice; +import org.apache.lucene.util.MultiBits; +import org.apache.lucene.util.ReaderUtil; + +import java.io.IOException; +import java.util.Comparator; + +/** + * Exposes flex API, merged from flex API of sub-segments. + * This does a merge sort, by term text, of the sub-readers. + * + * @lucene.experimental + */ +public final class MultiTermsEnum extends TermsEnum { + + private final TermMergeQueue queue; + private final TermsEnumWithSlice[] subs; // all of our subs (one per sub-reader) + private final TermsEnumWithSlice[] currentSubs; // current subs that have at least one term for this field + private final TermsEnumWithSlice[] top; + private final MultiDocsEnum.EnumWithSlice[] subDocs; + private final MultiDocsAndPositionsEnum.EnumWithSlice[] subDocsAndPositions; + + private int numTop; + private int numSubs; + private BytesRef current; + private Comparator termComp; + + public static class TermsEnumIndex { + public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; + final int subIndex; + final TermsEnum termsEnum; + + public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { + this.termsEnum = termsEnum; + this.subIndex = subIndex; + } + } + + public int getMatchCount() { + return numTop; + } + + public TermsEnumWithSlice[] getMatchArray() { + return top; + } + + public MultiTermsEnum(ReaderUtil.Slice[] slices) { + queue = new TermMergeQueue(slices.length); + top = new TermsEnumWithSlice[slices.length]; + subs = new TermsEnumWithSlice[slices.length]; + subDocs = new MultiDocsEnum.EnumWithSlice[slices.length]; + subDocsAndPositions = new MultiDocsAndPositionsEnum.EnumWithSlice[slices.length]; + for(int i=0;i getComparator() { + return termComp; + } + + /** The terms array must be newly created TermsEnum, ie + * {@link TermsEnum#next} has not yet been called. */ + public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException { + assert termsEnumsIndex.length <= top.length; + numSubs = 0; + numTop = 0; + termComp = null; + queue.clear(); + for(int i=0;i subTermComp = termsEnumIndex.termsEnum.getComparator(); + if (subTermComp != null && !subTermComp.equals(termComp)) { + throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + } + } + + final BytesRef term = termsEnumIndex.termsEnum.next(); + if (term != null) { + final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex]; + entry.reset(termsEnumIndex.termsEnum, term); + queue.add(entry); + currentSubs[numSubs++] = entry; + } else { + // field has no terms + } + } + + if (queue.size() == 0) { + return TermsEnum.EMPTY; + } else { + return this; + } + } + + @Override + public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + queue.clear(); + numTop = 0; + for(int i=0;i 0) { + // at least one sub had exact match to the requested term + current = term; + return SeekStatus.FOUND; + } else if (queue.size() > 0) { + // no sub had exact match, but at least one sub found + // a term after the requested term -- advance to that + // next term: + pullTop(); + return SeekStatus.NOT_FOUND; + } else { + return SeekStatus.END; + } + } + + @Override + public SeekStatus seek(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + private final void pullTop() { + // extract all subs from the queue that have the same + // top term + assert numTop == 0; + while(true) { + top[numTop++] = queue.pop(); + if (queue.size() == 0 || !(queue.top()).current.bytesEquals(top[0].current)) { + break; + } + } + current = top[0].current; + } + + private final void pushTop() throws IOException { + // call next() on each top, and put back into queue + for(int i=0;i 0) { + pullTop(); + } else { + current = null; + } + + return current; + } + + @Override + public int docFreq() { + int sum = 0; + for(int i=0;i= 0: "length=" + subSlice.length; + } + + public void reset(TermsEnum terms, BytesRef term) { + this.terms = terms; + current = term; + reuseDocs = null; + reusePostings = null; + } + } + + private final static class TermMergeQueue extends PriorityQueue { + Comparator termComp; + TermMergeQueue(int size) { + initialize(size); + } + + @Override + protected final boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) { + final int cmp = termComp.compare(termsA.current, termsB.current); + if (cmp != 0) { + return cmp < 0; + } else { + return termsA.subSlice.start < termsB.subSlice.start; + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java b/lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java index 2e46c3a0d52..0a819c34b21 100644 --- a/lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java +++ b/lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java @@ -28,8 +28,10 @@ import org.apache.lucene.util.ArrayUtil; /** * Allows you to iterate over the {@link TermPositions} for multiple {@link Term}s as * a single {@link TermPositions}. - * + * @deprecated This class is being replaced by the package + * private MultiDocsEnum on org.apache.lucene.search. */ +@Deprecated public class MultipleTermPositions implements TermPositions { private static final class TermPositionsQueue extends PriorityQueue { diff --git a/lucene/src/java/org/apache/lucene/index/ParallelPostingsArray.java b/lucene/src/java/org/apache/lucene/index/ParallelPostingsArray.java index 86f00ca6681..0887efc6ea6 100644 --- a/lucene/src/java/org/apache/lucene/index/ParallelPostingsArray.java +++ b/lucene/src/java/org/apache/lucene/index/ParallelPostingsArray.java @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.util.ArrayUtil; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -21,25 +23,49 @@ package org.apache.lucene.index; class ParallelPostingsArray { final static int BYTES_PER_POSTING = 3 * DocumentsWriter.INT_NUM_BYTE; + final int size; final int[] textStarts; final int[] intStarts; final int[] byteStarts; - - public ParallelPostingsArray(final int size) { + + ParallelPostingsArray(final int size) { + this.size = size; textStarts = new int[size]; intStarts = new int[size]; byteStarts = new int[size]; } - - ParallelPostingsArray resize(int newSize) { - ParallelPostingsArray newArray = new ParallelPostingsArray(newSize); - copy(this, newArray); + + int bytesPerPosting() { + return BYTES_PER_POSTING; + } + + ParallelPostingsArray newInstance(int size) { + return new ParallelPostingsArray(size); + } + + final ParallelPostingsArray grow() { + int newSize = ArrayUtil.oversize(size + 1, bytesPerPosting()); + ParallelPostingsArray newArray = newInstance(newSize); + copyTo(newArray, size); return newArray; } - - void copy(ParallelPostingsArray fromArray, ParallelPostingsArray toArray) { - System.arraycopy(fromArray.textStarts, 0, toArray.textStarts, 0, fromArray.textStarts.length); - System.arraycopy(fromArray.intStarts, 0, toArray.intStarts, 0, fromArray.intStarts.length); - System.arraycopy(fromArray.byteStarts, 0, toArray.byteStarts, 0, fromArray.byteStarts.length); + + final ParallelPostingsArray shrink(int targetSize, boolean doCopy) { + int shrinkSize = ArrayUtil.getShrinkSize(size, targetSize, bytesPerPosting()); + if (shrinkSize != size) { + ParallelPostingsArray newArray = newInstance(targetSize); + if (doCopy) { + copyTo(newArray, targetSize); + } + return newArray; + } else { + return this; + } + } + + void copyTo(ParallelPostingsArray toArray, int numToCopy) { + System.arraycopy(textStarts, 0, toArray.textStarts, 0, numToCopy); + System.arraycopy(intStarts, 0, toArray.intStarts, 0, numToCopy); + System.arraycopy(byteStarts, 0, toArray.byteStarts, 0, numToCopy); } } diff --git a/lucene/src/java/org/apache/lucene/index/ParallelReader.java b/lucene/src/java/org/apache/lucene/index/ParallelReader.java index 76d0a833cca..8e8905076d6 100644 --- a/lucene/src/java/org/apache/lucene/index/ParallelReader.java +++ b/lucene/src/java/org/apache/lucene/index/ParallelReader.java @@ -21,7 +21,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.Bits; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.*; @@ -56,6 +58,8 @@ public class ParallelReader extends IndexReader { private int numDocs; private boolean hasDeletions; + private ParallelFields fields = new ParallelFields(); + /** Construct a ParallelReader. *

    Note that all subreaders are closed if this ParallelReader is closed.

    */ @@ -122,9 +126,11 @@ public class ParallelReader extends IndexReader { Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); readerToFields.put(reader, fields); - for (final String field : fields) { // update fieldToReader map - if (fieldToReader.get(field) == null) + for (final String field : fields) { // update fieldToReader map + if (fieldToReader.get(field) == null) { fieldToReader.put(field, reader); + } + this.fields.addField(field, reader); } if (!ignoreStoredFields) @@ -136,6 +142,67 @@ public class ParallelReader extends IndexReader { } decrefOnClose.add(Boolean.valueOf(incRefReaders)); } + + private class ParallelFieldsEnum extends FieldsEnum { + String currentField; + IndexReader currentReader; + Iterator keys; + + ParallelFieldsEnum() { + keys = fieldToReader.keySet().iterator(); + } + + @Override + public String next() throws IOException { + if (keys.hasNext()) { + currentField = (String) keys.next(); + currentReader = (IndexReader) fieldToReader.get(currentField); + } else { + currentField = null; + currentReader = null; + } + return currentField; + } + + @Override + public TermsEnum terms() throws IOException { + assert currentReader != null; + Terms terms = MultiFields.getTerms(currentReader, currentField); + if (terms != null) { + return terms.iterator(); + } else { + return TermsEnum.EMPTY; + } + } + } + + // Single instance of this, per ParallelReader instance + private class ParallelFields extends Fields { + final HashMap fields = new HashMap(); + + public void addField(String field, IndexReader r) throws IOException { + fields.put(field, MultiFields.getFields(r).terms(field)); + } + + @Override + public FieldsEnum iterator() throws IOException { + return new ParallelFieldsEnum(); + } + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + } + + @Override + public Bits getDeletedDocs() throws IOException { + return MultiFields.getDeletedDocs(readers.get(0)); + } + + @Override + public Fields fields() { + return fields; + } @Override public synchronized Object clone() { @@ -403,6 +470,13 @@ public class ParallelReader extends IndexReader { return reader==null ? 0 : reader.docFreq(term); } + @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader == null? 0 : reader.docFreq(field, term); + } + @Override public TermDocs termDocs(Term term) throws IOException { ensureOpen(); @@ -501,6 +575,7 @@ public class ParallelReader extends IndexReader { return fieldSet; } + @Deprecated private class ParallelTermEnum extends TermEnum { private String field; private Iterator fieldIterator; diff --git a/lucene/src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java b/lucene/src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java index e2e38ebce1a..f6a54470297 100644 --- a/lucene/src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java +++ b/lucene/src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java @@ -18,22 +18,23 @@ package org.apache.lucene.index; */ import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.Map; class ReadOnlyDirectoryReader extends DirectoryReader { - ReadOnlyDirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, int termInfosIndexDivisor) throws IOException { - super(directory, sis, deletionPolicy, true, termInfosIndexDivisor); + ReadOnlyDirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(directory, sis, deletionPolicy, true, termInfosIndexDivisor, codecs); } ReadOnlyDirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, Map oldNormsCache, boolean doClone, - int termInfosIndexDivisor) throws IOException { - super(directory, infos, oldReaders, oldStarts, oldNormsCache, true, doClone, termInfosIndexDivisor); + int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(directory, infos, oldReaders, oldStarts, oldNormsCache, true, doClone, termInfosIndexDivisor, codecs); } - ReadOnlyDirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor) throws IOException { - super(writer, infos, termInfosIndexDivisor); + ReadOnlyDirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(writer, infos, termInfosIndexDivisor, codecs); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java index acf6e0b2c93..4fefac9cb07 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java @@ -21,9 +21,13 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BitVector; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.HashSet; import java.util.HashMap; import java.util.ArrayList; import java.util.Collections; @@ -87,10 +91,13 @@ public final class SegmentInfo { // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false + + private Codec codec; + private Map diagnostics; - public SegmentInfo(String name, int docCount, Directory dir) { + public SegmentInfo(String name, int docCount, Directory dir, Codec codec) { this.name = name; this.docCount = docCount; this.dir = dir; @@ -103,15 +110,13 @@ public final class SegmentInfo { docStoreIsCompoundFile = false; delCount = 0; hasProx = true; + this.codec = codec; } - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { - this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false, true); - } - - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, - int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx) { - this(name, docCount, dir); + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, + Codec codec) { + this(name, docCount, dir, codec); this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); this.hasSingleNormFile = hasSingleNormFile; preLockless = false; @@ -119,6 +124,7 @@ public final class SegmentInfo { this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; + this.codec = codec; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } @@ -144,6 +150,7 @@ public final class SegmentInfo { isCompoundFile = src.isCompoundFile; hasSingleNormFile = src.hasSingleNormFile; delCount = src.delCount; + codec = src.codec; } void setDiagnostics(Map diagnostics) { @@ -162,10 +169,11 @@ public final class SegmentInfo { * @param format format of the segments info file * @param input input handle to read segment info from */ - SegmentInfo(Directory dir, int format, IndexInput input) throws IOException { + SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException { this.dir = dir; name = input.readString(); docCount = input.readInt(); + final String codecName; if (format <= SegmentInfos.FORMAT_LOCKLESS) { delGen = input.readLong(); if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) { @@ -208,6 +216,13 @@ public final class SegmentInfo { else hasProx = true; + // System.out.println(Thread.currentThread().getName() + ": si.read hasProx=" + hasProx + " seg=" + name); + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + codecName = input.readString(); + else + codecName = "PreFlex"; + if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) { diagnostics = input.readStringStringMap(); } else { @@ -224,8 +239,10 @@ public final class SegmentInfo { docStoreSegment = null; delCount = -1; hasProx = true; + codecName = "PreFlex"; diagnostics = Collections.emptyMap(); } + codec = codecs.lookup(codecName); } void setNumFields(int numFields) { @@ -309,7 +326,7 @@ public final class SegmentInfo { @Override public Object clone () { - SegmentInfo si = new SegmentInfo(name, docCount, dir); + SegmentInfo si = new SegmentInfo(name, docCount, dir, codec); si.isCompoundFile = isCompoundFile; si.delGen = delGen; si.delCount = delCount; @@ -323,6 +340,7 @@ public final class SegmentInfo { si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; + si.codec = codec; return si; } @@ -373,14 +391,12 @@ public final class SegmentInfo { if (result == null) throw new IOException("cannot read directory " + dir + ": listAll() returned null"); - final IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); - String pattern; - pattern = name + ".s"; - int patternLength = pattern.length(); + final String pattern = name + ".s\\d+"; for(int i = 0; i < result.length; i++){ String fileName = result[i]; - if (filter.accept(null, fileName) && fileName.startsWith(pattern) && Character.isDigit(fileName.charAt(patternLength))) - return true; + if (fileName.matches(pattern)) { + return true; + } } return false; } @@ -550,6 +566,7 @@ public final class SegmentInfo { output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); + output.writeString(codec.name); output.writeStringStringMap(diagnostics); } @@ -562,7 +579,20 @@ public final class SegmentInfo { return hasProx; } - private void addIfExists(List files, String fileName) throws IOException { + /** Can only be called once. */ + public void setCodec(Codec codec) { + assert this.codec == null; + if (codec == null) { + throw new IllegalArgumentException("codec must be non-null"); + } + this.codec = codec; + } + + Codec getCodec() { + return codec; + } + + private void addIfExists(Set files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); } @@ -580,15 +610,17 @@ public final class SegmentInfo { return files; } - files = new ArrayList(); + Set fileSet = new HashSet(); boolean useCompoundFile = getUseCompoundFile(); if (useCompoundFile) { - files.add(IndexFileNames.segmentFileName(name, IndexFileNames.COMPOUND_FILE_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(name, IndexFileNames.COMPOUND_FILE_EXTENSION)); } else { - for (String ext : IndexFileNames.NON_STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(name, ext)); + for(String ext : IndexFileNames.NON_STORE_INDEX_EXTENSIONS) { + addIfExists(fileSet, IndexFileNames.segmentFileName(name, ext)); + } + codec.files(dir, this, fileSet); } if (docStoreOffset != -1) { @@ -596,19 +628,19 @@ public final class SegmentInfo { // vectors) with other segments assert docStoreSegment != null; if (docStoreIsCompoundFile) { - files.add(IndexFileNames.segmentFileName(docStoreSegment, IndexFileNames.COMPOUND_FILE_STORE_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, IndexFileNames.COMPOUND_FILE_STORE_EXTENSION)); } else { for (String ext : IndexFileNames.STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(docStoreSegment, ext)); + addIfExists(fileSet, IndexFileNames.segmentFileName(docStoreSegment, ext)); } } else if (!useCompoundFile) { for (String ext : IndexFileNames.STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(name, ext)); + addIfExists(fileSet, IndexFileNames.segmentFileName(name, ext)); } String delFileName = IndexFileNames.fileNameFromGeneration(name, IndexFileNames.DELETES_EXTENSION, delGen); if (delFileName != null && (delGen >= YES || dir.fileExists(delFileName))) { - files.add(delFileName); + fileSet.add(delFileName); } // Careful logic for norms files @@ -617,14 +649,14 @@ public final class SegmentInfo { long gen = normGen[i]; if (gen >= YES) { // Definitely a separate norm file, with generation: - files.add(IndexFileNames.fileNameFromGeneration(name, IndexFileNames.SEPARATE_NORMS_EXTENSION + i, gen)); + fileSet.add(IndexFileNames.fileNameFromGeneration(name, IndexFileNames.SEPARATE_NORMS_EXTENSION + i, gen)); } else if (NO == gen) { // No separate norms but maybe plain norms // in the non compound file case: if (!hasSingleNormFile && !useCompoundFile) { String fileName = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION + i); if (dir.fileExists(fileName)) { - files.add(fileName); + fileSet.add(fileName); } } } else if (CHECK_DIR == gen) { @@ -636,7 +668,7 @@ public final class SegmentInfo { fileName = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION + i); } if (fileName != null && dir.fileExists(fileName)) { - files.add(fileName); + fileSet.add(fileName); } } } @@ -644,20 +676,24 @@ public final class SegmentInfo { // Pre-2.1: we have to scan the dir to find all // matching _X.sN/_X.fN files for our segment: String prefix; - if (useCompoundFile) + if (useCompoundFile) { prefix = IndexFileNames.segmentFileName(name, IndexFileNames.SEPARATE_NORMS_EXTENSION); - else + } else { prefix = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION); - int prefixLength = prefix.length(); + } + final String pattern = prefix + "\\d+"; + String[] allFiles = dir.listAll(); - final IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); for(int i=0;i prefixLength && Character.isDigit(fileName.charAt(prefixLength)) && fileName.startsWith(prefix)) { - files.add(fileName); + if (fileName.matches(pattern)) { + fileSet.add(fileName); } } } + + files = new ArrayList(fileSet); + return files; } diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java index 15759e35666..a368cecf2bb 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java @@ -23,6 +23,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.ChecksumIndexOutput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.NoSuchDirectoryException; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.util.ThreadInterruptedException; import java.io.FileNotFoundException; @@ -88,9 +89,13 @@ public final class SegmentInfos extends Vector { /** This format adds optional per-segment String * diagnostics storage, and switches userData to Map */ public static final int FORMAT_DIAGNOSTICS = -9; + + /** Each segment records whether its postings are written + * in the new flex format */ + public static final int FORMAT_FLEX_POSTINGS = -10; /* This must always point to the most recent file format. */ - static final int CURRENT_FORMAT = FORMAT_DIAGNOSTICS; + static final int CURRENT_FORMAT = FORMAT_FLEX_POSTINGS; public int counter = 0; // used to name new segments /** @@ -228,7 +233,8 @@ public final class SegmentInfos extends Vector { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public final void read(Directory directory, String segmentFileName) throws CorruptIndexException, IOException { + public final void read(Directory directory, String segmentFileName, + CodecProvider codecs) throws CorruptIndexException, IOException { boolean success = false; // Clear any previous segments: @@ -254,7 +260,7 @@ public final class SegmentInfos extends Vector { } for (int i = input.readInt(); i > 0; i--) { // read segmentInfos - add(new SegmentInfo(directory, format, input)); + add(new SegmentInfo(directory, format, input, codecs)); } if(format >= 0){ // in old format the version number may be at the end of the file @@ -301,14 +307,17 @@ public final class SegmentInfos extends Vector { * @throws IOException if there is a low-level IO error */ public final void read(Directory directory) throws CorruptIndexException, IOException { - + read(directory, CodecProvider.getDefault()); + } + + public final void read(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException { generation = lastGeneration = -1; new FindSegmentsFile(directory) { @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { - read(directory, segmentFileName); + read(directory, segmentFileName, codecs); return null; } }.run(); @@ -375,9 +384,11 @@ public final class SegmentInfos extends Vector { public Object clone() { SegmentInfos sis = (SegmentInfos) super.clone(); for(int i=0;i(userData); + sis.userData = new HashMap(userData); return sis; } @@ -399,7 +410,7 @@ public final class SegmentInfos extends Vector { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public static long readCurrentVersion(Directory directory) + public static long readCurrentVersion(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException { // Fully read the segments file: this ensures that it's @@ -417,10 +428,10 @@ public final class SegmentInfos extends Vector { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public static Map readCurrentUserData(Directory directory) + public static Map readCurrentUserData(Directory directory, CodecProvider codecs) throws CorruptIndexException, IOException { SegmentInfos sis = new SegmentInfos(); - sis.read(directory); + sis.read(directory, codecs); return sis.getUserData(); } diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java index 8cbb6e52646..4369901c4e1 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java @@ -20,15 +20,23 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; - +import java.util.Set; +import java.util.HashSet; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.MergePolicy.MergeAbortedException; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.MergeState; +import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.MultiBits; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -66,25 +74,14 @@ final class SegmentMerger { /** Maximum number of contiguous documents to bulk-copy when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; + + private final CodecProvider codecs; + private Codec codec; + private SegmentWriteState segmentWriteState; - /** This ctor used only by test code. - * - * @param dir The Directory to merge the other segments into - * @param name The name of the new segment - */ - SegmentMerger(Directory dir, String name) { + SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs) { directory = dir; - segment = name; - checkAbort = new CheckAbort(null, null) { - @Override - public void work(double units) throws MergeAbortedException { - // do nothing - } - }; - } - - SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { - directory = writer.getDirectory(); + this.codecs = codecs; segment = name; if (merge != null) { checkAbort = new CheckAbort(merge, directory); @@ -96,7 +93,7 @@ final class SegmentMerger { } }; } - termIndexInterval = writer.getConfig().getTermIndexInterval(); + this.termIndexInterval = termIndexInterval; } boolean hasProx() { @@ -171,30 +168,27 @@ final class SegmentMerger { } } - final List createCompoundFile(String fileName) + final List createCompoundFile(String fileName, final SegmentInfo info) throws IOException { - CompoundFileWriter cfsWriter = - new CompoundFileWriter(directory, fileName, checkAbort); + CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); + + Set fileSet = new HashSet(); - List files = - new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); - // Basic files - for (String ext : IndexFileNames.COMPOUND_EXTENSIONS) { - if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) - continue; - + for (String ext : IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC) { if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && - !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) - files.add(IndexFileNames.segmentFileName(segment, ext)); + !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) + fileSet.add(IndexFileNames.segmentFileName(segment, ext)); } + codec.files(directory, info, fileSet); + // Fieldable norm files int numFIs = fieldInfos.size(); for (int i = 0; i < numFIs; i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { - files.add(IndexFileNames.segmentFileName(segment, IndexFileNames.NORMS_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(segment, IndexFileNames.NORMS_EXTENSION)); break; } } @@ -202,19 +196,19 @@ final class SegmentMerger { // Vector files if (fieldInfos.hasVectors() && mergeDocStores) { for (String ext : IndexFileNames.VECTOR_EXTENSIONS) { - files.add(IndexFileNames.segmentFileName(segment, ext)); + fileSet.add(IndexFileNames.segmentFileName(segment, ext)); } } // Now merge all added files - for (String file : files) { + for (String file : fileSet) { cfsWriter.addFile(file); } // Perform the merge cfsWriter.close(); - return files; + return new ArrayList(fileSet); } private void addIndexed(IndexReader reader, FieldInfos fInfos, @@ -351,13 +345,16 @@ final class SegmentMerger { // details. throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption"); - } else + } else { // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount for (final IndexReader reader : readers) { docCount += reader.numDocs(); } + } + + segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, null, docCount, 0, termIndexInterval, codecs); return docCount; } @@ -552,156 +549,116 @@ final class SegmentMerger { } } - private SegmentMergeQueue queue = null; + Codec getCodec() { + return codec; + } private final void mergeTerms() throws CorruptIndexException, IOException { - SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + // Let CodecProvider decide which codec will be used to write + // the new segment: + codec = codecs.getWriter(segmentWriteState); + + int docBase = 0; - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + final List fields = new ArrayList(); + final List subReaders = new ArrayList(); + final List slices = new ArrayList(); + final List bits = new ArrayList(); + final List bitsStarts = new ArrayList(); - try { - queue = new SegmentMergeQueue(readers.size()); - - mergeTermInfos(consumer); - - } finally { - consumer.finish(); - if (queue != null) queue.close(); - } - } - - boolean omitTermFreqAndPositions; - - private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { - int base = 0; - final int readerCount = readers.size(); - for (int i = 0; i < readerCount; i++) { - IndexReader reader = readers.get(i); - TermEnum termEnum = reader.terms(); - SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader); - int[] docMap = smi.getDocMap(); - if (docMap != null) { - if (docMaps == null) { - docMaps = new int[readerCount][]; - delCounts = new int[readerCount]; - } - docMaps[i] = docMap; - delCounts[i] = smi.reader.maxDoc() - smi.reader.numDocs(); - } - - base += reader.numDocs(); - - assert reader.numDocs() == reader.maxDoc() - smi.delCount; - - if (smi.next()) - queue.add(smi); // initialize queue - else - smi.close(); - } - - SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; - - String currentField = null; - FormatPostingsTermsConsumer termsConsumer = null; - - while (queue.size() > 0) { - int matchSize = 0; // pop matching terms - match[matchSize++] = queue.pop(); - Term term = match[0].term; - SegmentMergeInfo top = queue.top(); - - while (top != null && term.compareTo(top.term) == 0) { - match[matchSize++] = queue.pop(); - top = queue.top(); - } - - if (currentField != term.field) { - currentField = term.field; - if (termsConsumer != null) - termsConsumer.finish(); - final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField); - termsConsumer = consumer.addField(fieldInfo); - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; - } - - int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo - - checkAbort.work(df/3.0); - - while (matchSize > 0) { - SegmentMergeInfo smi = match[--matchSize]; - if (smi.next()) - queue.add(smi); // restore queue - else - smi.close(); // done with a segment - } - } - } - - private byte[] payloadBuffer; - private int[][] docMaps; - int[][] getDocMaps() { - return docMaps; - } - private int[] delCounts; - int[] getDelCounts() { - return delCounts; - } - - /** Process postings from multiple segments all positioned on the - * same term. Writes out merged entries into freqOutput and - * the proxOutput streams. - * - * @param smis array of segments - * @param n number of cells in the array actually occupied - * @return number of documents across all segments where this term was found - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) - throws CorruptIndexException, IOException { - - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text); - int df = 0; - for (int i = 0; i < n; i++) { - SegmentMergeInfo smi = smis[i]; - TermPositions postings = smi.getPositions(); - assert postings != null; - int base = smi.base; - int[] docMap = smi.getDocMap(); - postings.seek(smi.termEnum); - - while (postings.next()) { - df++; - int doc = postings.doc(); - if (docMap != null) - doc = docMap[doc]; // map around deletions - doc += base; // convert to merged space - - final int freq = postings.freq(); - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); - - if (!omitTermFreqAndPositions) { - for (int j = 0; j < freq; j++) { - final int position = postings.nextPosition(); - final int payloadLength = postings.getPayloadLength(); - if (payloadLength > 0) { - if (payloadBuffer == null || payloadBuffer.length < payloadLength) - payloadBuffer = new byte[payloadLength]; - postings.getPayload(payloadBuffer, 0); - } - posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); + final int numReaders = readers.size(); + for(int i=0;i files() throws IOException { return new ArrayList(si.files()); } - + @Override - public TermEnum terms() { + public TermEnum terms() throws IOException { ensureOpen(); - return core.getTermsReader().terms(); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + return ((PreFlexFields) core.fields).tis.terms(); + } else { + // Emulate pre-flex API on top of flex index + return new LegacyTermEnum(null); + } } + /** @deprecated Please switch to the flex API ({@link + * #fields}) instead. */ + @Deprecated @Override public TermEnum terms(Term t) throws IOException { ensureOpen(); - return core.getTermsReader().terms(t); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + return ((PreFlexFields) core.fields).tis.terms(t); + } else { + // Emulate pre-flex API on top of flex index + return new LegacyTermEnum(t); + } } FieldInfos fieldInfos() { @@ -887,6 +870,9 @@ public class SegmentReader extends IndexReader implements Cloneable { return (deletedDocs != null && deletedDocs.get(n)); } + /** @deprecated Switch to the flex API ({@link + * IndexReader#termDocsEnum}) instead. */ + @Deprecated @Override public TermDocs termDocs(Term term) throws IOException { if (term == null) { @@ -895,27 +881,73 @@ public class SegmentReader extends IndexReader implements Cloneable { return super.termDocs(term); } } + + @Override + public Fields fields() throws IOException { + return core.fields; + } + /** @deprecated Switch to the flex API {@link + * IndexReader#termDocsEnum} instead. */ + @Deprecated @Override public TermDocs termDocs() throws IOException { ensureOpen(); - return new SegmentTermDocs(this); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + final PreFlexFields pre = (PreFlexFields) core.fields; + SegmentTermDocs std = new SegmentTermDocs(pre.freqStream, pre.tis, core.fieldInfos); + std.setSkipDocs(deletedDocs); + return std; + } else { + // Emulate old API + return new LegacyTermDocs(); + } } + /** @deprecated Switch to the flex API {@link + * IndexReader#termDocsEnum} instead */ + @Deprecated @Override public TermPositions termPositions() throws IOException { ensureOpen(); - return new SegmentTermPositions(this); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + final PreFlexFields pre = (PreFlexFields) core.fields; + SegmentTermPositions stp = new SegmentTermPositions(pre.freqStream, pre.proxStream, pre.tis, core.fieldInfos); + stp.setSkipDocs(deletedDocs); + return stp; + } else { + // Emulate old API + return new LegacyTermPositions(); + } } @Override public int docFreq(Term t) throws IOException { ensureOpen(); - TermInfo ti = core.getTermsReader().get(t); - if (ti != null) - return ti.docFreq; - else + Terms terms = core.fields.terms(t.field); + if (terms != null) { + return terms.docFreq(new BytesRef(t.text)); + } else { return 0; + } + } + + @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + + Terms terms = core.fields.terms(field); + if (terms != null) { + return terms.docFreq(term); + } else { + return 0; + } } @Override @@ -1078,17 +1110,13 @@ public class SegmentReader extends IndexReader implements Cloneable { } } - boolean termsIndexLoaded() { - return core.termsIndexIsLoaded(); - } - // NOTE: only called from IndexWriter when a near // real-time reader is opened, or applyDeletes is run, // sharing a segment that's still being merged. This // method is not thread safe, and relies on the // synchronization in IndexWriter - void loadTermsIndex(int termsIndexDivisor) throws IOException { - core.loadTermsIndex(si, termsIndexDivisor); + void loadTermsIndex(int indexDivisor) throws IOException { + core.fields.loadTermsIndex(indexDivisor); } // for testing only @@ -1266,14 +1294,9 @@ public class SegmentReader extends IndexReader implements Cloneable { // same entry in the FieldCache. See LUCENE-1579. @Override public final Object getFieldCacheKey() { - return core.freqStream; + return core; } - - @Override - public long getUniqueTermCount() { - return core.getTermsReader().size(); - } - + /** * Lotsa tests did hacks like:
    * SegmentReader reader = (SegmentReader) IndexReader.open(dir);
    @@ -1283,7 +1306,7 @@ public class SegmentReader extends IndexReader implements Cloneable { */ @Deprecated static SegmentReader getOnlySegmentReader(Directory dir) throws IOException { - return getOnlySegmentReader(IndexReader.open(dir,false)); + return getOnlySegmentReader(IndexReader.open(dir, false)); } static SegmentReader getOnlySegmentReader(IndexReader reader) { @@ -1305,4 +1328,372 @@ public class SegmentReader extends IndexReader implements Cloneable { public int getTermInfosIndexDivisor() { return core.termsIndexDivisor; } + + // Back compat: pre-flex TermEnum API over flex API + @Deprecated + final private class LegacyTermEnum extends TermEnum { + FieldsEnum fields; + TermsEnum terms; + boolean done; + String currentField; + BytesRef currentTerm; + + public LegacyTermEnum(Term t) throws IOException { + fields = core.fields.iterator(); + currentField = fields.next(); + if (currentField == null) { + // no fields + done = true; + } else if (t != null) { + // Pre-seek to this term + + while(currentField.compareTo(t.field) < 0) { + currentField = fields.next(); + if (currentField == null) { + // Hit end of fields + done = true; + break; + } + } + + if (!done) { + // We found some field -- get its terms: + terms = fields.terms(); + + if (currentField == t.field) { + // We found exactly the requested field; now + // seek the term text: + String text = t.text(); + + // this is only for backwards compatibility. + // previously you could supply a term with unpaired surrogates, + // and it would return the next Term. + // if someone does this, tack on the lowest possible trail surrogate. + // this emulates the old behavior, and forms "valid UTF-8" unicode. + BytesRef tr = new BytesRef(UnicodeUtil.nextValidUTF16String(text)); + TermsEnum.SeekStatus status = terms.seek(tr); + + if (status == TermsEnum.SeekStatus.END) { + // Rollover to the next field + terms = null; + next(); + } else if (status == TermsEnum.SeekStatus.FOUND) { + // Found exactly the term + currentTerm = tr; + } else { + // Found another term, in this same field + currentTerm = terms.term(); + } + } else { + // We didn't find exact field (we found the + // following field); advance to first term in + // this field + next(); + } + } + } else { + terms = fields.terms(); + } + } + + @Override + public boolean next() throws IOException { + + if (done) { + return false; + } + + while(true) { + if (terms == null) { + // Advance to the next field + currentField = fields.next(); + if (currentField == null) { + done = true; + return false; + } + terms = fields.terms(); + } + currentTerm = terms.next(); + if (currentTerm != null) { + // This field still has terms + return true; + } else { + // Done producing terms from this field; advance + // to next field + terms = null; + } + } + } + + @Override + public Term term() { + if (!done && terms != null && currentTerm != null) { + return new Term(currentField, currentTerm.utf8ToString()); + } + return null; + } + + @Override + public int docFreq() { + return terms == null ? 0 : terms.docFreq(); + } + + @Override + public void close() {} + } + + // Back compat: emulates legacy TermDocs API on top of + // flex API + private class LegacyTermDocs implements TermDocs { + + String currentField; + final Fields fields; + TermsEnum terms; + DocsEnum docsEnum; + boolean any; + + LegacyTermDocs() throws IOException { + fields = core.fields; + } + + public void close() {} + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (!any) { + return false; + } else { + return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS; + } + } + + public void seek(Term term) throws IOException { + + any = false; + + if (terms != null && !term.field.equals(currentField)) { + // new field + terms = null; + } + + if (terms == null) { + currentField = term.field; + Terms terms1 = fields.terms(currentField); + if (terms1 == null) { + // no such field + return; + } else { + terms = terms1.iterator(); + } + } + + if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) { + // Term exists + any = true; + pendingBulkResult = null; + docsEnum = terms.docs(deletedDocs, docsEnum); + } + } + + public int doc() { + if (!any) { + return 0; + } else { + return docsEnum.docID(); + } + } + + private DocsEnum.BulkReadResult pendingBulkResult; + private int bulkCount; + private int pendingBulk; + + public int read(int[] docs, int[] freqs) throws IOException { + if (any && pendingBulkResult == null) { + pendingBulkResult = docsEnum.getBulkResult(); + } + if (!any) { + return 0; + } else if (pendingBulk > 0) { + final int left = bulkCount - pendingBulk; + if (docs.length >= left) { + // read all pending + System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, left); + System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, left); + pendingBulk = 0; + return left; + } else { + // read only part of pending + System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, docs.length); + System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, docs.length); + pendingBulk += docs.length; + return docs.length; + } + } else { + // nothing pending + bulkCount = docsEnum.read(); + if (docs.length >= bulkCount) { + System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, bulkCount); + System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, bulkCount); + return bulkCount; + } else { + System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, docs.length); + System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, docs.length); + pendingBulk = docs.length; + return docs.length; + } + } + } + + public int freq() { + if (!any) { + return 0; + } else { + return docsEnum.freq(); + } + } + + public boolean next() throws IOException { + if (!any) { + return false; + } else { + return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS; + } + } + } + + // Back compat: implements legacy TermPositions API on top + // of flex API + final private class LegacyTermPositions implements TermPositions { + + String currentField; + final Fields fields; + TermsEnum terms; + DocsAndPositionsEnum postingsEnum; + DocsEnum docsEnum; + boolean any; + + LegacyTermPositions() throws IOException { + fields = core.fields; + } + + public void close() {} + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (!any) { + return false; + } else { + return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS; + } + } + + public void seek(Term term) throws IOException { + + any = false; + + if (terms != null && !term.field.equals(currentField)) { + // new field + terms = null; + } + + if (terms == null) { + currentField = term.field; + Terms terms1 = fields.terms(currentField); + if (terms1 == null) { + // no such field + return; + } else { + terms = terms1.iterator(); + } + } + + if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) { + // Term exists + any = true; + postingsEnum = terms.docsAndPositions(deletedDocs, postingsEnum); + if (postingsEnum == null) { + docsEnum = terms.docs(deletedDocs, postingsEnum); + } else { + docsEnum = postingsEnum; + } + } + } + + public int doc() { + if (!any) { + return 0; + } else { + return docsEnum.docID(); + } + } + + public int freq() { + if (!any) { + return 0; + } else { + return docsEnum.freq(); + } + } + + public boolean next() throws IOException { + if (!any) { + return false; + } else { + return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS; + } + } + + public int read(int[] docs, int[] freqs) throws IOException { + throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); + } + + public int nextPosition() throws IOException { + if (!any || postingsEnum == null) { + return 0; + } else { + return postingsEnum.nextPosition(); + } + } + + public int getPayloadLength() { + if (!any || postingsEnum == null) { + return 0; + } else { + return postingsEnum.getPayloadLength(); + } + } + + public byte[] getPayload(byte[] bytes, int offset) throws IOException { + if (!any || postingsEnum == null) { + return null; + } + final BytesRef payload = postingsEnum.getPayload(); + // old API would always used passed in bytes if it + // "fits", else allocate new: + if (bytes != null && payload.length <= bytes.length - offset) { + System.arraycopy(payload.bytes, payload.offset, bytes, offset, payload.length); + return bytes; + } else if (payload.offset == 0 && payload.length == payload.bytes.length) { + return payload.bytes; + } else { + final byte[] retBytes = new byte[payload.length]; + System.arraycopy(payload.bytes, payload.offset, retBytes, 0, payload.length); + return retBytes; + } + } + + public boolean isPayloadAvailable() { + if (!any || postingsEnum == null) { + return false; + } else { + return postingsEnum.hasPayload(); + } + } + } } diff --git a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java index 47552e6c139..5e98be5a95d 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java @@ -19,32 +19,63 @@ package org.apache.lucene.index; import java.util.HashSet; import java.util.Collection; +import java.io.PrintStream; import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; -class SegmentWriteState { - DocumentsWriter docWriter; - Directory directory; - String segmentName; - String docStoreSegmentName; - int numDocs; - int termIndexInterval; - int numDocsInStore; - Collection flushedFiles; +/** + * This class is not meant for public usage; it's only + * public in order to expose access across packages. It's + * used internally when updating the index. + * @lucene.experimental + */ +public class SegmentWriteState { + public final PrintStream infoStream; + public final Directory directory; + public final String segmentName; + public final FieldInfos fieldInfos; + public final String docStoreSegmentName; + public final int numDocs; + public int numDocsInStore; + public final Collection flushedFiles; - public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs, - int numDocsInStore, int termIndexInterval) { - this.docWriter = docWriter; + // Actual codec used + final Codec codec; + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + public final int termIndexInterval; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + public final int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + public final int maxSkipLevels = 10; + + public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos, + String docStoreSegmentName, int numDocs, + int numDocsInStore, int termIndexInterval, + CodecProvider codecs) { + this.infoStream = infoStream; this.directory = directory; this.segmentName = segmentName; + this.fieldInfos = fieldInfos; this.docStoreSegmentName = docStoreSegmentName; this.numDocs = numDocs; this.numDocsInStore = numDocsInStore; this.termIndexInterval = termIndexInterval; + this.codec = codecs.getWriter(this); flushedFiles = new HashSet(); } - - public String segmentFileName(String ext) { - return segmentName + "." + ext; - } } diff --git a/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java index a599917aed4..583d2028c90 100644 --- a/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java @@ -90,8 +90,8 @@ final class StoredFieldsWriter { state.flushedFiles.add(fieldsName); state.flushedFiles.add(fieldsIdxName); - state.docWriter.removeOpenFile(fieldsName); - state.docWriter.removeOpenFile(fieldsIdxName); + docWriter.removeOpenFile(fieldsName); + docWriter.removeOpenFile(fieldsIdxName); if (4+((long) state.numDocsInStore)*8 != state.directory.fileLength(fieldsIdxName)) throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fieldsIdxName) + " length in bytes of " + fieldsIdxName + " file exists?=" + state.directory.fileExists(fieldsIdxName)); diff --git a/lucene/src/java/org/apache/lucene/index/Term.java b/lucene/src/java/org/apache/lucene/index/Term.java index c56483d124e..03ec2fb8bef 100644 --- a/lucene/src/java/org/apache/lucene/index/Term.java +++ b/lucene/src/java/org/apache/lucene/index/Term.java @@ -1,7 +1,5 @@ package org.apache.lucene.index; -import org.apache.lucene.util.StringHelper; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ import org.apache.lucene.util.StringHelper; * limitations under the License. */ +import org.apache.lucene.util.StringHelper; + /** A Term represents a word from text. This is the unit of search. It is composed of two elements, the text of the word, as a string, and the name of @@ -35,7 +35,7 @@ public final class Term implements Comparable, java.io.Serializable { *

    Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ public Term(String fld, String txt) { - field = StringHelper.intern(fld); + field = fld == null ? null : StringHelper.intern(fld); text = txt; } @@ -49,7 +49,8 @@ public final class Term implements Comparable, java.io.Serializable { this(fld, "", true); } - Term(String fld, String txt, boolean intern) { + /** @lucene.experimental */ + public Term(String fld, String txt, boolean intern) { field = intern ? StringHelper.intern(fld) : fld; // field names are interned text = txt; // unless already known to be } diff --git a/lucene/src/java/org/apache/lucene/index/TermDocs.java b/lucene/src/java/org/apache/lucene/index/TermDocs.java index 5a653ec027a..e645d39bdcb 100644 --- a/lucene/src/java/org/apache/lucene/index/TermDocs.java +++ b/lucene/src/java/org/apache/lucene/index/TermDocs.java @@ -27,8 +27,10 @@ import java.io.Closeable; ordered by document number. @see IndexReader#termDocs() - */ + @deprecated Use {@link DocsEnum} instead +*/ +@Deprecated public interface TermDocs extends Closeable { /** Sets this to the data for a term. * The enumeration is reset to the start of the data for this term. diff --git a/lucene/src/java/org/apache/lucene/index/TermEnum.java b/lucene/src/java/org/apache/lucene/index/TermEnum.java index ac99e2bf14e..ec39a7db592 100644 --- a/lucene/src/java/org/apache/lucene/index/TermEnum.java +++ b/lucene/src/java/org/apache/lucene/index/TermEnum.java @@ -23,8 +23,10 @@ import java.io.Closeable; /** Abstract class for enumerating terms.

    Term enumerations are always ordered by Term.compareTo(). Each term in - the enumeration is greater than all that precede it. */ + the enumeration is greater than all that precede it. +* @deprecated Use TermsEnum instead */ +@Deprecated public abstract class TermEnum implements Closeable { /** Increments the enumeration to the next element. True if one exists.*/ public abstract boolean next() throws IOException; diff --git a/lucene/src/java/org/apache/lucene/index/TermInfosWriter.java b/lucene/src/java/org/apache/lucene/index/TermInfosWriter.java deleted file mode 100644 index 335506e34a4..00000000000 --- a/lucene/src/java/org/apache/lucene/index/TermInfosWriter.java +++ /dev/null @@ -1,228 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.IOException; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; - - -/** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - -final class TermInfosWriter { - /** The file format version, a negative number. */ - public static final int FORMAT = -3; - - // Changed strings to true utf8 with length-in-bytes not - // length-in-chars - public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; - - // NOTE: always change this if you switch to a new format! - public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); - add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - -} diff --git a/lucene/src/java/org/apache/lucene/index/TermPositions.java b/lucene/src/java/org/apache/lucene/index/TermPositions.java index a22f9f08021..fea1d57abf7 100644 --- a/lucene/src/java/org/apache/lucene/index/TermPositions.java +++ b/lucene/src/java/org/apache/lucene/index/TermPositions.java @@ -26,8 +26,9 @@ import java.io.IOException; * positions of each occurrence of a term in a document. * * @see IndexReader#termPositions() + * @deprecated Use {@link DocsAndPositionsEnum} instead */ - +@Deprecated public interface TermPositions extends TermDocs { diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java index 81bd04318df..abec3d150be 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java @@ -22,7 +22,7 @@ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { @@ -106,6 +106,8 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { final int numPostings = termsHashPerField.numPostings; + final BytesRef flushTerm = perThread.flushTerm; + assert numPostings >= 0; if (!doVectors || numPostings == 0) @@ -126,7 +128,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { perThread.doc.addField(termsHashPerField.fieldInfo.number); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; - final int[] termIDs = termsHashPerField.sortPostings(); + // TODO: we may want to make this sort in same order + // as Codec's terms dict? + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); tvf.writeVInt(numPostings); byte bits = 0x0; @@ -136,46 +140,40 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); - int encoderUpto = 0; - int lastTermBytesCount = 0; - + int lastLen = 0; + byte[] lastBytes = null; + int lastStart = 0; + final ByteSliceReader reader = perThread.vectorSliceReader; - final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; + final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool; + for(int j=0;j> DocumentsWriter.CHAR_BLOCK_SHIFT]; - final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK; + // Get BytesRef + termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]); - // We swap between two encoders to save copying - // last Term's byte array - final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); - final int termBytesCount = utf8Result.length; - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute common prefix between last term and + // Compute common byte prefix between last term and // this term int prefix = 0; if (j > 0) { - final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result; - final byte[] termBytes = perThread.utf8Results[encoderUpto].result; - while(prefix < lastTermBytesCount && prefix < termBytesCount) { - if (lastTermBytes[prefix] != termBytes[prefix]) + while(prefix < lastLen && prefix < flushTerm.length) { + if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) { break; + } prefix++; } } - encoderUpto = 1-encoderUpto; - lastTermBytesCount = termBytesCount; - final int suffix = termBytesCount - prefix; + lastLen = flushTerm.length; + lastBytes = flushTerm.bytes; + lastStart = flushTerm.offset; + + final int suffix = flushTerm.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); - tvf.writeBytes(utf8Result.result, prefix, suffix); + tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix); tvf.writeVInt(freq); if (doVectorPositions) { @@ -209,9 +207,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { @Override void newTerm(final int termID) { - assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); - TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; postings.freqs[termID] = 1; @@ -275,23 +271,25 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { int[] lastOffsets; // Last offset we saw int[] lastPositions; // Last position where this term occurred + ParallelPostingsArray newInstance(int size) { + return new TermVectorsPostingsArray(size); + } + @Override - ParallelPostingsArray resize(int newSize) { - TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize); - copy(this, newArray); - return newArray; + void copyTo(ParallelPostingsArray toArray, int numToCopy) { + assert toArray instanceof TermVectorsPostingsArray; + TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray; + + super.copyTo(toArray, numToCopy); + + System.arraycopy(freqs, 0, to.freqs, 0, size); + System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size); + System.arraycopy(lastPositions, 0, to.lastPositions, 0, size); } - - void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) { - super.copy(fromArray, toArray); - System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length); - System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length); - System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length); + + @Override + int bytesPerPosting() { + return super.bytesPerPosting() + 3 * DocumentsWriter.INT_NUM_BYTE; } } - - @Override - int bytesPerPosting() { - return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE; - } } diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java index e4b06a29209..bf81fd60fee 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java @@ -17,13 +17,14 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread { final TermVectorsTermsWriter termsWriter; final TermsHashPerThread termsHashPerThread; final DocumentsWriter.DocState docState; + final BytesRef flushTerm = new BytesRef(); TermVectorsTermsWriter.PerDoc doc; @@ -36,9 +37,6 @@ final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread { // Used by perField when serializing the term vectors final ByteSliceReader vectorSliceReader = new ByteSliceReader(); - final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(), - new UnicodeUtil.UTF8Result()}; - @Override public void startDocument() { assert clearLastVectorFieldName(); diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java b/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java index 2870dd0ec12..ae041a13f3d 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; @@ -28,8 +29,7 @@ final class TermVectorsWriter { private IndexOutput tvx = null, tvd = null, tvf = null; private FieldInfos fieldInfos; - final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(), - new UnicodeUtil.UTF8Result()}; + final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)}; public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) @@ -107,14 +107,14 @@ final class TermVectorsWriter { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]); - int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result, + int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].bytes, utf8Results[1-utf8Upto].length, - utf8Results[utf8Upto].result, + utf8Results[utf8Upto].bytes, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(length); // write delta length - tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes + tvf.writeBytes(utf8Results[utf8Upto].bytes, start, length); // write delta bytes utf8Upto = 1-utf8Upto; final int termFreq = freqs[j]; diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java new file mode 100644 index 00000000000..40a54b36b38 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/Terms.java @@ -0,0 +1,101 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CloseableThreadLocal; + +/** + * Access to the terms in a specific field. See {@link Fields}. + * @lucene.experimental + */ + +public abstract class Terms { + + // Privately cache a TermsEnum per-thread for looking up + // docFreq and getting a private DocsEnum + private final CloseableThreadLocal threadEnums = new CloseableThreadLocal(); + + /** Returns an iterator that will step through all + * terms. This method will not return null.*/ + public abstract TermsEnum iterator() throws IOException; + + /** Return the BytesRef Comparator used to sort terms + * provided by the iterator. This method may return null + * if there are no terms. This method may be invoked + * many times; it's best to cache a single instance & + * reuse it. */ + public abstract Comparator getComparator() throws IOException; + + /** Returns the number of documents containing the + * specified term text. Returns 0 if the term does not + * exist. */ + public int docFreq(BytesRef text) throws IOException { + final TermsEnum termsEnum = getThreadTermsEnum(); + if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + return termsEnum.docFreq(); + } else { + return 0; + } + } + + /** Get DocsEnum for the specified term. This method may + * return null if the term does not exist. */ + public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException { + final TermsEnum termsEnum = getThreadTermsEnum(); + if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + return termsEnum.docs(skipDocs, reuse); + } else { + return null; + } + } + + /** Get DocsEnum for the specified term. This method will + * may return null if the term does not exists, or + * positions were not indexed. */ + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef text, DocsAndPositionsEnum reuse) throws IOException { + final TermsEnum termsEnum = getThreadTermsEnum(); + if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + return termsEnum.docsAndPositions(skipDocs, reuse); + } else { + return null; + } + } + + public long getUniqueTermCount() throws IOException { + throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); + } + + protected TermsEnum getThreadTermsEnum() throws IOException { + TermsEnum termsEnum = (TermsEnum) threadEnums.get(); + if (termsEnum == null) { + termsEnum = iterator(); + threadEnums.set(termsEnum); + } + return termsEnum; + } + + // subclass must close when done: + protected void close() { + threadEnums.close(); + } + public final static Terms[] EMPTY_ARRAY = new Terms[0]; +} diff --git a/lucene/src/java/org/apache/lucene/index/TermsEnum.java b/lucene/src/java/org/apache/lucene/index/TermsEnum.java new file mode 100644 index 00000000000..3c571862467 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/TermsEnum.java @@ -0,0 +1,181 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** Iterator to seek ({@link #seek}) or step through ({@link + * #next} terms, obtain frequency information ({@link + * #docFreq}), and obtain a {@link DocsEnum} or {@link + * DocsAndPositionsEnum} for the current term ({@link + * #docs}. + * + *

    Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

    + * + *

    On obtaining a TermsEnum, you must first call + * {@link #next} or {@link #seek}. + * + * @lucene.experimental */ +public abstract class TermsEnum { + + private AttributeSource atts = null; + + /** Returns the related attributes. */ + public AttributeSource attributes() { + if (atts == null) atts = new AttributeSource(); + return atts; + } + + /** Represents returned result from {@link #seek}. + * If status is FOUND, then the precise term was found. + * If status is NOT_FOUND, then a different term was + * found. If the status is END, the end of the iteration + * was hit. */ + public static enum SeekStatus {END, FOUND, NOT_FOUND}; + + /** Expert: just like {@link #seek(BytesRef)} but allows + * you to control whether the implementation should + * attempt to use its term cache (if it uses one). */ + public abstract SeekStatus seek(BytesRef text, boolean useCache) throws IOException; + + /** Seeks to the specified term. Returns SeekStatus to + * indicate whether exact term was found, a different + * term was found, or EOF was hit. The target term may + * be befor or after the current term. */ + public final SeekStatus seek(BytesRef text) throws IOException { + return seek(text, true); + } + + /** Seeks to the specified term by ordinal (position) as + * previously returned by {@link #ord}. The target ord + * may be befor or after the current ord. See {@link + * #seek(BytesRef)}. */ + public abstract SeekStatus seek(long ord) throws IOException; + + /** Increments the enumeration to the next element. + * Returns the resulting term, or null if the end was + * hit. The returned BytesRef may be re-used across calls + * to next. */ + public abstract BytesRef next() throws IOException; + + /** Returns current term. Do not call this before calling + * next() for the first time, after next() returns null + * or after seek returns {@link SeekStatus#END}.*/ + public abstract BytesRef term() throws IOException; + + /** Returns ordinal position for current term. This is an + * optional method (the codec may throw {@link + * UnsupportedOperationException}). Do not call this + * before calling {@link #next} for the first time or after + * {@link #next} returns null or {@link #seek} returns + * END; */ + public abstract long ord() throws IOException; + + /** Returns the number of documents containing the current + * term. Do not call this before calling next() for the + * first time, after next() returns null or seek returns + * {@link SeekStatus#END}.*/ + public abstract int docFreq(); + + /** Get {@link DocsEnum} for the current term. Do not + * call this before calling {@link #next} or {@link + * #seek} for the first time. This method will not + * return null. + * + * @param skipDocs set bits are documents that should not + * be returned + * @param reuse pass a prior DocsEnum for possible reuse */ + public abstract DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException; + + /** Get {@link DocsAndPositionsEnum} for the current term. + * Do not call this before calling {@link #next} or + * {@link #seek} for the first time. This method will + * only return null if positions were not indexed into + * the postings by this codec. */ + public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; + + /** Return the {@link BytesRef} Comparator used to sort + * terms provided by the iterator. This may return + * null if there are no terms. Callers may invoke this + * method many times, so it's best to cache a single + * instance & reuse it. */ + public abstract Comparator getComparator() throws IOException; + + /** An empty TermsEnum for quickly returning an empty instance e.g. + * in {@link org.apache.lucene.search.MultiTermQuery} + *

    Please note: This enum should be unmodifiable, + * but it is currently possible to add Attributes to it. + * This should not be a problem, as the enum is always empty and + * the existence of unused Attributes does not matter. + */ + public static final TermsEnum EMPTY = new TermsEnum() { + @Override + public SeekStatus seek(BytesRef term, boolean useCache) { return SeekStatus.END; } + + @Override + public SeekStatus seek(long ord) { return SeekStatus.END; } + + @Override + public BytesRef term() { + throw new IllegalStateException("this method should never be called"); + } + + @Override + public Comparator getComparator() { + // return an unused dummy to prevent NPE + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + + @Override + public int docFreq() { + throw new IllegalStateException("this method should never be called"); + } + + @Override + public long ord() { + throw new IllegalStateException("this method should never be called"); + } + + @Override + public DocsEnum docs(Bits bits, DocsEnum reuse) { + throw new IllegalStateException("this method should never be called"); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) { + throw new IllegalStateException("this method should never be called"); + } + + @Override + public BytesRef next() { + return null; + } + + @Override // make it synchronized here, to prevent double lazy init + public synchronized AttributeSource attributes() { + return super.attributes(); + } + }; +} diff --git a/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java b/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java index 61634bf635d..9aaeebd22c2 100644 --- a/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java +++ b/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java @@ -34,8 +34,6 @@ abstract class TermsHashConsumerPerField { abstract void newTerm(int termID) throws IOException; abstract void addTerm(int termID) throws IOException; abstract int getStreamCount(); - - abstract ParallelPostingsArray createPostingsArray(int size); - abstract int bytesPerPosting(); + abstract ParallelPostingsArray createPostingsArray(int size); } diff --git a/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java index 0f7a7ef5922..edc762bf984 100644 --- a/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -19,10 +19,13 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; final class TermsHashPerField extends InvertedDocConsumerPerField { @@ -32,12 +35,12 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { final TermsHashPerThread perThread; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; - TermAttribute termAtt; - + TermToBytesRefAttribute termAtt; + // Copied from our perThread - final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; + final ByteBlockPool termBytePool; final int streamCount; final int numPostingInt; @@ -52,43 +55,42 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { private int[] postingsHash; ParallelPostingsArray postingsArray; - - private final int bytesPerPosting; - + private final BytesRef utf8; + private Comparator termComp; + public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; - charPool = perThread.charPool; bytePool = perThread.bytePool; + termBytePool = perThread.termBytePool; docState = perThread.docState; + postingsHash = new int[postingsHashSize]; Arrays.fill(postingsHash, -1); + bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT); + fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); + postingsArray = consumer.createPostingsArray(postingsHashSize/2); + bytesUsed(postingsArray.size * postingsArray.bytesPerPosting()); + streamCount = consumer.getStreamCount(); numPostingInt = 2*streamCount; + utf8 = perThread.utf8; this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); else nextPerField = null; - - // +3: Posting is referenced by hash, which - // targets 25-50% fill factor; approximate this - // as 3X # pointers - bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE; } - - void initPostingsArray() { - assert postingsArray == null; - postingsArray = consumer.createPostingsArray(postingsHashSize); - + // sugar: just forwards to DW + private void bytesUsed(long size) { if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize); + perThread.termsHash.docWriter.bytesUsed(size); } } - + void shrinkHash(int targetSize) { assert postingsCompacted || numPostings == 0; @@ -100,13 +102,20 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { } if (newSize != postingsHash.length) { + final long previousSize = postingsHash.length; postingsHash = new int[newSize]; + bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT); Arrays.fill(postingsHash, -1); - postingsArray = null; postingsHashSize = newSize; postingsHashHalfSize = newSize/2; postingsHashMask = newSize-1; } + + if (postingsArray != null) { + final int startSize = postingsArray.size; + postingsArray = postingsArray.shrink(targetSize, false); + bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - startSize)); + } } public void reset() { @@ -129,14 +138,10 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { nextPerField.abort(); } - private void growParallelPostingsArray() { - int oldSize = postingsArray.byteStarts.length; - int newSize = (int) (oldSize * 1.5); - this.postingsArray = this.postingsArray.resize(newSize); - - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize)); - } + private final void growParallelPostingsArray() { + int oldSize = postingsArray.size; + this.postingsArray = this.postingsArray.grow(); + bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)); } public void initReader(ByteSliceReader reader, int termID, int stream) { @@ -166,7 +171,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { } /** Collapse the hash table & sort in-place. */ - public int[] sortPostings() { + public int[] sortPostings(Comparator termComp) { + this.termComp = termComp; compactPostings(); quickSort(postingsHash, 0, numPostings-1); return postingsHash; @@ -237,50 +243,48 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ int comparePostings(int term1, int term2) { - if (term1 == term2) + if (term1 == term2) { + // Our quicksort does this, eg during partition return 0; - - final int textStart1 = postingsArray.textStarts[term1]; - final int textStart2 = postingsArray.textStarts[term2]; - - final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK; - final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK; - - assert text1 != text2 || pos1 != pos2; - - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else - // This method should never compare equal postings - // unless p1==p2 - assert c1 != 0xffff; } + + termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]); + termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]); + + return termComp.compare(perThread.tr1, perThread.tr2); } /** Test whether the text for current RawPostingList p equals - * current tokenText. */ - private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) { + * current tokenText in utf8. */ + private boolean postingEquals(final int termID) { final int textStart = postingsArray.textStarts[termID]; - - final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert text != null; - int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK; - int tokenPos = 0; - for(;tokenPos= postingsArray.textStarts.length) { + if (termID >= postingsArray.size) { growParallelPostingsArray(); } - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesUsed(bytesPerPosting); - } assert termID >= 0; @@ -392,48 +397,15 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { // We are first in the chain so we must "intern" the // term text into textStart address - // Get the text of this term. - final char[] tokenText = termAtt.termBuffer(); - final int tokenTextLen = termAtt.termLength(); - - // Compute hashcode & replace any invalid UTF16 sequences - int downto = tokenTextLen; - int code = 0; - while (downto > 0) { - char ch = tokenText[--downto]; - - if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { - if (0 == downto) { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } else { - final char ch2 = tokenText[downto-1]; - if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { - // OK: high followed by low. This is a valid - // surrogate pair. - code = ((code*31) + ch)*31+ch2; - downto--; - continue; - } else { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } - } - } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || - ch == 0xffff)) { - // Unpaired or 0xffff - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } - - code = (code*31) + ch; - } + // Get the text & hash of this term. + int code = termAtt.toBytesRef(utf8); int hashPos = code & postingsHashMask; // Locate RawPostingList in hash int termID = postingsHash[hashPos]; - if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) { + if (termID != -1 && !postingEquals(termID)) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code>>8)+code)|1; @@ -441,61 +413,86 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { code += inc; hashPos = code & postingsHashMask; termID = postingsHash[hashPos]; - } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)); + } while (termID != -1 && !postingEquals(termID)); } if (termID == -1) { // First time we are seeing this token since we last // flushed the hash. - final int textLen1 = 1+tokenTextLen; - if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { - if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { + final int textLen2 = 2+utf8.length; + if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) { + // Not enough room in current block + + if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). - - if (docState.maxTermPrefix == null) - docState.maxTermPrefix = new String(tokenText, 0, 30); + if (docState.maxTermPrefix == null) { + final int saved = utf8.length; + try { + utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8); + docState.maxTermPrefix = utf8.toString(); + } finally { + utf8.length = saved; + } + } consumer.skippingLongTerm(); return; } - charPool.nextBuffer(); + bytePool.nextBuffer(); } // New posting termID = numPostings++; - if (termID >= postingsArray.textStarts.length) { + if (termID >= postingsArray.size) { growParallelPostingsArray(); } - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesUsed(bytesPerPosting); - } assert termID != -1; - - final char[] text = charPool.buffer; - final int textUpto = charPool.charUpto; - postingsArray.textStarts[termID] = textUpto + charPool.charOffset; - charPool.charUpto += textLen1; - System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); - text[textUpto+tokenTextLen] = 0xffff; - assert postingsHash[hashPos] == -1; + postingsHash[hashPos] = termID; - if (numPostings == postingsHashHalfSize) + final byte[] text = bytePool.buffer; + final int textUpto = bytePool.byteUpto; + postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset; + + // We first encode the length, followed by the UTF8 + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + + // encode length @ start of bytes + if (utf8.length < 128) { + // 1 byte to store length + text[textUpto] = (byte) utf8.length; + bytePool.byteUpto += utf8.length + 1; + System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length); + } else { + // 2 byte to store length + text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f)); + text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff); + bytePool.byteUpto += utf8.length + 2; + System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length); + } + + if (numPostings == postingsHashHalfSize) { rehashPostings(2*postingsHashSize); + bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT); + } // Init stream slices - if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) + if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); + } - if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.nextBuffer(); + } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; @@ -577,16 +574,28 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { int code; if (perThread.primary) { final int textStart = postingsArray.textStarts[termID]; - final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK; - final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos = start; - while(text[pos] != 0xffff) - pos++; + final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK; + final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; code = 0; - while (pos > start) - code = (code*31) + text[--pos]; - } else + + final int len; + int pos; + if ((text[start] & 0x80) == 0) { + // length is 1 byte + len = text[start]; + pos = start+1; + } else { + len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7); + pos = start+2; + } + + final int endPos = pos+len; + while(pos < endPos) { + code = (code*31) + text[pos++]; + } + } else { code = postingsArray.textStarts[termID]; + } int hashPos = code & newMask; assert hashPos >= 0; @@ -603,6 +612,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { postingsHashMask = newMask; postingsHash = newHash; + postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; } diff --git a/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java b/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java index b1c3784b057..16ef4422023 100644 --- a/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java +++ b/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java @@ -17,6 +17,11 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; + import java.io.IOException; final class TermsHashPerThread extends InvertedDocConsumerPerThread { @@ -25,30 +30,54 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread { final TermsHashConsumerPerThread consumer; final TermsHashPerThread nextPerThread; - final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; + final ByteBlockPool termBytePool; final boolean primary; final DocumentsWriter.DocState docState; + // Used when comparing postings via termRefComp, in TermsHashPerField + final BytesRef tr1 = new BytesRef(); + final BytesRef tr2 = new BytesRef(); + + // Used by perField: + final BytesRef utf8 = new BytesRef(10); + + final LegacyTermAttributeWrapper legacyTermAttributeWrapper = new LegacyTermAttributeWrapper(); + + /** This class is used to wrap a legacy TermAttribute without support for {@link TermToBytesRefAttribute}. */ + @Deprecated + static class LegacyTermAttributeWrapper implements TermToBytesRefAttribute { + private TermAttribute termAtt = null; + + void setTermAttribute(TermAttribute termAtt) { + this.termAtt = termAtt; + } + + public int toBytesRef(BytesRef target) { + assert target.bytes != null : "target byteref must be != null, because utf8 is used here"; + return UnicodeUtil.UTF16toUTF8WithHash(termAtt.termBuffer(), 0, termAtt.termLength(), target); + } + } + public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) { docState = docInverterPerThread.docState; this.termsHash = termsHash; this.consumer = termsHash.consumer.addThread(this); - if (nextTermsHash != null) { - // We are primary - charPool = new CharBlockPool(termsHash.docWriter); - primary = true; - } else { - charPool = primaryPerThread.charPool; - primary = false; - } - intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations); bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations); + if (nextTermsHash != null) { + // We are primary + primary = true; + termBytePool = bytePool; + } else { + primary = false; + termBytePool = primaryPerThread.bytePool; + } + if (nextTermsHash != null) nextPerThread = nextTermsHash.addThread(docInverterPerThread, this); else @@ -97,7 +126,8 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread { intPool.reset(); bytePool.reset(); - if (primary) - charPool.reset(); + if (primary) { + bytePool.reset(); + } } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/Codec.java b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java new file mode 100644 index 00000000000..42984ee33e3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java @@ -0,0 +1,59 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.store.Directory; + +/** @lucene.experimental */ +public abstract class Codec { + + /** Unique name that's used to retrieve this codec when + * reading the index */ + public String name; + + /** Writes a new segment */ + public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; + + public static void debug(String s, String desc) { + if (desc != null) { + System.out.println(Thread.currentThread().getName()+ " [" + desc + "]:" + s); + } else { + System.out.println(Thread.currentThread().getName() + ": " + s); + } + } + public static void debug(String s) { + debug(s, null); + } + + /** Reads a segment. NOTE: by the time this call + * returns, it must hold open any files it will need to + * use; else, those files may be deleted. */ + public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException; + + /** Gathers files associated with this segment */ + public abstract void files(Directory dir, SegmentInfo segmentInfo, Set files) throws IOException; + + /** Records all file extensions this codec uses */ + public abstract void getExtensions(Set extensions); +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java new file mode 100644 index 00000000000..a3ae4c4f8cb --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -0,0 +1,108 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.intblock.IntBlockCodec; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.codecs.pulsing.PulsingCodec; +import org.apache.lucene.index.codecs.sep.SepCodec; +import org.apache.lucene.index.codecs.standard.StandardCodec; + +/** Holds a set of codecs, keyed by name. You subclass + * this, instantiate it, and register your codecs, then + * pass this instance to IndexReader/IndexWriter (via + * package private APIs) to use different codecs when + * reading & writing segments. + * + * @lucene.experimental */ + +public abstract class CodecProvider { + + private final HashMap codecs = new HashMap(); + + private final Set knownExtensions = new HashSet(); + + private static String defaultCodec = "Standard"; + + public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock"}; + + public void register(Codec codec) { + if (codec.name == null) { + throw new IllegalArgumentException("code.name is null"); + } + + if (!codecs.containsKey(codec.name)) { + codecs.put(codec.name, codec); + codec.getExtensions(knownExtensions); + } else if (codecs.get(codec.name) != codec) { + throw new IllegalArgumentException("codec '" + codec.name + "' is already registered as a different codec instance"); + } + } + + public Collection getAllExtensions() { + return knownExtensions; + } + + public Codec lookup(String name) { + final Codec codec = (Codec) codecs.get(name); + if (codec == null) + throw new IllegalArgumentException("required codec '" + name + "' not found"); + return codec; + } + + public abstract Codec getWriter(SegmentWriteState state); + + static private final CodecProvider defaultCodecs = new DefaultCodecProvider(); + + public static CodecProvider getDefault() { + return defaultCodecs; + } + + /** Used for testing. @lucene.internal */ + public static void setDefaultCodec(String s) { + defaultCodec = s; + } + /** Used for testing. @lucene.internal */ + public static String getDefaultCodec() { + return defaultCodec; + } +} + +class DefaultCodecProvider extends CodecProvider { + DefaultCodecProvider() { + register(new StandardCodec()); + register(new IntBlockCodec()); + register(new PreFlexCodec()); + register(new PulsingCodec()); + register(new SepCodec()); + } + + @Override + public Codec getWriter(SegmentWriteState state) { + return lookup(CodecProvider.getDefaultCodec()); + //return lookup("Pulsing"); + //return lookup("Sep"); + //return lookup("IntBlock"); + } +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java new file mode 100644 index 00000000000..0c0bcf86569 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java @@ -0,0 +1,51 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; + +import java.io.IOException; + +/** Abstract API that consumes terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" with the postings (write it into + * the index in a specific format). + * + * @lucene.experimental + */ +public abstract class FieldsConsumer { + + /** Add a new field */ + public abstract TermsConsumer addField(FieldInfo field) throws IOException; + + /** Called when we are done adding everything. */ + public abstract void close() throws IOException; + + public void merge(MergeState mergeState, Fields fields) throws IOException { + FieldsEnum fieldsEnum = fields.iterator(); + assert fieldsEnum != null; + String field; + while((field = fieldsEnum.next()) != null) { + mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field); + final TermsConsumer termsConsumer = addField(mergeState.fieldInfo); + termsConsumer.merge(mergeState, fieldsEnum.terms()); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java similarity index 77% rename from lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java rename to lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java index 8118bc1baa9..a378680328e 100644 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,7 +17,10 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.index.Fields; + import java.io.IOException; +import java.io.Closeable; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -26,11 +29,8 @@ import java.io.IOException; * * @lucene.experimental */ -abstract class FormatPostingsFieldsConsumer { - /** Add a new field */ - abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; - - /** Called when we are done adding everything. */ - abstract void finish() throws IOException; +public abstract class FieldsProducer extends Fields implements Closeable { + public abstract void close() throws IOException; + public abstract void loadTermsIndex(int indexDivisor) throws IOException; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsAndPositionsEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsAndPositionsEnum.java new file mode 100644 index 00000000000..7aed4cea9c8 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsAndPositionsEnum.java @@ -0,0 +1,121 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.MultiDocsAndPositionsEnum; +import java.io.IOException; + +/** + * Exposes flex API, merged from flex API of sub-segments, + * remapping docIDs (this is used for segment merging). + * + * @lucene.experimental + */ + +public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum { + private MultiDocsAndPositionsEnum.EnumWithSlice[] subs; + int numSubs; + int upto; + int[] currentMap; + DocsAndPositionsEnum current; + int currentBase; + int doc = -1; + private MergeState mergeState; + + MappingMultiDocsAndPositionsEnum reset(MultiDocsAndPositionsEnum postingsEnum) throws IOException { + this.numSubs = postingsEnum.getNumSubs(); + this.subs = postingsEnum.getSubs(); + upto = -1; + current = null; + return this; + } + + public void setMergeState(MergeState mergeState) { + this.mergeState = mergeState; + } + + @Override + public int freq() { + return current.freq(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (current == null) { + if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + final int reader = subs[upto].slice.readerIndex; + current = subs[upto].docsAndPositionsEnum; + currentBase = mergeState.docBase[reader]; + currentMap = mergeState.docMaps[reader]; + } + } + + int doc = current.nextDoc(); + if (doc != NO_MORE_DOCS) { + if (currentMap != null) { + // compact deletions + doc = currentMap[doc]; + if (doc == -1) { + continue; + } + } + return this.doc = currentBase + doc; + } else { + current = null; + } + } + } + + @Override + public int nextPosition() throws IOException { + return current.nextPosition(); + } + + + @Override + public int getPayloadLength() { + return current.getPayloadLength(); + } + + @Override + public BytesRef getPayload() throws IOException { + return current.getPayload(); + } + + @Override + public boolean hasPayload() { + return current.hasPayload(); + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsEnum.java new file mode 100644 index 00000000000..314d6ff89e8 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/MappingMultiDocsEnum.java @@ -0,0 +1,99 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MultiDocsEnum; +import java.io.IOException; + +/** + * Exposes flex API, merged from flex API of sub-segments, + * remapping docIDs (this is used for segment merging). + * + * @lucene.experimental + */ + +public final class MappingMultiDocsEnum extends DocsEnum { + private MultiDocsEnum.EnumWithSlice[] subs; + int numSubs; + int upto; + int[] currentMap; + DocsEnum current; + int currentBase; + int doc = -1; + private MergeState mergeState; + + MappingMultiDocsEnum reset(MultiDocsEnum docsEnum) throws IOException { + this.numSubs = docsEnum.getNumSubs(); + this.subs = docsEnum.getSubs(); + upto = -1; + current = null; + return this; + } + + public void setMergeState(MergeState mergeState) { + this.mergeState = mergeState; + } + + @Override + public int freq() { + return current.freq(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (current == null) { + if (upto == numSubs-1) { + return this.doc = NO_MORE_DOCS; + } else { + upto++; + final int reader = subs[upto].slice.readerIndex; + current = subs[upto].docsEnum; + currentBase = mergeState.docBase[reader]; + currentMap = mergeState.docMaps[reader]; + } + } + + int doc = current.nextDoc(); + if (doc != NO_MORE_DOCS) { + if (currentMap != null) { + // compact deletions + doc = currentMap[doc]; + if (doc == -1) { + continue; + } + } + return this.doc = currentBase + doc; + } else { + current = null; + } + } + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java b/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java new file mode 100644 index 00000000000..d0bcd22c6f8 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java @@ -0,0 +1,42 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.Bits; +import java.util.List; + +/** Holds common state used during segment merging + * + * @lucene.experimental */ +public class MergeState { + public FieldInfos fieldInfos; + public List readers; // Readers being merged + public int readerCount; // Number of readers being merged + public int[][] docMaps; // Maps docIDs around deletions + public int[] delCounts; // Deletion count per reader + public int[] docBase; // New docID base per reader + public int mergedDocCount; // Total # merged docs + public Bits multiDeletedDocs; + + // Updated per field; + public FieldInfo fieldInfo; +} + diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java new file mode 100644 index 00000000000..0f65c818b16 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java @@ -0,0 +1,281 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.IndexInput; + +/** + * This abstract class reads skip lists with multiple levels. + * + * See {@link MultiLevelSkipListWriter} for the information about the encoding + * of the multi level skip lists. + * + * Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)} + * which defines the actual format of the skip data. + * @lucene.experimental + */ + +public abstract class MultiLevelSkipListReader { + // the maximum number of skip levels possible for this index + protected int maxNumberOfSkipLevels; + + // number of levels in this skip list + private int numberOfSkipLevels; + + // Expert: defines the number of top skip levels to buffer in memory. + // Reducing this number results in less memory usage, but possibly + // slower performance due to more random I/Os. + // Please notice that the space each level occupies is limited by + // the skipInterval. The top level can not contain more than + // skipLevel entries, the second top level can not contain more + // than skipLevel^2 entries and so forth. + private int numberOfLevelsToBuffer = 1; + + private int docCount; + private boolean haveSkipped; + + private IndexInput[] skipStream; // skipStream for each level + private long skipPointer[]; // the start pointer of each skip level + private int skipInterval[]; // skipInterval of each level + private int[] numSkipped; // number of docs skipped per level + + private int[] skipDoc; // doc id of current skip entry per level + private int lastDoc; // doc id of last read skip entry with docId <= target + private long[] childPointer; // child pointer of current skip entry per level + private long lastChildPointer; // childPointer of last read skip entry with docId <= target + + private boolean inputIsBuffered; + + public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { + this.skipStream = new IndexInput[maxSkipLevels]; + this.skipPointer = new long[maxSkipLevels]; + this.childPointer = new long[maxSkipLevels]; + this.numSkipped = new int[maxSkipLevels]; + this.maxNumberOfSkipLevels = maxSkipLevels; + this.skipInterval = new int[maxSkipLevels]; + this.skipStream [0]= skipStream; + this.inputIsBuffered = (skipStream instanceof BufferedIndexInput); + this.skipInterval[0] = skipInterval; + for (int i = 1; i < maxSkipLevels; i++) { + // cache skip intervals + this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval; + } + skipDoc = new int[maxSkipLevels]; + } + + + /** Returns the id of the doc to which the last call of {@link #skipTo(int)} + * has skipped. */ + public int getDoc() { + return lastDoc; + } + + + /** Skips entries to the first beyond the current whose document number is + * greater than or equal to target. Returns the current doc count. + */ + public int skipTo(int target) throws IOException { + if (!haveSkipped) { + // first time, load skip levels + loadSkipLevels(); + haveSkipped = true; + } + + // walk up the levels until highest level is found that has a skip + // for this target + int level = 0; + while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) { + level++; + } + + while (level >= 0) { + if (target > skipDoc[level]) { + if (!loadNextSkip(level)) { + continue; + } + } else { + // no more skips on this level, go down one level + if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) { + seekChild(level - 1); + } + level--; + } + } + + return numSkipped[0] - skipInterval[0] - 1; + } + + private boolean loadNextSkip(int level) throws IOException { + // we have to skip, the target document is greater than the current + // skip list entry + setLastSkipData(level); + + numSkipped[level] += skipInterval[level]; + + if (numSkipped[level] > docCount) { + // this skip list is exhausted + skipDoc[level] = Integer.MAX_VALUE; + if (numberOfSkipLevels > level) numberOfSkipLevels = level; + return false; + } + + // read next skip entry + skipDoc[level] += readSkipData(level, skipStream[level]); + + if (level != 0) { + // read the child pointer if we are not on the leaf level + childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; + } + + return true; + + } + + /** Seeks the skip entry on the given level */ + protected void seekChild(int level) throws IOException { + skipStream[level].seek(lastChildPointer); + numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1]; + skipDoc[level] = lastDoc; + if (level > 0) { + childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; + } + } + + public void close() throws IOException { + for (int i = 1; i < skipStream.length; i++) { + if (skipStream[i] != null) { + skipStream[i].close(); + } + } + } + + /** initializes the reader */ + public void init(long skipPointer, int df) { + this.skipPointer[0] = skipPointer; + this.docCount = df; + Arrays.fill(skipDoc, 0); + Arrays.fill(numSkipped, 0); + Arrays.fill(childPointer, 0); + + haveSkipped = false; + for (int i = 1; i < numberOfSkipLevels; i++) { + skipStream[i] = null; + } + } + + /** Loads the skip levels */ + private void loadSkipLevels() throws IOException { + numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0])); + if (numberOfSkipLevels > maxNumberOfSkipLevels) { + numberOfSkipLevels = maxNumberOfSkipLevels; + } + + skipStream[0].seek(skipPointer[0]); + + int toBuffer = numberOfLevelsToBuffer; + + for (int i = numberOfSkipLevels - 1; i > 0; i--) { + // the length of the current level + long length = skipStream[0].readVLong(); + + // the start pointer of the current level + skipPointer[i] = skipStream[0].getFilePointer(); + if (toBuffer > 0) { + // buffer this level + skipStream[i] = new SkipBuffer(skipStream[0], (int) length); + toBuffer--; + } else { + // clone this stream, it is already at the start of the current level + skipStream[i] = (IndexInput) skipStream[0].clone(); + if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) { + ((BufferedIndexInput) skipStream[i]).setBufferSize((int) length); + } + + // move base stream beyond the current level + skipStream[0].seek(skipStream[0].getFilePointer() + length); + } + } + + // use base stream for the lowest level + skipPointer[0] = skipStream[0].getFilePointer(); + } + + /** + * Subclasses must implement the actual skip data encoding in this method. + * + * @param level the level skip data shall be read from + * @param skipStream the skip stream to read from + */ + protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException; + + /** Copies the values of the last read skip entry on this level */ + protected void setLastSkipData(int level) { + lastDoc = skipDoc[level]; + lastChildPointer = childPointer[level]; + } + + + /** used to buffer the top skip levels */ + private final static class SkipBuffer extends IndexInput { + private byte[] data; + private long pointer; + private int pos; + + SkipBuffer(IndexInput input, int length) throws IOException { + data = new byte[length]; + pointer = input.getFilePointer(); + input.readBytes(data, 0, length); + } + + @Override + public void close() throws IOException { + data = null; + } + + @Override + public long getFilePointer() { + return pointer + pos; + } + + @Override + public long length() { + return data.length; + } + + @Override + public byte readByte() throws IOException { + return data[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + System.arraycopy(data, pos, b, offset, len); + pos += len; + } + + @Override + public void seek(long pos) throws IOException { + this.pos = (int) (pos - pointer); + } + + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListWriter.java new file mode 100644 index 00000000000..46cf791ccb3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListWriter.java @@ -0,0 +1,153 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; + +/** + * This abstract class writes skip lists with multiple levels. + * + * Example for skipInterval = 3: + * c (skip level 2) + * c c c (skip level 1) + * x x x x x x x x x x (skip level 0) + * d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) + * 3 6 9 12 15 18 21 24 27 30 (df) + * + * d - document + * x - skip data + * c - skip data with child pointer + * + * Skip level i contains every skipInterval-th entry from skip level i-1. + * Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))). + * + * Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1. + * This guarantees a logarithmic amount of skips to find the target document. + * + * While this class takes care of writing the different skip levels, + * subclasses must define the actual format of the skip data. + * @lucene.experimental + */ + +public abstract class MultiLevelSkipListWriter { + // number of levels in this skip list + protected int numberOfSkipLevels; + + // the skip interval in the list with level = 0 + private int skipInterval; + + // for every skip level a different buffer is used + private RAMOutputStream[] skipBuffer; + + protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) { + this.skipInterval = skipInterval; + + // calculate the maximum number of skip levels for this document frequency + numberOfSkipLevels = df == 0 ? 0 : (int) Math.floor(Math.log(df) / Math.log(skipInterval)); + + // make sure it does not exceed maxSkipLevels + if (numberOfSkipLevels > maxSkipLevels) { + numberOfSkipLevels = maxSkipLevels; + } + } + + protected void init() { + skipBuffer = new RAMOutputStream[numberOfSkipLevels]; + for (int i = 0; i < numberOfSkipLevels; i++) { + skipBuffer[i] = new RAMOutputStream(); + } + } + + protected void resetSkip() { + // creates new buffers or empties the existing ones + if (skipBuffer == null) { + init(); + } else { + for (int i = 0; i < skipBuffer.length; i++) { + skipBuffer[i].reset(); + } + } + } + + /** + * Subclasses must implement the actual skip data encoding in this method. + * + * @param level the level skip data shall be writing for + * @param skipBuffer the skip buffer to write to + */ + protected abstract void writeSkipData(int level, IndexOutput skipBuffer) throws IOException; + + /** + * Writes the current skip data to the buffers. The current document frequency determines + * the max level is skip data is to be written to. + * + * @param df the current document frequency + * @throws IOException + */ + public void bufferSkip(int df) throws IOException { + int numLevels; + + // determine max level + for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) { + numLevels++; + } + + long childPointer = 0; + + for (int level = 0; level < numLevels; level++) { + writeSkipData(level, skipBuffer[level]); + + long newChildPointer = skipBuffer[level].getFilePointer(); + + if (level != 0) { + // store child pointers for all levels except the lowest + skipBuffer[level].writeVLong(childPointer); + } + + //remember the childPointer for the next level + childPointer = newChildPointer; + } + } + + /** + * Writes the buffered skip lists to the given output. + * + * @param output the IndexOutput the skip lists shall be written to + * @return the pointer the skip list starts + */ + public long writeSkip(IndexOutput output) throws IOException { + long skipPointer = output.getFilePointer(); + //System.out.println("skipper.writeSkip fp=" + skipPointer); + if (skipBuffer == null || skipBuffer.length == 0) return skipPointer; + + for (int level = numberOfSkipLevels - 1; level > 0; level--) { + long length = skipBuffer[level].getFilePointer(); + if (length > 0) { + output.writeVLong(length); + skipBuffer[level].writeTo(output); + } + } + skipBuffer[0].writeTo(output); + + return skipPointer; + } + +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java new file mode 100644 index 00000000000..3449ff313ed --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java @@ -0,0 +1,97 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ + +public abstract class PostingsConsumer { + + /** Adds a new doc in this term. Return null if this + * consumer doesn't need to see the positions for this + * doc. */ + public abstract void startDoc(int docID, int termDocFreq) throws IOException; + + public static class PostingsMergeState { + DocsEnum docsEnum; + int[] docMap; + int docBase; + } + + /** Add a new position & payload. A null payload means no + * payload; a non-null payload with zero length also + * means no payload. Caller may reuse the {@link + * BytesRef} for the payload between calls (method must + * fully consume the payload). */ + public abstract void addPosition(int position, BytesRef payload) throws IOException; + + /** Called when we are done adding positions & payloads + * for each doc */ + public abstract void finishDoc() throws IOException; + + /** Default merge impl: append documents, mapping around + * deletes */ + public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException { + + int df = 0; + + if (mergeState.fieldInfo.omitTermFreqAndPositions) { + while(true) { + final int doc = postings.nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + this.startDoc(doc, postings.freq()); + this.finishDoc(); + df++; + } + } else { + final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; + while(true) { + final int doc = postingsEnum.nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + final int freq = postingsEnum.freq(); + this.startDoc(doc, freq); + for(int i=0;i 0) { + payload = postingsEnum.getPayload(); + } else { + payload = null; + } + this.addPosition(position, payload); + } + this.finishDoc(); + df++; + } + } + return df; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java new file mode 100644 index 00000000000..81501b54257 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -0,0 +1,99 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiDocsEnum; +import org.apache.lucene.index.MultiDocsAndPositionsEnum; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ + +public abstract class TermsConsumer { + + /** Starts a new term in this field; this may be called + * with no corresponding call to finish if the term had + * no docs. */ + public abstract PostingsConsumer startTerm(BytesRef text) throws IOException; + + /** Finishes the current term; numDocs must be > 0. */ + public abstract void finishTerm(BytesRef text, int numDocs) throws IOException; + + /** Called when we are done adding terms to this field */ + public abstract void finish() throws IOException; + + /** Return the BytesRef Comparator used to sort terms + * before feeding to this API. */ + public abstract Comparator getComparator() throws IOException; + + /** Default merge impl */ + private MappingMultiDocsEnum docsEnum = null; + private MappingMultiDocsAndPositionsEnum postingsEnum = null; + + public void merge(MergeState mergeState, TermsEnum termsEnum) throws IOException { + + BytesRef term; + assert termsEnum != null; + + if (mergeState.fieldInfo.omitTermFreqAndPositions) { + if (docsEnum == null) { + docsEnum = new MappingMultiDocsEnum(); + } + docsEnum.setMergeState(mergeState); + + MultiDocsEnum docsEnumIn = null; + + while((term = termsEnum.next()) != null) { + docsEnumIn = (MultiDocsEnum) termsEnum.docs(mergeState.multiDeletedDocs, docsEnumIn); + if (docsEnumIn != null) { + docsEnum.reset(docsEnumIn); + final PostingsConsumer postingsConsumer = startTerm(term); + final int numDocs = postingsConsumer.merge(mergeState, docsEnum); + if (numDocs > 0) { + finishTerm(term, numDocs); + } + } + } + } else { + if (postingsEnum == null) { + postingsEnum = new MappingMultiDocsAndPositionsEnum(); + } + postingsEnum.setMergeState(mergeState); + MultiDocsAndPositionsEnum postingsEnumIn = null; + while((term = termsEnum.next()) != null) { + postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(mergeState.multiDeletedDocs, postingsEnumIn); + if (postingsEnumIn != null) { + postingsEnum.reset(postingsEnumIn); + final PostingsConsumer postingsConsumer = startTerm(term); + final int numDocs = postingsConsumer.merge(mergeState, postingsEnum); + if (numDocs > 0) { + finishTerm(term, numDocs); + } + } + } + } + + finish(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java new file mode 100644 index 00000000000..42636f46c8f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java @@ -0,0 +1,190 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IntsRef; + +/** Abstract base class that reads fixed-size blocks of ints + * from an IndexInput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexInput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class FixedIntBlockIndexInput extends IntIndexInput { + + private IndexInput in; + protected int blockSize; + + protected void init(final IndexInput in) throws IOException { + this.in = in; + blockSize = in.readVInt(); + } + + @Override + public Reader reader() throws IOException { + final int[] buffer = new int[blockSize]; + final IndexInput clone = (IndexInput) in.clone(); + // TODO: can this be simplified? + return new Reader(clone, buffer, this.getBlockReader(clone, buffer)); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public Index index() { + return new Index(); + } + + protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException; + + public interface BlockReader { + public void readBlock() throws IOException; + } + + private static class Reader extends IntIndexInput.Reader { + private final IndexInput in; + + protected final int[] pending; + int upto; + + private boolean seekPending; + private long pendingFP; + private int pendingUpto; + private long lastBlockFP; + private final BlockReader blockReader; + private final int blockSize; + private final IntsRef bulkResult = new IntsRef(); + + public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) + throws IOException { + this.in = in; + this.pending = pending; + this.blockSize = pending.length; + bulkResult.ints = pending; + this.blockReader = blockReader; + upto = blockSize; + } + + void seek(final long fp, final int upto) { + pendingFP = fp; + pendingUpto = upto; + seekPending = true; + } + + private void maybeSeek() throws IOException { + if (seekPending) { + if (pendingFP != lastBlockFP) { + // need new block + in.seek(pendingFP); + lastBlockFP = pendingFP; + blockReader.readBlock(); + } + upto = pendingUpto; + seekPending = false; + } + } + + @Override + public int next() throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + lastBlockFP = in.getFilePointer(); + blockReader.readBlock(); + upto = 0; + } + + return pending[upto++]; + } + + @Override + public IntsRef read(final int count) throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + blockReader.readBlock(); + upto = 0; + } + bulkResult.offset = upto; + if (upto + count < blockSize) { + bulkResult.length = count; + upto += count; + } else { + bulkResult.length = blockSize - upto; + upto = blockSize; + } + + return bulkResult; + } + } + + private class Index extends IntIndexInput.Index { + private long fp; + private int upto; + + @Override + public void read(final IndexInput indexIn, final boolean absolute) throws IOException { + if (absolute) { + fp = indexIn.readVLong(); + upto = indexIn.readVInt(); + } else { + final long delta = indexIn.readVLong(); + if (delta == 0) { + // same block + upto += indexIn.readVInt(); + } else { + // new block + fp += delta; + upto = indexIn.readVInt(); + } + } + assert upto < blockSize; + } + + @Override + public void seek(final IntIndexInput.Reader other) throws IOException { + ((Reader) other).seek(fp, upto); + } + + @Override + public void set(final IntIndexInput.Index other) { + final Index idx = (Index) other; + fp = idx.fp; + upto = idx.upto; + } + + @Override + public Object clone() { + Index other = new Index(); + other.fp = fp; + other.upto = upto; + return other; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java new file mode 100644 index 00000000000..65e78d12096 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java @@ -0,0 +1,118 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.store.IndexOutput; + +/** Abstract base class that writes fixed-size blocks of ints + * to an IndexOutput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexOutput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { + + private IndexOutput out; + private int blockSize; + private int[] pending; + private int upto; + + protected void init(IndexOutput out, int fixedBlockSize) throws IOException { + blockSize = fixedBlockSize; + out.writeVInt(blockSize); + this.out = out; + pending = new int[blockSize]; + } + + protected abstract void flushBlock(int[] buffer, IndexOutput out) throws IOException; + + @Override + public Index index() throws IOException { + return new Index(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + int upto; + long lastFP; + int lastUpto; + + @Override + public void mark() throws IOException { + fp = out.getFilePointer(); + upto = FixedIntBlockIndexOutput.this.upto; + } + + @Override + public void set(IntIndexOutput.Index other) throws IOException { + Index idx = (Index) other; + lastFP = fp = idx.fp; + lastUpto = upto = idx.upto; + } + + @Override + public void write(IndexOutput indexOut, boolean absolute) throws IOException { + if (absolute) { + indexOut.writeVLong(fp); + indexOut.writeVInt(upto); + } else if (fp == lastFP) { + // same block + indexOut.writeVLong(0); + assert upto >= lastUpto; + indexOut.writeVLong(upto - lastUpto); + } else { + // new block + indexOut.writeVLong(fp - lastFP); + indexOut.writeVLong(upto); + } + lastUpto = upto; + lastFP = fp; + } + } + + @Override + public void write(int v) throws IOException { + pending[upto++] = v; + if (upto == blockSize) { + flushBlock(pending, out); + upto = 0; + } + } + + @Override + public void close() throws IOException { + try { + if (upto > 0) { + // NOTE: entries in the block after current upto are + // invalid + flushBlock(pending, out); + } + } finally { + out.close(); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java new file mode 100644 index 00000000000..1650aaa74c1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java @@ -0,0 +1,140 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.sep.SepCodec; +import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class IntBlockCodec extends Codec { + + public IntBlockCodec() { + name = "IntBlock"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new SimpleIntBlockFactory(1024)); + + boolean success = false; + StandardTermsIndexWriter indexWriter; + try { + indexWriter = new SimpleStandardTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + + success = false; + try { + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsWriter.close(); + } finally { + indexWriter.close(); + } + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + StandardPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, + state.segmentInfo, + state.readBufferSize, + new SimpleIntBlockFactory(1024)); + + StandardTermsIndexReader indexReader; + boolean success = false; + try { + indexReader = new SimpleStandardTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + } finally { + if (!success) { + postingsReader.close(); + } + } + + success = false; + try { + FieldsProducer ret = new StandardTermsDictReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUTF16Comparator(), + StandardCodec.TERMS_CACHE_SIZE); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsReader.close(); + } finally { + indexReader.close(); + } + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) { + SepPostingsReaderImpl.files(segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + SepCodec.getSepExtensions(extensions); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockFactory.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockFactory.java new file mode 100644 index 00000000000..974a80e6a58 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockFactory.java @@ -0,0 +1,41 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.sep.IntStreamFactory; +import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.index.codecs.sep.IntIndexOutput; + +import java.io.IOException; + +/** @lucene.experimental */ +public class SimpleIntBlockFactory extends IntStreamFactory { + private final int blockSize; + public SimpleIntBlockFactory(int blockSize) { + this.blockSize = blockSize; + } + @Override + public IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException { + return new SimpleIntBlockIndexInput(dir, fileName, readBufferSize); + } + @Override + public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException { + return new SimpleIntBlockIndexOutput(dir, fileName, blockSize); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java new file mode 100644 index 00000000000..cb137ab6af1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java @@ -0,0 +1,67 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; + +/** + * Don't use this class!! It naively encodes ints one vInt + * at a time. Use it only for testing. + * + * @lucene.experimental + */ +public class SimpleIntBlockIndexInput extends FixedIntBlockIndexInput { + + public SimpleIntBlockIndexInput(Directory dir, String fileName, int readBufferSize) throws IOException { + IndexInput in = dir.openInput(fileName, readBufferSize); + CodecUtil.checkHeader(in, SimpleIntBlockIndexOutput.CODEC, SimpleIntBlockIndexOutput.VERSION_START); + init(in); + } + + private static class BlockReader implements FixedIntBlockIndexInput.BlockReader { + + private final IndexInput in; + private final int[] buffer; + + public BlockReader(IndexInput in, int[] buffer) { + this.in = in; + this.buffer = buffer; + } + + public void readBlock() throws IOException { + // silly impl + for(int i=0;i files) throws IOException { + PreFlexFields.files(dir, info, files); + } + + @Override + public void getExtensions(Set extensions) { + extensions.add(FREQ_EXTENSION); + extensions.add(PROX_EXTENSION); + extensions.add(TERMS_EXTENSION); + extensions.add(TERMS_INDEX_EXTENSION); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java new file mode 100644 index 00000000000..bd3b8b44976 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -0,0 +1,488 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.Comparator; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.CompoundFileReader; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** Exposes flex API on a pre-flex index, as a codec. + * @lucene.experimental */ +public class PreFlexFields extends FieldsProducer { + + public TermInfosReader tis; + public final TermInfosReader tisNoIndex; + + public final IndexInput freqStream; + public final IndexInput proxStream; + final private FieldInfos fieldInfos; + private final SegmentInfo si; + final TreeMap fields = new TreeMap(); + private final Directory dir; + private final int readBufferSize; + private Directory cfsReader; + + PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) + throws IOException { + + si = info; + TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor); + if (indexDivisor == -1) { + tisNoIndex = r; + } else { + tisNoIndex = null; + tis = r; + } + this.readBufferSize = readBufferSize; + this.fieldInfos = fieldInfos; + + // make sure that all index files have been read or are kept open + // so that if an index update removes them we'll still have them + freqStream = dir.openInput(info.name + ".frq", readBufferSize); + boolean anyProx = false; + final int numFields = fieldInfos.size(); + for(int i=0;i files) throws IOException { + files.add(IndexFileNames.segmentFileName(info.name, PreFlexCodec.TERMS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, PreFlexCodec.TERMS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, PreFlexCodec.FREQ_EXTENSION)); + if (info.getHasProx()) { + // LUCENE-1739: for certain versions of 2.9-dev, + // hasProx would be incorrectly computed during + // indexing as true, and then stored into the segments + // file, when it should have been false. So we do the + // extra check, here: + final String prx = IndexFileNames.segmentFileName(info.name, PreFlexCodec.PROX_EXTENSION); + if (dir.fileExists(prx)) { + files.add(prx); + } + } + } + + @Override + public FieldsEnum iterator() throws IOException { + return new PreFlexFieldsEnum(); + } + + @Override + public Terms terms(String field) { + FieldInfo fi = fieldInfos.fieldInfo(field); + if (fi != null) { + return new PreTerms(fi); + } else { + return null; + } + } + + synchronized private TermInfosReader getTermsDict() { + if (tis != null) { + return tis; + } else { + return tisNoIndex; + } + } + + @Override + synchronized public void loadTermsIndex(int indexDivisor) throws IOException { + if (tis == null) { + Directory dir0; + if (si.getUseCompoundFile()) { + // In some cases, we were originally opened when CFS + // was not used, but then we are asked to open the + // terms reader with index, the segment has switched + // to CFS + + if (!(dir instanceof CompoundFileReader)) { + dir0 = cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(si.name, IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize); + } else { + dir0 = dir; + } + dir0 = cfsReader; + } else { + dir0 = dir; + } + + tis = new TermInfosReader(dir0, si.name, fieldInfos, readBufferSize, indexDivisor); + } + } + + @Override + public void close() throws IOException { + if (tis != null) { + tis.close(); + } + if (tisNoIndex != null) { + tisNoIndex.close(); + } + if (cfsReader != null) { + cfsReader.close(); + } + } + + private class PreFlexFieldsEnum extends FieldsEnum { + final Iterator it; + private final PreTermsEnum termsEnum; + private int count; + FieldInfo current; + + public PreFlexFieldsEnum() throws IOException { + it = fields.values().iterator(); + termsEnum = new PreTermsEnum(); + } + + @Override + public String next() { + if (it.hasNext()) { + count++; + current = it.next(); + return current.name; + } else { + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + termsEnum.reset(current, count == 1); + return termsEnum; + } + } + + private class PreTerms extends Terms { + final FieldInfo fieldInfo; + PreTerms(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + } + + @Override + public TermsEnum iterator() throws IOException { + PreTermsEnum termsEnum = new PreTermsEnum(); + termsEnum.reset(fieldInfo, false); + return termsEnum; + } + + @Override + public Comparator getComparator() { + // Pre-flex indexes always sorted in UTF16 order + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + } + + private class PreTermsEnum extends TermsEnum { + private SegmentTermEnum termEnum; + private FieldInfo fieldInfo; + private boolean skipNext; + private BytesRef current; + private final BytesRef scratchBytesRef = new BytesRef(); + + void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException { + this.fieldInfo = fieldInfo; + if (termEnum == null) { + // First time reset is called + if (isFirstField) { + termEnum = getTermsDict().terms(); + skipNext = false; + } else { + termEnum = getTermsDict().terms(new Term(fieldInfo.name, "")); + skipNext = true; + } + } else { + final Term t = termEnum.term(); + if (t != null && t.field() == fieldInfo.name) { + // No need to seek -- we have already advanced onto + // this field. We must be @ first term because + // flex API will not advance this enum further, on + // seeing a different field. + } else { + assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned + final TermInfosReader tis = getTermsDict(); + tis.seekEnum(termEnum, new Term(fieldInfo.name, "")); + } + skipNext = true; + } + } + + @Override + public Comparator getComparator() { + // Pre-flex indexes always sorted in UTF16 order + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + + @Override + public SeekStatus seek(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + skipNext = false; + final TermInfosReader tis = getTermsDict(); + final Term t0 = new Term(fieldInfo.name, term.utf8ToString()); + if (termEnum == null) { + termEnum = tis.terms(t0); + } else { + tis.seekEnum(termEnum, t0); + } + final Term t = termEnum.term(); + + final BytesRef tr; + if (t != null) { + tr = scratchBytesRef; + scratchBytesRef.copy(t.text()); + } else { + tr = null; + } + + if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) { + current = tr; + return SeekStatus.FOUND; + } else if (t == null || t.field() != fieldInfo.name) { + current = null; + return SeekStatus.END; + } else { + current = tr; + return SeekStatus.NOT_FOUND; + } + } + + @Override + public BytesRef next() throws IOException { + if (skipNext) { + skipNext = false; + if (termEnum.term() == null) { + return null; + } else { + scratchBytesRef.copy(termEnum.term().text()); + return current = scratchBytesRef; + } + } + if (termEnum.next()) { + final Term t = termEnum.term(); + if (t.field() == fieldInfo.name) { + scratchBytesRef.copy(t.text()); + current = scratchBytesRef; + return current; + } else { + assert !t.field().equals(fieldInfo.name); // make sure field name is interned + // Crossed into new field + return null; + } + } else { + return null; + } + } + + @Override + public BytesRef term() { + return current; + } + + @Override + public int docFreq() { + return termEnum.docFreq(); + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + if (reuse != null) { + return ((PreDocsEnum) reuse).reset(termEnum, skipDocs); + } else { + return (new PreDocsEnum()).reset(termEnum, skipDocs); + } + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (reuse != null) { + return ((PreDocsAndPositionsEnum) reuse).reset(termEnum, skipDocs); + } else { + return (new PreDocsAndPositionsEnum()).reset(termEnum, skipDocs); + } + } + } + + private final class PreDocsEnum extends DocsEnum { + final private SegmentTermDocs docs; + + PreDocsEnum() throws IOException { + docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos); + } + + public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { + docs.setSkipDocs(skipDocs); + docs.seek(termEnum); + return this; + } + + @Override + public int nextDoc() throws IOException { + if (docs.next()) { + return docs.doc(); + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int advance(int target) throws IOException { + if (docs.skipTo(target)) { + return docs.doc(); + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int freq() { + return docs.freq(); + } + + @Override + public int docID() { + return docs.doc(); + } + + @Override + public int read() throws IOException { + if (bulkResult == null) { + initBulkResult(); + bulkResult.docs.ints = new int[32]; + bulkResult.freqs.ints = new int[32]; + } + return this.docs.read(bulkResult.docs.ints, bulkResult.freqs.ints); + } + } + + private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum { + final private SegmentTermPositions pos; + + PreDocsAndPositionsEnum() throws IOException { + pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos); + } + + public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { + pos.setSkipDocs(skipDocs); + pos.seek(termEnum); + return this; + } + + @Override + public int nextDoc() throws IOException { + if (pos.next()) { + return pos.doc(); + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int advance(int target) throws IOException { + if (pos.skipTo(target)) { + return pos.doc(); + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int freq() { + return pos.freq(); + } + + @Override + public int docID() { + return pos.doc(); + } + + @Override + public int nextPosition() throws IOException { + return pos.nextPosition(); + } + + @Override + public int getPayloadLength() { + return pos.getPayloadLength(); + } + + @Override + public boolean hasPayload() { + return pos.isPayloadAvailable(); + } + + private BytesRef payload; + + @Override + public BytesRef getPayload() throws IOException { + final int len = pos.getPayloadLength(); + if (payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[len]; + } else { + if (payload.bytes.length < len) { + payload.grow(len); + } + } + + payload.bytes = pos.getPayload(payload.bytes, 0); + payload.length = len; + return payload; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/SegmentTermDocs.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java similarity index 81% rename from lucene/src/java/org/apache/lucene/index/SegmentTermDocs.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java index f4f337c87f3..55cda817aa7 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentTermDocs.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -18,15 +18,27 @@ package org.apache.lucene.index; */ import java.io.IOException; -import org.apache.lucene.util.BitVector; -import org.apache.lucene.store.IndexInput; -class SegmentTermDocs implements TermDocs { - protected SegmentReader parent; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.codecs.standard.DefaultSkipListReader; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; + +/** @deprecated + * @lucene.experimental */ +@Deprecated +public class SegmentTermDocs implements TermDocs { + //protected SegmentReader parent; + private final FieldInfos fieldInfos; + private final TermInfosReader tis; + protected Bits skipDocs; protected IndexInput freqStream; protected int count; protected int df; - protected BitVector deletedDocs; int doc = 0; int freq; @@ -43,6 +55,7 @@ class SegmentTermDocs implements TermDocs { protected boolean currentFieldStoresPayloads; protected boolean currentFieldOmitTermFreqAndPositions; + /* protected SegmentTermDocs(SegmentReader parent) { this.parent = parent; this.freqStream = (IndexInput) parent.core.freqStream.clone(); @@ -52,24 +65,37 @@ class SegmentTermDocs implements TermDocs { this.skipInterval = parent.core.getTermsReader().getSkipInterval(); this.maxSkipLevels = parent.core.getTermsReader().getMaxSkipLevels(); } + */ + + public SegmentTermDocs(IndexInput freqStream, TermInfosReader tis, FieldInfos fieldInfos) { + this.freqStream = (IndexInput) freqStream.clone(); + this.tis = tis; + this.fieldInfos = fieldInfos; + skipInterval = tis.getSkipInterval(); + maxSkipLevels = tis.getMaxSkipLevels(); + } public void seek(Term term) throws IOException { - TermInfo ti = parent.core.getTermsReader().get(term); + TermInfo ti = tis.get(term); seek(ti, term); } + public void setSkipDocs(Bits skipDocs) { + this.skipDocs = skipDocs; + } + public void seek(TermEnum termEnum) throws IOException { TermInfo ti; Term term; // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs - if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.core.fieldInfos) { // optimized case + if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == fieldInfos) { // optimized case SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum); term = segmentTermEnum.term(); ti = segmentTermEnum.termInfo(); } else { // punt case term = termEnum.term(); - ti = parent.core.getTermsReader().get(term); + ti = tis.get(term); } seek(ti, term); @@ -77,7 +103,7 @@ class SegmentTermDocs implements TermDocs { void seek(TermInfo ti, Term term) throws IOException { count = 0; - FieldInfo fi = parent.core.fieldInfos.fieldInfo(term.field); + FieldInfo fi = fieldInfos.fieldInfo(term.field()); currentFieldOmitTermFreqAndPositions = (fi != null) ? fi.omitTermFreqAndPositions : false; currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; if (ti == null) { @@ -118,14 +144,17 @@ class SegmentTermDocs implements TermDocs { doc += docCode >>> 1; // shift off low bit if ((docCode & 1) != 0) // if low bit is set freq = 1; // freq is one - else + else { freq = freqStream.readVInt(); // else read freq + assert freq != 1; + } } count++; - if (deletedDocs == null || !deletedDocs.get(doc)) + if (skipDocs == null || !skipDocs.get(doc)) { break; + } skippingDoc(); } return true; @@ -149,7 +178,7 @@ class SegmentTermDocs implements TermDocs { freq = freqStream.readVInt(); // else read freq count++; - if (deletedDocs == null || !deletedDocs.get(doc)) { + if (skipDocs == null || !skipDocs.get(doc)) { docs[i] = doc; freqs[i] = freq; ++i; @@ -166,7 +195,7 @@ class SegmentTermDocs implements TermDocs { doc += freqStream.readVInt(); count++; - if (deletedDocs == null || !deletedDocs.get(doc)) { + if (skipDocs == null || !skipDocs.get(doc)) { docs[i] = doc; // Hardware freq to 1 when term freqs were not // stored in the index diff --git a/lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java similarity index 86% rename from lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java index b004c59f0c3..2f1a9333f21 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,13 +19,33 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.CorruptIndexException; -final class SegmentTermEnum extends TermEnum implements Cloneable { +/** + * @deprecated No longer used with flex indexing, except for + * reading old segments + * @lucene.experimental */ + +@Deprecated +public final class SegmentTermEnum extends TermEnum implements Cloneable { private IndexInput input; FieldInfos fieldInfos; long size; long position = -1; + /** The file format version, a negative number. */ + public static final int FORMAT = -3; + + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + private TermBuffer termBuffer = new TermBuffer(); private TermBuffer prevBuffer = new TermBuffer(); private TermBuffer scanBuffer = new TermBuffer(); // used for scanning @@ -61,8 +81,8 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { format = firstInt; // check that it is a format we can understand - if (format < TermInfosWriter.FORMAT_CURRENT) - throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher"); + if (format < FORMAT_CURRENT) + throw new CorruptIndexException("Unknown format version:" + format + " expected " + FORMAT_CURRENT + " or higher"); size = input.readLong(); // read the size @@ -77,7 +97,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { } else { indexInterval = input.readInt(); skipInterval = input.readInt(); - if (format <= TermInfosWriter.FORMAT) { + if (format <= FORMAT) { // this new format introduces multi-level skipping maxSkipLevels = input.readInt(); } @@ -85,7 +105,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; } - if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { + if (format > FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { termBuffer.setPreUTF8Strings(); scanBuffer.setPreUTF8Strings(); prevBuffer.setPreUTF8Strings(); diff --git a/lucene/src/java/org/apache/lucene/index/SegmentTermPositions.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java similarity index 90% rename from lucene/src/java/org/apache/lucene/index/SegmentTermPositions.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java index ae24ff9cb98..3f2c3ddcc6b 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentTermPositions.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,13 +17,18 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.store.IndexInput; - import java.io.IOException; -final class SegmentTermPositions +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.store.IndexInput; + +/** @lucene.experimental */ +public final class SegmentTermPositions extends SegmentTermDocs implements TermPositions { private IndexInput proxStream; + private IndexInput proxStreamOrig; private int proxCount; private int position; @@ -37,11 +42,18 @@ extends SegmentTermDocs implements TermPositions { // for a lazy skip private long lazySkipPointer = -1; private int lazySkipProxCount = 0; - + + /* SegmentTermPositions(SegmentReader p) { super(p); this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time } + */ + + public SegmentTermPositions(IndexInput freqStream, IndexInput proxStream, TermInfosReader tis, FieldInfos fieldInfos) { + super(freqStream, tis, fieldInfos); + this.proxStreamOrig = proxStream; // the proxStream will be cloned lazily when nextPosition() is called for the first time + } @Override final void seek(TermInfo ti, Term term) throws IOException { @@ -152,7 +164,7 @@ extends SegmentTermDocs implements TermPositions { private void lazySkip() throws IOException { if (proxStream == null) { // clone lazily - proxStream = (IndexInput) parent.core.proxStream.clone(); + proxStream = (IndexInput)proxStreamOrig.clone(); } // we might have to skip the current payload diff --git a/lucene/src/java/org/apache/lucene/index/TermBuffer.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java similarity index 83% rename from lucene/src/java/org/apache/lucene/index/TermBuffer.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java index 983b6d3fcc6..1c7fc7a16c9 100644 --- a/lucene/src/java/org/apache/lucene/index/TermBuffer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,7 +19,11 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.FieldInfos; final class TermBuffer implements Cloneable { @@ -29,7 +33,7 @@ final class TermBuffer implements Cloneable { private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF8Result bytes = new UnicodeUtil.UTF8Result(); + private BytesRef bytes = new BytesRef(10); public final int compareTo(TermBuffer other) { if (field == other.field) // fields are interned @@ -72,15 +76,19 @@ final class TermBuffer implements Cloneable { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - bytes.setLength(totalLength); - input.readBytes(bytes.result, start, length); - UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); + if (bytes.bytes.length < totalLength) + bytes.bytes = new byte[totalLength]; + bytes.length = totalLength; + input.readBytes(bytes.bytes, start, length); + UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: - bytes.setLength(totalLength); - input.readBytes(bytes.result, start, length); - UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); + if (bytes.bytes.length < totalLength) + bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength); + bytes.length = totalLength; + input.readBytes(bytes.bytes, start, length); + UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); } } this.field = fieldInfos.fieldName(input.readVInt()); @@ -132,7 +140,7 @@ final class TermBuffer implements Cloneable { } catch (CloneNotSupportedException e) {} clone.dirty = true; - clone.bytes = new UnicodeUtil.UTF8Result(); + clone.bytes = new BytesRef(10); clone.text = new UnicodeUtil.UTF16Result(); clone.text.copyText(text); return clone; diff --git a/lucene/src/java/org/apache/lucene/index/TermInfo.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java similarity index 89% rename from lucene/src/java/org/apache/lucene/index/TermInfo.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java index 475bb09175d..9244ca08b09 100644 --- a/lucene/src/java/org/apache/lucene/index/TermInfo.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,8 +17,12 @@ package org.apache.lucene.index; * limitations under the License. */ -/** A TermInfo is the record of information stored for a term.*/ +/** A TermInfo is the record of information stored for a + * term + * @deprecated This class is no longer used in flexible + * indexing. */ +@Deprecated class TermInfo { /** The number of documents which contain the term. */ int docFreq = 0; diff --git a/lucene/src/java/org/apache/lucene/index/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java similarity index 89% rename from lucene/src/java/org/apache/lucene/index/TermInfosReader.java rename to lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java index 5a373cc169d..7a95a4270ed 100644 --- a/lucene/src/java/org/apache/lucene/index/TermInfosReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,16 +19,24 @@ package org.apache.lucene.index; import java.io.IOException; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.cache.Cache; -import org.apache.lucene.util.cache.DoubleBarrelLRUCache; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.util.cache.DoubleBarrelLRUCache; +import org.apache.lucene.util.cache.Cache; /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the - * set. */ - -final class TermInfosReader { + * set + * @deprecated This class has been replaced by + * FormatPostingsTermsDictReader, except for reading old segments. + * @lucene.experimental + */ +@Deprecated +public final class TermInfosReader { private final Directory directory; private final String segment; private final FieldInfos fieldInfos; @@ -44,7 +52,7 @@ final class TermInfosReader { private final int totalIndexInterval; private final static int DEFAULT_CACHE_SIZE = 1024; - + // Just adds term's ord to TermInfo private final static class TermInfoAndOrd extends TermInfo { final int termOrd; @@ -55,7 +63,7 @@ final class TermInfosReader { } private final Cache termsCache = new DoubleBarrelLRUCache(DEFAULT_CACHE_SIZE); - + /** * Per-thread resources managed by ThreadLocal */ @@ -76,15 +84,15 @@ final class TermInfosReader { segment = seg; fieldInfos = fis; - origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), - readBufferSize), fieldInfos, false); + origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, PreFlexCodec.TERMS_EXTENSION), + readBufferSize), fieldInfos, false); size = origEnum.size; if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; - final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), + final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, PreFlexCodec.TERMS_INDEX_EXTENSION), readBufferSize), fieldInfos, true); try { @@ -177,8 +185,8 @@ final class TermInfosReader { private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], - ((long) indexOffset * totalIndexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); + ((long) indexOffset * totalIndexInterval) - 1, + indexTerms[indexOffset], indexInfos[indexOffset]); } /** Returns the TermInfo for a Term in the set, or null. */ @@ -194,13 +202,21 @@ final class TermInfosReader { TermInfoAndOrd tiOrd = termsCache.get(term); ThreadResources resources = getThreadResources(); - + if (!mustSeekEnum && tiOrd != null) { return tiOrd; } - + + return seekEnum(resources.termEnum, term, tiOrd); + } + + TermInfo seekEnum(SegmentTermEnum enumerator, Term term) throws IOException { + return seekEnum(enumerator, term, termsCache.get(term)); + } + + TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { + // optimize sequential access: first try scanning cached enum w/o seeking - SegmentTermEnum enumerator = resources.termEnum; if (enumerator.term() != null // term is at or past current && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java new file mode 100644 index 00000000000..989383a155c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java @@ -0,0 +1,155 @@ +package org.apache.lucene.index.codecs.pulsing; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriterImpl; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardPostingsReaderImpl; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +/** This codec "inlines" the postings for terms that have + * low docFreq. It wraps another codec, which is used for + * writing the non-inlined terms. + * + * Currently in only inlines docFreq=1 terms, and + * otherwise uses the normal "standard" codec. + * @lucene.experimental */ + +public class PulsingCodec extends Codec { + + public PulsingCodec() { + name = "Pulsing"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // We wrap StandardPostingsWriterImpl, but any StandardPostingsWriter + // will work: + StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); + + // Terms that have <= freqCutoff number of docs are + // "pulsed" (inlined): + final int freqCutoff = 1; + StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); + + // Terms dict index + StandardTermsIndexWriter indexWriter; + boolean success = false; + try { + indexWriter = new SimpleStandardTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + pulsingWriter.close(); + } + } + + // Terms dict + success = false; + try { + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + return ret; + } finally { + if (!success) { + try { + pulsingWriter.close(); + } finally { + indexWriter.close(); + } + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + + // We wrap StandardPostingsReaderImpl, but any StandardPostingsReader + // will work: + StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); + StandardPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader); + + // Terms dict index reader + StandardTermsIndexReader indexReader; + + boolean success = false; + try { + indexReader = new SimpleStandardTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + } finally { + if (!success) { + pulsingReader.close(); + } + } + + // Terms dict reader + success = false; + try { + FieldsProducer ret = new StandardTermsDictReader(indexReader, + state.dir, state.fieldInfos, state.segmentInfo.name, + pulsingReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUTF16Comparator(), + StandardCodec.TERMS_CACHE_SIZE); + success = true; + return ret; + } finally { + if (!success) { + try { + pulsingReader.close(); + } finally { + indexReader.close(); + } + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) throws IOException { + StandardPostingsReaderImpl.files(dir, segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + StandardCodec.getStandardExtensions(extensions); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java new file mode 100644 index 00000000000..d72d4e8fb48 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java @@ -0,0 +1,381 @@ +package org.apache.lucene.index.codecs.pulsing; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.codecs.standard.TermState; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Document; +import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Position; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Concrete class that reads the current doc/freq/skip + * postings format + * @lucene.experimental */ + +// TODO: -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +public class PulsingPostingsReaderImpl extends StandardPostingsReader { + + // Fallback reader for non-pulsed terms: + final StandardPostingsReader wrappedPostingsReader; + int maxPulsingDocFreq; + + public PulsingPostingsReaderImpl(StandardPostingsReader wrappedPostingsReader) throws IOException { + this.wrappedPostingsReader = wrappedPostingsReader; + } + + @Override + public void init(IndexInput termsIn) throws IOException { + CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, PulsingPostingsWriterImpl.VERSION_START); + maxPulsingDocFreq = termsIn.readVInt(); + wrappedPostingsReader.init(termsIn); + } + + private static class PulsingTermState extends TermState { + private Document docs[]; + private TermState wrappedTermState; + private boolean pendingIndexTerm; + + public Object clone() { + PulsingTermState clone; + clone = (PulsingTermState) super.clone(); + clone.docs = (Document[]) docs.clone(); + for(int i=0;i>>1; + if ((code & 1) != 0) { + doc.numPositions = 1; + } else { + doc.numPositions = termsIn.readVInt(); + } + + if (doc.numPositions > doc.positions.length) { + doc.reallocPositions(doc.numPositions); + } + + int position = 0; + int payloadLength = -1; + + for(int j=0;j>> 1; + if ((code2 & 1) != 0) { + payloadLength = termsIn.readVInt(); + } + + if (payloadLength > 0) { + if (pos.payload == null) { + pos.payload = new BytesRef(); + pos.payload.bytes = new byte[payloadLength]; + } else if (payloadLength > pos.payload.bytes.length) { + pos.payload.grow(payloadLength); + } + pos.payload.length = payloadLength; + termsIn.readBytes(pos.payload.bytes, 0, payloadLength); + } else if (pos.payload != null) { + pos.payload.length = 0; + } + } else { + position += code2; + } + pos.pos = position; + } + } + doc.docID = docID; + } + } else { + termState.wrappedTermState.docFreq = termState.docFreq; + wrappedPostingsReader.readTerm(termsIn, fieldInfo, termState.wrappedTermState, termState.pendingIndexTerm); + termState.pendingIndexTerm = false; + } + } + + // TODO: we could actually reuse, by having TL that + // holds the last wrapped reuse, and vice-versa + @Override + public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException { + PulsingTermState termState = (PulsingTermState) _termState; + if (termState.docFreq <= maxPulsingDocFreq) { + if (reuse instanceof PulsingDocsEnum) { + return ((PulsingDocsEnum) reuse).reset(skipDocs, termState); + } else { + PulsingDocsEnum docsEnum = new PulsingDocsEnum(); + return docsEnum.reset(skipDocs, termState); + } + } else { + if (reuse instanceof PulsingDocsEnum) { + return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null); + } else { + return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, reuse); + } + } + } + + // TODO: -- not great that we can't always reuse + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + PulsingTermState termState = (PulsingTermState) _termState; + if (termState.docFreq <= maxPulsingDocFreq) { + if (reuse instanceof PulsingDocsAndPositionsEnum) { + return ((PulsingDocsAndPositionsEnum) reuse).reset(skipDocs, termState); + } else { + PulsingDocsAndPositionsEnum postingsEnum = new PulsingDocsAndPositionsEnum(); + return postingsEnum.reset(skipDocs, termState); + } + } else { + if (reuse instanceof PulsingDocsAndPositionsEnum) { + return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null); + } else { + return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, reuse); + } + } + } + + static class PulsingDocsEnum extends DocsEnum { + private int nextRead; + private Bits skipDocs; + private Document doc; + private PulsingTermState state; + + public void close() {} + + PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) { + // TODO: -- not great we have to clone here -- + // merging is wasteful; TermRangeQuery too + state = (PulsingTermState) termState.clone(); + this.skipDocs = skipDocs; + nextRead = 0; + return this; + } + + @Override + public int nextDoc() { + while(true) { + if (nextRead >= state.docFreq) { + return NO_MORE_DOCS; + } else { + doc = state.docs[nextRead++]; + if (skipDocs == null || !skipDocs.get(doc.docID)) { + return doc.docID; + } + } + } + } + + @Override + public int read() { + int i=0; + // TODO: -- ob1? + initBulkResult(); + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + while(nextRead < state.docFreq) { + doc = state.docs[nextRead++]; + if (skipDocs == null || !skipDocs.get(doc.docID)) { + docs[i] = doc.docID; + freqs[i] = doc.numPositions; + i++; + } + } + return i; + } + + @Override + public int freq() { + return doc.numPositions; + } + + @Override + public int docID() { + return doc.docID; + } + + @Override + public int advance(int target) throws IOException { + int doc; + while((doc=nextDoc()) != NO_MORE_DOCS) { + if (doc >= target) + return doc; + } + return NO_MORE_DOCS; + } + } + + static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum { + private int nextRead; + private int nextPosRead; + private Bits skipDocs; + private Document doc; + private Position pos; + private PulsingTermState state; + + // Only here to emulate limitation of standard codec, + // which only allows retrieving payload more than once + private boolean payloadRetrieved; + + public void close() {} + + PulsingDocsAndPositionsEnum reset(Bits skipDocs, PulsingTermState termState) { + // TODO: -- not great we have to clone here -- + // merging is wasteful; TermRangeQuery too + state = (PulsingTermState) termState.clone(); + this.skipDocs = skipDocs; + nextRead = 0; + nextPosRead = 0; + return this; + } + + @Override + public int nextDoc() { + while(true) { + if (nextRead >= state.docFreq) { + return NO_MORE_DOCS; + } else { + doc = state.docs[nextRead++]; + if (skipDocs == null || !skipDocs.get(doc.docID)) { + nextPosRead = 0; + return doc.docID; + } + } + } + } + + @Override + public int freq() { + return doc.numPositions; + } + + @Override + public int docID() { + return doc.docID; + } + + @Override + public int advance(int target) throws IOException { + int doc; + while((doc=nextDoc()) != NO_MORE_DOCS) { + if (doc >= target) { + return doc; + } + } + return NO_MORE_DOCS; + } + + @Override + public int nextPosition() { + assert nextPosRead < doc.numPositions; + pos = doc.positions[nextPosRead++]; + payloadRetrieved = false; + return pos.pos; + } + + @Override + public int getPayloadLength() { + return payloadRetrieved || pos.payload == null ? 0 : pos.payload.length; + } + + @Override + public boolean hasPayload() { + return !payloadRetrieved && pos.payload != null && pos.payload.length > 0; + } + + @Override + public BytesRef getPayload() { + payloadRetrieved = true; + return pos.payload; + } + } + + @Override + public void close() throws IOException { + wrappedPostingsReader.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java new file mode 100644 index 00000000000..0cd0840b520 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java @@ -0,0 +1,311 @@ +package org.apache.lucene.index.codecs.pulsing; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; + +// TODO: we now pulse entirely according to docFreq of the +// term; it might be better to eg pulse by "net bytes used" +// so that a term that has only 1 doc but zillions of +// positions would not be inlined. Though this is +// presumably rare in practice... + +/** @lucene.experimental */ +public final class PulsingPostingsWriterImpl extends StandardPostingsWriter { + + final static String CODEC = "PulsedPostings"; + + // To add a new version, increment from the last one, and + // change VERSION_CURRENT to point to your new version: + final static int VERSION_START = 0; + + final static int VERSION_CURRENT = VERSION_START; + + IndexOutput termsOut; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + FieldInfo fieldInfo; + + /** @lucene.experimental */ + public static class Document { + int docID; + int termDocFreq; + int numPositions; + Position[] positions; + Document() { + positions = new Position[1]; + positions[0] = new Position(); + } + + @Override + public Object clone() { + Document doc = new Document(); + doc.docID = docID; + doc.termDocFreq = termDocFreq; + doc.numPositions = numPositions; + doc.positions = new Position[positions.length]; + for(int i = 0; i < positions.length; i++) { + doc.positions[i] = (Position) positions[i].clone(); + } + + return doc; + } + + void reallocPositions(int minSize) { + final Position[] newArray = new Position[ArrayUtil.oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(positions, 0, newArray, 0, positions.length); + for(int i=positions.length;i maxPulsingDocFreq docs + + static class Position { + BytesRef payload; + int pos; + + @Override + public Object clone() { + Position position = new Position(); + position.pos = pos; + if (payload != null) { + position.payload = new BytesRef(payload); + } + return position; + } + } + + // TODO: -- lazy init this? ie, if every single term + // was pulsed then we never need to use this fallback? + // Fallback writer for non-pulsed terms: + final StandardPostingsWriter wrappedPostingsWriter; + + /** If docFreq <= maxPulsingDocFreq, its postings are + * inlined into terms dict */ + public PulsingPostingsWriterImpl(int maxPulsingDocFreq, StandardPostingsWriter wrappedPostingsWriter) throws IOException { + super(); + + pendingDocs = new Document[maxPulsingDocFreq]; + for(int i=0;i= 0: "got docID=" + docID; + + if (!pulsed && pendingDocCount == pendingDocs.length) { + + // OK we just crossed the threshold, this term should + // now be written with our wrapped codec: + wrappedPostingsWriter.startTerm(); + + // Flush all buffered docs + for(int i=0;i 0) { + assert storePayloads; + wrappedPostingsWriter.addPosition(pos.pos, pos.payload); + } else { + wrappedPostingsWriter.addPosition(pos.pos, null); + } + } + wrappedPostingsWriter.finishDoc(); + } + } + + pendingDocCount = 0; + + pulsed = true; + } + + if (pulsed) { + // We've already seen too many docs for this term -- + // just forward to our fallback writer + wrappedPostingsWriter.startDoc(docID, termDocFreq); + } else { + currentDoc = pendingDocs[pendingDocCount++]; + currentDoc.docID = docID; + // TODO: -- need not store in doc? only used for alloc & assert + currentDoc.termDocFreq = termDocFreq; + if (termDocFreq > currentDoc.positions.length) { + currentDoc.reallocPositions(termDocFreq); + } + currentDoc.numPositions = 0; + } + } + + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + if (pulsed) { + wrappedPostingsWriter.addPosition(position, payload); + } else { + // just buffer up + Position pos = currentDoc.positions[currentDoc.numPositions++]; + pos.pos = position; + if (payload != null && payload.length > 0) { + if (pos.payload == null) { + pos.payload = new BytesRef(payload); + } else { + pos.payload.copy(payload); + } + } else if (pos.payload != null) { + pos.payload.length = 0; + } + } + } + + @Override + public void finishDoc() { + assert currentDoc.numPositions == currentDoc.termDocFreq; + } + + boolean pendingIsIndexTerm; + + int pulsedCount; + int nonPulsedCount; + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + + assert docCount > 0; + + pendingIsIndexTerm |= isIndexTerm; + + if (pulsed) { + wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm); + pendingIsIndexTerm = false; + pulsedCount++; + } else { + nonPulsedCount++; + // OK, there were few enough occurrences for this + // term, so we fully inline our postings data into + // terms dict, now: + int lastDocID = 0; + for(int i=0;i 0) { + termsOut.writeBytes(pos.payload.bytes, 0, pos.payload.length); + } + } else { + termsOut.writeVInt(delta2); + } + } + } + } + } + + pendingDocCount = 0; + } + + @Override + public void close() throws IOException { + wrappedPostingsWriter.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java new file mode 100644 index 00000000000..ffed7d81af6 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java @@ -0,0 +1,75 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; +import java.io.Closeable; + +/** Defines basic API for writing ints to an IndexOutput. + * IntBlockCodec interacts with this API. @see + * IntBlockReader + * + * @lucene.experimental */ +public abstract class IntIndexInput implements Closeable { + + public abstract Reader reader() throws IOException; + + public abstract void close() throws IOException; + + public abstract Index index() throws IOException; + + // TODO: -- can we simplify this? + public abstract static class Index { + + public abstract void read(IndexInput indexIn, boolean absolute) throws IOException; + + /** Seeks primary stream to the last read offset */ + public abstract void seek(IntIndexInput.Reader stream) throws IOException; + + public abstract void set(Index other); + + public abstract Object clone(); + } + + public abstract static class Reader { + + /** Reads next single int */ + public abstract int next() throws IOException; + + /** Reads next chunk of ints */ + private IntsRef bulkResult; + + /** Read up to count ints. */ + public IntsRef read(int count) throws IOException { + if (bulkResult == null) { + bulkResult = new IntsRef(); + bulkResult.ints = new int[count]; + } else { + bulkResult.grow(count); + } + for(int i=0;iNOTE: block sizes could be variable + * + * @lucene.experimental */ +public abstract class IntIndexOutput implements Closeable { + + /** Write an int to the primary file */ + public abstract void write(int v) throws IOException; + + public abstract static class Index { + + /** Internally records the current location */ + public abstract void mark() throws IOException; + + /** Copies index from other */ + public abstract void set(Index other) throws IOException; + + /** Writes "location" of current output pointer of primary + * output to different output (out) */ + public abstract void write(IndexOutput indexOut, boolean absolute) throws IOException; + } + + /** If you are indexing the primary output file, call + * this and interact with the returned IndexWriter. */ + public abstract Index index() throws IOException; + + public abstract void close() throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/IntStreamFactory.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntStreamFactory.java new file mode 100644 index 00000000000..da91f2b4c5e --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntStreamFactory.java @@ -0,0 +1,33 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.BufferedIndexInput; + +import java.io.IOException; + +/** @lucene.experimental */ +public abstract class IntStreamFactory { + public IntIndexInput openInput(Directory dir, String fileName) throws IOException { + return openInput(dir, fileName, BufferedIndexInput.BUFFER_SIZE); + } + + public abstract IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException; + public abstract IntIndexOutput createOutput(Directory dir, String fileName) throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java new file mode 100644 index 00000000000..545d638fc61 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java @@ -0,0 +1,150 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +/** @lucene.experimental */ +public class SepCodec extends Codec { + + public SepCodec() { + name = "Sep"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + + StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new SingleIntFactory()); + + boolean success = false; + StandardTermsIndexWriter indexWriter; + try { + indexWriter = new SimpleStandardTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + + success = false; + try { + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsWriter.close(); + } finally { + indexWriter.close(); + } + } + } + } + + final static String DOC_EXTENSION = "doc"; + final static String SKIP_EXTENSION = "skp"; + final static String FREQ_EXTENSION = "frq"; + final static String POS_EXTENSION = "pos"; + final static String PAYLOAD_EXTENSION = "pyl"; + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + + StandardPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize, new SingleIntFactory()); + + StandardTermsIndexReader indexReader; + boolean success = false; + try { + indexReader = new SimpleStandardTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUTF16Comparator()); + success = true; + } finally { + if (!success) { + postingsReader.close(); + } + } + + success = false; + try { + FieldsProducer ret = new StandardTermsDictReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUTF16Comparator(), + StandardCodec.TERMS_CACHE_SIZE); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsReader.close(); + } finally { + indexReader.close(); + } + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) { + SepPostingsReaderImpl.files(segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + getSepExtensions(extensions); + } + + public static void getSepExtensions(Set extensions) { + extensions.add(DOC_EXTENSION); + extensions.add(FREQ_EXTENSION); + extensions.add(SKIP_EXTENSION); + extensions.add(POS_EXTENSION); + extensions.add(PAYLOAD_EXTENSION); + StandardTermsDictReader.getExtensions(extensions); + SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + } +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java new file mode 100644 index 00000000000..089e5c8b51c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java @@ -0,0 +1,679 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.TermState; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Concrete class that reads the current doc/freq/skip + * postings format. + * + * @lucene.experimental + */ + +// TODO: -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +public class SepPostingsReaderImpl extends StandardPostingsReader { + + final IntIndexInput freqIn; + final IntIndexInput docIn; + final IntIndexInput posIn; + final IndexInput payloadIn; + final IndexInput skipIn; + + int skipInterval; + int maxSkipLevels; + + public SepPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, IntStreamFactory intFactory) throws IOException { + + boolean success = false; + try { + + final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.DOC_EXTENSION); + docIn = intFactory.openInput(dir, docFileName); + + skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.SKIP_EXTENSION), readBufferSize); + + if (segmentInfo.getHasProx()) { + freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.FREQ_EXTENSION)); + posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.POS_EXTENSION), readBufferSize); + payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.PAYLOAD_EXTENSION), readBufferSize); + } else { + posIn = null; + payloadIn = null; + freqIn = null; + } + success = true; + } finally { + if (!success) { + close(); + } + } + } + + public static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.DOC_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.SKIP_EXTENSION)); + + if (segmentInfo.getHasProx()) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.FREQ_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.POS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.PAYLOAD_EXTENSION)); + } + } + + @Override + public void init(IndexInput termsIn) throws IOException { + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, SepPostingsWriterImpl.CODEC, SepPostingsWriterImpl.VERSION_START); + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + } + + @Override + public void close() throws IOException { + try { + if (freqIn != null) + freqIn.close(); + } finally { + try { + if (docIn != null) + docIn.close(); + } finally { + try { + if (skipIn != null) + skipIn.close(); + } finally { + try { + if (posIn != null) { + posIn.close(); + } + } finally { + if (payloadIn != null) { + payloadIn.close(); + } + } + } + } + } + } + + private static class SepTermState extends TermState { + IntIndexInput.Index docIndex; + IntIndexInput.Index freqIndex; + IntIndexInput.Index posIndex; + long skipOffset; + long payloadOffset; + + public Object clone() { + SepTermState other = (SepTermState) super.clone(); + other.docIndex = (IntIndexInput.Index) docIndex.clone(); + if (freqIndex != null) { + other.freqIndex = (IntIndexInput.Index) freqIndex.clone(); + } + if (posIndex != null) { + other.posIndex = (IntIndexInput.Index) posIndex.clone(); + } + return other; + } + + public void copy(TermState _other) { + super.copy(_other); + SepTermState other = (SepTermState) _other; + docIndex.set(other.docIndex); + if (other.posIndex != null) { + if (posIndex == null) { + posIndex = (IntIndexInput.Index) other.posIndex.clone(); + } else { + posIndex.set(other.posIndex); + } + } + if (other.freqIndex != null) { + if (freqIndex == null) { + freqIndex = (IntIndexInput.Index) other.freqIndex.clone(); + } else { + freqIndex.set(other.freqIndex); + } + } + skipOffset = other.skipOffset; + payloadOffset = other.payloadOffset; + } + } + + @Override + public TermState newTermState() throws IOException { + final SepTermState state = new SepTermState(); + state.docIndex = docIn.index(); + return state; + } + + @Override + public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException { + final SepTermState termState = (SepTermState) _termState; + + // read freq index + if (!fieldInfo.omitTermFreqAndPositions) { + if (termState.freqIndex == null) { + assert isIndexTerm; + termState.freqIndex = freqIn.index(); + termState.posIndex = posIn.index(); + } + termState.freqIndex.read(termsIn, isIndexTerm); + } + + // read doc index + termState.docIndex.read(termsIn, isIndexTerm); + + // read skip index + if (isIndexTerm) { + termState.skipOffset = termsIn.readVLong(); + } else if (termState.docFreq >= skipInterval) { + termState.skipOffset += termsIn.readVLong(); + } + + // read pos, payload index + if (!fieldInfo.omitTermFreqAndPositions) { + termState.posIndex.read(termsIn, isIndexTerm); + final long v = termsIn.readVLong(); + if (isIndexTerm) { + termState.payloadOffset = v; + } else { + termState.payloadOffset += v; + } + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException { + final SepTermState termState = (SepTermState) _termState; + if (reuse == null) { + return (new SepDocsEnum()).init(fieldInfo, termState, skipDocs); + } else { + return ((SepDocsEnum) reuse).init(fieldInfo, termState, skipDocs); + } + } + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + assert !fieldInfo.omitTermFreqAndPositions; + final SepTermState termState = (SepTermState) _termState; + if (reuse == null) { + return (new SepDocsAndPositionsEnum()).init(fieldInfo, termState, skipDocs); + } else { + return ((SepDocsAndPositionsEnum) reuse).init(fieldInfo, termState, skipDocs); + } + } + + class SepDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long freqStart; + + // TODO: -- should we do omitTF with 2 different enum classes? + private boolean omitTF; + private boolean storePayloads; + private Bits skipDocs; + private final IntIndexInput.Reader docReader; + private final IntIndexInput.Reader freqReader; + private long skipOffset; + + private final IntIndexInput.Index docIndex; + private final IntIndexInput.Index freqIndex; + private final IntIndexInput.Index posIndex; + + // TODO: -- should we do hasProx with 2 different enum classes? + + boolean skipped; + SepSkipListReader skipper; + + SepDocsEnum() throws IOException { + docReader = docIn.reader(); + docIndex = docIn.index(); + if (freqIn != null) { + freqReader = freqIn.reader(); + freqIndex = freqIn.index(); + } else { + freqReader = null; + freqIndex = null; + } + if (posIn != null) { + posIndex = posIn.index(); // only init this so skipper can read it + } else { + posIndex = null; + } + } + + SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException { + this.skipDocs = skipDocs; + omitTF = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + + // TODO: can't we only do this if consumer + // skipped consuming the previous docs? + docIndex.set(termState.docIndex); + docIndex.seek(docReader); + + skipOffset = termState.skipOffset; + + if (!omitTF) { + freqIndex.set(termState.freqIndex); + freqIndex.seek(freqReader); + } else { + freq = 1; + } + docFreq = termState.docFreq; + count = 0; + doc = 0; + skipped = false; + + return this; + } + + @Override + public int nextDoc() throws IOException { + + while(true) { + if (count == docFreq) { + return doc = NO_MORE_DOCS; + } + + count++; + + // Decode next doc + doc += docReader.next(); + + if (!omitTF) { + freq = freqReader.next(); + } + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + return doc; + } + + @Override + public int read() throws IOException { + // TODO: -- switch to bulk read api in IntIndexInput + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + doc += docReader.next(); + if (!omitTF) { + freq = freqReader.next(); + } + + if (skipDocs == null || !skipDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + i++; + } + } + return i; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (docFreq >= skipInterval) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This DocsEnum has never done any skipping + skipper = new SepSkipListReader((IndexInput) skipIn.clone(), + freqIn, + docIn, + posIn, + maxSkipLevels, skipInterval); + + } + + if (!skipped) { + // We haven't yet skipped for this posting + skipper.init(skipOffset, + docIndex, + freqIndex, + posIndex, + 0, + docFreq, + storePayloads); + skipper.setOmitTF(omitTF); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + // Skipper did move + if (!omitTF) { + skipper.getFreqIndex().seek(freqReader); + } + skipper.getDocIndex().seek(docReader); + count = newCount; + doc = skipper.getDoc(); + } + } + + // Now, linear scan for the rest: + do { + if (nextDoc() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + } while (target > doc); + + return doc; + } + } + + class SepDocsAndPositionsEnum extends DocsAndPositionsEnum { + int docFreq; + int doc; + int count; + int freq; + long freqStart; + + private boolean storePayloads; + private Bits skipDocs; + private final IntIndexInput.Reader docReader; + private final IntIndexInput.Reader freqReader; + private final IntIndexInput.Reader posReader; + private final IndexInput payloadIn; + private long skipOffset; + + private final IntIndexInput.Index docIndex; + private final IntIndexInput.Index freqIndex; + private final IntIndexInput.Index posIndex; + private long payloadOffset; + + private int pendingPosCount; + private int position; + private int payloadLength; + private long pendingPayloadBytes; + + private boolean skipped; + private SepSkipListReader skipper; + private boolean payloadPending; + private boolean posSeekPending; + + SepDocsAndPositionsEnum() throws IOException { + docReader = docIn.reader(); + docIndex = docIn.index(); + freqReader = freqIn.reader(); + freqIndex = freqIn.index(); + posReader = posIn.reader(); + posIndex = posIn.index(); + payloadIn = (IndexInput) SepPostingsReaderImpl.this.payloadIn.clone(); + } + + SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException { + this.skipDocs = skipDocs; + storePayloads = fieldInfo.storePayloads; + + // TODO: can't we only do this if consumer + // skipped consuming the previous docs? + docIndex.set(termState.docIndex); + docIndex.seek(docReader); + + freqIndex.set(termState.freqIndex); + freqIndex.seek(freqReader); + + posIndex.set(termState.posIndex); + posSeekPending = true; + //posIndex.seek(posReader); + + skipOffset = termState.skipOffset; + payloadOffset = termState.payloadOffset; + //payloadIn.seek(payloadOffset); + + docFreq = termState.docFreq; + count = 0; + doc = 0; + pendingPosCount = 0; + pendingPayloadBytes = 0; + skipped = false; + + return this; + } + + @Override + public int nextDoc() throws IOException { + + while(true) { + if (count == docFreq) { + return doc = NO_MORE_DOCS; + } + + count++; + + // TODO: maybe we should do the 1-bit trick for encoding + // freq=1 case? + + // Decode next doc + doc += docReader.next(); + + freq = freqReader.next(); + + pendingPosCount += freq; + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + position = 0; + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (docFreq >= skipInterval) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This DocsEnum has never done any skipping + skipper = new SepSkipListReader((IndexInput) skipIn.clone(), + freqIn, + docIn, + posIn, + maxSkipLevels, skipInterval); + } + + if (!skipped) { + // We haven't yet skipped for this posting + skipper.init(skipOffset, + docIndex, + freqIndex, + posIndex, + payloadOffset, + docFreq, + storePayloads); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + // Skipper did move + skipper.getFreqIndex().seek(freqReader); + skipper.getDocIndex().seek(docReader); + //skipper.getPosIndex().seek(posReader); + posIndex.set(skipper.getPosIndex()); + posSeekPending = true; + count = newCount; + doc = skipper.getDoc(); + //payloadIn.seek(skipper.getPayloadPointer()); + payloadOffset = skipper.getPayloadPointer(); + pendingPosCount = 0; + pendingPayloadBytes = 0; + payloadPending = false; + payloadLength = skipper.getPayloadLength(); + } + } + + // Now, linear scan for the rest: + do { + if (nextDoc() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + } while (target > doc); + + return doc; + } + + @Override + public int nextPosition() throws IOException { + if (posSeekPending) { + posIndex.seek(posReader); + payloadIn.seek(payloadOffset); + posSeekPending = false; + } + + // scan over any docs that were iterated without their + // positions + while (pendingPosCount > freq) { + final int code = posReader.next(); + if (storePayloads) { + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = posReader.next(); + assert payloadLength >= 0; + } + } + pendingPosCount--; + payloadPending = true; + position = 0; + pendingPayloadBytes += payloadLength; + } + + final int code = posReader.next(); + if (storePayloads) { + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = posReader.next(); + assert payloadLength >= 0; + } + position += code >> 1; + } else { + position += code; + } + + pendingPayloadBytes += payloadLength; + payloadPending = payloadLength > 0; + pendingPosCount--; + payloadPending = true; + assert pendingPosCount >= 0; + return position; + } + + @Override + public int getPayloadLength() { + return payloadLength; + } + + private BytesRef payload; + + @Override + public BytesRef getPayload() throws IOException { + if (!payloadPending) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + + assert pendingPayloadBytes >= payloadLength; + + if (pendingPayloadBytes > payloadLength) { + payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength)); + } + + if (payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[payloadLength]; + } else if (payload.bytes.length < payloadLength) { + payload.grow(payloadLength); + } + + payloadIn.readBytes(payload.bytes, 0, payloadLength); + payloadPending = false; + payload.length = payloadLength; + pendingPayloadBytes = 0; + return payload; + } + + @Override + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java new file mode 100644 index 00000000000..36c3fd834a1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java @@ -0,0 +1,287 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more +u * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Writes frq to .frq, docs to .doc, pos to .pos, payloads + * to .pyl, skip data to .skp + * + * @lucene.experimental */ +public final class SepPostingsWriterImpl extends StandardPostingsWriter { + final static String CODEC = "SepDocFreqSkip"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final IntIndexOutput freqOut; + final IntIndexOutput.Index freqIndex; + + final IntIndexOutput posOut; + final IntIndexOutput.Index posIndex; + + final IntIndexOutput docOut; + final IntIndexOutput.Index docIndex; + + final IndexOutput payloadOut; + + final IndexOutput skipOut; + IndexOutput termsOut; + + final SepSkipListWriter skipListWriter; + final int skipInterval; + final int maxSkipLevels; + final int totalNumDocs; + + boolean storePayloads; + boolean omitTF; + + // Starts a new term + long lastSkipStart; + + FieldInfo fieldInfo; + + int lastPayloadLength; + int lastPosition; + long payloadStart; + long lastPayloadStart; + int lastDocID; + int df; + + public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException { + super(); + + final String docFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.DOC_EXTENSION); + state.flushedFiles.add(docFileName); + docOut = factory.createOutput(state.directory, docFileName); + docIndex = docOut.index(); + + if (state.fieldInfos.hasProx()) { + final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.FREQ_EXTENSION); + state.flushedFiles.add(frqFileName); + freqOut = factory.createOutput(state.directory, frqFileName); + freqIndex = freqOut.index(); + + final String posFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.POS_EXTENSION); + posOut = factory.createOutput(state.directory, posFileName); + state.flushedFiles.add(posFileName); + posIndex = posOut.index(); + + // TODO: -- only if at least one field stores payloads? + final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.PAYLOAD_EXTENSION); + state.flushedFiles.add(payloadFileName); + payloadOut = state.directory.createOutput(payloadFileName); + + } else { + freqOut = null; + freqIndex = null; + posOut = null; + posIndex = null; + payloadOut = null; + } + + final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.SKIP_EXTENSION); + state.flushedFiles.add(skipFileName); + skipOut = state.directory.createOutput(skipFileName); + + totalNumDocs = state.numDocs; + + // TODO: -- abstraction violation + skipListWriter = new SepSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + freqOut, docOut, + posOut, payloadOut); + + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + } + + @Override + public void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + // TODO: -- just ask skipper to "start" here + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + } + + @Override + public void startTerm() throws IOException { + docIndex.mark(); + if (!omitTF) { + freqIndex.mark(); + posIndex.mark(); + payloadStart = payloadOut.getFilePointer(); + lastPayloadLength = -1; + } + skipListWriter.resetSkip(docIndex, freqIndex, posIndex); + } + + // TODO: -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + @Override + public void setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTermFreqAndPositions; + skipListWriter.setOmitTF(omitTF); + storePayloads = !omitTF && fieldInfo.storePayloads; + } + + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + + final int delta = docID - lastDocID; + + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % skipInterval) == 0) { + // TODO: -- awkward we have to make these two + // separate calls to skipper + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + lastDocID = docID; + docOut.write(delta); + if (!omitTF) { + freqOut.write(termDocFreq); + } + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert !omitTF; + + final int delta = position - lastPosition; + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + // TODO: explore whether we get better compression + // by not storing payloadLength into prox stream? + posOut.write((delta<<1)|1); + posOut.write(payloadLength); + } else { + posOut.write(delta << 1); + } + + if (payloadLength > 0) { + payloadOut.writeBytes(payload.bytes, payload.offset, payloadLength); + } + } else { + posOut.write(delta); + } + + lastPosition = position; + } + + /** Called when we are done adding positions & payloads */ + @Override + public void finishDoc() { + lastPosition = 0; + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + + long skipPos = skipOut.getFilePointer(); + + // TODO: -- wasteful we are counting this in two places? + assert docCount > 0; + assert docCount == df; + + // TODO: -- only do this if once (consolidate the + // conditional things that are written) + if (!omitTF) { + freqIndex.write(termsOut, isIndexTerm); + } + docIndex.write(termsOut, isIndexTerm); + + if (df >= skipInterval) { + skipListWriter.writeSkip(skipOut); + } + + if (isIndexTerm) { + termsOut.writeVLong(skipPos); + lastSkipStart = skipPos; + } else if (df >= skipInterval) { + termsOut.writeVLong(skipPos-lastSkipStart); + lastSkipStart = skipPos; + } + + if (!omitTF) { + posIndex.write(termsOut, isIndexTerm); + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(payloadStart); + } else { + termsOut.writeVLong(payloadStart-lastPayloadStart); + } + lastPayloadStart = payloadStart; + } + + lastDocID = 0; + df = 0; + } + + @Override + public void close() throws IOException { + try { + docOut.close(); + } finally { + try { + skipOut.close(); + } finally { + if (freqOut != null) { + try { + freqOut.close(); + } finally { + try { + posOut.close(); + } finally { + payloadOut.close(); + } + } + } + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java new file mode 100644 index 00000000000..3c8e324aa2d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java @@ -0,0 +1,205 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.codecs.MultiLevelSkipListReader; + +/** + * Implements the skip list reader for the default posting list format + * that stores positions and payloads. + * + * @lucene.experimental + */ + +// TODO: rewrite this as recursive classes? +class SepSkipListReader extends MultiLevelSkipListReader { + private boolean currentFieldStoresPayloads; + private IntIndexInput.Index freqIndex[]; + private IntIndexInput.Index docIndex[]; + private IntIndexInput.Index posIndex[]; + private long payloadPointer[]; + private int payloadLength[]; + + private final IntIndexInput.Index lastFreqIndex; + private final IntIndexInput.Index lastDocIndex; + // TODO: -- make private again + final IntIndexInput.Index lastPosIndex; + + private long lastPayloadPointer; + private int lastPayloadLength; + + SepSkipListReader(IndexInput skipStream, + IntIndexInput freqIn, + IntIndexInput docIn, + IntIndexInput posIn, + int maxSkipLevels, + int skipInterval) + throws IOException { + super(skipStream, maxSkipLevels, skipInterval); + if (freqIn != null) { + freqIndex = new IntIndexInput.Index[maxSkipLevels]; + } + docIndex = new IntIndexInput.Index[maxSkipLevels]; + if (posIn != null) { + posIndex = new IntIndexInput.Index[maxNumberOfSkipLevels]; + } + for(int i=0;i 0) { + if (freqIndex != null) { + freqIndex[level-1].set(freqIndex[level]); + } + docIndex[level-1].set(docIndex[level]); + if (posIndex != null) { + posIndex[level-1].set(posIndex[level]); + } + } + } + + IntIndexInput.Index getFreqIndex() { + return lastFreqIndex; + } + + IntIndexInput.Index getPosIndex() { + return lastPosIndex; + } + + IntIndexInput.Index getDocIndex() { + return lastDocIndex; + } + + @Override + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta; + if (currentFieldStoresPayloads) { + // the current field stores payloads. + // if the doc delta is odd then we have + // to read the current payload length + // because it differs from the length of the + // previous payload + delta = skipStream.readVInt(); + if ((delta & 1) != 0) { + payloadLength[level] = skipStream.readVInt(); + } + delta >>>= 1; + } else { + delta = skipStream.readVInt(); + } + if (!omitTF) { + freqIndex[level].read(skipStream, false); + } + docIndex[level].read(skipStream, false); + if (!omitTF) { + posIndex[level].read(skipStream, false); + payloadPointer[level] += skipStream.readVInt(); + } + + return delta; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java new file mode 100644 index 00000000000..866bac6e390 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java @@ -0,0 +1,197 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.codecs.MultiLevelSkipListWriter; + +// TODO: -- skip data should somehow be more local to the +// particular stream (doc, freq, pos, payload) + +/** + * Implements the skip list writer for the default posting list format + * that stores positions and payloads. + * + * @lucene.experimental + */ +class SepSkipListWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private int[] lastSkipPayloadLength; + private long[] lastSkipPayloadPointer; + + private IntIndexOutput.Index[] docIndex; + private IntIndexOutput.Index[] freqIndex; + private IntIndexOutput.Index[] posIndex; + + private IntIndexOutput freqOutput; + // TODO: -- private again + IntIndexOutput posOutput; + // TODO: -- private again + IndexOutput payloadOutput; + + private int curDoc; + private boolean curStorePayloads; + private int curPayloadLength; + private long curPayloadPointer; + + SepSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, + IntIndexOutput freqOutput, + IntIndexOutput docOutput, + IntIndexOutput posOutput, + IndexOutput payloadOutput) + throws IOException { + super(skipInterval, numberOfSkipLevels, docCount); + + this.freqOutput = freqOutput; + this.posOutput = posOutput; + this.payloadOutput = payloadOutput; + + lastSkipDoc = new int[numberOfSkipLevels]; + lastSkipPayloadLength = new int[numberOfSkipLevels]; + // TODO: -- also cutover normal IndexOutput to use getIndex()? + lastSkipPayloadPointer = new long[numberOfSkipLevels]; + + freqIndex = new IntIndexOutput.Index[numberOfSkipLevels]; + docIndex = new IntIndexOutput.Index[numberOfSkipLevels]; + posIndex = new IntIndexOutput.Index[numberOfSkipLevels]; + + for(int i=0;i DocSkip, FreqSkip, ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // DocSkip records the document number before every SkipInterval th document in TermFreqs. + // Document numbers are represented as differences from the previous value in the sequence. + // Case 2: current field stores payloads + // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // PayloadLength --> VInt + // In this case DocSkip/2 is the difference between + // the current and the previous value. If DocSkip + // is odd, then a PayloadLength encoded as VInt follows, + // if DocSkip is even, then it is assumed that the + // current payload length equals the length at the previous + // skip point + + assert !omitTF || !curStorePayloads; + + if (curStorePayloads) { + int delta = curDoc - lastSkipDoc[level]; + if (curPayloadLength == lastSkipPayloadLength[level]) { + // the current payload length equals the length at the previous skip point, + // so we don't store the length again + skipBuffer.writeVInt(delta << 1); + } else { + // the payload length is different from the previous one. We shift the DocSkip, + // set the lowest bit and store the current payload length as VInt. + skipBuffer.writeVInt(delta << 1 | 1); + skipBuffer.writeVInt(curPayloadLength); + lastSkipPayloadLength[level] = curPayloadLength; + } + } else { + // current field does not store payloads + skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); + } + + if (!omitTF) { + freqIndex[level].mark(); + freqIndex[level].write(skipBuffer, false); + } + docIndex[level].mark(); + docIndex[level].write(skipBuffer, false); + if (!omitTF) { + posIndex[level].mark(); + posIndex[level].write(skipBuffer, false); + skipBuffer.writeVInt((int) (curPayloadPointer - lastSkipPayloadPointer[level])); + } + + lastSkipDoc[level] = curDoc; + lastSkipPayloadPointer[level] = curPayloadPointer; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntFactory.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntFactory.java new file mode 100644 index 00000000000..02a70f0371d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntFactory.java @@ -0,0 +1,33 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import java.io.IOException; + +/** @lucene.experimental */ +public class SingleIntFactory extends IntStreamFactory { + @Override + public IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException { + return new SingleIntIndexInput(dir, fileName, readBufferSize); + } + @Override + public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException { + return new SingleIntIndexOutput(dir, fileName); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java new file mode 100644 index 00000000000..c2e03c68edb --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java @@ -0,0 +1,114 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.CodecUtil; + +/** Reads IndexInputs written with {@link + * SingleIntIndexOutput}. NOTE: this class is just for + * demonstration puprposes (it is a very slow way to read a + * block of ints). + * + * @lucene.experimental + */ +public class SingleIntIndexInput extends IntIndexInput { + private final IndexInput in; + + public SingleIntIndexInput(Directory dir, String fileName, int readBufferSize) + throws IOException { + in = dir.openInput(fileName, readBufferSize); + CodecUtil.checkHeader(in, SingleIntIndexOutput.CODEC, SingleIntIndexOutput.VERSION_START); + } + + @Override + public Reader reader() throws IOException { + return new Reader((IndexInput) in.clone()); + } + + @Override + public void close() throws IOException { + in.close(); + } + + public static class Reader extends IntIndexInput.Reader { + // clone: + private final IndexInput in; + + public Reader(IndexInput in) { + this.in = in; + } + + /** Reads next single int */ + @Override + public int next() throws IOException { + return in.readVInt(); + } + } + + class Index extends IntIndexInput.Index { + private long fp; + // nocmmit: only for asserts + boolean first = true; + + @Override + public void read(IndexInput indexIn, boolean absolute) + throws IOException { + if (absolute) { + fp = indexIn.readVLong(); + first = false; + } else { + assert !first; + fp += indexIn.readVLong(); + } + } + + @Override + public void set(IntIndexInput.Index other) { + fp = ((Index) other).fp; + first = false; + } + + @Override + public void seek(IntIndexInput.Reader other) throws IOException { + ((Reader) other).in.seek(fp); + } + + @Override + public String toString() { + return Long.toString(fp); + } + + @Override + public Object clone() { + Index other = new Index(); + other.first = first; + other.fp = fp; + return other; + } + } + + @Override + public Index index() { + return new Index(); + } +} + diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java new file mode 100644 index 00000000000..ebfb36c599f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java @@ -0,0 +1,84 @@ +package org.apache.lucene.index.codecs.sep; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.CodecUtil; + +import java.io.IOException; + +/** Writes ints directly to the file (not in blocks) as + * vInt. + * + * @lucene.experimental +*/ +public class SingleIntIndexOutput extends IntIndexOutput { + private final IndexOutput out; + final static String CODEC = "SINGLE_INTS"; + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + public SingleIntIndexOutput(Directory dir, String fileName) throws IOException { + out = dir.createOutput(fileName); + CodecUtil.writeHeader(out, CODEC, VERSION_CURRENT); + } + + /** Write an int to the primary file */ + @Override + public void write(int v) throws IOException { + out.writeVInt(v); + } + + @Override + public Index index() { + return new Index(); + } + + @Override + public void close() throws IOException { + out.close(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + long lastFP; + @Override + public void mark() { + fp = out.getFilePointer(); + } + @Override + public void set(IntIndexOutput.Index other) { + lastFP = fp = ((Index) other).fp; + } + @Override + public void write(IndexOutput indexOut, boolean absolute) + throws IOException { + if (absolute) { + indexOut.writeVLong(fp); + } else { + indexOut.writeVLong(fp - lastFP); + } + lastFP = fp; + } + @Override + public String toString() { + return Long.toString(fp); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/DefaultSkipListReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListReader.java similarity index 87% rename from lucene/src/java/org/apache/lucene/index/DefaultSkipListReader.java rename to lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListReader.java index 3c6e8568b51..32f52c8faeb 100644 --- a/lucene/src/java/org/apache/lucene/index/DefaultSkipListReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListReader.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.standard; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,14 +20,15 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.index.codecs.MultiLevelSkipListReader; import org.apache.lucene.store.IndexInput; /** * Implements the skip list reader for the default posting list format * that stores positions and payloads. - * + * @lucene.experimental */ -class DefaultSkipListReader extends MultiLevelSkipListReader { +public class DefaultSkipListReader extends MultiLevelSkipListReader { private boolean currentFieldStoresPayloads; private long freqPointer[]; private long proxPointer[]; @@ -38,14 +39,14 @@ class DefaultSkipListReader extends MultiLevelSkipListReader { private int lastPayloadLength; - DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { + public DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { super(skipStream, maxSkipLevels, skipInterval); freqPointer = new long[maxSkipLevels]; proxPointer = new long[maxSkipLevels]; payloadLength = new int[maxSkipLevels]; } - - void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { + + public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { super.init(skipPointer, df); this.currentFieldStoresPayloads = storesPayloads; lastFreqPointer = freqBasePointer; @@ -58,20 +59,20 @@ class DefaultSkipListReader extends MultiLevelSkipListReader { /** Returns the freq pointer of the doc to which the last call of * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ - long getFreqPointer() { + public long getFreqPointer() { return lastFreqPointer; } /** Returns the prox pointer of the doc to which the last call of * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ - long getProxPointer() { + public long getProxPointer() { return lastProxPointer; } /** Returns the payload length of the payload stored just before * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} * has skipped. */ - int getPayloadLength() { + public int getPayloadLength() { return lastPayloadLength; } diff --git a/lucene/src/java/org/apache/lucene/index/DefaultSkipListWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java similarity index 89% rename from lucene/src/java/org/apache/lucene/index/DefaultSkipListWriter.java rename to lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java index 1ca28330aff..c2435c878bd 100644 --- a/lucene/src/java/org/apache/lucene/index/DefaultSkipListWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.index.codecs.standard; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,14 +21,15 @@ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.codecs.MultiLevelSkipListWriter; /** * Implements the skip list writer for the default posting list format * that stores positions and payloads. - * + * @lucene.experimental */ -class DefaultSkipListWriter extends MultiLevelSkipListWriter { +public class DefaultSkipListWriter extends MultiLevelSkipListWriter { private int[] lastSkipDoc; private int[] lastSkipPayloadLength; private long[] lastSkipFreqPointer; @@ -42,8 +43,8 @@ class DefaultSkipListWriter extends MultiLevelSkipListWriter { private int curPayloadLength; private long curFreqPointer; private long curProxPointer; - - DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { + + public DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { super(skipInterval, numberOfSkipLevels, docCount); this.freqOutput = freqOutput; this.proxOutput = proxOutput; @@ -54,18 +55,10 @@ class DefaultSkipListWriter extends MultiLevelSkipListWriter { lastSkipProxPointer = new long[numberOfSkipLevels]; } - void setFreqOutput(IndexOutput freqOutput) { - this.freqOutput = freqOutput; - } - - void setProxOutput(IndexOutput proxOutput) { - this.proxOutput = proxOutput; - } - /** * Sets the values for the current skip data. */ - void setSkipData(int doc, boolean storePayloads, int payloadLength) { + public void setSkipData(int doc, boolean storePayloads, int payloadLength) { this.curDoc = doc; this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; @@ -73,9 +66,9 @@ class DefaultSkipListWriter extends MultiLevelSkipListWriter { if (proxOutput != null) this.curProxPointer = proxOutput.getFilePointer(); } - + @Override - protected void resetSkip() { + public void resetSkip() { super.resetSkip(); Arrays.fill(lastSkipDoc, 0); Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list @@ -127,7 +120,6 @@ class DefaultSkipListWriter extends MultiLevelSkipListWriter { skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); lastSkipDoc[level] = curDoc; - //System.out.println("write doc at level " + level + ": " + curDoc); lastSkipFreqPointer[level] = curFreqPointer; lastSkipProxPointer[level] = curProxPointer; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java new file mode 100644 index 00000000000..59a654cc1f3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java @@ -0,0 +1,48 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +// Handles reading incremental UTF8 encoded terms +final class DeltaBytesReader { + final BytesRef term = new BytesRef(); + final IndexInput in; + + DeltaBytesReader(IndexInput in) { + this.in = in; + term.bytes = new byte[10]; + } + + void reset(BytesRef text) { + term.copy(text); + } + + void read() throws IOException { + final int start = in.readVInt(); + final int suffix = in.readVInt(); + assert start <= term.length: "start=" + start + " length=" + term.length; + final int newLength = start+suffix; + term.grow(newLength); + in.readBytes(term.bytes, start, suffix); + term.length = newLength; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java new file mode 100644 index 00000000000..4277efd9214 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java @@ -0,0 +1,67 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +final class DeltaBytesWriter { + + private byte[] lastBytes = new byte[10]; + private int lastLength; + final IndexOutput out; + + DeltaBytesWriter(IndexOutput out) { + this.out = out; + } + + void reset() { + lastLength = 0; + } + + void write(BytesRef text) throws IOException { + int start = 0; + int upto = text.offset; + final int length = text.length; + final byte[] bytes = text.bytes; + + final int limit = length < lastLength ? length : lastLength; + while(start < limit) { + if (bytes[upto] != lastBytes[start]) + break; + start++; + upto++; + } + + final int suffix = length - start; + out.writeVInt(start); // prefix + out.writeVInt(suffix); // suffix + out.writeBytes(bytes, upto, suffix); + if (lastBytes.length < length) { + lastBytes = ArrayUtil.grow(lastBytes, length); + } + // TODO: is this copy really necessary? I don't think + // caller actually modifies these bytes, so we can save + // by reference? + System.arraycopy(bytes, upto, lastBytes, start, suffix); + lastLength = length; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java new file mode 100644 index 00000000000..b3ddc986ef5 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java @@ -0,0 +1,129 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexInput; + +import java.util.List; +import java.util.ArrayList; +import java.io.Closeable; +import java.io.IOException; + +/** Represents a logical byte[] as a series of pages. You + * can write-once into the logical byte[], using copy, and + * then retrieve slices (BytesRef) into it using fill. */ +class PagedBytes implements Closeable { + private final List blocks = new ArrayList(); + private final int blockSize; + private final int blockBits; + private final int blockMask; + private int upto; + private byte[] currentBlock; + private final CloseableThreadLocal threadBuffers = new CloseableThreadLocal(); + + private static final byte[] EMPTY_BYTES = new byte[0]; + + /** 1< 0) { + int left = blockSize - upto; + if (left == 0) { + if (currentBlock != null) { + blocks.add(currentBlock); + } + currentBlock = new byte[blockSize]; + upto = 0; + left = blockSize; + } + if (left < byteCount) { + in.readBytes(currentBlock, upto, left, false); + upto = blockSize; + byteCount -= left; + } else { + in.readBytes(currentBlock, upto, (int) byteCount, false); + upto += byteCount; + byteCount = 0; + } + } + } + + /** Commits final byte[], trimming it if necessary. */ + public void finish() { + if (upto < blockSize) { + final byte[] newBlock = new byte[upto]; + System.arraycopy(currentBlock, 0, newBlock, 0, upto); + currentBlock = newBlock; + } + if (currentBlock == null) { + currentBlock = EMPTY_BYTES; + } + blocks.add(currentBlock); + currentBlock = null; + } + + public long getPointer() { + if (currentBlock == null) { + return 0; + } else { + return (blocks.size() * ((long) blockSize)) + upto; + } + } + + /** Get a slice out of the byte array. */ + public void fill(BytesRef b, long start, int length) { + assert length >= 0: "length=" + length; + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + b.length = length; + if (blockSize - offset >= length) { + // Within block + b.bytes = blocks.get(index); + b.offset = offset; + } else { + // Split + byte[] buffer = threadBuffers.get(); + if (buffer == null) { + buffer = new byte[length]; + threadBuffers.set(buffer); + } else if (buffer.length < length) { + buffer = ArrayUtil.grow(buffer, length); + threadBuffers.set(buffer); + } + b.bytes = buffer; + b.offset = 0; + System.arraycopy(blocks.get(index), offset, buffer, 0, blockSize-offset); + System.arraycopy(blocks.get(1+index), 0, buffer, blockSize-offset, length-(blockSize-offset)); + } + } + + public void close() { + threadBuffers.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java new file mode 100644 index 00000000000..00261968950 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java @@ -0,0 +1,442 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.packed.PackedInts; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Collection; +import java.util.Comparator; +import java.io.IOException; + +/** + * Uses a simplistic format to record terms dict index + * information. Limititations: + * + * - Index for all fields is loaded entirely into RAM up + * front + * - Index is stored in RAM using shared byte[] that + * wastefully expand every term. Using FST to share + * common prefix & suffix would save RAM. + * - Index is taken at regular numTerms (every 128 by + * default); might be better to do it by "net docFreqs" + * encountered, so that for spans of low-freq terms we + * take index less often. + * + * A better approach might be something similar to how + * postings are encoded, w/ multi-level skips. Ie, load all + * terms index data into memory, as a single large compactly + * encoded stream (eg delta bytes + delta offset). Index + * that w/ multi-level skipper. Then to look up a term is + * the equivalent binary search, using the skipper instead, + * while data remains compressed in memory. + */ + +import org.apache.lucene.index.IndexFileNames; + +/** @lucene.experimental */ +public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { + + // NOTE: long is overkill here, since this number is 128 + // by default and only indexDivisor * 128 if you change + // the indexDivisor at search time. But, we use this in a + // number of places to multiply out the actual ord, and we + // will overflow int during those multiplies. So to avoid + // having to upgrade each multiple to long in multiple + // places (error proned), we use long here: + private long totalIndexInterval; + + private int indexDivisor; + final private int indexInterval; + + // Closed if indexLoaded is true: + final private IndexInput in; + private volatile boolean indexLoaded; + + private final Comparator termComp; + + private final static int PAGED_BYTES_BITS = 15; + + // all fields share this single logical byte[] + private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); + + final HashMap fields = new HashMap(); + + public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp) + throws IOException { + + this.termComp = termComp; + + IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION)); + + boolean success = false; + + try { + CodecUtil.checkHeader(in, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START); + + final long dirOffset = in.readLong(); + + indexInterval = in.readInt(); + this.indexDivisor = indexDivisor; + + if (indexDivisor == -1) { + totalIndexInterval = indexInterval; + } else { + // In case terms index gets loaded, later, on demand + totalIndexInterval = indexInterval * indexDivisor; + } + + // Read directory + in.seek(dirOffset); + + final int numFields = in.readInt(); + + for(int i=0;i= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment; + if (numIndexTerms > 0) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart)); + } + } + success = true; + } finally { + if (indexDivisor != -1) { + in.close(); + this.in = null; + if (success) { + indexLoaded = true; + } + termBytes.finish(); + } else { + this.in = in; + } + } + } + + private final class FieldIndexReader extends FieldReader { + + final private FieldInfo fieldInfo; + + private volatile CoreFieldIndex coreIndex; + + private final IndexInput in; + + private final long indexStart; + private final long termsStart; + private final long packedIndexStart; + private final long packedOffsetsStart; + + private final int numIndexTerms; + + public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart, + long packedOffsetsStart) throws IOException { + + this.fieldInfo = fieldInfo; + this.in = in; + this.termsStart = termsStart; + this.indexStart = indexStart; + this.packedIndexStart = packedIndexStart; + this.packedOffsetsStart = packedOffsetsStart; + this.numIndexTerms = numIndexTerms; + + // We still create the indexReader when indexDivisor + // is -1, so that StandardTermsDictReader can call + // isIndexTerm for each field: + if (indexDivisor != -1) { + coreIndex = new CoreFieldIndex(indexStart, + termsStart, + packedIndexStart, + packedOffsetsStart, + numIndexTerms); + + } + } + + public void loadTermsIndex() throws IOException { + if (coreIndex == null) { + coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms); + } + } + + @Override + public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) { + if (onlyLoaded) { + return ord % totalIndexInterval == 0; + } else { + return ord % indexInterval == 0; + } + } + + @Override + public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException { + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } else { + return coreIndex.nextIndexTerm(ord, result); + } + } + + @Override + public final void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException { + // You must call loadTermsIndex if you had specified -1 for indexDivisor + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } + coreIndex.getIndexOffset(term, result); + } + + @Override + public final void getIndexOffset(long ord, TermsIndexResult result) throws IOException { + // You must call loadTermsIndex if you had specified -1 for indexDivisor + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } + coreIndex.getIndexOffset(ord, result); + } + + private final class CoreFieldIndex { + + final private long termBytesStart; + + // offset into index termBytes + final PackedInts.Reader termOffsets; + + // index pointers into main terms dict + final PackedInts.Reader termsDictOffsets; + + final int numIndexTerms; + + final long termsStart; + + public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException { + + this.termsStart = termsStart; + termBytesStart = termBytes.getPointer(); + + IndexInput clone = (IndexInput) in.clone(); + clone.seek(indexStart); + + // -1 is passed to mean "don't load term index", but + // if we are then later loaded it's overwritten with + // a real value + assert indexDivisor > 0; + + this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor; + + assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor; + + if (indexDivisor == 1) { + // Default (load all index terms) is fast -- slurp in the images from disk: + + try { + final long numTermBytes = packedIndexStart - indexStart; + termBytes.copy(clone, numTermBytes); + + // records offsets into main terms dict file + termsDictOffsets = PackedInts.getReader(clone); + assert termsDictOffsets.size() == numIndexTerms; + + // records offsets into byte[] term data + termOffsets = PackedInts.getReader(clone); + assert termOffsets.size() == 1+numIndexTerms; + } finally { + clone.close(); + } + } else { + // Get packed iterators + final IndexInput clone1 = (IndexInput) in.clone(); + final IndexInput clone2 = (IndexInput) in.clone(); + + try { + // Subsample the index terms + clone1.seek(packedIndexStart); + final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1); + + clone2.seek(packedOffsetsStart); + final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2); + + // TODO: often we can get by w/ fewer bits per + // value, below.. .but this'd be more complex: + // we'd have to try @ fewer bits and then grow + // if we overflowed it. + + PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue()); + PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue()); + + termsDictOffsets = termsDictOffsetsM; + termOffsets = termOffsetsM; + + int upto = 0; + + long termOffsetUpto = 0; + + while(upto < this.numIndexTerms) { + // main file offset copies straight over + termsDictOffsetsM.set(upto, termsDictOffsetsIter.next()); + + termOffsetsM.set(upto, termOffsetUpto); + upto++; + + long termOffset = termOffsetsIter.next(); + long nextTermOffset = termOffsetsIter.next(); + final int numTermBytes = (int) (nextTermOffset - termOffset); + + clone.seek(indexStart + termOffset); + assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length(); + assert indexStart + termOffset + numTermBytes < clone.length(); + + termBytes.copy(clone, numTermBytes); + termOffsetUpto += numTermBytes; + + // skip terms: + termsDictOffsetsIter.next(); + for(int i=0;i= lo) { + int mid = (lo + hi) >>> 1; + + final long offset = termOffsets.get(mid); + final int length = (int) (termOffsets.get(1+mid) - offset); + termBytes.fill(result.term, termBytesStart + offset, length); + + int delta = termComp.compare(term, result.term); + if (delta < 0) { + hi = mid - 1; + } else if (delta > 0) { + lo = mid + 1; + } else { + assert mid >= 0; + result.position = mid*totalIndexInterval; + result.offset = termsStart + termsDictOffsets.get(mid); + return; + } + } + if (hi < 0) { + assert hi == -1; + hi = 0; + } + + final long offset = termOffsets.get(hi); + final int length = (int) (termOffsets.get(1+hi) - offset); + termBytes.fill(result.term, termBytesStart + offset, length); + + result.position = hi*totalIndexInterval; + result.offset = termsStart + termsDictOffsets.get(hi); + } + + public final void getIndexOffset(long ord, TermsIndexResult result) throws IOException { + int idx = (int) (ord / totalIndexInterval); + // caller must ensure ord is in bounds + assert idx < numIndexTerms; + fillResult(idx, result); + } + } + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + if (!indexLoaded) { + + this.indexDivisor = indexDivisor; + this.totalIndexInterval = indexInterval * indexDivisor; + + Iterator it = fields.values().iterator(); + while(it.hasNext()) { + it.next().loadTermsIndex(); + } + + indexLoaded = true; + in.close(); + termBytes.finish(); + } + } + + @Override + public FieldReader getField(FieldInfo fieldInfo) { + return fields.get(fieldInfo); + } + + public static void files(Directory dir, SegmentInfo info, Collection files) { + files.add(IndexFileNames.segmentFileName(info.name, StandardCodec.TERMS_INDEX_EXTENSION)); + } + + public static void getIndexExtensions(Collection extensions) { + extensions.add(StandardCodec.TERMS_INDEX_EXTENSION); + } + + @Override + public void getExtensions(Collection extensions) { + getIndexExtensions(extensions); + } + + @Override + public void close() throws IOException { + if (in != null && !indexLoaded) { + in.close(); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java new file mode 100644 index 00000000000..09bfbcd37a3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java @@ -0,0 +1,186 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +import java.util.List; +import java.util.ArrayList; +import java.io.IOException; + +/** @lucene.experimental */ +public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter { + final private IndexOutput out; + + final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX"; + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final private int termIndexInterval; + + private final List fields = new ArrayList(); + private final FieldInfos fieldInfos; // unread + private IndexOutput termsOut; + + public SimpleStandardTermsIndexWriter(SegmentWriteState state) throws IOException { + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, StandardCodec.TERMS_INDEX_EXTENSION); + state.flushedFiles.add(indexFileName); + termIndexInterval = state.termIndexInterval; + out = state.directory.createOutput(indexFileName); + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + fieldInfos = state.fieldInfos; + + // Placeholder for dir offset + out.writeLong(0); + out.writeInt(termIndexInterval); + } + + @Override + public void setTermsOutput(IndexOutput termsOut) { + this.termsOut = termsOut; + } + + @Override + public FieldWriter addField(FieldInfo field) { + SimpleFieldWriter writer = new SimpleFieldWriter(field); + fields.add(writer); + return writer; + } + + private class SimpleFieldWriter extends FieldWriter { + final FieldInfo fieldInfo; + int numIndexTerms; + final long indexStart; + final long termsStart; + long packedIndexStart; + long packedOffsetsStart; + private int numTerms; + + // TODO: we could conceivably make a PackedInts wrapper + // that auto-grows... then we wouldn't force 6 bytes RAM + // per index term: + private short[] termLengths; + private int[] termsPointerDeltas; + private long lastTermsPointer; + private long totTermLength; + + SimpleFieldWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + indexStart = out.getFilePointer(); + termsStart = lastTermsPointer = termsOut.getFilePointer(); + termLengths = new short[0]; + termsPointerDeltas = new int[0]; + } + + @Override + public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { + // First term is first indexed term: + if (0 == (numTerms++ % termIndexInterval)) { + + // write full bytes + out.writeBytes(text.bytes, text.offset, text.length); + + if (termLengths.length == numIndexTerms) { + termLengths = ArrayUtil.grow(termLengths); + } + if (termsPointerDeltas.length == numIndexTerms) { + termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas); + } + + // save delta terms pointer + final long fp = termsOut.getFilePointer(); + termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer); + lastTermsPointer = fp; + + // save term length (in bytes) + assert text.length <= Short.MAX_VALUE; + termLengths[numIndexTerms] = (short) text.length; + + totTermLength += text.length; + + numIndexTerms++; + return true; + } else { + return false; + } + } + + @Override + public void finish() throws IOException { + + // write primary terms dict offsets + packedIndexStart = out.getFilePointer(); + + final long maxValue = termsOut.getFilePointer(); + PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue)); + + // relative to our indexStart + long upto = 0; + for(int i=0;i files) throws IOException { + StandardPostingsReaderImpl.files(dir, segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + getStandardExtensions(extensions); + } + + public static void getStandardExtensions(Set extensions) { + extensions.add(FREQ_EXTENSION); + extensions.add(PROX_EXTENSION); + StandardTermsDictReader.getExtensions(extensions); + SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java new file mode 100644 index 00000000000..ede7c877526 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java @@ -0,0 +1,56 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; + +/** StandardTermsDictReader interacts with a single instance + * of this to manage creation of {@link DocsEnum} and + * {@link DocsAndPositionsEnum} instances. It provides an + * IndexInput (termsIn) where this class may read any + * previously stored data that it had written in its + * corresponding {@link StandardPostingsWriter} at indexing + * time. + * @lucene.experimental */ + +public abstract class StandardPostingsReader implements Closeable { + + public abstract void init(IndexInput termsIn) throws IOException; + + /** Return a newly created empty TermState */ + public abstract TermState newTermState() throws IOException; + + public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException; + + /** Must fully consume state, since after this call that + * TermState may be reused. */ + public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException; + + /** Must fully consume state, since after this call that + * TermState may be reused. */ + public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; + + public abstract void close() throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java new file mode 100644 index 00000000000..b23ca647c80 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java @@ -0,0 +1,594 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Concrete class that reads the current doc/freq/skip + * postings format. + * @lucene.experimental */ + +public class StandardPostingsReaderImpl extends StandardPostingsReader { + + private final IndexInput freqIn; + private final IndexInput proxIn; + + int skipInterval; + int maxSkipLevels; + + public StandardPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, StandardCodec.FREQ_EXTENSION), + readBufferSize); + if (segmentInfo.getHasProx()) { + boolean success = false; + try { + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, StandardCodec.PROX_EXTENSION), + readBufferSize); + success = true; + } finally { + if (!success) { + freqIn.close(); + } + } + } else { + proxIn = null; + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, StandardCodec.FREQ_EXTENSION)); + if (segmentInfo.getHasProx()) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, StandardCodec.PROX_EXTENSION)); + } + } + + @Override + public void init(IndexInput termsIn) throws IOException { + + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, StandardPostingsWriterImpl.CODEC, StandardPostingsWriterImpl.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + } + + private static class DocTermState extends TermState { + long freqOffset; + long proxOffset; + int skipOffset; + + public Object clone() { + DocTermState other = (DocTermState) super.clone(); + other.freqOffset = freqOffset; + other.proxOffset = proxOffset; + other.skipOffset = skipOffset; + return other; + } + + public void copy(TermState _other) { + super.copy(_other); + DocTermState other = (DocTermState) _other; + freqOffset = other.freqOffset; + proxOffset = other.proxOffset; + skipOffset = other.skipOffset; + } + + public String toString() { + return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset; + } + } + + @Override + public TermState newTermState() { + return new DocTermState(); + } + + @Override + public void close() throws IOException { + try { + if (freqIn != null) { + freqIn.close(); + } + } finally { + if (proxIn != null) { + proxIn.close(); + } + } + } + + @Override + public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) + throws IOException { + + final DocTermState docTermState = (DocTermState) termState; + + if (isIndexTerm) { + docTermState.freqOffset = termsIn.readVLong(); + } else { + docTermState.freqOffset += termsIn.readVLong(); + } + + if (docTermState.docFreq >= skipInterval) { + docTermState.skipOffset = termsIn.readVInt(); + } else { + docTermState.skipOffset = 0; + } + + if (!fieldInfo.omitTermFreqAndPositions) { + if (isIndexTerm) { + docTermState.proxOffset = termsIn.readVLong(); + } else { + docTermState.proxOffset += termsIn.readVLong(); + } + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException { + final SegmentDocsEnum docsEnum; + if (reuse == null) { + docsEnum = new SegmentDocsEnum(freqIn); + } else { + docsEnum = (SegmentDocsEnum) reuse; + } + return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } + final SegmentDocsAndPositionsEnum docsEnum; + if (reuse == null) { + docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); + } else { + docsEnum = (SegmentDocsAndPositionsEnum) reuse; + } + return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); + } + + // Decodes only docs + private class SegmentDocsEnum extends DocsEnum { + final IndexInput freqIn; + + boolean omitTF; // does current field omit term freq? + boolean storePayloads; // does current field store payloads? + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + + Bits skipDocs; + + long freqOffset; + int skipOffset; + + boolean skipped; + DefaultSkipListReader skipper; + + public SegmentDocsEnum(IndexInput freqIn) throws IOException { + this.freqIn = (IndexInput) freqIn.clone(); + } + + public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { + omitTF = fieldInfo.omitTermFreqAndPositions; + if (omitTF) { + freq = 1; + } + storePayloads = fieldInfo.storePayloads; + this.skipDocs = skipDocs; + freqOffset = termState.freqOffset; + skipOffset = termState.skipOffset; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + limit = termState.docFreq; + ord = 0; + doc = 0; + + skipped = false; + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + return doc; + } + + @Override + public int read() throws IOException { + + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + int i = 0; + final int length = docs.length; + while (i < length && ord < limit) { + ord++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset + skipOffset, + freqOffset, 0, + limit, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + } + } + + // scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + } + + // Decodes docs & positions + private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum { + private final IndexInput freqIn; + private final IndexInput proxIn; + + boolean storePayloads; // does current field store payloads? + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + int position; + + Bits skipDocs; + + long freqOffset; + int skipOffset; + long proxOffset; + + int posPendingCount; + int payloadLength; + boolean payloadPending; + + boolean skipped; + DefaultSkipListReader skipper; + private BytesRef payload; + private long lazyProxPointer; + + public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { + this.freqIn = (IndexInput) freqIn.clone(); + this.proxIn = (IndexInput) proxIn.clone(); + } + + public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { + assert !fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + if (storePayloads && payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[1]; + } + + this.skipDocs = skipDocs; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + lazyProxPointer = termState.proxOffset; + + limit = termState.docFreq; + ord = 0; + doc = 0; + position = 0; + + skipped = false; + posPendingCount = 0; + payloadPending = false; + + freqOffset = termState.freqOffset; + proxOffset = termState.proxOffset; + skipOffset = termState.skipOffset; + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + posPendingCount += freq; + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + position = 0; + + return doc; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped, since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset+skipOffset, + freqOffset, proxOffset, + limit, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + lazyProxPointer = skipper.getProxPointer(); + posPendingCount = 0; + position = 0; + payloadPending = false; + payloadLength = skipper.getPayloadLength(); + } + } + + // Now, linear scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + + public int nextPosition() throws IOException { + + if (lazyProxPointer != -1) { + proxIn.seek(lazyProxPointer); + lazyProxPointer = -1; + } + + if (payloadPending && payloadLength > 0) { + // payload of last position as never retrieved -- skip it + proxIn.seek(proxIn.getFilePointer() + payloadLength); + payloadPending = false; + } + + // scan over any docs that were iterated without their positions + while(posPendingCount > freq) { + + final int code = proxIn.readVInt(); + + if (storePayloads) { + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; + proxIn.seek(proxIn.getFilePointer() + payloadLength); + } + + posPendingCount--; + position = 0; + payloadPending = false; + } + + // read next position + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + // payload wasn't retrieved for last position + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else { + position += proxIn.readVInt(); + } + + posPendingCount--; + + assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount; + + return position; + } + + /** Returns length of payload at current position */ + public int getPayloadLength() { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + return payloadLength; + } + + /** Returns the payload at this position, or null if no + * payload was indexed. */ + public BytesRef getPayload() throws IOException { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + if (!payloadPending) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + if (payloadLength > payload.bytes.length) { + payload.grow(payloadLength); + } + proxIn.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + payloadPending = false; + + return payload; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java new file mode 100644 index 00000000000..784c4a3cc70 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java @@ -0,0 +1,43 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.PostingsConsumer; + +/** + * @lucene.experimental + */ + +public abstract class StandardPostingsWriter extends PostingsConsumer implements Closeable { + + public abstract void start(IndexOutput termsOut) throws IOException; + + public abstract void startTerm() throws IOException; + + /** Finishes the current term */ + public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + + public abstract void setField(FieldInfo fieldInfo); + + public abstract void close() throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java new file mode 100644 index 00000000000..ed4d771f78f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java @@ -0,0 +1,234 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Consumes doc & freq, writing them using the current + * index file format */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** @lucene.experimental */ +public final class StandardPostingsWriterImpl extends StandardPostingsWriter { + final static String CODEC = "StandardPostingsWriterImpl"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final IndexOutput freqOut; + final IndexOutput proxOut; + final DefaultSkipListWriter skipListWriter; + final int skipInterval; + final int maxSkipLevels; + final int totalNumDocs; + IndexOutput termsOut; + + boolean omitTermFreqAndPositions; + boolean storePayloads; + // Starts a new term + long lastFreqStart; + long freqStart; + long lastProxStart; + long proxStart; + FieldInfo fieldInfo; + int lastPayloadLength; + int lastPosition; + + public StandardPostingsWriterImpl(SegmentWriteState state) throws IOException { + super(); + String fileName = IndexFileNames.segmentFileName(state.segmentName, StandardCodec.FREQ_EXTENSION); + state.flushedFiles.add(fileName); + freqOut = state.directory.createOutput(fileName); + + if (state.fieldInfos.hasProx()) { + // At least one field does not omit TF, so create the + // prox file + fileName = IndexFileNames.segmentFileName(state.segmentName, StandardCodec.PROX_EXTENSION); + state.flushedFiles.add(fileName); + proxOut = state.directory.createOutput(fileName); + } else { + // Every field omits TF so we will write no prox file + proxOut = null; + } + + totalNumDocs = state.numDocs; + + skipListWriter = new DefaultSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + freqOut, + proxOut); + + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + } + + @Override + public void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + } + + @Override + public void startTerm() { + freqStart = freqOut.getFilePointer(); + if (proxOut != null) { + proxStart = proxOut.getFilePointer(); + // force first payload to write its length + lastPayloadLength = -1; + } + skipListWriter.resetSkip(); + } + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + @Override + public void setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + int lastDocID; + int df; + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + + final int delta = docID - lastDocID; + + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + lastDocID = docID; + if (omitTermFreqAndPositions) { + freqOut.writeVInt(delta); + } else if (1 == termDocFreq) { + freqOut.writeVInt((delta<<1) | 1); + } else { + freqOut.writeVInt(delta<<1); + freqOut.writeVInt(termDocFreq); + } + + lastPosition = 0; + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; + assert proxOut != null; + + final int delta = position - lastPosition; + + assert delta > 0 || position == 0 || position == -1: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) + + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() { + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + assert docCount > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert docCount == df; + + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(freqStart); + } else { + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); + } + + lastFreqStart = freqStart; + + if (df >= skipInterval) { + termsOut.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart)); + } + + if (!omitTermFreqAndPositions) { + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(proxStart); + } else { + // Write delta between seek points + termsOut.writeVLong(proxStart - lastProxStart); + } + lastProxStart = proxStart; + } + + lastDocID = 0; + df = 0; + } + + @Override + public void close() throws IOException { + try { + freqOut.close(); + } finally { + if (proxOut != null) { + proxOut.close(); + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java new file mode 100644 index 00000000000..f7d66b0d867 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java @@ -0,0 +1,480 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; +import java.util.Collection; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.Comparator; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.cache.Cache; +import org.apache.lucene.util.cache.DoubleBarrelLRUCache; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Handles a terms dict, but decouples all details of + * doc/freqs/positions reading to an instance of {@link + * StandardPostingsReader}. This class is reusable for + * codecs that use a different format for + * docs/freqs/positions (though codecs are also free to + * make their own terms dict impl). + * + *

    This class also interacts with an instance of {@link + * StandardTermsIndexReader}, to abstract away the specific + * implementation of the terms dict index. + * @lucene.experimental */ + +public class StandardTermsDictReader extends FieldsProducer { + // Open input to the main terms dict file (_X.tis) + private final IndexInput in; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + private final StandardPostingsReader postingsReader; + + private final TreeMap fields = new TreeMap(); + + // Comparator that orders our terms + private final Comparator termComp; + + // Caches the most recently looked-up field + terms: + private final Cache termsCache; + + // Reads the terms index + private StandardTermsIndexReader indexReader; + + // Used as key for the terms cache + private static class FieldAndTerm { + String field; + BytesRef term; + + public FieldAndTerm() { + } + + public FieldAndTerm(FieldAndTerm other) { + field = other.field; + term = new BytesRef(other.term); + } + + public boolean equals(Object _other) { + FieldAndTerm other = (FieldAndTerm) _other; + return other.field == field && term.bytesEquals(other.term); + } + + public int hashCode() { + return field.hashCode() * 31 + term.hashCode(); + } + } + + public StandardTermsDictReader(StandardTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, StandardPostingsReader postingsReader, int readBufferSize, + Comparator termComp, int termsCacheSize) + throws IOException { + + this.postingsReader = postingsReader; + termsCache = new DoubleBarrelLRUCache(termsCacheSize); + + this.termComp = termComp; + + in = dir.openInput(IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_EXTENSION), + readBufferSize); + + boolean success = false; + try { + CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT); + + final long dirOffset = in.readLong(); + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + in.seek(dirOffset); + + final int numFields = in.readInt(); + + for(int i=0;i= 0; + final long termsStartPointer = in.readLong(); + final StandardTermsIndexReader.FieldReader fieldIndexReader; + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fieldIndexReader = indexReader.getField(fieldInfo); + if (numTerms > 0) { + assert !fields.containsKey(fieldInfo.name); + fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer)); + } + } + success = true; + } finally { + if (!success) { + in.close(); + } + } + + this.indexReader = indexReader; + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + indexReader.loadTermsIndex(indexDivisor); + } + + @Override + public void close() throws IOException { + try { + try { + if (indexReader != null) { + indexReader.close(); + } + } finally { + // null so if an app hangs on to us (ie, we are not + // GCable, despite being closed) we still free most + // ram + indexReader = null; + if (in != null) { + in.close(); + } + } + } finally { + try { + if (postingsReader != null) { + postingsReader.close(); + } + } finally { + for(FieldReader field : fields.values()) { + field.close(); + } + } + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, StandardCodec.TERMS_EXTENSION)); + } + + public static void getExtensions(Collection extensions) { + extensions.add(StandardCodec.TERMS_EXTENSION); + } + + @Override + public FieldsEnum iterator() { + return new TermFieldsEnum(); + } + + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + + // Iterates through all fields + private class TermFieldsEnum extends FieldsEnum { + final Iterator it; + FieldReader current; + + TermFieldsEnum() { + it = fields.values().iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + return current.fieldInfo.name; + } else { + current = null; + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + return current.iterator(); + } + } + + private class FieldReader extends Terms implements Closeable { + final long numTerms; + final FieldInfo fieldInfo; + final long termsStartPointer; + final StandardTermsIndexReader.FieldReader indexReader; + + FieldReader(StandardTermsIndexReader.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + this.termsStartPointer = termsStartPointer; + this.indexReader = fieldIndexReader; + } + + @Override + public Comparator getComparator() { + return termComp; + } + + public void close() { + super.close(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public long getUniqueTermCount() { + return numTerms; + } + + // Iterates through terms in this field + private class SegmentTermsEnum extends TermsEnum { + private final IndexInput in; + private final DeltaBytesReader bytesReader; + private final TermState state; + private boolean seekPending; + private final StandardTermsIndexReader.TermsIndexResult indexResult = new StandardTermsIndexReader.TermsIndexResult(); + private final FieldAndTerm fieldTerm = new FieldAndTerm(); + + SegmentTermsEnum() throws IOException { + in = (IndexInput) StandardTermsDictReader.this.in.clone(); + in.seek(termsStartPointer); + bytesReader = new DeltaBytesReader(in); + fieldTerm.field = fieldInfo.name; + state = postingsReader.newTermState(); + state.ord = -1; + } + + @Override + public Comparator getComparator() { + return termComp; + } + + /** Seeks until the first term that's >= the provided + * text; returns SeekStatus.FOUND if the exact term + * is found, SeekStatus.NOT_FOUND if a different term + * was found, SeekStatus.END if we hit EOF */ + @Override + public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + // Check cache + fieldTerm.term = term; + TermState cachedState; + if (useCache) { + cachedState = termsCache.get(fieldTerm); + if (cachedState != null) { + state.copy(cachedState); + seekPending = true; + bytesReader.term.copy(term); + return SeekStatus.FOUND; + } + } else { + cachedState = null; + } + + boolean doSeek = true; + + if (state.ord != -1) { + // we are positioned + + final int cmp = termComp.compare(bytesReader.term, term); + + if (cmp == 0) { + // already at the requested term + return SeekStatus.FOUND; + } + + if (cmp < 0 && + indexReader.nextIndexTerm(state.ord, indexResult) && + termComp.compare(indexResult.term, term) > 0) { + // Optimization: requested term is within the + // same index block we are now in; skip seeking + // (but do scanning): + doSeek = false; + } + } + + // Used only for assert: + final long startOrd; + + if (doSeek) { + + // As index to find biggest index term that's <= + // our text: + indexReader.getIndexOffset(term, indexResult); + + in.seek(indexResult.offset); + seekPending = false; + + // NOTE: the first next() after an index seek is + // wasteful, since it redundantly reads the same + // bytes into the buffer. We could avoid storing + // those bytes in the primary file, but then when + // scanning over an index term we'd have to + // special case it: + bytesReader.reset(indexResult.term); + + state.ord = indexResult.position-1; + assert state.ord >= -1: "ord=" + state.ord; + + startOrd = indexResult.position; + } else { + startOrd = -1; + } + + // Now scan: + while(next() != null) { + final int cmp = termComp.compare(bytesReader.term, term); + if (cmp == 0) { + + if (doSeek && useCache) { + // Store in cache + FieldAndTerm entryKey = new FieldAndTerm(fieldTerm); + cachedState = (TermState) state.clone(); + // this is fp after current term + cachedState.filePointer = in.getFilePointer(); + termsCache.put(entryKey, cachedState); + } + + return SeekStatus.FOUND; + } else if (cmp > 0) { + return SeekStatus.NOT_FOUND; + } + + // The purpose of the terms dict index is to seek + // the enum to the closest index term before the + // term we are looking for. So, we should never + // cross another index term (besides the first + // one) while we are scanning: + assert state.ord == startOrd || !indexReader.isIndexTerm(state.ord, state.docFreq, true); + } + + return SeekStatus.END; + } + + @Override + public SeekStatus seek(long ord) throws IOException { + + // TODO: should we cache term lookup by ord as well...? + + if (ord >= numTerms) { + state.ord = numTerms-1; + return SeekStatus.END; + } + + indexReader.getIndexOffset(ord, indexResult); + in.seek(indexResult.offset); + seekPending = false; + + // NOTE: the first next() after an index seek is + // wasteful, since it redundantly reads the same + // bytes into the buffer + bytesReader.reset(indexResult.term); + + state.ord = indexResult.position-1; + assert state.ord >= -1: "ord=" + state.ord; + + // Now, scan: + int left = (int) (ord - state.ord); + while(left > 0) { + final BytesRef term = next(); + assert term != null; + left--; + } + + // always found + return SeekStatus.FOUND; + } + + @Override + public BytesRef term() { + return bytesReader.term; + } + + @Override + public long ord() { + return state.ord; + } + + @Override + public BytesRef next() throws IOException { + + if (seekPending) { + seekPending = false; + in.seek(state.filePointer); + } + + if (state.ord >= numTerms-1) { + return null; + } + + bytesReader.read(); + state.docFreq = in.readVInt(); + + // TODO: would be cleaner, but space-wasting, to + // simply record a bit into each index entry as to + // whether it's an index entry or not, rather than + // re-compute that information... or, possibly store + // a "how many terms until next index entry" in each + // index entry, but that'd require some tricky + // lookahead work when writing the index + postingsReader.readTerm(in, + fieldInfo, state, + indexReader.isIndexTerm(1+state.ord, state.docFreq, false)); + + state.ord++; + + return bytesReader.term; + } + + @Override + public int docFreq() { + return state.docFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); + assert docsEnum != null; + return docsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } else { + return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse); + } + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java new file mode 100644 index 00000000000..fe58dc638d2 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java @@ -0,0 +1,176 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Comparator; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.CodecUtil; + +/** + * Writes terms dict and interacts with docs/positions + * consumers to write the postings files. + * + * The [new] terms dict format is field-centric: each field + * has its own section in the file. Fields are written in + * UTF16 string comparison order. Within each field, each + * term's text is written in UTF16 string comparison order. + * @lucene.experimental + */ + +public class StandardTermsDictWriter extends FieldsConsumer { + + final static String CODEC_NAME = "STANDARD_TERMS_DICT"; + + // Initial format + public static final int VERSION_START = 0; + + public static final int VERSION_CURRENT = VERSION_START; + + private final DeltaBytesWriter termWriter; + + final IndexOutput out; + final StandardPostingsWriter postingsWriter; + final FieldInfos fieldInfos; + FieldInfo currentField; + private final StandardTermsIndexWriter indexWriter; + private final List fields = new ArrayList(); + private final Comparator termComp; + + public StandardTermsDictWriter(StandardTermsIndexWriter indexWriter, SegmentWriteState state, StandardPostingsWriter postingsWriter, Comparator termComp) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, StandardCodec.TERMS_EXTENSION); + this.indexWriter = indexWriter; + this.termComp = termComp; + out = state.directory.createOutput(termsFileName); + indexWriter.setTermsOutput(out); + state.flushedFiles.add(termsFileName); + + fieldInfos = state.fieldInfos; + + // Count indexed fields up front + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + + out.writeLong(0); // leave space for end index pointer + + termWriter = new DeltaBytesWriter(out); + currentField = null; + this.postingsWriter = postingsWriter; + + postingsWriter.start(out); // have consumer write its format/header + } + + @Override + public TermsConsumer addField(FieldInfo field) { + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + StandardTermsIndexWriter.FieldWriter fieldIndexWriter = indexWriter.addField(field); + TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); + fields.add(terms); + return terms; + } + + @Override + public void close() throws IOException { + + try { + final int fieldCount = fields.size(); + + final long dirStart = out.getFilePointer(); + + out.writeInt(fieldCount); + for(int i=0;i getComparator() { + return termComp; + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + postingsWriter.startTerm(); + return postingsWriter; + } + + @Override + public void finishTerm(BytesRef text, int numDocs) throws IOException { + + assert numDocs > 0; + + final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs); + + termWriter.write(text); + out.writeVInt(numDocs); + + postingsWriter.finishTerm(numDocs, isIndexTerm); + numTerms++; + } + + // Finishes all terms in this field + @Override + public void finish() throws IOException { + fieldIndexWriter.finish(); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java new file mode 100644 index 00000000000..26bc0303787 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java @@ -0,0 +1,76 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.util.Collection; + + +// TODO +// - allow for non-regular index intervals? eg with a +// long string of rare terms, you don't need such +// frequent indexing + +/** + * TermsDictReader interacts with an instance of this class + * to manage its terms index. The writer must accept + * indexed terms (many pairs of CharSequence text + long + * fileOffset), and then this reader must be able to + * retrieve the nearest index term to a provided term + * text. + * @lucene.experimental */ + +public abstract class StandardTermsIndexReader { + + static class TermsIndexResult { + long position; + final BytesRef term = new BytesRef(); + long offset; + }; + + public abstract class FieldReader { + /** Returns position of "largest" index term that's <= + * text. Returned TermsIndexResult may be reused + * across calls. This resets internal state, and + * expects that you'll then scan the file and + * sequentially call isIndexTerm for each term + * encountered. */ + public abstract void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException; + + public abstract void getIndexOffset(long ord, TermsIndexResult result) throws IOException; + + /** Call this sequentially for each term encoutered, + * after calling {@link #getIndexOffset}. */ + public abstract boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) throws IOException; + + /** Finds the next index term, after the specified + * ord. Returns true if one exists. */ + public abstract boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException; + } + + public abstract FieldReader getField(FieldInfo fieldInfo); + + public abstract void loadTermsIndex(int indexDivisor) throws IOException; + + public abstract void close() throws IOException; + + public abstract void getExtensions(Collection extensions); +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java new file mode 100644 index 00000000000..6c64c7fec68 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java @@ -0,0 +1,38 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.BytesRef; +import java.io.IOException; + +/** @lucene.experimental */ +public abstract class StandardTermsIndexWriter { + + public abstract void setTermsOutput(IndexOutput out); + + public abstract class FieldWriter { + public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; + public abstract void finish() throws IOException; + } + + public abstract FieldWriter addField(FieldInfo fieldInfo); + + public abstract void close() throws IOException; +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java new file mode 100644 index 00000000000..24974caae89 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java @@ -0,0 +1,54 @@ +package org.apache.lucene.index.codecs.standard; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocsEnum; // for javadocs + +/** + * Holds all state required for {@link StandardPostingsReader} + * to produce a {@link DocsEnum} without re-seeking the + * terms dict. + * @lucene.experimental + */ + +public class TermState implements Cloneable { + public long ord; // ord for this term + public long filePointer; // fp into the terms dict primary file (_X.tis) + public int docFreq; // how many docs have this term + + public void copy(TermState other) { + ord = other.ord; + filePointer = other.filePointer; + docFreq = other.docFreq; + } + + @Override + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException cnse) { + // should not happen + throw new RuntimeException(cnse); + } + } + + @Override + public String toString() { + return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java new file mode 100644 index 00000000000..2be034a35b1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -0,0 +1,151 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A {@link Query} that will match terms against a finite-state machine. + *

    + * This query will match documents that contain terms accepted by a given + * finite-state machine. The automaton can be constructed with the + * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be + * created from a regular expression with {@link RegexpQuery} or from + * the standard Lucene wildcard syntax with {@link WildcardQuery}. + *

    + *

    + * When the query is executed, it will create an equivalent minimal DFA of the + * finite-state machine, and will enumerate the term dictionary in an + * intelligent way to reduce the number of comparisons. For example: the regular + * expression of [dl]og? will make approximately four comparisons: + * do, dog, lo, and log. + *

    + * @lucene.experimental + */ +public class AutomatonQuery extends MultiTermQuery { + /** the automaton to match index terms against */ + protected Automaton automaton; + /** term containing the field, and possibly some pattern structure */ + protected Term term; + + /** + * Create a new AutomatonQuery from an {@link Automaton}. + * + * @param term Term containing field and possibly some pattern structure. The + * term text is ignored. + * @param automaton Automaton to run, terms that are accepted are considered a + * match. + */ + public AutomatonQuery(Term term, Automaton automaton) { + super(term.field()); + this.term = term; + this.automaton = automaton; + MinimizationOperations.minimize(automaton); + } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + // matches nothing + if (BasicOperations.isEmpty(automaton)) { + return TermsEnum.EMPTY; + } + + // matches all possible strings + if (BasicOperations.isTotal(automaton)) { + return MultiFields.getTerms(reader, getField()).iterator(); + } + + // matches a fixed string in singleton representation + String singleton = automaton.getSingleton(); + if (singleton != null) + return new SingleTermsEnum(reader, term.createTerm(singleton)); + + // matches a fixed string in expanded representation + String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (automaton.equals(BasicAutomata.makeString(commonPrefix))) { + return new SingleTermsEnum(reader, term.createTerm(commonPrefix)); + } + + // matches a constant prefix + Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata + .makeString(commonPrefix), BasicAutomata.makeAnyString()); + if (automaton.equals(prefixAutomaton)) { + return new PrefixTermsEnum(reader, term.createTerm(commonPrefix)); + } + + return new AutomatonTermsEnum(automaton, term, reader); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + AutomatonQuery other = (AutomatonQuery) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(getClass().getSimpleName()); + buffer.append(" {"); + buffer.append('\n'); + buffer.append(automaton.toString()); + buffer.append("}"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java new file mode 100644 index 00000000000..43cda5ef62a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java @@ -0,0 +1,377 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +/** + * A FilteredTermsEnum that enumerates terms based upon what is accepted by a + * DFA. + *

    + * The algorithm is such: + *

      + *
    1. As long as matches are successful, keep reading sequentially. + *
    2. When a match fails, skip to the next string in lexicographic order that + * does not enter a reject state. + *
    + *

    + * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *

    + * @lucene.experimental + */ +public class AutomatonTermsEnum extends FilteredTermsEnum { + // the object-oriented form of the DFA + private final Automaton automaton; + // a tableized array-based form of the DFA + private final RunAutomaton runAutomaton; + // common suffix of the automaton + private final BytesRef commonSuffixRef; + // true if the automaton accepts a finite language + private final boolean finite; + // array of sorted transitions for each state, indexed by state number + private final Transition[][] allTransitions; + // for path tracking: each long records gen when we last + // visited the state; we use gens to avoid having to clear + private final long[] visited; + private long curGen; + // used for unicode conversion from BytesRef byte[] to char[] + private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + // the reference used for seeking forwards through the term dictionary + private final BytesRef seekBytesRef = new BytesRef(10); + // true if we are enumerating an infinite portion of the DFA. + // in this case it is faster to drive the query based on the terms dictionary. + // when this is true, linearUpperBound indicate the end of range + // of terms where we should simply do sequential reads instead. + private boolean linear = false; + private final BytesRef linearUpperBound = new BytesRef(10); + private final UnicodeUtil.UTF16Result linearUpperBoundUTF16 = new UnicodeUtil.UTF16Result(); + private final Comparator termComp; + + /** + * Expert ctor: + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

    + * @lucene.internal Use the public ctor instead. This constructor allows the + * (dangerous) option of passing in a pre-compiled RunAutomaton. If you use + * this ctor and compile your own RunAutomaton, you are responsible for + * ensuring it is in sync with the Automaton object, including internal + * State numbering, or you will get undefined behavior. + *

    + * @param preCompiled optional pre-compiled RunAutomaton (can be null) + * @param finite true if the automaton accepts a finite language + */ + AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled, + Term queryTerm, IndexReader reader, boolean finite) + throws IOException { + super(reader, queryTerm.field()); + this.automaton = automaton; + this.finite = finite; + + /* + * tableize the automaton. this also ensures it is deterministic, and has no + * transitions to dead states. it also invokes Automaton.setStateNumbers to + * number the original states (this is how they are tableized) + */ + if (preCompiled == null) + runAutomaton = new RunAutomaton(this.automaton); + else + runAutomaton = preCompiled; + + commonSuffixRef = finite ? null : new BytesRef(getValidUTF16Suffix(SpecialOperations + .getCommonSuffix(automaton))); + + // build a cache of sorted transitions for every state + allTransitions = new Transition[runAutomaton.getSize()][]; + for (State state : this.automaton.getStates()) + allTransitions[state.getNumber()] = state.getSortedTransitionArray(false); + // used for path tracking, where each bit is a numbered state. + visited = new long[runAutomaton.getSize()]; + + setUseTermsCache(finite); + termComp = getComparator(); + } + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

    + * It will automatically calculate whether or not the automaton is finite + */ + public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader) + throws IOException { + this(automaton, null, queryTerm, reader, SpecialOperations.isFinite(automaton)); + } + + /** + * Returns true if the term matches the automaton. Also stashes away the term + * to assist with smart enumeration. + */ + @Override + protected AcceptStatus accept(final BytesRef term) { + if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + if (runAutomaton.run(utf16.result, 0, utf16.length)) + return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; + else + return (linear && termComp.compare(term, linearUpperBound) < 0) ? + AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; + } else { + return (linear && termComp.compare(term, linearUpperBound) < 0) ? + AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; + } + } + + @Override + protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { + if (term == null) { + // return the empty term, as its valid + if (runAutomaton.run("")) { + seekBytesRef.copy(""); + return seekBytesRef; + } + + utf16.copyText(""); + } else { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + } + + // seek to the next possible string; + if (nextString()) { + // reposition + if (linear) + setLinear(infinitePosition); + UnicodeUtil.nextValidUTF16String(utf16); + UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef); + return seekBytesRef; + } + // no more possible strings can match + return null; + } + + // this instance prevents unicode conversion during backtracking, + // we can just call setLinear once at the end. + int infinitePosition; + + /** + * Sets the enum to operate in linear fashion, as we have found + * a looping transition at position + */ + private void setLinear(int position) { + int state = runAutomaton.getInitialState(); + char maxInterval = 0xffff; + for (int i = 0; i < position; i++) + state = runAutomaton.step(state, utf16.result[i]); + for (int i = 0; i < allTransitions[state].length; i++) { + Transition t = allTransitions[state][i]; + if (t.getMin() <= utf16.result[position] && utf16.result[position] <= t.getMax()) { + maxInterval = t.getMax(); + break; + } + } + // 0xffff terms don't get the optimization... not worth the trouble. + if (maxInterval < 0xffff) + maxInterval++; + int length = position + 1; /* position + maxTransition */ + if (linearUpperBoundUTF16.result.length < length) + linearUpperBoundUTF16.result = new char[length]; + System.arraycopy(utf16.result, 0, linearUpperBoundUTF16.result, 0, position); + linearUpperBoundUTF16.result[position] = maxInterval; + linearUpperBoundUTF16.setLength(length); + UnicodeUtil.nextValidUTF16String(linearUpperBoundUTF16); + UnicodeUtil.UTF16toUTF8(linearUpperBoundUTF16.result, 0, length, linearUpperBound); + } + + /** + * Increments the utf16 buffer to the next String in lexicographic order after s that will not put + * the machine into a reject state. If such a string does not exist, returns + * false. + * + * The correctness of this method depends upon the automaton being deterministic, + * and having no transitions to dead states. + * + * @return true if more possible solutions exist for the DFA + */ + private boolean nextString() { + int state; + int pos = 0; + + while (true) { + curGen++; + linear = false; + state = runAutomaton.getInitialState(); + // walk the automaton until a character is rejected. + for (pos = 0; pos < utf16.length; pos++) { + visited[state] = curGen; + int nextState = runAutomaton.step(state, utf16.result[pos]); + if (nextState == -1) + break; + // we found a loop, record it for faster enumeration + if (!finite && !linear && visited[nextState] == curGen) { + linear = true; + infinitePosition = pos; + } + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to + // append characters that will match. + if (nextString(state, pos)) { + return true; + } else { /* no more solutions exist from this useful portion, backtrack */ + if (!backtrack(pos)) /* no more solutions at all */ + return false; + else if (runAutomaton.run(utf16.result, 0, utf16.length)) + /* String is good to go as-is */ + return true; + /* else advance further */ + } + } + } + + /** + * Returns the next String in lexicographic order that will not put + * the machine into a reject state. + * + * This method traverses the DFA from the given position in the String, + * starting at the given state. + * + * If this cannot satisfy the machine, returns false. This method will + * walk the minimal path, in lexicographic order, as long as possible. + * + * If this method returns false, then there might still be more solutions, + * it is necessary to backtrack to find out. + * + * @param state current non-reject state + * @param position useful portion of the string + * @return true if more possible solutions exist for the DFA from this + * position + */ + private boolean nextString(int state, int position) { + /* + * the next lexicographic character must be greater than the existing + * character, if it exists. + */ + char c = 0; + if (position < utf16.length) { + c = utf16.result[position]; + // if the next character is U+FFFF and is not part of the useful portion, + // then by definition it puts us in a reject state, and therefore this + // path is dead. there cannot be any higher transitions. backtrack. + if (c == '\uFFFF') + return false; + else + c++; + } + + utf16.setLength(position); + visited[state] = curGen; + + Transition transitions[] = allTransitions[state]; + + // find the minimal path (lexicographic order) that is >= c + + for (int i = 0; i < transitions.length; i++) { + Transition transition = transitions[i]; + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + // append either the next sequential char, or the minimum transition + utf16.setLength(utf16.length + 1); + utf16.result[utf16.length - 1] = nextChar; + state = transition.getDest().getNumber(); + /* + * as long as is possible, continue down the minimal path in + * lexicographic order. if a loop or accept state is encountered, stop. + */ + while (visited[state] != curGen && !runAutomaton.isAccept(state)) { + visited[state] = curGen; + /* + * Note: we work with a DFA with no transitions to dead states. + * so the below is ok, if it is not an accept state, + * then there MUST be at least one transition. + */ + transition = allTransitions[state][0]; + state = transition.getDest().getNumber(); + // we found a loop, record it for faster enumeration + if (!finite && !linear && visited[state] == curGen) { + linear = true; + infinitePosition = utf16.length; + } + // append the minimum transition + utf16.setLength(utf16.length + 1); + utf16.result[utf16.length - 1] = transition.getMin(); + } + return true; + } + } + return false; + } + + /** + * Attempts to backtrack thru the string after encountering a dead end + * at some given position. Returns false if no more possible strings + * can match. + * + * @param position current position in the input String + * @return true if more possible solutions exist for the DFA + */ + private boolean backtrack(int position) { + while (position > 0) { + char nextChar = utf16.result[position - 1]; + // if a character is U+FFFF its a dead-end too, + // because there is no higher character in UTF-16 sort order. + if (nextChar != '\uFFFF') { + nextChar++; + utf16.result[position - 1] = nextChar; + utf16.setLength(position); + return true; + } + position--; + } + return false; /* all solutions exhausted */ + } + + /** + * if the suffix starts with a low surrogate, remove it. + * This won't be quite as efficient, but can be converted to valid UTF-8 + * + * This isn't nearly as complex as cleanupPosition, because its not + * going to use this suffix to walk any path thru the terms. + * + */ + private String getValidUTF16Suffix(String suffix) { + if (suffix != null && suffix.length() > 0 && + Character.isLowSurrogate(suffix.charAt(0))) + return suffix.substring(1); + else + return suffix; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java index 3f2ade7a38f..3e8a6e16319 100644 --- a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java +++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java @@ -161,8 +161,8 @@ public class ConstantScoreQuery extends Query { /** Prints a user-readable version of this query. */ @Override public String toString(String field) { - return "ConstantScore(" + filter.toString() - + (getBoost()==1.0 ? ")" : "^" + getBoost()); + return "ConstantScore(" + filter.toString() + ")" + + (getBoost()==1.0 ? "" : "^" + getBoost()); } /** Returns true if o is equal to this. */ diff --git a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java index 9d56d95f2f7..55825b48ad2 100644 --- a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -22,9 +22,9 @@ import org.apache.lucene.index.*; final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, byte[] norms) { - super(weight, tps, offsets, similarity, norms); + super(weight, postings, offsets, similarity, norms); } @Override @@ -42,11 +42,11 @@ final class ExactPhraseScorer extends PhraseScorer { int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first - do { - if (!first.nextPosition()) - return freq; - } while (first.position < last.position); - firstToLast(); + do { + if (!first.nextPosition()) + return freq; + } while (first.position < last.position); + firstToLast(); } freq++; // all equal: a match } while (last.nextPosition()); diff --git a/lucene/src/java/org/apache/lucene/search/FieldCache.java b/lucene/src/java/org/apache/lucene/search/FieldCache.java index b928176f6e6..3f09fd8f206 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCache.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCache.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.analysis.NumericTokenStream; // for javadocs @@ -100,7 +101,7 @@ public interface FieldCache { */ public interface ByteParser extends Parser { /** Return a single Byte representation of this field's value. */ - public byte parseByte(String string); + public byte parseByte(BytesRef term); } /** Interface to parse shorts from document fields. @@ -108,7 +109,7 @@ public interface FieldCache { */ public interface ShortParser extends Parser { /** Return a short representation of this field's value. */ - public short parseShort(String string); + public short parseShort(BytesRef term); } /** Interface to parse ints from document fields. @@ -116,7 +117,7 @@ public interface FieldCache { */ public interface IntParser extends Parser { /** Return an integer representation of this field's value. */ - public int parseInt(String string); + public int parseInt(BytesRef term); } /** Interface to parse floats from document fields. @@ -124,7 +125,7 @@ public interface FieldCache { */ public interface FloatParser extends Parser { /** Return an float representation of this field's value. */ - public float parseFloat(String string); + public float parseFloat(BytesRef term); } /** Interface to parse long from document fields. @@ -132,7 +133,7 @@ public interface FieldCache { */ public interface LongParser extends Parser { /** Return an long representation of this field's value. */ - public long parseLong(String string); + public long parseLong(BytesRef term); } /** Interface to parse doubles from document fields. @@ -140,16 +141,20 @@ public interface FieldCache { */ public interface DoubleParser extends Parser { /** Return an long representation of this field's value. */ - public double parseDouble(String string); + public double parseDouble(BytesRef term); } /** Expert: The cache used internally by sorting and range query classes. */ public static FieldCache DEFAULT = new FieldCacheImpl(); - + /** The default parser for byte values, which are encoded by {@link Byte#toString(byte)} */ public static final ByteParser DEFAULT_BYTE_PARSER = new ByteParser() { - public byte parseByte(String value) { - return Byte.parseByte(value); + public byte parseByte(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Byte.parseByte(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_BYTE_PARSER; @@ -162,8 +167,12 @@ public interface FieldCache { /** The default parser for short values, which are encoded by {@link Short#toString(short)} */ public static final ShortParser DEFAULT_SHORT_PARSER = new ShortParser() { - public short parseShort(String value) { - return Short.parseShort(value); + public short parseShort(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Short.parseShort(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_SHORT_PARSER; @@ -176,8 +185,12 @@ public interface FieldCache { /** The default parser for int values, which are encoded by {@link Integer#toString(int)} */ public static final IntParser DEFAULT_INT_PARSER = new IntParser() { - public int parseInt(String value) { - return Integer.parseInt(value); + public int parseInt(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Integer.parseInt(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_INT_PARSER; @@ -190,8 +203,12 @@ public interface FieldCache { /** The default parser for float values, which are encoded by {@link Float#toString(float)} */ public static final FloatParser DEFAULT_FLOAT_PARSER = new FloatParser() { - public float parseFloat(String value) { - return Float.parseFloat(value); + public float parseFloat(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Float.parseFloat(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_FLOAT_PARSER; @@ -204,8 +221,12 @@ public interface FieldCache { /** The default parser for long values, which are encoded by {@link Long#toString(long)} */ public static final LongParser DEFAULT_LONG_PARSER = new LongParser() { - public long parseLong(String value) { - return Long.parseLong(value); + public long parseLong(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Long.parseLong(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_LONG_PARSER; @@ -218,8 +239,12 @@ public interface FieldCache { /** The default parser for double values, which are encoded by {@link Double#toString(double)} */ public static final DoubleParser DEFAULT_DOUBLE_PARSER = new DoubleParser() { - public double parseDouble(String value) { - return Double.parseDouble(value); + public double parseDouble(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Double.parseDouble(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_DOUBLE_PARSER; @@ -231,15 +256,14 @@ public interface FieldCache { }; /** - * A parser instance for int values encoded by {@link NumericUtils#intToPrefixCoded(int)}, e.g. when indexed + * A parser instance for int values encoded by {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){ - public int parseInt(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; - if (shift>0 && shift<=31) + public int parseInt(BytesRef term) { + if (NumericUtils.getPrefixCodedIntShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.prefixCodedToInt(val); + return NumericUtils.prefixCodedToInt(term); } protected Object readResolve() { return NUMERIC_UTILS_INT_PARSER; @@ -255,11 +279,10 @@ public interface FieldCache { * via {@link NumericField}/{@link NumericTokenStream}. */ public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){ - public float parseFloat(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; - if (shift>0 && shift<=31) + public float parseFloat(BytesRef term) { + if (NumericUtils.getPrefixCodedIntShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(val)); + return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term)); } protected Object readResolve() { return NUMERIC_UTILS_FLOAT_PARSER; @@ -271,15 +294,14 @@ public interface FieldCache { }; /** - * A parser instance for long values encoded by {@link NumericUtils#longToPrefixCoded(long)}, e.g. when indexed + * A parser instance for long values encoded by {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){ - public long parseLong(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; - if (shift>0 && shift<=63) + public long parseLong(BytesRef term) { + if (NumericUtils.getPrefixCodedLongShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.prefixCodedToLong(val); + return NumericUtils.prefixCodedToLong(term); } protected Object readResolve() { return NUMERIC_UTILS_LONG_PARSER; @@ -295,11 +317,10 @@ public interface FieldCache { * via {@link NumericField}/{@link NumericTokenStream}. */ public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){ - public double parseDouble(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; - if (shift>0 && shift<=63) + public double parseDouble(BytesRef term) { + if (NumericUtils.getPrefixCodedLongShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(val)); + return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term)); } protected Object readResolve() { return NUMERIC_UTILS_DOUBLE_PARSER; diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java index 929c9195235..bb11d162bca 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java @@ -26,9 +26,12 @@ import java.util.Map; import java.util.WeakHashMap; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.FieldCacheSanityChecker; @@ -277,22 +280,29 @@ class FieldCacheImpl implements FieldCache { return wrapper.getBytes(reader, field, FieldCache.DEFAULT_BYTE_PARSER); } final byte[] retArray = new byte[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - byte termval = parser.parseByte(term.text()); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final byte termval = parser.parseByte(term); + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } return retArray; } @@ -324,22 +334,29 @@ class FieldCacheImpl implements FieldCache { return wrapper.getShorts(reader, field, FieldCache.DEFAULT_SHORT_PARSER); } final short[] retArray = new short[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - short termval = parser.parseShort(term.text()); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final short termval = parser.parseShort(term); + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } return retArray; } @@ -375,27 +392,41 @@ class FieldCacheImpl implements FieldCache { } } int[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - int termval = parser.parseInt(term.text()); - if (retArray == null) // late init - retArray = new int[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final int termval = parser.parseInt(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new int[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } - if (retArray == null) // no values + + if (retArray == null) { + // no values retArray = new int[reader.maxDoc()]; + } return retArray; } } @@ -431,29 +462,43 @@ class FieldCacheImpl implements FieldCache { } catch (NumberFormatException ne) { return wrapper.getFloats(reader, field, NUMERIC_UTILS_FLOAT_PARSER); } - } - float[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - float termval = parser.parseFloat(term.text()); - if (retArray == null) // late init - retArray = new float[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; - } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); } - if (retArray == null) // no values + float[] retArray = null; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final float termval = parser.parseFloat(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new float[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } + } + } catch (StopFillCacheException stop) { + } + } + + if (retArray == null) { + // no values retArray = new float[reader.maxDoc()]; + } return retArray; } } @@ -487,27 +532,41 @@ class FieldCacheImpl implements FieldCache { } } long[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term(field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - long termval = parser.parseLong(term.text()); - if (retArray == null) // late init - retArray = new long[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final long termval = parser.parseLong(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new long[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } - if (retArray == null) // no values + + if (retArray == null) { + // no values retArray = new long[reader.maxDoc()]; + } return retArray; } } @@ -543,24 +602,35 @@ class FieldCacheImpl implements FieldCache { } } double[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - double termval = parser.parseDouble(term.text()); - if (retArray == null) // late init - retArray = new double[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final double termval = parser.parseDouble(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new double[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } if (retArray == null) // no values retArray = new double[reader.maxDoc()]; @@ -584,21 +654,27 @@ class FieldCacheImpl implements FieldCache { throws IOException { String field = StringHelper.intern(entryKey.field); final String[] retArray = new String[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - String termval = term.text(); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; } - } while (termEnum.next()); - } finally { - termDocs.close(); - termEnum.close(); + docs = termsEnum.docs(delDocs, docs); + final String termval = term.utf8ToString(); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } + } } return retArray; } @@ -621,8 +697,10 @@ class FieldCacheImpl implements FieldCache { String field = StringHelper.intern(entryKey.field); final int[] retArray = new int[reader.maxDoc()]; String[] mterms = new String[reader.maxDoc()+1]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); + + //System.out.println("FC: getStringIndex field=" + field); + Terms terms = MultiFields.getTerms(reader, field); + int t = 0; // current term number // an entry for documents that have no terms in this field @@ -631,24 +709,31 @@ class FieldCacheImpl implements FieldCache { // needs to change as well. mterms[t++] = null; - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - - // store term text - mterms[t] = term.text(); - - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = t; + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; } + // store term text + mterms[t] = term.utf8ToString(); + //System.out.println("FC: ord=" + t + " term=" + term.toBytesString()); + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + //System.out.println("FC: docID=" + docID); + retArray[docID] = t; + } t++; - } while (termEnum.next()); - } finally { - termDocs.close(); - termEnum.close(); + } } if (t == 0) { @@ -658,16 +743,17 @@ class FieldCacheImpl implements FieldCache { } else if (t < mterms.length) { // if there are less terms than documents, // trim off the dead array space - String[] terms = new String[t]; - System.arraycopy (mterms, 0, terms, 0, t); - mterms = terms; + String[] newTerms = new String[t]; + System.arraycopy (mterms, 0, newTerms, 0, t); + mterms = newTerms; } StringIndex value = new StringIndex (retArray, mterms); + //System.out.println("FC: done\n"); return value; } } - + private volatile PrintStream infoStream; public void setInfoStream(PrintStream stream) { diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java index 067f881d39d..8003c81d874 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java @@ -19,8 +19,9 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.Bits; import org.apache.lucene.document.NumericField; // for javadocs /** @@ -119,9 +120,9 @@ public abstract class FieldCacheRangeFilter extends Filter { assert inclusiveLowerPoint > 0 && inclusiveUpperPoint > 0; - // for this DocIdSet, we never need to use TermDocs, + // for this DocIdSet, we can ignore deleted docs // because deleted docs have an order of 0 (null entry in StringIndex) - return new FieldCacheDocIdSet(reader, false) { + return new FieldCacheDocIdSet(reader, true) { @Override final boolean matchDoc(int doc) { return fcsi.order[doc] >= inclusiveLowerPoint && fcsi.order[doc] <= inclusiveUpperPoint; @@ -171,8 +172,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final byte[] values = FieldCache.DEFAULT.getBytes(reader, field, (FieldCache.ByteParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // we only respect deleted docs if the range contains 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -222,8 +223,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final short[] values = FieldCache.DEFAULT.getShorts(reader, field, (FieldCache.ShortParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -273,8 +274,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final int[] values = FieldCache.DEFAULT.getInts(reader, field, (FieldCache.IntParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -324,8 +325,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final long[] values = FieldCache.DEFAULT.getLongs(reader, field, (FieldCache.LongParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0L && inclusiveUpperPoint >= 0L)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0L && inclusiveUpperPoint >= 0L)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -379,8 +380,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final float[] values = FieldCache.DEFAULT.getFloats(reader, field, (FieldCache.FloatParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0f && inclusiveUpperPoint >= 0.0f)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0.0f && inclusiveUpperPoint >= 0.0f)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -434,8 +435,8 @@ public abstract class FieldCacheRangeFilter extends Filter { return DocIdSet.EMPTY_DOCIDSET; final double[] values = FieldCache.DEFAULT.getDoubles(reader, field, (FieldCache.DoubleParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0 && inclusiveUpperPoint >= 0.0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0.0 && inclusiveUpperPoint >= 0.0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -503,99 +504,81 @@ public abstract class FieldCacheRangeFilter extends Filter { static abstract class FieldCacheDocIdSet extends DocIdSet { private final IndexReader reader; - private boolean mayUseTermDocs; - - FieldCacheDocIdSet(IndexReader reader, boolean mayUseTermDocs) { + private boolean canIgnoreDeletedDocs; + + FieldCacheDocIdSet(IndexReader reader, boolean canIgnoreDeletedDocs) { this.reader = reader; - this.mayUseTermDocs = mayUseTermDocs; + this.canIgnoreDeletedDocs = canIgnoreDeletedDocs; } - - /** this method checks, if a doc is a hit, should throw AIOBE, when position invalid */ + + /** + * this method checks, if a doc is a hit, should throw AIOBE, when position + * invalid + */ abstract boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException; - - /** this DocIdSet is cacheable, if it works solely with FieldCache and no TermDocs */ + + /** + * this DocIdSet is cacheable, if it can ignore deletions + */ @Override public boolean isCacheable() { - return !(mayUseTermDocs && reader.hasDeletions()); + return canIgnoreDeletedDocs || !reader.hasDeletions(); } @Override public DocIdSetIterator iterator() throws IOException { // Synchronization needed because deleted docs BitVector // can change after call to hasDeletions until TermDocs creation. - // We only use an iterator with termDocs, when this was requested (e.g. range contains 0) + // We only use an iterator with termDocs, when this was requested (e.g. + // range contains 0) // and the index has deletions - final TermDocs termDocs; - synchronized(reader) { - termDocs = isCacheable() ? null : reader.termDocs(null); + + final Bits skipDocs; + synchronized (reader) { + if (isCacheable()) { + skipDocs = null; + } else { + skipDocs = MultiFields.getDeletedDocs(reader); + } } - if (termDocs != null) { - // a DocIdSetIterator using TermDocs to iterate valid docIds - return new DocIdSetIterator() { - private int doc = -1; - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { + final int maxDoc = reader.maxDoc(); + + // a DocIdSetIterator generating docIds by + // incrementing a variable & checking skipDocs - + return new DocIdSetIterator() { + private int doc = -1; + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() { + try { do { - if (!termDocs.next()) - return doc = NO_MORE_DOCS; - } while (!matchDoc(doc = termDocs.doc())); + doc++; + } while ((skipDocs != null && doc < maxDoc && skipDocs.get(doc)) + || !matchDoc(doc)); return doc; + } catch (ArrayIndexOutOfBoundsException e) { + return doc = NO_MORE_DOCS; } - - @Override - public int advance(int target) throws IOException { - if (!termDocs.skipTo(target)) - return doc = NO_MORE_DOCS; - while (!matchDoc(doc = termDocs.doc())) { - if (!termDocs.next()) - return doc = NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + try { + doc = target; + while (!matchDoc(doc)) { + doc++; } return doc; + } catch (ArrayIndexOutOfBoundsException e) { + return doc = NO_MORE_DOCS; } - }; - } else { - // a DocIdSetIterator generating docIds by incrementing a variable - - // this one can be used if there are no deletions are on the index - return new DocIdSetIterator() { - private int doc = -1; - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - try { - do { - doc++; - } while (!matchDoc(doc)); - return doc; - } catch (ArrayIndexOutOfBoundsException e) { - return doc = NO_MORE_DOCS; - } - } - - @Override - public int advance(int target) { - try { - doc = target; - while (!matchDoc(doc)) { - doc++; - } - return doc; - } catch (ArrayIndexOutOfBoundsException e) { - return doc = NO_MORE_DOCS; - } - } - }; - } + + } + }; } } diff --git a/lucene/src/java/org/apache/lucene/search/FilteredTermEnum.java b/lucene/src/java/org/apache/lucene/search/FilteredTermEnum.java index 5780fa17777..4933a903e88 100644 --- a/lucene/src/java/org/apache/lucene/search/FilteredTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FilteredTermEnum.java @@ -24,7 +24,11 @@ import org.apache.lucene.index.TermEnum; /** Abstract class for enumerating a subset of all terms.

    Term enumerations are always ordered by Term.compareTo(). Each term in - the enumeration is greater than all that precede it. */ + the enumeration is greater than all that precede it. + + @deprecated Switch to {@link FilteredTermsEnum} instead. +*/ +@Deprecated public abstract class FilteredTermEnum extends TermEnum { /** the current term */ protected Term currentTerm = null; @@ -37,7 +41,14 @@ public abstract class FilteredTermEnum extends TermEnum { /** Equality compare on the term */ protected abstract boolean termCompare(Term term); - /** Equality measure on the term */ + /** Equality measure on the term, it is in reality a boost + * factor and used like so in {@link MultiTermQuery}, + * so the name is wrong. + * @deprecated Use {@link MultiTermQuery.BoostAttribute} + * together with {@link FilteredTermsEnum}. For example + * see {@link FuzzyTermsEnum} + */ + @Deprecated public abstract float difference(); /** Indicates the end of the enumeration has been reached */ diff --git a/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java new file mode 100644 index 00000000000..737a839895d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java @@ -0,0 +1,233 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Bits; + +/** + * Abstract class for enumerating a subset of all terms. + * + *

    Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

    + *

    Please note: Consumers of this enum cannot + * call {@code seek()}, it is forward only; it throws + * {@link UnsupportedOperationException} when a seeking method + * is called. + */ +public abstract class FilteredTermsEnum extends TermsEnum { + + private BytesRef initialSeekTerm = null; + private boolean doSeek = true; + private BytesRef actualTerm = null; + private boolean useTermsCache = false; + + private final TermsEnum tenum; + + /** Return value, if term should be accepted or the iteration should + * {@code END}. The {@code *_SEEK} values denote, that after handling the current term + * the enum should call {@link #nextSeekTerm} and step forward. + * @see #accept(BytesRef) + */ + protected static enum AcceptStatus {YES, YES_AND_SEEK, NO, NO_AND_SEEK, END}; + + /** Return if term is accepted, not accepted or the iteration should ended + * (and possibly seek). + */ + protected abstract AcceptStatus accept(BytesRef term) throws IOException; + + /** + * Creates a filtered {@link TermsEnum} for the given field name and reader. + */ + public FilteredTermsEnum(final IndexReader reader, final String field) throws IOException { + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + tenum = terms.iterator(); + } else { + tenum = null; + } + } + + /** + * Creates a filtered {@link TermsEnum} on a terms enum. + * @param tenum the terms enumeration to filter, if {@code null} this is the null iterator. + */ + public FilteredTermsEnum(final TermsEnum tenum) { + this.tenum = tenum; + } + + /** + * Use this method to set the initial {@link BytesRef} + * to seek before iterating. This is a convenience method for + * subclasses that do not override {@link #nextSeekTerm}. + * If the initial seek term is {@code null} (default), + * the enum is empty. + *

    You can only use this method, if you keep the default + * implementation of {@link #nextSeekTerm}. + */ + protected final void setInitialSeekTerm(BytesRef term) throws IOException { + this.initialSeekTerm = term; + } + + /** On the first call to {@link #next} or if {@link #accept} returns + * {@link AcceptStatus#YES_AND_SEEK} or {@link AcceptStatus#NO_AND_SEEK}, + * this method will be called to eventually seek the underlying TermsEnum + * to a new position. + * On the first call, {@code currentTerm} will be {@code null}, later + * calls will provide the term the underlying enum is positioned at. + * This method returns per default only one time the initial seek term + * and then {@code null}, so no repositioning is ever done. + *

    Override this method, if you want a more sophisticated TermsEnum, + * that repositions the iterator during enumeration. + * If this method always returns {@code null} the enum is empty. + *

    Please note: This method should always provide a greater term + * than the last enumerated term, else the behaviour of this enum + * violates the contract for TermsEnums. + */ + protected BytesRef nextSeekTerm(final BytesRef currentTerm) throws IOException { + final BytesRef t = initialSeekTerm; + initialSeekTerm = null; + return t; + } + + /** Expert: enable or disable the terms cache when seeking. */ + protected final void setUseTermsCache(boolean useTermsCache) { + this.useTermsCache = useTermsCache; + } + + /** Expert: enable or disable the terms cache when seeking. */ + protected final boolean getUseTermsCache() { + return useTermsCache; + } + + /** + * Returns the related attributes, the returned {@link AttributeSource} + * is shared with the delegate {@code TermsEnum}. + */ + @Override + public AttributeSource attributes() { + /* if we have no tenum, we return a new attributes instance, + * to prevent NPE in subclasses that use attributes. + * in all other cases we share the attributes with our delegate. */ + return (tenum == null) ? super.attributes() : tenum.attributes(); + } + + @Override + public BytesRef term() throws IOException { + assert tenum != null; + return tenum.term(); + } + + @Override + public Comparator getComparator() throws IOException { + return (tenum == null) ? null : tenum.getComparator(); + } + + @Override + public int docFreq() { + assert tenum != null; + return tenum.docFreq(); + } + + /** This enum does not support seeking! + * @throws UnsupportedOperationException + */ + @Override + public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); + } + + /** This enum does not support seeking! + * @throws UnsupportedOperationException + */ + @Override + public SeekStatus seek(long ord) throws IOException { + throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); + } + + @Override + public long ord() throws IOException { + assert tenum != null; + return tenum.ord(); + } + + @Override + public DocsEnum docs(Bits bits, DocsEnum reuse) throws IOException { + assert tenum != null; + return tenum.docs(bits, reuse); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException { + assert tenum != null; + return tenum.docsAndPositions(bits, reuse); + } + + @Override + public BytesRef next() throws IOException { + if (tenum == null) + return null; + for (;;) { + // Seek or forward the iterator + if (doSeek) { + doSeek = false; + final BytesRef t = nextSeekTerm(actualTerm); + if (t == null || tenum.seek(t, useTermsCache) == SeekStatus.END) { + // no more terms to seek to or enum exhausted + return null; + } + actualTerm = tenum.term(); + } else { + actualTerm = tenum.next(); + if (actualTerm == null) { + // enum exhausted + return null; + } + } + + // check if term is accepted + switch (accept(actualTerm)) { + case YES_AND_SEEK: + doSeek = true; + // term accepted, but we need to seek so fall-through + case YES: + // term accepted + return actualTerm; + case NO_AND_SEEK: + // invalid term, seek next time + doSeek = true; + break; + case END: + // we are supposed to end the enum + return null; + } + } + } + +} diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java index 85d52350ea4..2bd9faf3e99 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -68,6 +69,7 @@ public class FuzzyQuery extends MultiTermQuery { */ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength, int maxExpansions) { + super(term.field()); this.term = term; if (minimumSimilarity >= 1.0f) @@ -127,7 +129,7 @@ public class FuzzyQuery extends MultiTermQuery { return prefixLength; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { if (!termLongEnough) { // can only match if it's exact return new SingleTermEnum(reader, term); @@ -135,6 +137,14 @@ public class FuzzyQuery extends MultiTermQuery { return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); } + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (!termLongEnough) { // can only match if it's exact + return new SingleTermsEnum(reader, term); + } + return new FuzzyTermsEnum(reader, getTerm(), minimumSimilarity, prefixLength); + } + /** * Returns the pattern term. */ diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java index 788e397edd4..c65256c6fa5 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java @@ -27,7 +27,10 @@ import org.apache.lucene.index.Term; * *

    Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. + * + * @deprecated Please use {@link FuzzyTermsEnum} instead. */ +@Deprecated public final class FuzzyTermEnum extends FilteredTermEnum { /* Allows us save time required to create a new array @@ -136,7 +139,8 @@ public final class FuzzyTermEnum extends FilteredTermEnum { return false; } - /** {@inheritDoc} */ + /** @deprecated Use {@link MultiTermQuery.BoostAttribute} together with {@link FuzzyTermsEnum} */ + @Deprecated @Override public final float difference() { return (similarity - minimumSimilarity) * scale_factor; diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java new file mode 100644 index 00000000000..8d4cbb028bf --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -0,0 +1,555 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.RunAutomaton; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +/** Subclass of TermsEnum for enumerating all terms that are similar + * to the specified filter term. + * + *

    Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

    + */ +public final class FuzzyTermsEnum extends TermsEnum { + private TermsEnum actualEnum; + private MultiTermQuery.BoostAttribute actualBoostAtt; + + private final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + private float bottom = boostAtt.getMaxNonCompetitiveBoost(); + + private final float minSimilarity; + private final float scale_factor; + + private final int termLength; + + private int maxEdits; + private List automata; + private List runAutomata; + + private final IndexReader reader; + private final Term term; + private final int realPrefixLength; + + /** + * Constructor for enumeration of all terms from specified reader which share a prefix of + * length prefixLength with term and which have a fuzzy similarity > + * minSimilarity. + *

    + * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + public FuzzyTermsEnum(IndexReader reader, Term term, + final float minSimilarity, final int prefixLength) throws IOException { + if (minSimilarity >= 1.0f) + throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); + else if (minSimilarity < 0.0f) + throw new IllegalArgumentException("minimumSimilarity cannot be less than 0"); + if(prefixLength < 0) + throw new IllegalArgumentException("prefixLength cannot be less than 0"); + this.reader = reader; + this.term = term; + //The prefix could be longer than the word. + //It's kind of silly though. It means we must match the entire word. + this.termLength = term.text().length(); + this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength; + this.minSimilarity = minSimilarity; + this.scale_factor = 1.0f / (1.0f - minSimilarity); + + // calculate the maximum k edits for this similarity + maxEdits = initialMaxDistance(minSimilarity, termLength); + + TermsEnum subEnum = getAutomatonEnum(maxEdits, null); + setEnum(subEnum != null ? subEnum : + new LinearFuzzyTermsEnum(reader, term, minSimilarity, prefixLength)); + } + + /** + * return an automata-based enum for matching up to editDistance from + * lastTerm, if possible + */ + private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) + throws IOException { + initAutomata(editDistance); + if (automata != null && editDistance < automata.size()) { + return new AutomatonFuzzyTermsEnum(automata.get(editDistance), term, + reader, minSimilarity, runAutomata.subList(0, editDistance + 1) + .toArray(new RunAutomaton[0]), lastTerm); + } else { + return null; + } + } + + /** initialize levenshtein DFAs up to maxDistance, if possible */ + private void initAutomata(int maxDistance) { + if (automata == null && + maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + LevenshteinAutomata builder = + new LevenshteinAutomata(term.text().substring(realPrefixLength)); + automata = new ArrayList(maxDistance); + runAutomata = new ArrayList(maxDistance); + for (int i = 0; i <= maxDistance; i++) { + Automaton a = builder.toAutomaton(i); + // constant prefix + if (realPrefixLength > 0) { + Automaton prefix = BasicAutomata.makeString( + term.text().substring(0, realPrefixLength)); + a = BasicOperations.concatenate(prefix, a); + } + automata.add(a); + runAutomata.add(new RunAutomaton(a)); + } + } + } + + /** swap in a new actual enum to proxy to */ + private void setEnum(TermsEnum actualEnum) { + this.actualEnum = actualEnum; + this.actualBoostAtt = actualEnum.attributes().addAttribute( + MultiTermQuery.BoostAttribute.class); + } + + /** + * fired when the max non-competitive boost has changed. this is the hook to + * swap in a smarter actualEnum + */ + private void bottomChanged(float boostValue, BytesRef lastTerm) + throws IOException { + int oldMaxEdits = maxEdits; + + // as long as the max non-competitive boost is >= the max boost + // for some edit distance, keep dropping the max edit distance. + while (maxEdits > 0 && boostValue >= calculateMaxBoost(maxEdits)) + maxEdits--; + + if (oldMaxEdits != maxEdits) { // the maximum n has changed + TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); + if (newEnum != null) { + setEnum(newEnum); + } + } + // TODO, besides changing linear -> automaton, and swapping in a smaller + // automaton, we can also use this information to optimize the linear case + // itself: re-init maxDistances so the fast-fail happens for more terms due + // to the now stricter constraints. + } + + // for some raw min similarity and input term length, the maximum # of edits + private int initialMaxDistance(float minimumSimilarity, int termLen) { + return (int) ((1-minimumSimilarity) * termLen); + } + + // for some number of edits, the maximum possible scaled boost + private float calculateMaxBoost(int nEdits) { + final float similarity = 1.0f - ((float) nEdits / (float) (termLength)); + return (similarity - minSimilarity) * scale_factor; + } + + @Override + public BytesRef next() throws IOException { + BytesRef term = actualEnum.next(); + boostAtt.setBoost(actualBoostAtt.getBoost()); + + final float bottom = boostAtt.getMaxNonCompetitiveBoost(); + if (bottom != this.bottom) { + this.bottom = bottom; + // clone the term before potentially doing something with it + // this is a rare but wonderful occurrence anyway + bottomChanged(bottom, term == null ? null : (BytesRef) term.clone()); + } + + return term; + } + + // proxy all other enum calls to the actual enum + @Override + public int docFreq() { + return actualEnum.docFreq(); + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + return actualEnum.docs(skipDocs, reuse); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, + DocsAndPositionsEnum reuse) throws IOException { + return actualEnum.docsAndPositions(skipDocs, reuse); + } + + @Override + public Comparator getComparator() throws IOException { + return actualEnum.getComparator(); + } + + @Override + public long ord() throws IOException { + return actualEnum.ord(); + } + + @Override + public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { + return actualEnum.seek(text, useCache); + } + + @Override + public SeekStatus seek(long ord) throws IOException { + return actualEnum.seek(ord); + } + + @Override + public BytesRef term() throws IOException { + return actualEnum.term(); + } +} + +/** + * Implement fuzzy enumeration with automaton. + *

    + * This is the fastest method as opposed to LinearFuzzyTermsEnum: + * as enumeration is logarithmic to the number of terms (instead of linear) + * and comparison is linear to length of the term (rather than quadratic) + */ +final class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum { + private final RunAutomaton matchers[]; + // used for unicode conversion from BytesRef byte[] to char[] + private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + + private final float minimumSimilarity; + private final float scale_factor; + + private final int fullSearchTermLength; + private final BytesRef termRef; + + private final BytesRef lastTerm; + private final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm, + IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm) throws IOException { + super(automaton, matchers[matchers.length - 1], queryTerm, reader, true); + this.minimumSimilarity = minSimilarity; + this.scale_factor = 1.0f / (1.0f - minimumSimilarity); + this.matchers = matchers; + this.lastTerm = lastTerm; + termRef = new BytesRef(queryTerm.text()); + fullSearchTermLength = queryTerm.text().length(); + } + + /** finds the smallest Lev(n) DFA that accepts the term. */ + @Override + protected AcceptStatus accept(BytesRef term) { + if (term.equals(termRef)) { // ed = 0 + boostAtt.setBoost(1.0F); + return AcceptStatus.YES_AND_SEEK; + } + + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + + // TODO: benchmark doing this backwards + for (int i = 1; i < matchers.length; i++) + if (matchers[i].run(utf16.result, 0, utf16.length)) { + final float similarity = 1.0f - ((float) i / (float) + (Math.min(utf16.length, fullSearchTermLength))); + if (similarity > minimumSimilarity) { + boostAtt.setBoost((float) ((similarity - minimumSimilarity) * scale_factor)); + return AcceptStatus.YES_AND_SEEK; + } else { + return AcceptStatus.NO_AND_SEEK; + } + } + + return AcceptStatus.NO_AND_SEEK; + } + + /** defers to superclass, except can start at an arbitrary location */ + @Override + protected BytesRef nextSeekTerm(BytesRef term) throws IOException { + if (term == null) + term = lastTerm; + return super.nextSeekTerm(term); + } +} + +/** + * Implement fuzzy enumeration with linear brute force. + */ +final class LinearFuzzyTermsEnum extends FilteredTermsEnum { + + /* This should be somewhere around the average long word. + * If it is longer, we waste time and space. If it is shorter, we waste a + * little bit of time growing the array as we encounter longer words. + */ + private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19; + + /* Allows us save time required to create a new array + * every time similarity is called. + */ + private int[][] d; + + private final char[] text; + private final int prefixLen; + + private final float minimumSimilarity; + private final float scale_factor; + private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; + + private final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + /** + * Constructor for enumeration of all terms from specified reader which share a prefix of + * length prefixLength with term and which have a fuzzy similarity > + * minSimilarity. + *

    + * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + public LinearFuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException { + super(reader, term.field()); + + if (minSimilarity >= 1.0f) + throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); + else if (minSimilarity < 0.0f) + throw new IllegalArgumentException("minimumSimilarity cannot be less than 0"); + if(prefixLength < 0) + throw new IllegalArgumentException("prefixLength cannot be less than 0"); + + this.minimumSimilarity = minSimilarity; + this.scale_factor = 1.0f / (1.0f - minimumSimilarity); + + //The prefix could be longer than the word. + //It's kind of silly though. It means we must match the entire word. + final int fullSearchTermLength = term.text().length(); + final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength; + + this.text = term.text().substring(realPrefixLength).toCharArray(); + final String prefix = term.text().substring(0, realPrefixLength); + prefixBytesRef = new BytesRef(prefix); + prefixLen = prefix.length(); + initializeMaxDistances(); + this.d = initDistanceArray(); + + setInitialSeekTerm(prefixBytesRef); + } + + private final BytesRef prefixBytesRef; + // used for unicode conversion from BytesRef byte[] to char[] + private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + + /** + * The termCompare method in FuzzyTermEnum uses Levenshtein distance to + * calculate the distance between the given term and the comparing term. + */ + @Override + protected final AcceptStatus accept(BytesRef term) { + if (term.startsWith(prefixBytesRef)) { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + final float similarity = similarity(utf16.result, prefixLen, utf16.length - prefixLen); + if (similarity > minimumSimilarity) { + boostAtt.setBoost((float)((similarity - minimumSimilarity) * scale_factor)); + return AcceptStatus.YES; + } else return AcceptStatus.NO; + } else { + return AcceptStatus.END; + } + } + + /****************************** + * Compute Levenshtein distance + ******************************/ + + /** + * Finds and returns the smallest of three integers + */ + private static final int min(int a, int b, int c) { + final int t = (a < b) ? a : b; + return (t < c) ? t : c; + } + + private final int[][] initDistanceArray(){ + return new int[this.text.length + 1][TYPICAL_LONGEST_WORD_IN_INDEX]; + } + + /** + *

    Similarity returns a number that is 1.0f or less (including negative numbers) + * based on how similar the Term is compared to a target term. It returns + * exactly 0.0f when + *

    +   *    editDistance > maximumEditDistance
    + * Otherwise it returns: + *
    +   *    1 - (editDistance / length)
    + * where length is the length of the shortest term (text or target) including a + * prefix that are identical and editDistance is the Levenshtein distance for + * the two words.

    + * + *

    Embedded within this algorithm is a fail-fast Levenshtein distance + * algorithm. The fail-fast algorithm differs from the standard Levenshtein + * distance algorithm in that it is aborted if it is discovered that the + * minimum distance between the words is greater than some threshold. + * + *

    To calculate the maximum distance threshold we use the following formula: + *

    +   *     (1 - minimumSimilarity) * length
    + * where length is the shortest term including any prefix that is not part of the + * similarity comparison. This formula was derived by solving for what maximum value + * of distance returns false for the following statements: + *
    +   *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
    +   *   return (similarity > minimumSimilarity);
    + * where distance is the Levenshtein distance for the two words. + *

    + *

    Levenshtein distance (also known as edit distance) is a measure of similarity + * between two strings where the distance is measured as the number of character + * deletions, insertions or substitutions required to transform one string to + * the other string. + * @param target the target word or phrase + * @return the similarity, 0.0 or less indicates that it matches less than the required + * threshold and 1.0 indicates that the text and target are identical + */ + private final float similarity(final char[] target, int offset, int length) { + final int m = length; + final int n = text.length; + if (n == 0) { + //we don't have anything to compare. That means if we just add + //the letters for m we get the new word + return prefixLen == 0 ? 0.0f : 1.0f - ((float) m / prefixLen); + } + if (m == 0) { + return prefixLen == 0 ? 0.0f : 1.0f - ((float) n / prefixLen); + } + + final int maxDistance = getMaxDistance(m); + + if (maxDistance < Math.abs(m-n)) { + //just adding the characters of m to n or vice-versa results in + //too many edits + //for example "pre" length is 3 and "prefixes" length is 8. We can see that + //given this optimal circumstance, the edit distance cannot be less than 5. + //which is 8-3 or more precisely Math.abs(3-8). + //if our maximum edit distance is 4, then we can discard this word + //without looking at it. + return 0.0f; + } + + //let's make sure we have enough room in our array to do the distance calculations. + if (d[0].length <= m) { + growDistanceArray(m); + } + + // init matrix d + for (int i = 0; i <= n; i++) d[i][0] = i; + for (int j = 0; j <= m; j++) d[0][j] = j; + + // start computing edit distance + for (int i = 1; i <= n; i++) { + int bestPossibleEditDistance = m; + final char s_i = text[i - 1]; + for (int j = 1; j <= m; j++) { + if (s_i != target[offset+j-1]) { + d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1; + } + else { + d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]); + } + bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]); + } + + //After calculating row i, the best possible edit distance + //can be found by found by finding the smallest value in a given column. + //If the bestPossibleEditDistance is greater than the max distance, abort. + + if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater + //the closest the target can be to the text is just too far away. + //this target is leaving the party early. + return 0.0f; + } + } + + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + return 1.0f - ((float)d[n][m] / (float) (prefixLen + Math.min(n, m))); + } + + /** + * Grow the second dimension of the array, so that we can calculate the + * Levenshtein difference. + */ + private void growDistanceArray(int m) { + for (int i = 0; i < d.length; i++) { + d[i] = new int[m+1]; + } + } + + /** + * The max Distance is the maximum Levenshtein distance for the text + * compared to some other value that results in score that is + * better than the minimum similarity. + * @param m the length of the "other value" + * @return the maximum levenshtein distance that we care about + */ + private final int getMaxDistance(int m) { + return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m); + } + + private void initializeMaxDistances() { + for (int i = 0; i < maxDistances.length; i++) { + maxDistances[i] = calculateMaxDistance(i); + } + } + + private int calculateMaxDistance(int m) { + return (int) ((1-minimumSimilarity) * (Math.min(text.length, m) + prefixLen)); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java b/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java index 6e447afc0d1..bbddfa6a575 100644 --- a/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java @@ -19,8 +19,9 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.Bits; import java.util.Set; import java.io.IOException; @@ -45,16 +46,18 @@ public class MatchAllDocsQuery extends Query { } private class MatchAllScorer extends Scorer { - final TermDocs termDocs; final float score; final byte[] norms; private int doc = -1; + private final int maxDoc; + private final Bits delDocs; MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, byte[] norms) throws IOException { super(similarity); - this.termDocs = reader.termDocs(null); + delDocs = MultiFields.getDeletedDocs(reader); score = w.getValue(); + maxDoc = reader.maxDoc(); this.norms = norms; } @@ -65,7 +68,14 @@ public class MatchAllDocsQuery extends Query { @Override public int nextDoc() throws IOException { - return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; + doc++; + while(delDocs != null && doc < maxDoc && delDocs.get(doc)) { + doc++; + } + if (doc == maxDoc) { + doc = NO_MORE_DOCS; + } + return doc; } @Override @@ -75,7 +85,8 @@ public class MatchAllDocsQuery extends Query { @Override public int advance(int target) throws IOException { - return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; + doc = target-1; + return nextDoc(); } } diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java index fbb5cc1d029..98b00b3e122 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -21,10 +21,14 @@ import java.io.IOException; import java.util.*; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultipleTermPositions; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.Bits; /** * MultiPhraseQuery is a generalized version of PhraseQuery, with an added @@ -167,27 +171,31 @@ public class MultiPhraseQuery extends Query { if (termArrays.size() == 0) // optimize zero-term case return null; - TermPositions[] tps = new TermPositions[termArrays.size()]; - for (int i=0; i 1) - p = new MultipleTermPositions(reader, terms); - else - p = reader.termPositions(terms[0]); + final DocsAndPositionsEnum postingsEnum; + if (terms.length > 1) { + postingsEnum = new UnionDocsAndPositionsEnum(reader, terms); + } else { + postingsEnum = reader.termPositionsEnum(MultiFields.getDeletedDocs(reader), + terms[0].field(), + new BytesRef(terms[0].text())); + } - if (p == null) + if (postingsEnum == null) { return null; + } - tps[i] = p; + postings[i] = postingsEnum; } if (slop == 0) - return new ExactPhraseScorer(this, tps, getPositions(), similarity, + return new ExactPhraseScorer(this, postings, getPositions(), similarity, reader.norms(field)); else - return new SloppyPhraseScorer(this, tps, getPositions(), similarity, + return new SloppyPhraseScorer(this, postings, getPositions(), similarity, slop, reader.norms(field)); } @@ -370,3 +378,169 @@ public class MultiPhraseQuery extends Query { return true; } } + +/** + * Takes the logical union of multiple DocsEnum iterators. + */ + +// TODO: if ever we allow subclassing of the *PhraseScorer +class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { + + private static final class DocsQueue extends PriorityQueue { + DocsQueue(List docsEnums) throws IOException { + initialize(docsEnums.size()); + + Iterator i = docsEnums.iterator(); + while (i.hasNext()) { + DocsAndPositionsEnum postings = (DocsAndPositionsEnum) i.next(); + if (postings.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { + add(postings); + } + } + } + + final public DocsEnum peek() { + return top(); + } + + @Override + public final boolean lessThan(DocsAndPositionsEnum a, DocsAndPositionsEnum b) { + return a.docID() < b.docID(); + } + } + + private static final class IntQueue { + private int _arraySize = 16; + private int _index = 0; + private int _lastIndex = 0; + private int[] _array = new int[_arraySize]; + + final void add(int i) { + if (_lastIndex == _arraySize) + growArray(); + + _array[_lastIndex++] = i; + } + + final int next() { + return _array[_index++]; + } + + final void sort() { + Arrays.sort(_array, _index, _lastIndex); + } + + final void clear() { + _index = 0; + _lastIndex = 0; + } + + final int size() { + return (_lastIndex - _index); + } + + private void growArray() { + int[] newArray = new int[_arraySize * 2]; + System.arraycopy(_array, 0, newArray, 0, _arraySize); + _array = newArray; + _arraySize *= 2; + } + } + + private int _doc; + private int _freq; + private DocsQueue _queue; + private IntQueue _posList; + + public UnionDocsAndPositionsEnum(IndexReader indexReader, Term[] terms) throws IOException { + List docsEnums = new LinkedList(); + final Bits delDocs = MultiFields.getDeletedDocs(indexReader); + for (int i = 0; i < terms.length; i++) { + DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, + terms[i].field(), + new BytesRef(terms[i].text())); + if (postings != null) { + docsEnums.add(postings); + } + } + + _queue = new DocsQueue(docsEnums); + _posList = new IntQueue(); + } + + @Override + public final int nextDoc() throws IOException { + if (_queue.size() == 0) { + return NO_MORE_DOCS; + } + + // TODO: move this init into positions(): if the search + // doesn't need the positions for this doc then don't + // waste CPU merging them: + _posList.clear(); + _doc = _queue.top().docID(); + + // merge sort all positions together + DocsAndPositionsEnum postings; + do { + postings = _queue.top(); + + final int freq = postings.freq(); + for (int i = 0; i < freq; i++) { + _posList.add(postings.nextPosition()); + } + + if (postings.nextDoc() != NO_MORE_DOCS) { + _queue.updateTop(); + } else { + _queue.pop(); + } + } while (_queue.size() > 0 && _queue.top().docID() == _doc); + + _posList.sort(); + _freq = _posList.size(); + + return _doc; + } + + @Override + public int nextPosition() { + return _posList.next(); + } + + @Override + public int getPayloadLength() { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef getPayload() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPayload() { + throw new UnsupportedOperationException(); + } + + @Override + public final int advance(int target) throws IOException { + while (_queue.top() != null && target > _queue.top().docID()) { + DocsAndPositionsEnum postings = _queue.pop(); + if (postings.advance(target) != NO_MORE_DOCS) { + _queue.add(postings); + } + } + return nextDoc(); + } + + @Override + public final int freq() { + return _freq; + } + + @Override + public final int docID() { + return _doc; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index 7c4a5808e94..67a80da2489 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -24,17 +24,24 @@ import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; - +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; import org.apache.lucene.queryParser.QueryParser; // for javadoc +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.VirtualMethod; /** * An abstract {@link Query} that matches documents * containing a subset of terms provided by a {@link - * FilteredTermEnum} enumeration. + * FilteredTermsEnum} enumeration. * *

    This query cannot be used directly; you must subclass - * it and define {@link #getEnum} to provide a {@link - * FilteredTermEnum} that iterates through the terms to be + * it and define {@link #getTermsEnum} to provide a {@link + * FilteredTermsEnum} that iterates through the terms to be * matched. * *

    NOTE: if {@link #setRewriteMethod} is either @@ -61,8 +68,90 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default. */ public abstract class MultiTermQuery extends Query { + protected final String field; protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; + + /** @deprecated remove when getEnum is removed */ + private static final VirtualMethod getEnumMethod = + new VirtualMethod(MultiTermQuery.class, "getEnum", IndexReader.class); + /** @deprecated remove when getEnum is removed */ + private static final VirtualMethod getTermsEnumMethod = + new VirtualMethod(MultiTermQuery.class, "getTermsEnum", IndexReader.class); + /** @deprecated remove when getEnum is removed */ + final boolean hasNewAPI = + VirtualMethod.compareImplementationDistance(getClass(), + getTermsEnumMethod, getEnumMethod) >= 0; // its ok for both to be overridden + + /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum} + * and update the boost on each returned term. This enables to control the boost factor + * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or + * {@link TopTermsBooleanQueryRewrite} mode. + * {@link FuzzyQuery} is using this to take the edit distance into account. + */ + public static interface BoostAttribute extends Attribute { + /** Sets the boost in this attribute */ + public void setBoost(float boost); + /** Retrieves the boost, default is {@code 1.0f}. */ + public float getBoost(); + /** Sets the maximum boost for terms that would never get + * into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}. + * This value is not changed by {@link AttributeImpl#clear} + * and not used in {@code equals()} and {@code hashCode()}. + * Do not change the value in the {@link TermsEnum}! + */ + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); + /** Retrieves the maximum boost that is not competitive, + * default is megative infinity. You can use this boost value + * as a hint when writing the {@link TermsEnum}. + */ + public float getMaxNonCompetitiveBoost(); + } + + /** Implementation class for {@link BoostAttribute}. */ + public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { + private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + + public void setBoost(float boost) { + this.boost = boost; + } + + public float getBoost() { + return boost; + } + + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { + this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; + } + + public float getMaxNonCompetitiveBoost() { + return maxNonCompetitiveBoost; + } + + @Override + public void clear() { + boost = 1.0f; + } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other instanceof BoostAttributeImpl) + return ((BoostAttributeImpl) other).boost == boost; + return false; + } + + @Override + public int hashCode() { + return Float.floatToIntBits(boost); + } + + @Override + public void copyTo(AttributeImpl target) { + ((BoostAttribute) target).setBoost(boost); + } + } /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { @@ -100,30 +189,79 @@ public abstract class MultiTermQuery extends Query { private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final FilteredTermEnum enumerator = query.getEnum(reader); - int count = 0; - try { - do { - Term t = enumerator.term(); - if (t != null) { - if (collector.collect(t, enumerator.difference())) { - count++; - } else { - break; - } + + if (query.hasNewAPI) { + + if (query.field == null) { + throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); + } + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields + return 0; + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + return 0; + } + + final TermsEnum termsEnum = query.getTermsEnum(reader); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) + return 0; + final BoostAttribute boostAtt = + termsEnum.attributes().addAttribute(BoostAttribute.class); + collector.boostAtt = boostAtt; + int count = 0; + BytesRef term; + final Term placeholderTerm = new Term(query.field); + while ((term = termsEnum.next()) != null) { + if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) { + count++; + } else { + break; } - } while (enumerator.next()); - } finally { - enumerator.close(); + } + collector.boostAtt = null; + return count; + } else { + // deprecated case + final FilteredTermEnum enumerator = query.getEnum(reader); + int count = 0; + try { + do { + Term t = enumerator.term(); + if (t != null) { + if (collector.collect(t, enumerator.difference())) { + count++; + } else { + break; + } + } + } while (enumerator.next()); + } finally { + enumerator.close(); + } + return count; } - return count; } - protected interface TermCollector { + protected static abstract class TermCollector { + /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */ + private BoostAttribute boostAtt = null; + /** return false to stop collecting */ - boolean collect(Term t, float boost) throws IOException; + public abstract boolean collect(Term t, float boost) throws IOException; + + /** set the minimum boost as a hint for the term producer */ + protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { + if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + } } - } private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { @@ -207,6 +345,7 @@ public abstract class MultiTermQuery extends Query { stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); + setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); return true; } @@ -338,6 +477,7 @@ public abstract class MultiTermQuery extends Query { public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { Query result = super.rewrite(reader, query); assert result instanceof BooleanQuery; + // TODO: if empty boolean query return NullQuery? if (!((BooleanQuery) result).clauses().isEmpty()) { // strip the scores off result = new ConstantScoreQuery(new QueryWrapperFilter(result)); @@ -448,7 +588,7 @@ public abstract class MultiTermQuery extends Query { } } - private static final class CutOffTermCollector implements TermCollector { + private static final class CutOffTermCollector extends TermCollector { CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { this.reader = reader; this.docCountCutoff = docCountCutoff; @@ -465,6 +605,7 @@ public abstract class MultiTermQuery extends Query { // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: + // @deprecated: in 4.0 use BytesRef for collectTerms() docVisitCount += reader.docFreq(t); return true; } @@ -538,12 +679,44 @@ public abstract class MultiTermQuery extends Query { * Constructs a query matching terms that cannot be represented with a single * Term. */ - public MultiTermQuery() { + public MultiTermQuery(final String field) { + this.field = field; } - /** Construct the enumeration to be used, expanding the pattern term. */ - protected abstract FilteredTermEnum getEnum(IndexReader reader) - throws IOException; + /** + * Constructs a query matching terms that cannot be represented with a single + * Term. + * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can + * only work on one field per terms enum. If you override + * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor. + */ + @Deprecated + public MultiTermQuery() { + this(null); + } + + /** Returns the field name for this query */ + public final String getField() { return field; } + + /** Construct the enumeration to be used, expanding the + * pattern term. + * @deprecated Please override {@link #getTermsEnum} instead */ + @Deprecated + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Construct the enumeration to be used, expanding the + * pattern term. This method should only be called if + * the field exists (ie, implementations can assume the + * field does exist). This method should not return null + * (should instead return {@link TermsEnum#EMPTY} if no + * terms match). The TermsEnum must already be + * positioned to the first matching term. */ + // TODO 4.0: make this method abstract + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + throw new UnsupportedOperationException(); + } /** * Expert: Return the number of unique terms visited during execution of the query. @@ -602,8 +775,8 @@ public abstract class MultiTermQuery extends Query { final int prime = 31; int result = 1; result = prime * result + Float.floatToIntBits(getBoost()); - result = prime * result; - result += rewriteMethod.hashCode(); + result = prime * result + rewriteMethod.hashCode(); + if (field != null) result = prime * result + field.hashCode(); return result; } @@ -621,7 +794,7 @@ public abstract class MultiTermQuery extends Query { if (!rewriteMethod.equals(other.rewriteMethod)) { return false; } - return true; + return (other.field == null ? field == null : other.field.equals(field)); } } diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java b/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java index bbf80f5df4c..e24f097d8f8 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java @@ -21,9 +21,15 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; /** * A wrapper for {@link MultiTermQuery}, that exposes its @@ -70,6 +76,9 @@ public class MultiTermQueryWrapperFilter extends Filte public final int hashCode() { return query.hashCode(); } + + /** Returns the field name for this query */ + public final String getField() { return query.getField(); } /** * Expert: Return the number of unique terms visited during execution of the filter. @@ -95,49 +104,101 @@ public class MultiTermQueryWrapperFilter extends Filte } /** - * Returns a DocIdSet with documents that should be - * permitted in search results. + * Returns a DocIdSet with documents that should be permitted in search + * results. */ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { - final TermEnum enumerator = query.getEnum(reader); - try { - // if current term in enum is null, the enum is empty -> shortcut - if (enumerator.term() == null) + if (query.hasNewAPI) { + if (query.field == null) { + throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); + } + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields return DocIdSet.EMPTY_DOCIDSET; - // else fill into a OpenBitSet - final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); - final int[] docs = new int[32]; - final int[] freqs = new int[32]; - TermDocs termDocs = reader.termDocs(); - try { + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + return DocIdSet.EMPTY_DOCIDSET; + } + + final TermsEnum termsEnum = query.getTermsEnum(reader); + assert termsEnum != null; + if (termsEnum.next() != null) { + // fill into a OpenBitSet + final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); int termCount = 0; + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docsEnum = null; do { - Term term = enumerator.term(); - if (term == null) - break; termCount++; - termDocs.seek(term); + // System.out.println(" iter termCount=" + termCount + " term=" + + // enumerator.term().toBytesString()); + docsEnum = termsEnum.docs(delDocs, docsEnum); + final DocsEnum.BulkReadResult result = docsEnum.getBulkResult(); while (true) { - final int count = termDocs.read(docs, freqs); + final int count = docsEnum.read(); if (count != 0) { - for(int i=0;i=1"); - this.field = StringHelper.intern(field); this.precisionStep = precisionStep; this.valSize = valSize; this.min = min; @@ -299,14 +299,14 @@ public final class NumericRangeQuery extends MultiTermQuery { ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); } - - @Override - protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { - return new NumericRangeTermEnum(reader); - } - /** Returns the field name for this query */ - public String getField() { return field; } + @Override @SuppressWarnings("unchecked") + protected TermsEnum getTermsEnum(final IndexReader reader) throws IOException { + // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are + return (min != null && max != null && ((Comparable) min).compareTo(max) > 0) ? + TermsEnum.EMPTY : + new NumericRangeTermsEnum(reader); + } /** Returns true if the lower endpoint is inclusive */ public boolean includesMin() { return minInclusive; } @@ -323,7 +323,7 @@ public final class NumericRangeQuery extends MultiTermQuery { @Override public String toString(final String field) { final StringBuilder sb = new StringBuilder(); - if (!this.field.equals(field)) sb.append(this.field).append(':'); + if (!getField().equals(field)) sb.append(getField()).append(':'); return sb.append(minInclusive ? '[' : '{') .append((min == null) ? "*" : min.toString()) .append(" TO ") @@ -341,7 +341,6 @@ public final class NumericRangeQuery extends MultiTermQuery { if (o instanceof NumericRangeQuery) { final NumericRangeQuery q=(NumericRangeQuery)o; return ( - field==q.field && (q.min == null ? min == null : q.min.equals(min)) && (q.max == null ? max == null : q.max.equals(max)) && minInclusive == q.minInclusive && @@ -355,29 +354,22 @@ public final class NumericRangeQuery extends MultiTermQuery { @Override public final int hashCode() { int hash = super.hashCode(); - hash += field.hashCode()^0x4565fd66 + precisionStep^0x64365465; + hash += precisionStep^0x64365465; if (min != null) hash += min.hashCode()^0x14fa55fb; if (max != null) hash += max.hashCode()^0x733fa5fe; return hash + (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); } - - // field must be interned after reading from stream - private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { - in.defaultReadObject(); - field = StringHelper.intern(field); - } // members (package private, to be also fast accessible by NumericRangeTermEnum) - String field; final int precisionStep, valSize; final T min, max; final boolean minInclusive,maxInclusive; /** - * Subclass of FilteredTermEnum for enumerating all terms that match the - * sub-ranges for trie range queries. + * Subclass of FilteredTermsEnum for enumerating all terms that match the + * sub-ranges for trie range queries, using flex API. *

    * WARNING: This term enumeration is not guaranteed to be always ordered by * {@link Term#compareTo}. @@ -385,16 +377,15 @@ public final class NumericRangeQuery extends MultiTermQuery { * {@link NumericUtils#splitIntRange} generates the sub-ranges. For * {@link MultiTermQuery} ordering is not relevant. */ - private final class NumericRangeTermEnum extends FilteredTermEnum { + private final class NumericRangeTermsEnum extends FilteredTermsEnum { - private final IndexReader reader; - private final LinkedList rangeBounds = new LinkedList(); - private final Term termTemplate = new Term(field); - private String currentUpperBound = null; + private BytesRef currentLowerBound, currentUpperBound; - NumericRangeTermEnum(final IndexReader reader) throws IOException { - this.reader = reader; - + private final LinkedList rangeBounds = new LinkedList(); + private final Comparator termComp; + + NumericRangeTermsEnum(final IndexReader reader) throws IOException { + super(reader, getField()); switch (valSize) { case 64: { // lower @@ -423,7 +414,7 @@ public final class NumericRangeQuery extends MultiTermQuery { NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() { @Override - public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + public final void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } @@ -458,7 +449,7 @@ public final class NumericRangeQuery extends MultiTermQuery { NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() { @Override - public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + public final void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } @@ -470,85 +461,32 @@ public final class NumericRangeQuery extends MultiTermQuery { // should never happen throw new IllegalArgumentException("valSize must be 32 or 64"); } - - // seek to first term - next(); - } - @Override - public float difference() { - return 1.0f; + termComp = getComparator(); } - /** this is a dummy, it is not used by this class. */ @Override - protected boolean endEnum() { - throw new UnsupportedOperationException("not implemented"); - } - - /** this is a dummy, it is not used by this class. */ - @Override - protected void setEnum(TermEnum tenum) { - throw new UnsupportedOperationException("not implemented"); - } - - /** - * Compares if current upper bound is reached. - * In contrast to {@link FilteredTermEnum}, a return value - * of false ends iterating the current enum - * and forwards to the next sub-range. - */ - @Override - protected boolean termCompare(Term term) { - return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0); - } - - /** Increments the enumeration to the next element. True if one exists. */ - @Override - public boolean next() throws IOException { - // if a current term exists, the actual enum is initialized: - // try change to next term, if no such term exists, fall-through - if (currentTerm != null) { - assert actualEnum != null; - if (actualEnum.next()) { - currentTerm = actualEnum.term(); - if (termCompare(currentTerm)) - return true; - } - } - - // if all above fails, we go forward to the next enum, - // if one is available - currentTerm = null; - while (rangeBounds.size() >= 2) { + protected final BytesRef nextSeekTerm(BytesRef term) throws IOException { + if (rangeBounds.size() >= 2) { assert rangeBounds.size() % 2 == 0; - // close the current enum and read next bounds - if (actualEnum != null) { - actualEnum.close(); - actualEnum = null; - } - final String lowerBound = rangeBounds.removeFirst(); + + this.currentLowerBound = rangeBounds.removeFirst(); + assert currentUpperBound == null || termComp.compare(currentUpperBound, currentLowerBound) <= 0 : + "The current upper bound must be <= the new lower bound"; + this.currentUpperBound = rangeBounds.removeFirst(); - // create a new enum - actualEnum = reader.terms(termTemplate.createTerm(lowerBound)); - currentTerm = actualEnum.term(); - if (currentTerm != null && termCompare(currentTerm)) - return true; - // clear the current term for next iteration - currentTerm = null; + return currentLowerBound; } // no more sub-range enums available - assert rangeBounds.size() == 0 && currentTerm == null; - return false; + assert rangeBounds.size() == 0; + return null; } - - /** Closes the enumeration to further activity, freeing resources. */ + @Override - public void close() throws IOException { - rangeBounds.clear(); - currentUpperBound = null; - super.close(); + protected AcceptStatus accept(BytesRef term) { + return (currentUpperBound != null && termComp.compare(term, currentUpperBound) <= 0) ? + AcceptStatus.YES : AcceptStatus.NO_AND_SEEK; } } diff --git a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java index 0b0af0af8af..303cbd166b1 100644 --- a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java @@ -28,40 +28,33 @@ final class PhrasePositions { int position; // position in doc int count; // remaining pos in this doc int offset; // position in phrase - TermPositions tp; // stream of positions - PhrasePositions next; // used to make lists + final DocsAndPositionsEnum postings; // stream of docs & positions + PhrasePositions next; // used to make lists boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) - PhrasePositions(TermPositions t, int o) { - tp = t; + PhrasePositions(DocsAndPositionsEnum postings, int o) { + this.postings = postings; offset = o; } final boolean next() throws IOException { // increments to next doc - if (!tp.next()) { - tp.close(); // close stream - doc = Integer.MAX_VALUE; // sentinel value + doc = postings.nextDoc(); + if (doc == postings.NO_MORE_DOCS) { return false; } - doc = tp.doc(); - position = 0; return true; } final boolean skipTo(int target) throws IOException { - if (!tp.skipTo(target)) { - tp.close(); // close stream - doc = Integer.MAX_VALUE; // sentinel value + doc = postings.advance(target); + if (doc == postings.NO_MORE_DOCS) { return false; } - doc = tp.doc(); - position = 0; return true; } - final void firstPosition() throws IOException { - count = tp.freq(); // read first pos + count = postings.freq(); // read first pos nextPosition(); } @@ -73,7 +66,7 @@ final class PhrasePositions { */ final boolean nextPosition() throws IOException { if (count-- > 0) { // read subsequent pos's - position = tp.nextPosition() - offset; + position = postings.nextPosition() - offset; return true; } else return false; diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java index 08a01c9e1ce..6d7229760bd 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java @@ -22,10 +22,13 @@ import java.util.Set; import java.util.ArrayList; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.Bits; /** A Query that matches documents containing a particular sequence of terms. * A PhraseQuery is built by QueryParser for input like "new york". @@ -150,20 +153,35 @@ public class PhraseQuery extends Query { if (terms.size() == 0) // optimize zero-term case return null; - TermPositions[] tps = new TermPositions[terms.size()]; + DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[terms.size()]; + final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { - TermPositions p = reader.termPositions(terms.get(i)); - if (p == null) - return null; - tps[i] = p; + final Term t = terms.get(i); + final BytesRef text = new BytesRef(t.text()); + DocsAndPositionsEnum postingsEnum = MultiFields.getTermPositionsEnum(reader, + delDocs, + t.field(), + text); + // PhraseQuery on a field that did not index + // positions. + if (postingsEnum == null) { + if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), text) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); + } else { + // term does not exist + return null; + } + } + postings[i] = postingsEnum; } if (slop == 0) // optimize exact case - return new ExactPhraseScorer(this, tps, getPositions(), similarity, + return new ExactPhraseScorer(this, postings, getPositions(), similarity, reader.norms(field)); else return - new SloppyPhraseScorer(this, tps, getPositions(), similarity, slop, + new SloppyPhraseScorer(this, postings, getPositions(), similarity, slop, reader.norms(field)); } diff --git a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java index dab6d896302..ab15ae926c3 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java @@ -19,7 +19,7 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; /** Expert: Scoring functionality for phrase queries. *
    A document is considered matching if it contains the phrase-query terms @@ -43,7 +43,7 @@ abstract class PhraseScorer extends Scorer { private float freq; //phrase frequency in current doc as computed by phraseFreq(). - PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + PhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, byte[] norms) { super(similarity); this.norms = norms; @@ -55,8 +55,8 @@ abstract class PhraseScorer extends Scorer { // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. - for (int i = 0; i < tps.length; i++) { - PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); + for (int i = 0; i < postings.length; i++) { + PhrasePositions pp = new PhrasePositions(postings[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else { @@ -65,7 +65,7 @@ abstract class PhraseScorer extends Scorer { last = pp; } - pq = new PhraseQueue(tps.length); // construct empty pq + pq = new PhraseQueue(postings.length); // construct empty pq first.doc = -1; } diff --git a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java index 0b671770b69..9ef09138662 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java @@ -20,7 +20,10 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing terms with a specified prefix. A PrefixQuery @@ -34,23 +37,34 @@ public class PrefixQuery extends MultiTermQuery { /** Constructs a query for terms starting with prefix. */ public PrefixQuery(Term prefix) { + super(prefix.field()); this.prefix = prefix; } /** Returns the prefix of this query. */ public Term getPrefix() { return prefix; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new PrefixTermEnum(reader, prefix); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (prefix.text().length() == 0) { + // no prefix -- match all terms for this field: + final Terms terms = MultiFields.getTerms(reader, getField()); + return (terms != null) ? terms.iterator() : TermsEnum.EMPTY; + } + return new PrefixTermsEnum(reader, prefix); + } /** Prints a user-readable version of this query. */ @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); - if (!prefix.field().equals(field)) { - buffer.append(prefix.field()); + if (!getField().equals(field)) { + buffer.append(getField()); buffer.append(":"); } buffer.append(prefix.text()); diff --git a/lucene/src/java/org/apache/lucene/search/PrefixTermEnum.java b/lucene/src/java/org/apache/lucene/search/PrefixTermEnum.java index 6ba6208c4fc..4aa28003935 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixTermEnum.java @@ -29,7 +29,9 @@ import org.apache.lucene.index.Term; * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * + * @deprecated Use {@link PrefixTermsEnum} instead. */ +@Deprecated public class PrefixTermEnum extends FilteredTermEnum { private final Term prefix; diff --git a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java new file mode 100644 index 00000000000..650dbd03edd --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java @@ -0,0 +1,50 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; + +/** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * specified prefix filter term. + *

    Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

    + */ +public class PrefixTermsEnum extends FilteredTermsEnum { + + private final BytesRef prefixRef; + + public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException { + super(reader, prefix.field()); + setInitialSeekTerm(prefixRef = new BytesRef(prefix.text())); + } + + @Override + protected AcceptStatus accept(BytesRef term) { + if (term.startsWith(prefixRef)) { + return AcceptStatus.YES; + } else { + return AcceptStatus.END; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/src/java/org/apache/lucene/search/RegexpQuery.java new file mode 100644 index 00000000000..19dafa328d7 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/RegexpQuery.java @@ -0,0 +1,106 @@ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.Term; + +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A fast regular expression query based on the + * {@link org.apache.lucene.util.automaton} package. + *
      + *
    • Comparisons are fast + *
    • The term dictionary is enumerated in an intelligent way, to avoid + * comparisons. See {@link AutomatonQuery} for more details. + *
    + *

    + * The supported syntax is documented in the {@link RegExp} class. + * Note this might be different than other regular expression implementations. + * For some alternatives with different syntax, look under contrib/regex + *

    + *

    + * Note this query can be slow, as it needs to iterate over many terms. In order + * to prevent extremely slow RegexpQueries, a Regexp term should not start with + * the expression .* + * + * @see RegExp + * @lucene.experimental + */ +public class RegexpQuery extends AutomatonQuery { + /** + * A provider that provides no named automata + */ + private static AutomatonProvider defaultProvider = new AutomatonProvider() { + public Automaton getAutomaton(String name) throws IOException { + return null; + } + }; + + /** + * Constructs a query for terms matching term. + *

    + * By default, all regular expression features are enabled. + *

    + * + * @param term regular expression. + */ + public RegexpQuery(Term term) { + this(term, RegExp.ALL); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + */ + public RegexpQuery(Term term, int flags) { + this(term, flags, defaultProvider); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + * @param provider custom AutomatonProvider for named automata + */ + public RegexpQuery(Term term, int flags, AutomatonProvider provider) { + super(term, new RegExp(term.text(), flags).toAutomaton(provider)); + } + + /** Prints a user-readable version of this query. */ + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(term.text()); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/Similarity.java index c2705d84e59..0d414d63e67 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/Similarity.java @@ -857,6 +857,7 @@ public abstract class Similarity implements Serializable { * @return An implementation dependent float to be used as a scoring factor * */ + // TODO: maybe switch this API to BytesRef? public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length) { return 1; diff --git a/lucene/src/java/org/apache/lucene/search/SingleTermEnum.java b/lucene/src/java/org/apache/lucene/search/SingleTermEnum.java index 441b72b55fc..332381531bd 100644 --- a/lucene/src/java/org/apache/lucene/search/SingleTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/SingleTermEnum.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.Term; * but want to preserve MultiTermQuery semantics such as * {@link MultiTermQuery#rewriteMethod}. */ +@Deprecated public class SingleTermEnum extends FilteredTermEnum { private Term singleTerm; private boolean endEnum = false; diff --git a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java new file mode 100644 index 00000000000..1e5acdae119 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java @@ -0,0 +1,53 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; + +/** + * Subclass of FilteredTermsEnum for enumerating a single term. + *

    + * This can be used by {@link MultiTermQuery}s that need only visit one term, + * but want to preserve MultiTermQuery semantics such as + * {@link MultiTermQuery#rewriteMethod}. + */ +public final class SingleTermsEnum extends FilteredTermsEnum { + private final BytesRef singleRef; + + /** + * Creates a new SingleTermsEnum. + *

    + * After calling the constructor the enumeration is already pointing to the term, + * if it exists. + */ + public SingleTermsEnum(IndexReader reader, Term singleTerm) throws IOException { + super(reader, singleTerm.field()); + singleRef = new BytesRef(singleTerm.text()); + setInitialSeekTerm(singleRef); + } + + @Override + protected AcceptStatus accept(BytesRef term) { + return term.equals(singleRef) ? AcceptStatus.YES : AcceptStatus.END; + } + +} diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index d2e03e8ead3..decf1c84e8c 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -17,7 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; import java.io.IOException; import java.util.HashMap; @@ -28,9 +28,9 @@ final class SloppyPhraseScorer extends PhraseScorer { private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; - SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, + SloppyPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, int slop, byte[] norms) { - super(weight, tps, offsets, similarity, norms); + super(weight, postings, offsets, similarity, norms); this.slop = slop; } diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index bdc85af939d..7fdf3e82395 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -20,8 +20,10 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; @@ -71,12 +73,14 @@ public class TermQuery extends Query { @Override public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { - TermDocs termDocs = reader.termDocs(term); - - if (termDocs == null) + // NOTE: debateably, the caller should never pass in a + // multi reader... + DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + if (docs == null) { return null; + } - return new TermScorer(this, termDocs, similarity, reader.norms(term.field())); + return new TermScorer(this, docs, similarity, reader.norms(term.field())); } @Override @@ -114,15 +118,12 @@ public class TermQuery extends Query { Explanation tfExplanation = new Explanation(); int tf = 0; - TermDocs termDocs = reader.termDocs(term); - if (termDocs != null) { - try { - if (termDocs.skipTo(doc) && termDocs.doc() == doc) { - tf = termDocs.freq(); + DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + if (docs != null) { + int newDoc = docs.advance(doc); + if (newDoc == doc) { + tf = docs.freq(); } - } finally { - termDocs.close(); - } tfExplanation.setValue(similarity.tf(tf)); tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); } else { diff --git a/lucene/src/java/org/apache/lucene/search/TermRangeFilter.java b/lucene/src/java/org/apache/lucene/search/TermRangeFilter.java index 923fb8be049..b6da2410827 100644 --- a/lucene/src/java/org/apache/lucene/search/TermRangeFilter.java +++ b/lucene/src/java/org/apache/lucene/search/TermRangeFilter.java @@ -87,9 +87,6 @@ public class TermRangeFilter extends MultiTermQueryWrapperFilter public static TermRangeFilter More(String fieldName, String lowerTerm) { return new TermRangeFilter(fieldName, lowerTerm, null, true, false); } - - /** Returns the field name for this filter */ - public String getField() { return query.getField(); } /** Returns the lower value of this range filter */ public String getLowerTerm() { return query.getLowerTerm(); } diff --git a/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java b/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java index 383883dac08..045e6891cb7 100644 --- a/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.text.Collator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; /** @@ -41,7 +44,6 @@ public class TermRangeQuery extends MultiTermQuery { private String lowerTerm; private String upperTerm; private Collator collator; - private String field; private boolean includeLower; private boolean includeUpper; @@ -104,7 +106,7 @@ public class TermRangeQuery extends MultiTermQuery { */ public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, Collator collator) { - this.field = field; + super(field); this.lowerTerm = lowerTerm; this.upperTerm = upperTerm; this.includeLower = includeLower; @@ -112,9 +114,6 @@ public class TermRangeQuery extends MultiTermQuery { this.collator = collator; } - /** Returns the field name for this query */ - public String getField() { return field; } - /** Returns the lower value of this range query */ public String getLowerTerm() { return lowerTerm; } @@ -130,12 +129,33 @@ public class TermRangeQuery extends MultiTermQuery { /** Returns the collator used to determine range inclusion, if any. */ public Collator getCollator() { return collator; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new TermRangeTermEnum(reader, field, lowerTerm, upperTerm, includeLower, includeUpper, collator); } + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) { + return TermsEnum.EMPTY; + } + if ((lowerTerm == null || (collator == null && includeLower && "".equals(lowerTerm))) && upperTerm == null) { + // NOTE: debateably, the caller should never pass in a + // multi reader... + final Terms terms = MultiFields.getTerms(reader, field); + return (terms != null) ? terms.iterator() : null; + } + return new TermRangeTermsEnum(reader, field, + lowerTerm, upperTerm, includeLower, includeUpper, collator); + } + + /** @deprecated */ + @Deprecated + public String field() { + return getField(); + } + /** Prints a user-readable version of this query. */ @Override public String toString(String field) { @@ -158,7 +178,6 @@ public class TermRangeQuery extends MultiTermQuery { final int prime = 31; int result = super.hashCode(); result = prime * result + ((collator == null) ? 0 : collator.hashCode()); - result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + (includeLower ? 1231 : 1237); result = prime * result + (includeUpper ? 1231 : 1237); result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode()); @@ -180,11 +199,6 @@ public class TermRangeQuery extends MultiTermQuery { return false; } else if (!collator.equals(other.collator)) return false; - if (field == null) { - if (other.field != null) - return false; - } else if (!field.equals(other.field)) - return false; if (includeLower != other.includeLower) return false; if (includeUpper != other.includeUpper) diff --git a/lucene/src/java/org/apache/lucene/search/TermRangeTermEnum.java b/lucene/src/java/org/apache/lucene/search/TermRangeTermEnum.java index edd0bba8ee7..ac28464de1d 100644 --- a/lucene/src/java/org/apache/lucene/search/TermRangeTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/TermRangeTermEnum.java @@ -31,7 +31,9 @@ import org.apache.lucene.util.StringHelper; * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * @since 2.9 + * @deprecated Please switch to {@link TermRangeTermsEnum} */ +@Deprecated public class TermRangeTermEnum extends FilteredTermEnum { private Collator collator = null; diff --git a/lucene/src/java/org/apache/lucene/search/TermRangeTermsEnum.java b/lucene/src/java/org/apache/lucene/search/TermRangeTermsEnum.java new file mode 100644 index 00000000000..4ccfe51274d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/TermRangeTermsEnum.java @@ -0,0 +1,132 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.text.Collator; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.BytesRef; + +/** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * specified range parameters. + *

    Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

    + */ +public class TermRangeTermsEnum extends FilteredTermsEnum { + + private Collator collator; + private String upperTermText; + private String lowerTermText; + private boolean includeLower; + private boolean includeUpper; + final private BytesRef lowerBytesRef; + final private BytesRef upperBytesRef; + private final Comparator termComp; + + /** + * Enumerates all terms greater/equal than lowerTerm + * but less/equal than upperTerm. + * + * If an endpoint is null, it is said to be "open". Either or both + * endpoints may be open. Open endpoints may not be exclusive + * (you can't select all but the first or last term without + * explicitly specifying the term to exclude.) + * + * @param reader + * @param field + * An interned field that holds both lower and upper terms. + * @param lowerTermText + * The term text at the lower end of the range + * @param upperTermText + * The term text at the upper end of the range + * @param includeLower + * If true, the lowerTerm is included in the range. + * @param includeUpper + * If true, the upperTerm is included in the range. + * @param collator + * The collator to use to collate index Terms, to determine their + * membership in the range bounded by lowerTerm and + * upperTerm. + * + * @throws IOException + */ + public TermRangeTermsEnum(IndexReader reader, String field, String lowerTermText, String upperTermText, + boolean includeLower, boolean includeUpper, Collator collator) throws IOException { + super(reader, field); + this.collator = collator; + this.upperTermText = upperTermText; + this.lowerTermText = lowerTermText; + this.includeLower = includeLower; + this.includeUpper = includeUpper; + + // do a little bit of normalization... + // open ended range queries should always be inclusive. + if (this.lowerTermText == null) { + this.lowerTermText = ""; + this.includeLower = true; + } + lowerBytesRef = new BytesRef(this.lowerTermText); + + if (this.upperTermText == null) { + this.includeUpper = true; + upperBytesRef = null; + } else { + upperBytesRef = new BytesRef(upperTermText); + } + + BytesRef startBytesRef = (collator == null) ? lowerBytesRef : new BytesRef(""); + setInitialSeekTerm(startBytesRef); + termComp = getComparator(); + } + + @Override + protected AcceptStatus accept(BytesRef term) { + if (collator == null) { + if (!this.includeLower && term.equals(lowerBytesRef)) + return AcceptStatus.NO; + // Use this field's default sort ordering + if (upperBytesRef != null) { + final int cmp = termComp.compare(upperBytesRef, term); + /* + * if beyond the upper term, or is exclusive and this is equal to + * the upper term, break out + */ + if ((cmp < 0) || + (!includeUpper && cmp==0)) { + return AcceptStatus.END; + } + } + return AcceptStatus.YES; + } else { + if ((includeLower + ? collator.compare(term.utf8ToString(), lowerTermText) >= 0 + : collator.compare(term.utf8ToString(), lowerTermText) > 0) + && (upperTermText == null + || (includeUpper + ? collator.compare(term.utf8ToString(), upperTermText) <= 0 + : collator.compare(term.utf8ToString(), upperTermText) < 0))) { + return AcceptStatus.YES; + } + return AcceptStatus.NO; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/TermScorer.java b/lucene/src/java/org/apache/lucene/search/TermScorer.java index d450295afc2..6c7ff6bcab1 100644 --- a/lucene/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/TermScorer.java @@ -19,25 +19,26 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.DocsEnum; /** Expert: A Scorer for documents matching a Term. */ final class TermScorer extends Scorer { - private Weight weight; - private TermDocs termDocs; + private DocsEnum docsEnum; private byte[] norms; private float weightValue; private int doc = -1; + private int freq; - private final int[] docs = new int[32]; // buffered doc numbers - private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; + private int[] docs; + private int[] freqs; + private final DocsEnum.BulkReadResult bulkResult; /** * Construct a TermScorer. @@ -52,13 +53,14 @@ final class TermScorer extends Scorer { * @param norms * The field norms of the document fields for the Term. */ - TermScorer(Weight weight, TermDocs td, Similarity similarity, byte[] norms) { + TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { super(similarity); this.weight = weight; - this.termDocs = td; + this.docsEnum = td; this.norms = norms; this.weightValue = weight.getValue(); + bulkResult = td.getBulkResult(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; @@ -69,62 +71,69 @@ final class TermScorer extends Scorer { score(c, Integer.MAX_VALUE, nextDoc()); } + private final void refillBuffer() throws IOException { + pointerMax = docsEnum.read(); // refill + docs = bulkResult.docs.ints; + freqs = bulkResult.freqs.ints; + } + // firstDocID is ignored since nextDoc() sets 'doc' @Override protected boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); while (doc < end) { // for docs in window c.collect(doc); // collect score - if (++pointer >= pointerMax) { - pointerMax = termDocs.read(docs, freqs); // refill buffers + refillBuffer(); if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream - doc = Integer.MAX_VALUE; // set to sentinel value + doc = NO_MORE_DOCS; // set to sentinel value return false; } } doc = docs[pointer]; + freq = freqs[pointer]; } return true; } @Override - public int docID() { return doc; } + public int docID() { + return doc; + } /** * Advances to the next document matching the query.
    * The iterator over the matching documents is buffered using * {@link TermDocs#read(int[],int[])}. * - * @return the document matching the query or -1 if there are no more documents. + * @return the document matching the query or NO_MORE_DOCS if there are no more documents. */ @Override public int nextDoc() throws IOException { pointer++; if (pointer >= pointerMax) { - pointerMax = termDocs.read(docs, freqs); // refill buffer + refillBuffer(); if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream return doc = NO_MORE_DOCS; } } doc = docs[pointer]; + freq = freqs[pointer]; + assert doc != NO_MORE_DOCS; return doc; } @Override public float score() { - assert doc != -1; - int f = freqs[pointer]; + assert doc != NO_MORE_DOCS; float raw = // compute tf(f)*weight - f < SCORE_CACHE_SIZE // check cache - ? scoreCache[f] // cache hit - : getSimilarity().tf(f)*weightValue; // cache miss + freq < SCORE_CACHE_SIZE // check cache + ? scoreCache[freq] // cache hit + : getSimilarity().tf(freq)*weightValue; // cache miss return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field } @@ -132,34 +141,34 @@ final class TermScorer extends Scorer { /** * Advances to the first match beyond the current whose document number is * greater than or equal to a given target.
    - * The implementation uses {@link TermDocs#skipTo(int)}. + * The implementation uses {@link DocsEnum#advance(int)}. * * @param target * The target document number. - * @return the matching document or -1 if none exist. + * @return the matching document or NO_MORE_DOCS if none exist. */ @Override public int advance(int target) throws IOException { // first scan in cache for (pointer++; pointer < pointerMax; pointer++) { if (docs[pointer] >= target) { + freq = freqs[pointer]; return doc = docs[pointer]; } } - // not found in cache, seek underlying stream - boolean result = termDocs.skipTo(target); - if (result) { - pointerMax = 1; - pointer = 0; - docs[pointer] = doc = termDocs.doc(); - freqs[pointer] = termDocs.freq(); + // not found in readahead cache, seek underlying stream + int newDoc = docsEnum.advance(target); + //System.out.println("ts.advance docsEnum=" + docsEnum); + if (newDoc != DocsEnum.NO_MORE_DOCS) { + doc = newDoc; + freq = docsEnum.freq(); } else { doc = NO_MORE_DOCS; } return doc; } - + /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } diff --git a/lucene/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/src/java/org/apache/lucene/search/WildcardQuery.java index f6c42c3983b..bd8d9b0c622 100644 --- a/lucene/src/java/org/apache/lucene/search/WildcardQuery.java +++ b/lucene/src/java/org/apache/lucene/search/WildcardQuery.java @@ -19,101 +19,95 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, - * a Wildcard term should not start with one of the wildcards * or - * ?. + * a Wildcard term should not start with the wildcard * * *

    This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * - * @see WildcardTermEnum */ -public class WildcardQuery extends MultiTermQuery { - private boolean termContainsWildcard; - private boolean termIsPrefix; - protected Term term; - - public WildcardQuery(Term term) { - this.term = term; - String text = term.text(); - this.termContainsWildcard = (text.indexOf('*') != -1) - || (text.indexOf('?') != -1); - this.termIsPrefix = termContainsWildcard - && (text.indexOf('?') == -1) - && (text.indexOf('*') == text.length() - 1); - } + * @see AutomatonQuery + */ +public class WildcardQuery extends AutomatonQuery { + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = '*'; - @Override - protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - if (termContainsWildcard) - return new WildcardTermEnum(reader, getTerm()); - else - return new SingleTermEnum(reader, getTerm()); + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = '?'; + + /** + * Constructs a query for terms matching term. + */ + public WildcardQuery(Term term) { + super(term, toAutomaton(term)); } + /** + * Convert Lucene wildcard syntax into an automaton. + */ + static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + final char c = wildcardText.charAt(i); + switch(c) { + case WILDCARD_STRING: + automata.add(BasicAutomata.makeAnyString()); + break; + case WILDCARD_CHAR: + automata.add(BasicAutomata.makeAnyChar()); + break; + default: + automata.add(BasicAutomata.makeChar(c)); + } + } + + return BasicOperations.concatenate(automata); + } + + @Override @Deprecated + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new WildcardTermEnum(reader, term); + } + + // we override this method, else backwards layer in MTQ will prefer getEnum! + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return super.getTermsEnum(reader); + } + /** * Returns the pattern term. */ public Term getTerm() { return term; } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - if (termIsPrefix) { - MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text() - .substring(0, term.text().indexOf('*')))); - rewritten.setBoost(getBoost()); - rewritten.setRewriteMethod(getRewriteMethod()); - return rewritten; - } else { - return super.rewrite(reader); - } - } /** Prints a user-readable version of this query. */ @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); - if (!term.field().equals(field)) { - buffer.append(term.field()); + if (!getField().equals(field)) { + buffer.append(getField()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } - - @Override - public int hashCode() { - final int prime = 31; - int result = super.hashCode(); - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (!super.equals(obj)) - return false; - if (getClass() != obj.getClass()) - return false; - WildcardQuery other = (WildcardQuery) obj; - if (term == null) { - if (other.term != null) - return false; - } else if (!term.equals(other.term)) - return false; - return true; - } - } diff --git a/lucene/src/java/org/apache/lucene/search/WildcardTermEnum.java b/lucene/src/java/org/apache/lucene/search/WildcardTermEnum.java index 82d60fdac11..7e58c3b31df 100644 --- a/lucene/src/java/org/apache/lucene/search/WildcardTermEnum.java +++ b/lucene/src/java/org/apache/lucene/search/WildcardTermEnum.java @@ -28,7 +28,9 @@ import org.apache.lucene.index.Term; *

    * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. + * @deprecated Please use {@link AutomatonTermsEnum} instead. */ +@Deprecated public class WildcardTermEnum extends FilteredTermEnum { final Term searchTerm; final String field; @@ -91,8 +93,8 @@ public class WildcardTermEnum extends FilteredTermEnum { * String equality with support for wildcards ********************************************/ - public static final char WILDCARD_STRING = '*'; - public static final char WILDCARD_CHAR = '?'; + public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING; + public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR; /** * Determines if a word matches a wildcard pattern. diff --git a/lucene/src/java/org/apache/lucene/search/function/MultiValueSource.java b/lucene/src/java/org/apache/lucene/search/function/MultiValueSource.java new file mode 100644 index 00000000000..39991fae253 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/function/MultiValueSource.java @@ -0,0 +1,136 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Explanation; + +/** This class wraps another ValueSource, but protects + * against accidental double RAM usage in FieldCache when + * a composite reader is passed to {@link #getValues}. + * + *

    NOTE: this class adds a CPU penalty to every + * lookup, as it must resolve the incoming document to the + * right sub-reader using a binary search.

    + * + * @deprecated This class is temporary, to ease the + * migration to segment-based searching. Please change your + * code to not pass composite readers to these APIs. */ + +@Deprecated +public final class MultiValueSource extends ValueSource { + + final ValueSource other; + public MultiValueSource(ValueSource other) { + this.other = other; + } + + @Override + public DocValues getValues(IndexReader reader) throws IOException { + + IndexReader[] subReaders = reader.getSequentialSubReaders(); + if (subReaders != null) { + // This is a composite reader + return new MultiDocValues(subReaders); + } else { + // Already an atomic reader -- just delegate + return other.getValues(reader); + } + } + + @Override + public String description() { + return other.description(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof MultiValueSource) { + return ((MultiValueSource) o).other.equals(other); + } else { + return false; + } + } + + @Override + public int hashCode() { + return 31 * other.hashCode(); + } + + private final class MultiDocValues extends DocValues { + + final DocValues[] docValues; + final int[] docStarts; + + MultiDocValues(IndexReader[] subReaders) throws IOException { + docValues = new DocValues[subReaders.length]; + docStarts = new int[subReaders.length]; + int base = 0; + for(int i=0;i terms) { // no terms involved here @@ -127,7 +128,8 @@ public class ValueSourceQuery extends Query { private class ValueSourceScorer extends Scorer { private final float qWeight; private final DocValues vals; - private final TermDocs termDocs; + private final Bits delDocs; + private final int maxDoc; private int doc = -1; // constructor @@ -136,28 +138,37 @@ public class ValueSourceQuery extends Query { qWeight = w.getValue(); // this is when/where the values are first created. vals = valSrc.getValues(reader); - termDocs = reader.termDocs(null); + delDocs = MultiFields.getDeletedDocs(reader); + maxDoc = reader.maxDoc(); } @Override public int nextDoc() throws IOException { - return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; + doc++; + while (delDocs != null && doc < maxDoc && delDocs.get(doc)) { + doc++; + } + if (doc == maxDoc) { + doc = NO_MORE_DOCS; + } + return doc; } - + @Override public int docID() { return doc; } - + @Override public int advance(int target) throws IOException { - return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; + doc = target - 1; + return nextDoc(); } /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ @Override public float score() throws IOException { - return qWeight * vals.floatVal(termDocs.doc()); + return qWeight * vals.floatVal(doc); } } diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java index 5ca45e90fec..5a22e7c1f65 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java @@ -18,8 +18,8 @@ package org.apache.lucene.search.payloads; */ import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; @@ -30,6 +30,7 @@ import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.SpanScorer; +import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -80,16 +81,15 @@ public class PayloadTermQuery extends SpanTermQuery { } protected class PayloadTermSpanScorer extends SpanScorer { - // TODO: is this the best way to allocate this? - protected byte[] payload = new byte[256]; - protected TermPositions positions; + protected BytesRef payload; protected float payloadScore; protected int payloadsSeen; + private final TermSpans termSpans; public PayloadTermSpanScorer(TermSpans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); - positions = spans.getPositions(); + termSpans = spans; } @Override @@ -115,12 +115,24 @@ public class PayloadTermQuery extends SpanTermQuery { } protected void processPayload(Similarity similarity) throws IOException { - if (positions.isPayloadAvailable()) { - payload = positions.getPayload(payload, 0); - payloadScore = function.currentScore(doc, term.field(), - spans.start(), spans.end(), payloadsSeen, payloadScore, - similarity.scorePayload(doc, term.field(), spans.start(), spans - .end(), payload, 0, positions.getPayloadLength())); + final DocsAndPositionsEnum postings = termSpans.getPostings(); + if (postings.hasPayload()) { + payload = postings.getPayload(); + if (payload != null) { + payloadScore = function.currentScore(doc, term.field(), + spans.start(), spans.end(), payloadsSeen, payloadScore, + similarity.scorePayload(doc, term.field(), spans.start(), + spans.end(), payload.bytes, + payload.offset, + payload.length)); + } else { + payloadScore = function.currentScore(doc, term.field(), + spans.start(), spans.end(), payloadsSeen, payloadScore, + similarity.scorePayload(doc, term.field(), spans.start(), + spans.end(), null, + 0, + 0)); + } payloadsSeen++; } else { diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index 2808c4b65a9..40d5a885639 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -19,6 +19,9 @@ package org.apache.lucene.search.spans; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -39,7 +42,7 @@ public class SpanTermQuery extends SpanQuery { @Override public void extractTerms(Set terms) { - terms.add(term); + terms.add(term); } @Override @@ -80,7 +83,24 @@ public class SpanTermQuery extends SpanQuery { @Override public Spans getSpans(final IndexReader reader) throws IOException { - return new TermSpans(reader.termPositions(term), term); - } + // NOTE: debateably, the caller should never pass in a + // multi reader... + final BytesRef textBytes = new BytesRef(term.text()); + final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, + MultiFields.getDeletedDocs(reader), + term.field(), + textBytes); + if (postings != null) { + return new TermSpans(postings, term); + } else { + if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), textBytes) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + term.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run SpanTermQuery (term=" + term.text() + ")"); + } else { + // term does not exist + return TermSpans.EMPTY_TERM_SPANS; + } + } + } } diff --git a/lucene/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/src/java/org/apache/lucene/search/spans/Spans.java index 04c38e63cbf..2d21e8ef79f 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/Spans.java +++ b/lucene/src/java/org/apache/lucene/search/spans/Spans.java @@ -83,5 +83,4 @@ public abstract class Spans { * @return true if there is a payload available at this position that can be loaded */ public abstract boolean isPayloadAvailable(); - } diff --git a/lucene/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/src/java/org/apache/lucene/search/spans/TermSpans.java index 091762a6b23..2ce409eeca9 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/TermSpans.java +++ b/lucene/src/java/org/apache/lucene/search/spans/TermSpans.java @@ -17,7 +17,9 @@ package org.apache.lucene.search.spans; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.Collections; @@ -28,49 +30,53 @@ import java.util.Collection; * Public for extension only */ public class TermSpans extends Spans { - protected TermPositions positions; - protected Term term; + protected final DocsAndPositionsEnum postings; + protected final Term term; protected int doc; protected int freq; protected int count; protected int position; - - public TermSpans(TermPositions positions, Term term) throws IOException { - - this.positions = positions; + public TermSpans(DocsAndPositionsEnum postings, Term term) throws IOException { + this.postings = postings; this.term = term; doc = -1; } + // only for EmptyTermSpans (below) + TermSpans() { + term = null; + postings = null; + } + @Override public boolean next() throws IOException { if (count == freq) { - if (!positions.next()) { - doc = Integer.MAX_VALUE; + if (postings == null) { return false; } - doc = positions.doc(); - freq = positions.freq(); + doc = postings.nextDoc(); + if (doc == DocsAndPositionsEnum.NO_MORE_DOCS) { + return false; + } + freq = postings.freq(); count = 0; } - position = positions.nextPosition(); + position = postings.nextPosition(); count++; return true; } @Override public boolean skipTo(int target) throws IOException { - if (!positions.skipTo(target)) { - doc = Integer.MAX_VALUE; + doc = postings.advance(target); + if (doc == DocsAndPositionsEnum.NO_MORE_DOCS) { return false; } - doc = positions.doc(); - freq = positions.freq(); + freq = postings.freq(); count = 0; - - position = positions.nextPosition(); + position = postings.nextPosition(); count++; return true; @@ -94,15 +100,21 @@ public class TermSpans extends Spans { // TODO: Remove warning after API has been finalized @Override public Collection getPayload() throws IOException { - byte [] bytes = new byte[positions.getPayloadLength()]; - bytes = positions.getPayload(bytes, 0); + final BytesRef payload = postings.getPayload(); + final byte[] bytes; + if (payload != null) { + bytes = new byte[payload.length]; + System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length); + } else { + bytes = null; + } return Collections.singletonList(bytes); } // TODO: Remove warning after API has been finalized @Override public boolean isPayloadAvailable() { - return positions.isPayloadAvailable(); + return postings.hasPayload(); } @Override @@ -111,8 +123,47 @@ public class TermSpans extends Spans { (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position); } - - public TermPositions getPositions() { - return positions; + public DocsAndPositionsEnum getPostings() { + return postings; } + + private static final class EmptyTermSpans extends TermSpans { + + @Override + public boolean next() { + return false; + } + + @Override + public boolean skipTo(int target) { + return false; + } + + @Override + public int doc() { + return DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public int start() { + return -1; + } + + @Override + public int end() { + return -1; + } + + @Override + public Collection getPayload() { + return null; + } + + @Override + public boolean isPayloadAvailable() { + return false; + } + } + + public static final TermSpans EMPTY_TERM_SPANS = new EmptyTermSpans(); } diff --git a/lucene/src/java/org/apache/lucene/store/DataInput.java b/lucene/src/java/org/apache/lucene/store/DataInput.java new file mode 100644 index 00000000000..e175be6ae71 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/store/DataInput.java @@ -0,0 +1,239 @@ +package org.apache.lucene.store; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Abstract base class for performing read operations of Lucene's low-level + * data types. + */ +public abstract class DataInput implements Cloneable { + private byte[] bytes; // used by readString() + private char[] chars; // used by readModifiedUTF8String() + private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format + + /** Reads and returns a single byte. + * @see DataOutput#writeByte(byte) + */ + public abstract byte readByte() throws IOException; + + /** Reads a specified number of bytes into an array at the specified offset. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @see DataOutput#writeBytes(byte[],int) + */ + public abstract void readBytes(byte[] b, int offset, int len) + throws IOException; + + /** Reads a specified number of bytes into an array at the + * specified offset with control over whether the read + * should be buffered (callers who have their own buffer + * should pass in "false" for useBuffer). Currently only + * {@link BufferedIndexInput} respects this parameter. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @param useBuffer set to false if the caller will handle + * buffering. + * @see DataOutput#writeBytes(byte[],int) + */ + public void readBytes(byte[] b, int offset, int len, boolean useBuffer) + throws IOException + { + // Default to ignoring useBuffer entirely + readBytes(b, offset, len); + } + + /** Reads two bytes and returns a short. + * @see DataOutput#writeByte(byte) + */ + public short readShort() throws IOException { + return (short) (((readByte() & 0xFF) << 8) | (readByte() & 0xFF)); + } + + /** Reads four bytes and returns an int. + * @see DataOutput#writeInt(int) + */ + public int readInt() throws IOException { + return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) + | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); + } + + /** Reads an int stored in variable-length format. Reads between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataOutput#writeVInt(int) + */ + public int readVInt() throws IOException { + byte b = readByte(); + int i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7F) << shift; + } + return i; + } + + /** Reads eight bytes and returns a long. + * @see DataOutput#writeLong(long) + */ + public long readLong() throws IOException { + return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); + } + + /** Reads a long stored in variable-length format. Reads between one and + * nine bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. */ + public long readVLong() throws IOException { + byte b = readByte(); + long i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7FL) << shift; + } + return i; + } + + /** Call this if readString should read characters stored + * in the old modified UTF8 format (length in java chars + * and java's modified UTF8 encoding). This is used for + * indices written pre-2.4 See LUCENE-510 for details. */ + public void setModifiedUTF8StringsMode() { + preUTF8Strings = true; + } + + /** Reads a string. + * @see DataOutput#writeString(String) + */ + public String readString() throws IOException { + if (preUTF8Strings) + return readModifiedUTF8String(); + int length = readVInt(); + if (bytes == null || length > bytes.length) { + bytes = new byte[ArrayUtil.oversize(length, 1)]; + } + readBytes(bytes, 0, length); + return new String(bytes, 0, length, "UTF-8"); + } + + private String readModifiedUTF8String() throws IOException { + int length = readVInt(); + if (chars == null || length > chars.length) { + chars = new char[ArrayUtil.oversize(length, RamUsageEstimator.NUM_BYTES_CHAR)]; + } + readChars(chars, 0, length); + return new String(chars, 0, length); + } + + /** Reads Lucene's old "modified UTF-8" encoded + * characters into an array. + * @param buffer the array to read characters into + * @param start the offset in the array to start storing characters + * @param length the number of characters to read + * @see DataOutput#writeChars(String,int,int) + * @deprecated -- please use readString or readBytes + * instead, and construct the string + * from those utf8 bytes + */ + @Deprecated + public void readChars(char[] buffer, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + byte b = readByte(); + if ((b & 0x80) == 0) + buffer[i] = (char)(b & 0x7F); + else if ((b & 0xE0) != 0xE0) { + buffer[i] = (char)(((b & 0x1F) << 6) + | (readByte() & 0x3F)); + } else { + buffer[i] = (char)(((b & 0x0F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); + } + } + } + + /** + * Expert + * + * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still + * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything + * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine + * how many more bytes to read + * @param length The number of chars to read + * @deprecated this method operates on old "modified utf8" encoded + * strings + */ + @Deprecated + public void skipChars(int length) throws IOException{ + for (int i = 0; i < length; i++) { + byte b = readByte(); + if ((b & 0x80) == 0){ + //do nothing, we only need one byte + } else if ((b & 0xE0) != 0xE0) { + readByte();//read an additional byte + } else { + //read two additional bytes. + readByte(); + readByte(); + } + } + } + + /** Returns a clone of this stream. + * + *

    Clones of a stream access the same data, and are positioned at the same + * point as the stream they were cloned from. + * + *

    Expert: Subclasses must ensure that clones may be positioned at + * different points in the input from each other and from the stream they + * were cloned from. + */ + @Override + public Object clone() { + DataInput clone = null; + try { + clone = (DataInput)super.clone(); + } catch (CloneNotSupportedException e) {} + + clone.bytes = null; + clone.chars = null; + + return clone; + } + + public Map readStringStringMap() throws IOException { + final Map map = new HashMap(); + final int count = readInt(); + for(int i=0;i> 24)); + writeByte((byte)(i >> 16)); + writeByte((byte)(i >> 8)); + writeByte((byte) i); + } + + /** Writes an int in a variable-length format. Writes between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVInt() + */ + public void writeVInt(int i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a long as eight bytes. + * @see DataInput#readLong() + */ + public void writeLong(long i) throws IOException { + writeInt((int) (i >> 32)); + writeInt((int) i); + } + + /** Writes an long in a variable-length format. Writes between one and five + * bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVLong() + */ + public void writeVLong(long i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a string. + * @see DataInput#readString() + */ + public void writeString(String s) throws IOException { + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); + writeVInt(utf8Result.length); + writeBytes(utf8Result.bytes, 0, utf8Result.length); + } + + /** Writes a sub sequence of characters from s as the old + * format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes + * instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(String s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s.charAt(i); + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + /** Writes a sub sequence of characters from char[] as + * the old format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(char[] s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s[i]; + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + private static int COPY_BUFFER_SIZE = 16384; + private byte[] copyBuffer; + + /** Copy numBytes bytes from input to ourself. */ + public void copyBytes(DataInput input, long numBytes) throws IOException { + assert numBytes >= 0: "numBytes=" + numBytes; + long left = numBytes; + if (copyBuffer == null) + copyBuffer = new byte[COPY_BUFFER_SIZE]; + while(left > 0) { + final int toCopy; + if (left > COPY_BUFFER_SIZE) + toCopy = COPY_BUFFER_SIZE; + else + toCopy = (int) left; + input.readBytes(copyBuffer, 0, toCopy); + writeBytes(copyBuffer, 0, toCopy); + left -= toCopy; + } + } + + public void writeStringStringMap(Map map) throws IOException { + if (map == null) { + writeInt(0); + } else { + writeInt(map.size()); + for(final Map.Entry entry: map.entrySet()) { + writeString(entry.getKey()); + writeString(entry.getValue()); + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/store/Directory.java b/lucene/src/java/org/apache/lucene/store/Directory.java index e4dd2fb01f3..8ab8d07b308 100644 --- a/lucene/src/java/org/apache/lucene/store/Directory.java +++ b/lucene/src/java/org/apache/lucene/store/Directory.java @@ -19,15 +19,9 @@ package org.apache.lucene.store; import java.io.IOException; import java.io.Closeable; -import java.util.Collection; -import java.util.Collections; +import java.util.Collection; // for javadocs -import java.util.ArrayList; -import static java.util.Arrays.asList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import org.apache.lucene.index.IndexFileNameFilter; +import java.util.Arrays; import org.apache.lucene.util.IOUtils; /** A Directory is a flat list of files. Files may be written once, when they @@ -200,14 +194,7 @@ public abstract class Directory implements Closeable { * @param to destination directory */ public final void copyTo(Directory to) throws IOException { - List filenames = new ArrayList(); - IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); - - for (String name : listAll()) - if (filter.accept(null, name)) - filenames.add(name); - - copyTo(to, filenames); + copyTo(to, Arrays.asList(listAll())); } /** diff --git a/lucene/src/java/org/apache/lucene/store/IndexInput.java b/lucene/src/java/org/apache/lucene/store/IndexInput.java index 2095d518a10..1268c93191d 100644 --- a/lucene/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/src/java/org/apache/lucene/store/IndexInput.java @@ -17,187 +17,14 @@ package org.apache.lucene.store; * limitations under the License. */ -import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import java.util.HashMap; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; +import java.io.IOException; /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ -public abstract class IndexInput implements Cloneable,Closeable { - private byte[] bytes; // used by readString() - private char[] chars; // used by readModifiedUTF8String() - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - - /** Reads and returns a single byte. - * @see IndexOutput#writeByte(byte) - */ - public abstract byte readByte() throws IOException; - - /** Reads a specified number of bytes into an array at the specified offset. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @see IndexOutput#writeBytes(byte[],int) - */ - public abstract void readBytes(byte[] b, int offset, int len) - throws IOException; - - /** Reads a specified number of bytes into an array at the - * specified offset with control over whether the read - * should be buffered (callers who have their own buffer - * should pass in "false" for useBuffer). Currently only - * {@link BufferedIndexInput} respects this parameter. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @param useBuffer set to false if the caller will handle - * buffering. - * @see IndexOutput#writeBytes(byte[],int) - */ - public void readBytes(byte[] b, int offset, int len, boolean useBuffer) - throws IOException - { - // Default to ignoring useBuffer entirely - readBytes(b, offset, len); - } - - /** Reads four bytes and returns an int. - * @see IndexOutput#writeInt(int) - */ - public int readInt() throws IOException { - return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) - | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); - } - - /** Reads an int stored in variable-length format. Reads between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexOutput#writeVInt(int) - */ - public int readVInt() throws IOException { - byte b = readByte(); - int i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7F) << shift; - } - return i; - } - - /** Reads eight bytes and returns a long. - * @see IndexOutput#writeLong(long) - */ - public long readLong() throws IOException { - return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); - } - - /** Reads a long stored in variable-length format. Reads between one and - * nine bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. */ - public long readVLong() throws IOException { - byte b = readByte(); - long i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7FL) << shift; - } - return i; - } - - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - - /** Reads a string. - * @see IndexOutput#writeString(String) - */ - public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); - int length = readVInt(); - if (bytes == null || length > bytes.length) { - bytes = new byte[ArrayUtil.oversize(length, 1)]; - } - readBytes(bytes, 0, length); - return new String(bytes, 0, length, "UTF-8"); - } - - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - if (chars == null || length > chars.length) { - chars = new char[ArrayUtil.oversize(length, RamUsageEstimator.NUM_BYTES_CHAR)]; - } - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see IndexOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - @Deprecated - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else { - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - } - - /** - * Expert - * - * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still - * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything - * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine - * how many more bytes to read - * @param length The number of chars to read - * @deprecated this method operates on old "modified utf8" encoded - * strings - */ - @Deprecated - public void skipChars(int length) throws IOException{ - for (int i = 0; i < length; i++) { - byte b = readByte(); - if ((b & 0x80) == 0){ - //do nothing, we only need one byte - } else if ((b & 0xE0) != 0xE0) { - readByte();//read an additional byte - } else { - //read two additional bytes. - readByte(); - readByte(); - } - } - } - - +public abstract class IndexInput extends DataInput implements Cloneable,Closeable { /** Closes the stream to further operations. */ public abstract void close() throws IOException; @@ -214,38 +41,4 @@ public abstract class IndexInput implements Cloneable,Closeable { /** The number of bytes in the file. */ public abstract long length(); - - /** Returns a clone of this stream. - * - *

    Clones of a stream access the same data, and are positioned at the same - * point as the stream they were cloned from. - * - *

    Expert: Subclasses must ensure that clones may be positioned at - * different points in the input from each other and from the stream they - * were cloned from. - */ - @Override - public Object clone() { - IndexInput clone = null; - try { - clone = (IndexInput)super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.bytes = null; - clone.chars = null; - - return clone; - } - - public Map readStringStringMap() throws IOException { - final Map map = new HashMap(); - final int count = readInt(); - for(int i=0;i> 24)); - writeByte((byte)(i >> 16)); - writeByte((byte)(i >> 8)); - writeByte((byte) i); - } - - /** Writes an int in a variable-length format. Writes between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVInt() - */ - public void writeVInt(int i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a long as eight bytes. - * @see IndexInput#readLong() - */ - public void writeLong(long i) throws IOException { - writeInt((int) (i >> 32)); - writeInt((int) i); - } - - /** Writes an long in a variable-length format. Writes between one and five - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVLong() - */ - public void writeVLong(long i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a string. - * @see IndexInput#readString() - */ - public void writeString(String s) throws IOException { - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - writeVInt(utf8Result.length); - writeBytes(utf8Result.result, 0, utf8Result.length); - } - - /** Writes a sub sequence of characters from s as the old - * format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes - * instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(String s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - /** Writes a sub sequence of characters from char[] as - * the old format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(char[] s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - private static int COPY_BUFFER_SIZE = 16384; - private byte[] copyBuffer; - - /** Copy numBytes bytes from input to ourself. */ - public void copyBytes(IndexInput input, long numBytes) throws IOException { - assert numBytes >= 0: "numBytes=" + numBytes; - long left = numBytes; - if (copyBuffer == null) - copyBuffer = new byte[COPY_BUFFER_SIZE]; - while(left > 0) { - final int toCopy; - if (left > COPY_BUFFER_SIZE) - toCopy = COPY_BUFFER_SIZE; - else - toCopy = (int) left; - input.readBytes(copyBuffer, 0, toCopy); - writeBytes(copyBuffer, 0, toCopy); - left -= toCopy; - } - } +public abstract class IndexOutput extends DataOutput implements Closeable { /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; @@ -211,16 +58,4 @@ public abstract class IndexOutput implements Closeable { * @param length file length */ public void setLength(long length) throws IOException {} - - public void writeStringStringMap(Map map) throws IOException { - if (map == null) { - writeInt(0); - } else { - writeInt(map.size()); - for(final Map.Entry entry: map.entrySet()) { - writeString(entry.getKey()); - writeString(entry.getValue()); - } - } - } } diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java index eef023fbf46..2f4c74e1e49 100644 --- a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java @@ -232,6 +232,29 @@ public final class ArrayUtil { return currentSize; } + public static short[] grow(short[] array, int minSize) { + if (array.length < minSize) { + short[] newArray = new short[oversize(minSize, RamUsageEstimator.NUM_BYTES_SHORT)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static short[] grow(short[] array) { + return grow(array, 1 + array.length); + } + + public static short[] shrink(short[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); + if (newSize != array.length) { + short[] newArray = new short[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static int[] grow(int[] array, int minSize) { if (array.length < minSize) { int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)]; diff --git a/lucene/src/java/org/apache/lucene/util/BitVector.java b/lucene/src/java/org/apache/lucene/util/BitVector.java index 29f4e05e871..f9dbeb36542 100644 --- a/lucene/src/java/org/apache/lucene/util/BitVector.java +++ b/lucene/src/java/org/apache/lucene/util/BitVector.java @@ -32,7 +32,7 @@ import org.apache.lucene.store.IndexOutput;

  • store and load, as bit set or d-gaps, depending on sparseness;
  • */ -public final class BitVector implements Cloneable { +public final class BitVector implements Cloneable, Bits { private byte[] bits; private int size; @@ -110,6 +110,11 @@ public final class BitVector implements Cloneable { return size; } + // @Override -- not until Java 1.6 + public int length() { + return size; + } + /** Returns the total number of one bits in this vector. This is efficiently computed and cached, so that, if the vector is not changed, no recomputation is done for repeated calls. */ diff --git a/lucene/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java b/lucene/src/java/org/apache/lucene/util/Bits.java similarity index 66% rename from lucene/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java rename to lucene/src/java/org/apache/lucene/util/Bits.java index 03a6e95dd9e..3df71d0a14a 100644 --- a/lucene/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java +++ b/lucene/src/java/org/apache/lucene/util/Bits.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,18 +17,13 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; - /** * @lucene.experimental */ -abstract class FormatPostingsDocsConsumer { +public interface Bits { + public boolean get(int index); + public int length(); - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ - abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; - - /** Called when we are done adding docs to this term */ - abstract void finish() throws IOException; + public static final Bits[] EMPTY_ARRAY = new Bits[0]; } diff --git a/lucene/src/java/org/apache/lucene/util/BitsSlice.java b/lucene/src/java/org/apache/lucene/util/BitsSlice.java new file mode 100644 index 00000000000..2c3211c6cad --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/BitsSlice.java @@ -0,0 +1,46 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Exposes a slice of an existing Bits as a new Bits. */ + +public final class BitsSlice implements Bits { + private final Bits parent; + private final int start; + private final int length; + + // start is inclusive; end is exclusive (length = end-start) + public BitsSlice(Bits parent, ReaderUtil.Slice slice) { + this.parent = parent; + this.start = slice.start; + this.length = slice.length; + assert length >= 0: "length=" + length; + } + + public boolean get(int doc) { + if (doc >= length) { + throw new RuntimeException("doc " + doc + " is out of bounds 0 .. " + (length-1)); + } + assert doc < length: "doc=" + doc + " length=" + length; + return parent.get(doc+start); + } + + public int length() { + return length; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java new file mode 100644 index 00000000000..b41831c408d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -0,0 +1,250 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; +import java.io.UnsupportedEncodingException; + +/** Represents byte[], as a slice (offset + length) into an + * existing byte[]. + * + * @lucene.experimental */ +public final class BytesRef { + + public byte[] bytes; + public int offset; + public int length; + + public BytesRef() { + } + + public BytesRef(byte[] bytes, int offset, int length) { + this.bytes = bytes; + this.offset = offset; + this.length = length; + } + + public BytesRef(byte[] bytes) { + this.bytes = bytes; + this.offset = 0; + this.length = bytes.length; + } + + public BytesRef(int capacity) { + this.bytes = new byte[capacity]; + } + + /** + * @param text Initialize the byte[] from the UTF8 bytes + * for the provided Sring. This must be well-formed + * unicode text, with no unpaired surrogates or U+FFFF. + */ + public BytesRef(CharSequence text) { + copy(text); + } + + public BytesRef(BytesRef other) { + copy(other); + } + + /** + * Copies the UTF8 bytes for this string. + * + * @param text Must be well-formed unicode text, with no + * unpaired surrogates or invalid UTF16 code units. + */ + public void copy(CharSequence text) { + // TODO: new byte[10] is waste of resources; it should + // simply allocate text.length()*4 like UnicodeUtil. + // Ideally, I would remove this here and add a + // null-check in UnicodeUtil. (Uwe) + if (bytes == null) { + bytes = new byte[10]; + } + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this); + } + + public boolean bytesEquals(BytesRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final byte[] otherBytes = other.bytes; + final int end = offset + length; + for(int upto=offset;uptoIt is defined as: + *
    +   *  int hash = 0;
    +   *  for (int i = offset; i < offset + length; i++) {
    +   *    hash = 31*hash + bytes[i];
    +   *  }
    +   * 
    + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i=offset;i offset) { + sb.append(' '); + } + sb.append(Integer.toHexString(bytes[i]&0xff)); + } + sb.append(']'); + return sb.toString(); + } + + public void copy(BytesRef other) { + if (bytes == null) { + bytes = new byte[other.length]; + } else { + bytes = ArrayUtil.grow(bytes, other.length); + } + System.arraycopy(other.bytes, other.offset, bytes, 0, other.length); + length = other.length; + offset = 0; + } + + public void grow(int newLength) { + bytes = ArrayUtil.grow(bytes, newLength); + } + + private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + + public static Comparator getUTF8SortedAsUTF16Comparator() { + return utf8SortedAsUTF16SortOrder; + } + + private static class UTF8SortedAsUTF16Comparator implements Comparator { + // Only singleton + private UTF8SortedAsUTF16Comparator() {}; + + public int compare(BytesRef a, BytesRef b) { + + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + if (aByte != bByte) { + + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + + public boolean equals(Object other) { + return this == other; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/CodecUtil.java b/lucene/src/java/org/apache/lucene/util/CodecUtil.java new file mode 100644 index 00000000000..3c5e3e63534 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/CodecUtil.java @@ -0,0 +1,72 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.CorruptIndexException; + +import java.io.IOException; + +/** + * @lucene.experimental + */ + +public final class CodecUtil { + private final static int CODEC_MAGIC = 0x3fd76c17; + + public static void writeHeader(IndexOutput out, String codec, int version) + throws IOException { + final long start = out.getFilePointer(); + out.writeInt(CODEC_MAGIC); + out.writeString(codec); + out.writeInt(version); + + // We require this so we can easily pre-compute header length + if (out.getFilePointer()-start != codec.length()+9) { + throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); + } + } + + public static int headerLength(String codec) { + return 9+codec.length(); + } + + public static int checkHeader(IndexInput in, String codec, int maxVersion) + throws IOException { + + // Safety to guard against reading a bogus string: + final int actualHeader = in.readInt(); + if (actualHeader != CODEC_MAGIC) { + throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC); + } + + final String actualCodec = in.readString(); + if (!actualCodec.equals(codec)) { + throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec); + } + + final int actualVersion = in.readInt(); + if (actualVersion > maxVersion) { + throw new CorruptIndexException("version " + actualVersion + " is too new (expected <= version " + maxVersion + ")"); + } + + return actualVersion; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/IntsRef.java b/lucene/src/java/org/apache/lucene/util/IntsRef.java new file mode 100644 index 00000000000..78ea03ae443 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/IntsRef.java @@ -0,0 +1,96 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Represents int[], as a slice (offset + length) into an + * existing int[]. + * + * @lucene.internal */ +public final class IntsRef { + + public int[] ints; + public int offset; + public int length; + + public IntsRef() { + } + + public IntsRef(int[] ints, int offset, int length) { + this.ints = ints; + this.offset = offset; + this.length = length; + } + + public IntsRef(IntsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new IntsRef(this); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i = offset; i < end; i++) { + result = prime * result + ints[i]; + } + return result; + } + + @Override + public boolean equals(Object other) { + return this.intsEquals((IntsRef) other); + } + + public boolean intsEquals(IntsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final int[] otherInts = other.ints; + final int end = offset + length; + for(int upto=offset;uptoNOTE: This is very costly, as every lookup must + * do a binary search to locate the right sub-reader. + * + * @lucene.experimental + */ + +public final class MultiBits implements Bits { + private final Bits[] subs; + + // length is 1+subs.length (the last entry has the maxDoc): + private final int[] starts; + + public MultiBits(List bits, List starts) { + assert starts.size() == 1+bits.size(); + this.subs = bits.toArray(Bits.EMPTY_ARRAY); + this.starts = new int[starts.size()]; + for(int i=0;i>>= shift; - while (nChars>=1) { - // Store 7 bits per character for good efficiency when UTF-8 encoding. - // The whole number is right-justified so that lucene can prefix-encode - // the terms more efficiently. - buffer[nChars--] = (char)(sortableBits & 0x7f); + while (nChars > 0) { + // Store 7 bits per byte for compatibility + // with UTF-8 encoding of terms + bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f); sortableBits >>>= 7; } - return len; + // calculate hash + for (int i = 1; i < bytes.length; i++) { + hash = 31*hash + bytes.bytes[i]; + } + return hash; } /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link LongRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String longToPrefixCoded(final long val, final int shift) { - final char[] buffer = new char[BUF_SIZE_LONG]; - final int len = longToPrefixCoded(val, shift, buffer); - return new String(buffer, 0, len); - } + final BytesRef buffer = new BytesRef(BUF_SIZE_LONG); + longToPrefixCoded(val, shift, buffer); + return buffer.utf8ToString(); + }*/ /** * This is a convenience method, that returns prefix coded bits of a long without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToLong}. - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String longToPrefixCoded(final long val) { return longToPrefixCoded(val, 0); - } + }*/ /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT} - * length - * @return number of chars written to buffer + * @param bytes will contain the encoded value + * @return the hash code for indexing (TermsHash) */ - public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { + public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) { if (shift>31 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..31"); - int nChars = (31-shift)/7 + 1, len = nChars+1; - buffer[0] = (char)(SHIFT_START_INT + shift); + if (bytes.bytes == null) { + bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT]; + } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) { + bytes.grow(NumericUtils.BUF_SIZE_INT); + } + int hash, nChars = (31-shift)/7 + 1; + bytes.length = nChars+1; + bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift)); int sortableBits = val ^ 0x80000000; sortableBits >>>= shift; - while (nChars>=1) { - // Store 7 bits per character for good efficiency when UTF-8 encoding. - // The whole number is right-justified so that lucene can prefix-encode - // the terms more efficiently. - buffer[nChars--] = (char)(sortableBits & 0x7f); + while (nChars > 0) { + // Store 7 bits per byte for compatibility + // with UTF-8 encoding of terms + bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f); sortableBits >>>= 7; } - return len; + // calculate hash + for (int i = 1; i < bytes.length; i++) { + hash = 31*hash + bytes.bytes[i]; + } + return hash; } /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link IntRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String intToPrefixCoded(final int val, final int shift) { - final char[] buffer = new char[BUF_SIZE_INT]; - final int len = intToPrefixCoded(val, shift, buffer); - return new String(buffer, 0, len); - } + final BytesRef buffer = new BytesRef(BUF_SIZE_INT); + intToPrefixCoded(val, shift, buffer); + return buffer.utf8ToString(); + }*/ /** * This is a convenience method, that returns prefix coded bits of an int without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToInt}. - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String intToPrefixCoded(final int val) { return intToPrefixCoded(val, 0); - } + }*/ /** * Returns a long from prefixCoded characters. @@ -198,51 +223,97 @@ public final class NumericUtils { * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #longToPrefixCoded(long) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static long prefixCodedToLong(final String prefixCoded) { - final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG; - if (shift>63 || shift<0) - throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)"); + return prefixCodedToLong(new BytesRef(prefixCoded)); + }*/ + + /** + * Returns the shift value from a prefix encoded {@code long}. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + */ + public static int getPrefixCodedLongShift(final BytesRef val) { + final int shift = val.bytes[val.offset] - SHIFT_START_LONG; + if (shift > 63 || shift < 0) + throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); + return shift; + } + + /** + * Returns the shift value from a prefix encoded {@code int}. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + */ + public static int getPrefixCodedIntShift(final BytesRef val) { + final int shift = val.bytes[val.offset] - SHIFT_START_INT; + if (shift > 31 || shift < 0) + throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); + return shift; + } + + /** + * Returns a long from prefixCoded bytes. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode a term's value. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + * @see #longToPrefixCoded(long,int,BytesRef) + */ + public static long prefixCodedToLong(final BytesRef val) { long sortableBits = 0L; - for (int i=1, len=prefixCoded.length(); i0x7f) { + final byte b = val.bytes[i]; + if (b < 0) { throw new NumberFormatException( - "Invalid prefixCoded numerical value representation (char "+ - Integer.toHexString(ch)+" at position "+i+" is invalid)" + "Invalid prefixCoded numerical value representation (byte "+ + Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" ); } - sortableBits |= ch; + sortableBits |= b; } - return (sortableBits << shift) ^ 0x8000000000000000L; + return (sortableBits << getPrefixCodedLongShift(val)) ^ 0x8000000000000000L; } /** * Returns an int from prefixCoded characters. * Rightmost bits will be zero for lower precision codes. - * This method can be used to decode e.g. a stored field. + * This method can be used to decode a term's value. * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #intToPrefixCoded(int) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static int prefixCodedToInt(final String prefixCoded) { - final int shift = prefixCoded.charAt(0)-SHIFT_START_INT; - if (shift>31 || shift<0) - throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)"); + return prefixCodedToInt(new BytesRef(prefixCoded)); + }*/ + + /** + * Returns an int from prefixCoded bytes. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode a term's value. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + * @see #intToPrefixCoded(int,int,BytesRef) + */ + public static int prefixCodedToInt(final BytesRef val) { int sortableBits = 0; - for (int i=1, len=prefixCoded.length(); i0x7f) { + final byte b = val.bytes[i]; + if (b < 0) { throw new NumberFormatException( - "Invalid prefixCoded numerical value representation (char "+ - Integer.toHexString(ch)+" at position "+i+" is invalid)" + "Invalid prefixCoded numerical value representation (byte "+ + Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" ); } - sortableBits |= ch; + sortableBits |= b; } - return (sortableBits << shift) ^ 0x80000000; + return (sortableBits << getPrefixCodedIntShift(val)) ^ 0x80000000; } /** @@ -261,10 +332,12 @@ public final class NumericUtils { /** * Convenience method: this just returns: * longToPrefixCoded(doubleToSortableLong(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String doubleToPrefixCoded(double val) { return longToPrefixCoded(doubleToSortableLong(val)); - } + }*/ /** * Converts a sortable long back to a double. @@ -278,10 +351,12 @@ public final class NumericUtils { /** * Convenience method: this just returns: * sortableLongToDouble(prefixCodedToLong(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static double prefixCodedToDouble(String val) { return sortableLongToDouble(prefixCodedToLong(val)); - } + }*/ /** * Converts a float value to a sortable signed int. @@ -299,10 +374,12 @@ public final class NumericUtils { /** * Convenience method: this just returns: * intToPrefixCoded(floatToSortableInt(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String floatToPrefixCoded(float val) { return intToPrefixCoded(floatToSortableInt(val)); - } + }*/ /** * Converts a sortable int back to a float. @@ -316,16 +393,18 @@ public final class NumericUtils { /** * Convenience method: this just returns: * sortableIntToFloat(prefixCodedToInt(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static float prefixCodedToFloat(String val) { return sortableIntToFloat(prefixCodedToInt(val)); - } + }*/ /** - * Expert: Splits a long range recursively. + * Splits a long range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its - * {@link LongRangeBuilder#addRange(String,String)} + * {@link LongRangeBuilder#addRange(BytesRef,BytesRef)} * method. *

    This method is used by {@link NumericRangeQuery}. */ @@ -336,10 +415,10 @@ public final class NumericUtils { } /** - * Expert: Splits an int range recursively. + * Splits an int range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its - * {@link IntRangeBuilder#addRange(String,String)} + * {@link IntRangeBuilder#addRange(BytesRef,BytesRef)} * method. *

    This method is used by {@link NumericRangeQuery}. */ @@ -412,10 +491,10 @@ public final class NumericUtils { } /** - * Expert: Callback for {@link #splitLongRange}. + * Callback for {@link #splitLongRange}. * You need to overwrite only one of the methods. - *

    NOTE: This is a very low-level interface, - * the method signatures may change in later versions. + * @lucene.internal + * @since 2.9, API changed non backwards-compliant in 3.1 */ public static abstract class LongRangeBuilder { @@ -423,7 +502,7 @@ public final class NumericUtils { * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical (inclusive) range queries from them. */ - public void addRange(String minPrefixCoded, String maxPrefixCoded) { + public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { throw new UnsupportedOperationException(); } @@ -432,16 +511,19 @@ public final class NumericUtils { * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final long min, final long max, final int shift) { - addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift)); + final BytesRef minBytes = new BytesRef(BUF_SIZE_LONG), maxBytes = new BytesRef(BUF_SIZE_LONG); + longToPrefixCoded(min, shift, minBytes); + longToPrefixCoded(max, shift, maxBytes); + addRange(minBytes, maxBytes); } } /** - * Expert: Callback for {@link #splitIntRange}. + * Callback for {@link #splitIntRange}. * You need to overwrite only one of the methods. - *

    NOTE: This is a very low-level interface, - * the method signatures may change in later versions. + * @lucene.internal + * @since 2.9, API changed non backwards-compliant in 3.1 */ public static abstract class IntRangeBuilder { @@ -449,7 +531,7 @@ public final class NumericUtils { * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical range (inclusive) queries from them. */ - public void addRange(String minPrefixCoded, String maxPrefixCoded) { + public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { throw new UnsupportedOperationException(); } @@ -458,7 +540,10 @@ public final class NumericUtils { * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final int min, final int max, final int shift) { - addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift)); + final BytesRef minBytes = new BytesRef(BUF_SIZE_INT), maxBytes = new BytesRef(BUF_SIZE_INT); + intToPrefixCoded(min, shift, minBytes); + intToPrefixCoded(max, shift, maxBytes); + addRange(minBytes, maxBytes); } } diff --git a/lucene/src/java/org/apache/lucene/util/OpenBitSet.java b/lucene/src/java/org/apache/lucene/util/OpenBitSet.java index 6ee6b42b8af..9815ad7dec0 100644 --- a/lucene/src/java/org/apache/lucene/util/OpenBitSet.java +++ b/lucene/src/java/org/apache/lucene/util/OpenBitSet.java @@ -75,7 +75,7 @@ Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M */ -public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { +public class OpenBitSet extends DocIdSet implements Bits, Cloneable, Serializable { protected long[] bits; protected int wlen; // number of words (elements) used in the array @@ -132,6 +132,11 @@ public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { return capacity(); } + // @Override -- not until Java 1.6 + public int length() { + return bits.length << 6; + } + /** Returns true if there are no set bits */ public boolean isEmpty() { return cardinality()==0; } diff --git a/lucene/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/src/java/org/apache/lucene/util/RamUsageEstimator.java index 2313c409b89..40a65a79895 100644 --- a/lucene/src/java/org/apache/lucene/util/RamUsageEstimator.java +++ b/lucene/src/java/org/apache/lucene/util/RamUsageEstimator.java @@ -37,6 +37,16 @@ import java.util.*; * @lucene.internal */ public final class RamUsageEstimator { + + public final static int NUM_BYTES_SHORT = 2; + public final static int NUM_BYTES_INT = 4; + public final static int NUM_BYTES_LONG = 8; + public final static int NUM_BYTES_FLOAT = 4; + public final static int NUM_BYTES_DOUBLE = 8; + public final static int NUM_BYTES_OBJ_HEADER = 8; + public final static int NUM_BYTES_OBJ_REF = Constants.JRE_IS_64BIT ? 8 : 4; + public final static int NUM_BYTES_ARRAY_HEADER = NUM_BYTES_OBJ_HEADER + NUM_BYTES_INT + NUM_BYTES_OBJ_REF; + private MemoryModel memoryModel; private final Map seen; @@ -47,11 +57,6 @@ public final class RamUsageEstimator { public final static int NUM_BYTES_OBJECT_REF = Constants.JRE_IS_64BIT ? 8 : 4; public final static int NUM_BYTES_CHAR = 2; - public final static int NUM_BYTES_SHORT = 2; - public final static int NUM_BYTES_INT = 4; - public final static int NUM_BYTES_LONG = 8; - public final static int NUM_BYTES_FLOAT = 4; - public final static int NUM_BYTES_DOUBLE = 8; private boolean checkInterned; diff --git a/lucene/src/java/org/apache/lucene/util/ReaderUtil.java b/lucene/src/java/org/apache/lucene/util/ReaderUtil.java index 62749707d01..f6ecfbbd982 100644 --- a/lucene/src/java/org/apache/lucene/util/ReaderUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ReaderUtil.java @@ -19,6 +19,7 @@ package org.apache.lucene.util; import java.util.ArrayList; import java.util.List; +import java.io.IOException; import org.apache.lucene.index.IndexReader; @@ -29,24 +30,87 @@ import org.apache.lucene.index.IndexReader; */ public class ReaderUtil { + public static class Slice { + public static final Slice[] EMPTY_ARRAY = new Slice[0]; + public final int start; + public final int length; + public final int readerIndex; + + public Slice(int start, int length, int readerIndex) { + this.start = start; + this.length = length; + this.readerIndex = readerIndex; + } + + public String toString() { + return "slice start=" + start + " length=" + length; + } + } + /** - * Gathers sub-readers from reader into a List. + * Gathers sub-readers from reader into a List. See + * {@link Gather} for are more general way to gather + * whatever you need to, per reader. + * + * @lucene.experimental * * @param allSubReaders * @param reader */ - public static void gatherSubReaders(List allSubReaders, IndexReader reader) { - IndexReader[] subReaders = reader.getSequentialSubReaders(); - if (subReaders == null) { - // Add the reader itself, and do not recurse - allSubReaders.add(reader); - } else { - for (int i = 0; i < subReaders.length; i++) { - gatherSubReaders(allSubReaders, subReaders[i]); - } + + public static void gatherSubReaders(final List allSubReaders, IndexReader reader) { + try { + new Gather(reader) { + @Override + protected void add(int base, IndexReader r) { + allSubReaders.add(r); + } + }.run(); + } catch (IOException ioe) { + // won't happen + throw new RuntimeException(ioe); } } + /** Recursively visits all sub-readers of a reader. You + * should subclass this and override the add method to + * gather what you need. + * + * @lucene.experimental */ + public static abstract class Gather { + private final IndexReader topReader; + + public Gather(IndexReader r) { + topReader = r; + } + + public int run() throws IOException { + return run(0, topReader); + } + + public int run(int docBase) throws IOException { + return run(docBase, topReader); + } + + private int run(int base, IndexReader reader) throws IOException { + IndexReader[] subReaders = reader.getSequentialSubReaders(); + if (subReaders == null) { + // atomic reader + add(base, reader); + base += reader.maxDoc(); + } else { + // composite reader + for (int i = 0; i < subReaders.length; i++) { + base = run(base, subReaders[i]); + } + } + + return base; + } + + protected abstract void add(int base, IndexReader r) throws IOException; + } + /** * Returns sub IndexReader that contains the given document id. * diff --git a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java index c0c9d9161be..109b1888ea3 100644 --- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -71,21 +71,6 @@ final public class UnicodeUtil { private static final long HALF_SHIFT = 10; private static final long HALF_MASK = 0x3FFL; - /** - * @lucene.internal - */ - public static final class UTF8Result { - public byte[] result = new byte[10]; - public int length; - - public void setLength(int newLength) { - if (result.length < newLength) { - result = ArrayUtil.grow(result, newLength); - } - length = newLength; - } - } - /** * @lucene.internal */ @@ -94,10 +79,15 @@ final public class UnicodeUtil { public int[] offsets = new int[10]; public int length; + /* + public String toString() { + return new String(result, 0, length); + } + */ + public void setLength(int newLength) { - if (result.length < newLength) { + if (result.length < newLength) result = ArrayUtil.grow(result, newLength); - } length = newLength; } @@ -105,80 +95,89 @@ final public class UnicodeUtil { setLength(other.length); System.arraycopy(other.result, 0, result, 0, length); } - } - /** Encode characters from a char[] source, starting at - * offset and stopping when the character 0xffff is seen. - * Returns the number of bytes written to bytesOut. */ - public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) { - - int upto = 0; - int i = offset; - byte[] out = result.result; - - while(true) { - - final int code = (int) source[i++]; - - if (upto+4 > out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); - } - if (code < 0x80) - out[upto++] = (byte) code; - else if (code < 0x800) { - out[upto++] = (byte) (0xC0 | (code >> 6)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); - } else if (code < 0xD800 || code > 0xDFFF) { - if (code == 0xffff) - // END - break; - out[upto++] = (byte)(0xE0 | (code >> 12)); - out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); - } else { - // surrogate pair - // confirm valid high surrogate - if (code < 0xDC00 && source[i] != 0xffff) { - int utf32 = (int) source[i]; - // confirm valid low surrogate and write pair - if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { - utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); - i++; - out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); - continue; - } - } - // replace unpaired surrogate or out-of-order low surrogate - // with substitution character - out[upto++] = (byte) 0xEF; - out[upto++] = (byte) 0xBF; - out[upto++] = (byte) 0xBD; - } + public void copyText(String other) { + final int otherLength = other.length(); + setLength(otherLength); + other.getChars(0, otherLength, result, 0); + length = otherLength; } - //assert matches(source, offset, i-offset-1, out, upto); - result.length = upto; } /** Encode characters from a char[] source, starting at - * offset for length chars. Returns the number of bytes - * written to bytesOut. */ - public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) { - + * offset for length chars. Returns a hash of the resulting bytes */ + public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) { + int hash = 0; int upto = 0; int i = offset; final int end = offset + length; - byte[] out = result.result; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; + result.offset = 0; while(i < end) { final int code = (int) source[i++]; - if (upto+4 > out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); + if (code < 0x80) { + hash = 31*hash + (out[upto++] = (byte) code); + } else if (code < 0x800) { + hash = 31*hash + (out[upto++] = (byte) (0xC0 | (code >> 6))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F))); + } else if (code < 0xD800 || code > 0xDFFF) { + hash = 31*hash + (out[upto++] = (byte)(0xE0 | (code >> 12))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F))); + } else { + // surrogate pair + // confirm valid high surrogate + if (code < 0xDC00 && i < end) { + int utf32 = (int) source[i]; + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); + i++; + hash = 31*hash + (out[upto++] = (byte)(0xF0 | (utf32 >> 18))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (utf32 & 0x3F))); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + hash = 31*hash + (out[upto++] = (byte) 0xEF); + hash = 31*hash + (out[upto++] = (byte) 0xBF); + hash = 31*hash + (out[upto++] = (byte) 0xBD); } + } + //assert matches(source, offset, length, out, upto); + result.length = upto; + return hash; + } + + /** Encode characters from a char[] source, starting at + * offset for length chars. Returns the number of bytes + * written to bytesOut. */ + public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) { + + int upto = 0; + int i = offset; + final int end = offset + length; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; + result.offset = 0; + + while(i < end) { + + final int code = (int) source[i++]; + if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { @@ -191,7 +190,7 @@ final public class UnicodeUtil { } else { // surrogate pair // confirm valid high surrogate - if (code < 0xDC00 && i < end && source[i] != 0xffff) { + if (code < 0xDC00 && i < end) { int utf32 = (int) source[i]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { @@ -218,18 +217,20 @@ final public class UnicodeUtil { /** Encode characters from this String, starting at offset * for length characters. Returns the number of bytes * written to bytesOut. */ - public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) { + public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) { final int end = offset + length; - byte[] out = result.result; + byte[] out = result.bytes; + result.offset = 0; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; int upto = 0; for(int i=offset;i out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); - } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { @@ -332,6 +333,71 @@ final public class UnicodeUtil { result.length = outUpto; } + /** + * Get the next valid UTF-16 String in UTF-16 order. + *

    + * If the input String is already valid, it is returned. + * Otherwise the next String in code unit order is returned. + *

    + * @param s input String (possibly with unpaired surrogates) + * @return next valid UTF-16 String in UTF-16 order + */ + public static String nextValidUTF16String(String s) { + if (validUTF16String(s)) + return s; + else { + UTF16Result chars = new UTF16Result(); + chars.copyText(s); + nextValidUTF16String(chars); + return new String(chars.result, 0, chars.length); + } + } + + public static void nextValidUTF16String(UTF16Result s) { + final int size = s.length; + for (int i = 0; i < size; i++) { + char ch = s.result[i]; + if (ch >= UnicodeUtil.UNI_SUR_HIGH_START + && ch <= UnicodeUtil.UNI_SUR_HIGH_END) { + if (i < size - 1) { + i++; + char nextCH = s.result[i]; + if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START + && nextCH <= UnicodeUtil.UNI_SUR_LOW_END) { + // Valid surrogate pair + } else + // Unmatched high surrogate + if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated + s.setLength(i + 1); + s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; + return; + } else { // SMP already enumerated + if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) { + s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); + s.setLength(i); + } else { + s.result[i - 1]++; + s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; + s.setLength(i + 1); + } + return; + } + } else { + // Unmatched high surrogate in final position, SMP not yet enumerated + s.setLength(i + 2); + s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START; + return; + } + } else if (ch >= UnicodeUtil.UNI_SUR_LOW_START + && ch <= UnicodeUtil.UNI_SUR_LOW_END) { + // Unmatched low surrogate, SMP already enumerated + s.setLength(i + 1); + s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); + return; + } + } + } + // Only called from assert /* private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) { @@ -386,8 +452,8 @@ final public class UnicodeUtil { return false; } } - - public static final boolean validUTF16String(String s) { + */ + public static final boolean validUTF16String(CharSequence s) { final int size = s.length(); for(int i=0;i + * Class invariants: + *
      + *
    • An automaton is either represented explicitly (with {@link State} and + * {@link Transition} objects) or with a singleton string (see + * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton + * is known to accept exactly one string. (Implicitly, all states and + * transitions of an automaton are reachable from its initial state.) + *
    • Automata are always reduced (see {@link #reduce()}) and have no + * transitions to dead states (see {@link #removeDeadTransitions()}). + *
    • If an automaton is nondeterministic, then {@link #isDeterministic()} + * returns false (but the converse is not required). + *
    • Automata provided as input to operations are generally assumed to be + * disjoint. + *
    + *

    + * If the states or transitions are manipulated manually, the + * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods + * should be used afterwards to restore representation invariants that are + * assumed by the built-in automata operations. + * + *

    + * @lucene.experimental + */ +public class Automaton implements Serializable, Cloneable { + + /** + * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of + * the most generally efficient algorithms that exist. + * + * @see #setMinimization(int) + */ + public static final int MINIMIZE_HOPCROFT = 2; + + /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */ + static int minimization = MINIMIZE_HOPCROFT; + + /** Initial state of this automaton. */ + State initial; + + /** + * If true, then this automaton is definitely deterministic (i.e., there are + * no choices for any run, but a run may crash). + */ + boolean deterministic; + + /** Extra data associated with this automaton. */ + transient Object info; + + /** + * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)} + */ + int hash_code; + + /** Singleton string. Null if not applicable. */ + String singleton; + + /** Minimize always flag. */ + static boolean minimize_always = false; + + /** + * Selects whether operations may modify the input automata (default: + * false). + */ + static boolean allow_mutation = false; + + /** + * Constructs a new automaton that accepts the empty language. Using this + * constructor, automata can be constructed manually from {@link State} and + * {@link Transition} objects. + * + * @see #setInitialState(State) + * @see State + * @see Transition + */ + public Automaton() { + initial = new State(); + deterministic = true; + singleton = null; + } + + boolean isDebug() { + return System.getProperty("dk.brics.automaton.debug") != null; + } + + /** + * Selects minimization algorithm (default: MINIMIZE_HOPCROFT). + * + * @param algorithm minimization algorithm + */ + static public void setMinimization(int algorithm) { + minimization = algorithm; + } + + /** + * Sets or resets minimize always flag. If this flag is set, then + * {@link MinimizationOperations#minimize(Automaton)} will automatically be + * invoked after all operations that otherwise may produce non-minimal + * automata. By default, the flag is not set. + * + * @param flag if true, the flag is set + */ + static public void setMinimizeAlways(boolean flag) { + minimize_always = flag; + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then all automata + * operations may modify automata given as input; otherwise, operations will + * always leave input automata languages unmodified. By default, the flag is + * not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + static public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + /** + * Returns the state of the allow mutate flag. If this flag is set, then all + * automata operations may modify automata given as input; otherwise, + * operations will always leave input automata languages unmodified. By + * default, the flag is not set. + * + * @return current value of the flag + */ + static boolean getAllowMutate() { + return allow_mutation; + } + + void checkMinimizeAlways() { + if (minimize_always) MinimizationOperations.minimize(this); + } + + boolean isSingleton() { + return singleton != null; + } + + /** + * Returns the singleton string for this automaton. An automaton that accepts + * exactly one string may be represented in singleton mode. In that + * case, this method may be used to obtain the string. + * + * @return string, null if this automaton is not in singleton mode. + */ + public String getSingleton() { + return singleton; + } + + /** + * Sets initial state. + * + * @param s state + */ + public void setInitialState(State s) { + initial = s; + singleton = null; + } + + /** + * Gets initial state. + * + * @return state + */ + public State getInitialState() { + expandSingleton(); + return initial; + } + + /** + * Returns deterministic flag for this automaton. + * + * @return true if the automaton is definitely deterministic, false if the + * automaton may be nondeterministic + */ + public boolean isDeterministic() { + return deterministic; + } + + /** + * Sets deterministic flag for this automaton. This method should (only) be + * used if automata are constructed manually. + * + * @param deterministic true if the automaton is definitely deterministic, + * false if the automaton may be nondeterministic + */ + public void setDeterministic(boolean deterministic) { + this.deterministic = deterministic; + } + + /** + * Associates extra information with this automaton. + * + * @param info extra information + */ + public void setInfo(Object info) { + this.info = info; + } + + /** + * Returns extra information associated with this automaton. + * + * @return extra information + * @see #setInfo(Object) + */ + public Object getInfo() { + return info; + } + + /** + * Returns the set of states that are reachable from the initial state. + * + * @return set of {@link State} objects + */ + public Set getStates() { + expandSingleton(); + Set visited; + if (isDebug()) visited = new LinkedHashSet(); + else visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + Collection tr; + if (isDebug()) tr = s.getSortedTransitions(false); + else tr = s.transitions; + for (Transition t : tr) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return visited; + } + + /** + * Returns the set of reachable accept states. + * + * @return set of {@link State} objects + */ + public Set getAcceptStates() { + expandSingleton(); + HashSet accepts = new HashSet(); + HashSet visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + if (s.accept) accepts.add(s); + for (Transition t : s.transitions) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return accepts; + } + + /** + * Assigns consecutive numbers to the given states. + */ + static void setStateNumbers(Set states) { + int number = 0; + for (State s : states) + s.number = number++; + } + + /** + * Adds transitions to explicit crash state to ensure that transition function + * is total. + */ + void totalize() { + State s = new State(); + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + for (State p : getStates()) { + int maxi = Character.MIN_VALUE; + for (Transition t : p.getSortedTransitions(false)) { + if (t.min > maxi) p.transitions.add(new Transition((char) maxi, + (char) (t.min - 1), s)); + if (t.max + 1 > maxi) maxi = t.max + 1; + } + if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition( + (char) maxi, Character.MAX_VALUE, s)); + } + } + + /** + * Restores representation invariant. This method must be invoked before any + * built-in automata operation is performed if automaton states or transitions + * are manipulated manually. + * + * @see #setDeterministic(boolean) + */ + public void restoreInvariant() { + removeDeadTransitions(); + } + + /** + * Reduces this automaton. An automaton is "reduced" by combining overlapping + * and adjacent edge intervals with same destination. + */ + public void reduce() { + if (isSingleton()) return; + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + List st = s.getSortedTransitions(true); + s.resetTransitions(); + State p = null; + int min = -1, max = -1; + for (Transition t : st) { + if (p == t.to) { + if (t.min <= max + 1) { + if (t.max > max) max = t.max; + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + min = t.min; + max = t.max; + } + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + p = t.to; + min = t.min; + max = t.max; + } + } + if (p != null) s.transitions + .add(new Transition((char) min, (char) max, p)); + } + } + + /** + * Returns sorted array of all interval start points. + */ + char[] getStartPoints() { + Set pointset = new HashSet(); + for (State s : getStates()) { + pointset.add(Character.MIN_VALUE); + for (Transition t : s.transitions) { + pointset.add(t.min); + if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1)); + } + } + char[] points = new char[pointset.size()]; + int n = 0; + for (Character m : pointset) + points[n++] = m; + Arrays.sort(points); + return points; + } + + /** + * Returns the set of live states. A state is "live" if an accept state is + * reachable from it. + * + * @return set of {@link State} objects + */ + public Set getLiveStates() { + expandSingleton(); + return getLiveStates(getStates()); + } + + private Set getLiveStates(Set states) { + HashMap> map = new HashMap>(); + for (State s : states) + map.put(s, new HashSet()); + for (State s : states) + for (Transition t : s.transitions) + map.get(t.to).add(s); + Set live = new HashSet(getAcceptStates()); + LinkedList worklist = new LinkedList(live); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + for (State p : map.get(s)) + if (!live.contains(p)) { + live.add(p); + worklist.add(p); + } + } + return live; + } + + /** + * Removes transitions to dead states and calls {@link #reduce()} and + * {@link #clearHashCode()}. (A state is "dead" if no accept state is + * reachable from it.) + */ + public void removeDeadTransitions() { + clearHashCode(); + if (isSingleton()) return; + Set states = getStates(); + Set live = getLiveStates(states); + for (State s : states) { + Set st = s.transitions; + s.resetTransitions(); + for (Transition t : st) + if (live.contains(t.to)) s.transitions.add(t); + } + reduce(); + } + + /** + * Returns a sorted array of transitions for each state (and sets state + * numbers). + */ + static Transition[][] getSortedTransitions(Set states) { + setStateNumbers(states); + Transition[][] transitions = new Transition[states.size()][]; + for (State s : states) + transitions[s.number] = s.getSortedTransitionArray(false); + return transitions; + } + + /** + * Expands singleton representation to normal representation. Does nothing if + * not in singleton representation. + */ + public void expandSingleton() { + if (isSingleton()) { + State p = new State(); + initial = p; + for (int i = 0; i < singleton.length(); i++) { + State q = new State(); + p.transitions.add(new Transition(singleton.charAt(i), q)); + p = q; + } + p.accept = true; + deterministic = true; + singleton = null; + } + } + + /** + * Returns the number of states in this automaton. + */ + public int getNumberOfStates() { + if (isSingleton()) return singleton.length() + 1; + return getStates().size(); + } + + /** + * Returns the number of transitions in this automaton. This number is counted + * as the total number of edges, where one edge may be a character interval. + */ + public int getNumberOfTransitions() { + if (isSingleton()) return singleton.length(); + int c = 0; + for (State s : getStates()) + c += s.transitions.size(); + return c; + } + + /** + * Returns true if the language of this automaton is equal to the language of + * the given automaton. Implemented using hashCode and + * subsetOf. + */ + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (!(obj instanceof Automaton)) return false; + Automaton a = (Automaton) obj; + if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton); + return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a) + && BasicOperations.subsetOf(a, this); + } + + /** + * Returns hash code for this automaton. The hash code is based on the number + * of states and transitions in the minimized automaton. Invoking this method + * may involve minimizing the automaton. + */ + @Override + public int hashCode() { + if (hash_code == 0) MinimizationOperations.minimize(this); + return hash_code; + } + + /** + * Must be invoked when the stored hash code may no longer be valid. + */ + void clearHashCode() { + hash_code = 0; + } + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + if (isSingleton()) { + b.append("singleton: "); + for (char c : singleton.toCharArray()) + Transition.appendCharString(c, b); + b.append("\n"); + } else { + Set states = getStates(); + setStateNumbers(states); + b.append("initial state: ").append(initial.number).append("\n"); + for (State s : states) + b.append(s.toString()); + } + return b.toString(); + } + + /** + * Returns Graphviz Dot representation of this automaton. + */ + public String toDot() { + StringBuilder b = new StringBuilder("digraph Automaton {\n"); + b.append(" rankdir = LR;\n"); + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + b.append(" ").append(s.number); + if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n"); + else b.append(" [shape=circle,label=\"\"];\n"); + if (s == initial) { + b.append(" initial [shape=plaintext,label=\"\"];\n"); + b.append(" initial -> ").append(s.number).append("\n"); + } + for (Transition t : s.transitions) { + b.append(" ").append(s.number); + t.appendDot(b); + } + } + return b.append("}\n").toString(); + } + + /** + * Returns a clone of this automaton, expands if singleton. + */ + Automaton cloneExpanded() { + Automaton a = clone(); + a.expandSingleton(); + return a; + } + + /** + * Returns a clone of this automaton unless allow_mutation is + * set, expands if singleton. + */ + Automaton cloneExpandedIfRequired() { + if (allow_mutation) { + expandSingleton(); + return this; + } else return cloneExpanded(); + } + + /** + * Returns a clone of this automaton. + */ + @Override + public Automaton clone() { + try { + Automaton a = (Automaton) super.clone(); + if (!isSingleton()) { + HashMap m = new HashMap(); + Set states = getStates(); + for (State s : states) + m.put(s, new State()); + for (State s : states) { + State p = m.get(s); + p.accept = s.accept; + if (s == initial) a.initial = p; + for (Transition t : s.transitions) + p.transitions.add(new Transition(t.min, t.max, m.get(t.to))); + } + } + return a; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** + * Returns a clone of this automaton, or this automaton itself if + * allow_mutation flag is set. + */ + Automaton cloneIfRequired() { + if (allow_mutation) return this; + else return clone(); + } + + /** + * See {@link BasicOperations#concatenate(Automaton, Automaton)}. + */ + public Automaton concatenate(Automaton a) { + return BasicOperations.concatenate(this, a); + } + + /** + * See {@link BasicOperations#concatenate(List)}. + */ + static public Automaton concatenate(List l) { + return BasicOperations.concatenate(l); + } + + /** + * See {@link BasicOperations#optional(Automaton)}. + */ + public Automaton optional() { + return BasicOperations.optional(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton)}. + */ + public Automaton repeat() { + return BasicOperations.repeat(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int)}. + */ + public Automaton repeat(int min) { + return BasicOperations.repeat(this, min); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int, int)}. + */ + public Automaton repeat(int min, int max) { + return BasicOperations.repeat(this, min, max); + } + + /** + * See {@link BasicOperations#complement(Automaton)}. + */ + public Automaton complement() { + return BasicOperations.complement(this); + } + + /** + * See {@link BasicOperations#minus(Automaton, Automaton)}. + */ + public Automaton minus(Automaton a) { + return BasicOperations.minus(this, a); + } + + /** + * See {@link BasicOperations#intersection(Automaton, Automaton)}. + */ + public Automaton intersection(Automaton a) { + return BasicOperations.intersection(this, a); + } + + /** + * See {@link BasicOperations#subsetOf(Automaton, Automaton)}. + */ + public boolean subsetOf(Automaton a) { + return BasicOperations.subsetOf(this, a); + } + + /** + * See {@link BasicOperations#union(Automaton, Automaton)}. + */ + public Automaton union(Automaton a) { + return BasicOperations.union(this, a); + } + + /** + * See {@link BasicOperations#union(Collection)}. + */ + static public Automaton union(Collection l) { + return BasicOperations.union(l); + } + + /** + * See {@link BasicOperations#determinize(Automaton)}. + */ + public void determinize() { + BasicOperations.determinize(this); + } + + /** + * See {@link BasicOperations#isEmptyString(Automaton)}. + */ + public boolean isEmptyString() { + return BasicOperations.isEmptyString(this); + } + + /** + * See {@link MinimizationOperations#minimize(Automaton)}. Returns the + * automaton being given as argument. + */ + public static Automaton minimize(Automaton a) { + MinimizationOperations.minimize(a); + return a; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java b/lucene/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java new file mode 100644 index 00000000000..26812d0d53c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java @@ -0,0 +1,50 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; + +/** + * Automaton provider for RegExp. + * {@link RegExp#toAutomaton(AutomatonProvider)} + * + * @lucene.experimental + */ +public interface AutomatonProvider { + + /** + * Returns automaton of the given name. + * + * @param name automaton name + * @return automaton + * @throws IOException if errors occur + */ + public Automaton getAutomaton(String name) throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java b/lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java new file mode 100644 index 00000000000..082e45dca29 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java @@ -0,0 +1,242 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.Collection; + +/** + * Construction of basic automata. + * + * @lucene.experimental + */ +final public class BasicAutomata { + + private BasicAutomata() {} + + /** + * Returns a new (deterministic) automaton with the empty language. + */ + public static Automaton makeEmpty() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts only the empty string. + */ + public static Automaton makeEmptyString() { + Automaton a = new Automaton(); + a.singleton = ""; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts all strings. + */ + public static Automaton makeAnyString() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + s.accept = true; + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts any single character. + */ + public static Automaton makeAnyChar() { + return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE); + } + + /** + * Returns a new (deterministic) automaton that accepts a single character of + * the given value. + */ + public static Automaton makeChar(char c) { + Automaton a = new Automaton(); + a.singleton = Character.toString(c); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts a single char whose + * value is in the given interval (including both end points). + */ + public static Automaton makeCharRange(char min, char max) { + if (min == max) return makeChar(min); + Automaton a = new Automaton(); + State s1 = new State(); + State s2 = new State(); + a.initial = s1; + s2.accept = true; + if (min <= max) s1.transitions.add(new Transition(min, max, s2)); + a.deterministic = true; + return a; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of length + * x.substring(n).length(). + */ + private static State anyOfRightLength(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1))); + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at least + * x.substring(n) and length x.substring(n).length(). + */ + private static State atLeast(String x, int n, Collection initials, + boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char c = x.charAt(n); + s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros + && c == '0'))); + if (c < '9') s.addTransition(new Transition((char) (c + 1), '9', + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at most + * x.substring(n) and length x.substring(n).length(). + */ + private static State atMost(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + char c = x.charAt(n); + s.addTransition(new Transition(c, atMost(x, (char) n + 1))); + if (c > '0') s.addTransition(new Transition('0', (char) (c - 1), + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value between + * x.substring(n) and y.substring(n) and of length x.substring(n).length() + * (which must be equal to y.substring(n).length()). + */ + private static State between(String x, String y, int n, + Collection initials, boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char cx = x.charAt(n); + char cy = y.charAt(n); + if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1, + initials, zeros && cx == '0'))); + else { // cx0, use fixed number of digits (strings must be prefixed + * by 0's to obtain the right length) - otherwise, the number of + * digits is not fixed + * @exception IllegalArgumentException if min>max or if numbers in the + * interval cannot be expressed with the given fixed number of + * digits + */ + public static Automaton makeInterval(int min, int max, int digits) + throws IllegalArgumentException { + Automaton a = new Automaton(); + String x = Integer.toString(min); + String y = Integer.toString(max); + if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException(); + int d; + if (digits > 0) d = digits; + else d = y.length(); + StringBuilder bx = new StringBuilder(); + for (int i = x.length(); i < d; i++) + bx.append('0'); + bx.append(x); + x = bx.toString(); + StringBuilder by = new StringBuilder(); + for (int i = y.length(); i < d; i++) + by.append('0'); + by.append(y); + y = by.toString(); + Collection initials = new ArrayList(); + a.initial = between(x, y, 0, initials, digits <= 0); + if (digits <= 0) { + ArrayList pairs = new ArrayList(); + for (State p : initials) + if (a.initial != p) pairs.add(new StatePair(a.initial, p)); + BasicOperations.addEpsilons(a, pairs); + a.initial.addTransition(new Transition('0', a.initial)); + a.deterministic = false; + } else a.deterministic = true; + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts the single given + * string. + */ + public static Automaton makeString(String s) { + Automaton a = new Automaton(); + a.singleton = s; + a.deterministic = true; + return a; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java b/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java new file mode 100644 index 00000000000..4eb23b6e0b9 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java @@ -0,0 +1,625 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Basic automata operations. + * + * @lucene.experimental + */ +final public class BasicOperations { + + private BasicOperations() {} + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

    + * Complexity: linear in number of states. + */ + static public Automaton concatenate(Automaton a1, Automaton a2) { + if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata + .makeString(a1.singleton + a2.singleton); + // adding epsilon transitions with the NFA concatenation algorithm + // in this case always produces a resulting DFA, preventing expensive + // redundant determinize() calls for this common case. + boolean deterministic = a1.isSingleton() && a2.isDeterministic(); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + for (State s : a1.getAcceptStates()) { + s.accept = false; + s.addEpsilon(a2.initial); + } + a1.deterministic = deterministic; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

    + * Complexity: linear in total number of states. + */ + static public Automaton concatenate(List l) { + if (l.isEmpty()) return BasicAutomata.makeEmptyString(); + boolean all_singleton = true; + for (Automaton a : l) + if (!a.isSingleton()) { + all_singleton = false; + break; + } + if (all_singleton) { + StringBuilder b = new StringBuilder(); + for (Automaton a : l) + b.append(a.singleton); + return BasicAutomata.makeString(b.toString()); + } else { + for (Automaton a : l) + if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty(); + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + Automaton b = l.get(0); + if (has_aliases) b = b.cloneExpanded(); + else b = b.cloneExpandedIfRequired(); + Set ac = b.getAcceptStates(); + boolean first = true; + for (Automaton a : l) + if (first) first = false; + else { + if (a.isEmptyString()) continue; + Automaton aa = a; + if (has_aliases) aa = aa.cloneExpanded(); + else aa = aa.cloneExpandedIfRequired(); + Set ns = aa.getAcceptStates(); + for (State s : ac) { + s.accept = false; + s.addEpsilon(aa.initial); + if (s.accept) ns.add(s); + } + ac = ns; + } + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + return b; + } + } + + /** + * Returns an automaton that accepts the union of the empty string and the + * language of the given automaton. + *

    + * Complexity: linear in number of states. + */ + static public Automaton optional(Automaton a) { + a = a.cloneExpandedIfRequired(); + State s = new State(); + s.addEpsilon(a.initial); + s.accept = true; + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts the Kleene star (zero or more + * concatenated repetitions) of the language of the given automaton. Never + * modifies the input automaton language. + *

    + * Complexity: linear in number of states. + */ + static public Automaton repeat(Automaton a) { + a = a.cloneExpanded(); + State s = new State(); + s.accept = true; + s.addEpsilon(a.initial); + for (State p : a.getAcceptStates()) + p.addEpsilon(s); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts min or more concatenated + * repetitions of the language of the given automaton. + *

    + * Complexity: linear in number of states and in min. + */ + static public Automaton repeat(Automaton a, int min) { + if (min == 0) return repeat(a); + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + as.add(repeat(a)); + return concatenate(as); + } + + /** + * Returns an automaton that accepts between min and + * max (including both) concatenated repetitions of the language + * of the given automaton. + *

    + * Complexity: linear in number of states and in min and + * max. + */ + static public Automaton repeat(Automaton a, int min, int max) { + if (min > max) return BasicAutomata.makeEmpty(); + max -= min; + a.expandSingleton(); + Automaton b; + if (min == 0) b = BasicAutomata.makeEmptyString(); + else if (min == 1) b = a.clone(); + else { + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + b = concatenate(as); + } + if (max > 0) { + Automaton d = a.clone(); + while (--max > 0) { + Automaton c = a.clone(); + for (State p : c.getAcceptStates()) + p.addEpsilon(d.initial); + d = c; + } + for (State p : b.getAcceptStates()) + p.addEpsilon(d.initial); + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + } + return b; + } + + /** + * Returns a (deterministic) automaton that accepts the complement of the + * language of the given automaton. + *

    + * Complexity: linear in number of states (if already deterministic). + */ + static public Automaton complement(Automaton a) { + a = a.cloneExpandedIfRequired(); + a.determinize(); + a.totalize(); + for (State p : a.getStates()) + p.accept = !p.accept; + a.removeDeadTransitions(); + return a; + } + + /** + * Returns a (deterministic) automaton that accepts the intersection of the + * language of a1 and the complement of the language of + * a2. As a side-effect, the automata may be determinized, if not + * already deterministic. + *

    + * Complexity: quadratic in number of states (if already deterministic). + */ + static public Automaton minus(Automaton a1, Automaton a2) { + if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata + .makeEmpty(); + if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired(); + if (a1.isSingleton()) { + if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty(); + else return a1.cloneIfRequired(); + } + return intersection(a1, a2.complement()); + } + + /** + * Returns an automaton that accepts the intersection of the languages of the + * given automata. Never modifies the input automata languages. + *

    + * Complexity: quadratic in number of states. + */ + static public Automaton intersection(Automaton a1, Automaton a2) { + if (a1.isSingleton()) { + if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a2.isSingleton()) { + if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a1 == a2) return a1.cloneIfRequired(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + Automaton c = new Automaton(); + LinkedList worklist = new LinkedList(); + HashMap newstates = new HashMap(); + StatePair p = new StatePair(c.initial, a1.initial, a2.initial); + worklist.add(p); + newstates.put(p, p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + p.s.accept = p.s1.accept && p.s2.accept; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) + if (t2[n2].max >= t1[n1].min) { + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + StatePair r = newstates.get(q); + if (r == null) { + q.s = new State(); + worklist.add(q); + newstates.put(q, q); + r = q; + } + char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + p.s.transitions.add(new Transition(min, max, r.s)); + } + } + } + c.deterministic = a1.deterministic && a2.deterministic; + c.removeDeadTransitions(); + c.checkMinimizeAlways(); + return c; + } + + /** + * Returns true if the language of a1 is a subset of the language + * of a2. As a side-effect, a2 is determinized if + * not already marked as deterministic. + *

    + * Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1 == a2) return true; + if (a1.isSingleton()) { + if (a2.isSingleton()) return a1.singleton.equals(a2.singleton); + return BasicOperations.run(a2, a1.singleton); + } + a2.determinize(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + LinkedList worklist = new LinkedList(); + HashSet visited = new HashSet(); + StatePair p = new StatePair(a1.initial, a2.initial); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (p.s1.accept && !p.s2.accept) return false; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + int min1 = t1[n1].min, max1 = t1[n1].max; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) return false; + if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1; + else { + min1 = Character.MAX_VALUE; + max1 = Character.MIN_VALUE; + } + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) return false; + } + } + return true; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

    + * Complexity: linear in number of states. + */ + public static Automaton union(Automaton a1, Automaton a2) { + if ((a1.isSingleton() && a2.isSingleton() && a1.singleton + .equals(a2.singleton)) + || a1 == a2) return a1.cloneIfRequired(); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + State s = new State(); + s.addEpsilon(a1.initial); + s.addEpsilon(a2.initial); + a1.initial = s; + a1.deterministic = false; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

    + * Complexity: linear in number of states. + */ + public static Automaton union(Collection l) { + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + State s = new State(); + for (Automaton b : l) { + if (BasicOperations.isEmpty(b)) continue; + Automaton bb = b; + if (has_aliases) bb = bb.cloneExpanded(); + else bb = bb.cloneExpandedIfRequired(); + s.addEpsilon(bb.initial); + } + Automaton a = new Automaton(); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Determinizes the given automaton. + *

    + * Complexity: exponential in number of states. + */ + public static void determinize(Automaton a) { + if (a.deterministic || a.isSingleton()) return; + Set initialset = new HashSet(); + initialset.add(a.initial); + determinize(a, initialset); + } + + /** + * Determinizes the given automaton using the given set of initial states. + */ + static void determinize(Automaton a, Set initialset) { + char[] points = a.getStartPoints(); + // subset construction + Map,Set> sets = new HashMap,Set>(); + LinkedList> worklist = new LinkedList>(); + Map,State> newstate = new HashMap,State>(); + sets.put(initialset, initialset); + worklist.add(initialset); + a.initial = new State(); + newstate.put(initialset, a.initial); + while (worklist.size() > 0) { + Set s = worklist.removeFirst(); + State r = newstate.get(s); + for (State q : s) + if (q.accept) { + r.accept = true; + break; + } + for (int n = 0; n < points.length; n++) { + Set p = new HashSet(); + for (State q : s) + for (Transition t : q.transitions) + if (t.min <= points[n] && points[n] <= t.max) p.add(t.to); + if (!sets.containsKey(p)) { + sets.put(p, p); + worklist.add(p); + newstate.put(p, new State()); + } + State q = newstate.get(p); + char min = points[n]; + char max; + if (n + 1 < points.length) max = (char) (points[n + 1] - 1); + else max = Character.MAX_VALUE; + r.transitions.add(new Transition(min, max, q)); + } + } + a.deterministic = true; + a.removeDeadTransitions(); + } + + /** + * Adds epsilon transitions to the given automaton. This method adds extra + * character interval transitions that are equivalent to the given set of + * epsilon transitions. + * + * @param pairs collection of {@link StatePair} objects representing pairs of + * source/destination states where epsilon transitions should be + * added + */ + public static void addEpsilons(Automaton a, Collection pairs) { + a.expandSingleton(); + HashMap> forward = new HashMap>(); + HashMap> back = new HashMap>(); + for (StatePair p : pairs) { + HashSet to = forward.get(p.s1); + if (to == null) { + to = new HashSet(); + forward.put(p.s1, to); + } + to.add(p.s2); + HashSet from = back.get(p.s2); + if (from == null) { + from = new HashSet(); + back.put(p.s2, from); + } + from.add(p.s1); + } + // calculate epsilon closure + LinkedList worklist = new LinkedList(pairs); + HashSet workset = new HashSet(pairs); + while (!worklist.isEmpty()) { + StatePair p = worklist.removeFirst(); + workset.remove(p); + HashSet to = forward.get(p.s2); + HashSet from = back.get(p.s1); + if (to != null) { + for (State s : to) { + StatePair pp = new StatePair(p.s1, s); + if (!pairs.contains(pp)) { + pairs.add(pp); + forward.get(p.s1).add(s); + back.get(s).add(p.s1); + worklist.add(pp); + workset.add(pp); + if (from != null) { + for (State q : from) { + StatePair qq = new StatePair(q, p.s1); + if (!workset.contains(qq)) { + worklist.add(qq); + workset.add(qq); + } + } + } + } + } + } + } + // add transitions + for (StatePair p : pairs) + p.s1.addEpsilon(p.s2); + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + } + + /** + * Returns true if the given automaton accepts the empty string and nothing + * else. + */ + public static boolean isEmptyString(Automaton a) { + if (a.isSingleton()) return a.singleton.length() == 0; + else return a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts no strings. + */ + public static boolean isEmpty(Automaton a) { + if (a.isSingleton()) return false; + return !a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts all strings. + */ + public static boolean isTotal(Automaton a) { + if (a.isSingleton()) return false; + if (a.initial.accept && a.initial.transitions.size() == 1) { + Transition t = a.initial.transitions.iterator().next(); + return t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE; + } + return false; + } + + /** + * Returns true if the given string is accepted by the automaton. + *

    + * Complexity: linear in the length of the string. + *

    + * Note: for full performance, use the {@link RunAutomaton} class. + */ + public static boolean run(Automaton a, String s) { + if (a.isSingleton()) return s.equals(a.singleton); + if (a.deterministic) { + State p = a.initial; + for (int i = 0; i < s.length(); i++) { + State q = p.step(s.charAt(i)); + if (q == null) return false; + p = q; + } + return p.accept; + } else { + Set states = a.getStates(); + Automaton.setStateNumbers(states); + LinkedList pp = new LinkedList(); + LinkedList pp_other = new LinkedList(); + BitSet bb = new BitSet(states.size()); + BitSet bb_other = new BitSet(states.size()); + pp.add(a.initial); + ArrayList dest = new ArrayList(); + boolean accept = a.initial.accept; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + accept = false; + pp_other.clear(); + bb_other.clear(); + for (State p : pp) { + dest.clear(); + p.step(c, dest); + for (State q : dest) { + if (q.accept) accept = true; + if (!bb_other.get(q.number)) { + bb_other.set(q.number); + pp_other.add(q); + } + } + } + LinkedList tp = pp; + pp = pp_other; + pp_other = tp; + BitSet tb = bb; + bb = bb_other; + bb_other = tb; + } + return accept; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java b/lucene/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java new file mode 100644 index 00000000000..cdb23550219 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java @@ -0,0 +1,117 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The following code was generated with the moman/finenight pkg +// This package is available under the MIT License, see NOTICE.txt +// for more details. + +import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; + +/** Parametric description for generating a Levenshtein automaton of degree 1 */ +class Lev1ParametricDescription extends ParametricDescription { + + @Override + int transition(int absState, int position, int vector) { + // null absState should never be passed in + assert absState != -1; + + // decode absState -> state, offset + int state = absState/(w+1); + int offset = absState%(w+1); + assert offset >= 0; + + if (position == w) { + if (state < 2) { + final int loc = vector * 2 + state; + offset += unpack(offsetIncrs0, loc, 1); + state = unpack(toStates0, loc, 2)-1; + } + } else if (position == w-1) { + if (state < 3) { + final int loc = vector * 3 + state; + offset += unpack(offsetIncrs1, loc, 1); + state = unpack(toStates1, loc, 2)-1; + } + } else if (position == w-2) { + if (state < 5) { + final int loc = vector * 5 + state; + offset += unpack(offsetIncrs2, loc, 2); + state = unpack(toStates2, loc, 3)-1; + } + } else { + if (state < 5) { + final int loc = vector * 5 + state; + offset += unpack(offsetIncrs3, loc, 2); + state = unpack(toStates3, loc, 3)-1; + } + } + + if (state == -1) { + // null state + return -1; + } else { + // translate back to abs + return state*(w+1)+offset; + } + } + + // 1 vectors; 2 states per vector; array length = 2 + private final static long[] toStates0 = new long[] /*2 bits per value */ { + 0x2L + }; + private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ { + 0x0L + }; + + // 2 vectors; 3 states per vector; array length = 6 + private final static long[] toStates1 = new long[] /*2 bits per value */ { + 0xa43L + }; + private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ { + 0x38L + }; + + // 4 vectors; 5 states per vector; array length = 20 + private final static long[] toStates2 = new long[] /*3 bits per value */ { + 0x4da292442420003L + }; + private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ { + 0x5555528000L + }; + + // 8 vectors; 5 states per vector; array length = 40 + private final static long[] toStates3 = new long[] /*3 bits per value */ { + 0x14d0812112018003L,0xb1a29b46d48a49L + }; + private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ { + 0x555555e80a0f0000L,0x5555L + }; + + // state map + // 0 -> [(0, 0)] + // 1 -> [(0, 1)] + // 2 -> [(0, 1), (1, 1)] + // 3 -> [(0, 1), (1, 1), (2, 1)] + // 4 -> [(0, 1), (2, 1)] + + + public Lev1ParametricDescription(int w) { + super(w, 1, new int[] {0,1,0,-1,-1}); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java b/lucene/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java new file mode 100644 index 00000000000..4bd52084246 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java @@ -0,0 +1,217 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The following code was generated with the moman/finenight pkg +// This package is available under the MIT License, see NOTICE.txt +// for more details. + +import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; + +/** Parametric description for generating a Levenshtein automaton of degree 2 */ +class Lev2ParametricDescription extends ParametricDescription { + + @Override + int transition(int absState, int position, int vector) { + // null absState should never be passed in + assert absState != -1; + + // decode absState -> state, offset + int state = absState/(w+1); + int offset = absState%(w+1); + assert offset >= 0; + + if (position == w) { + if (state < 3) { + final int loc = vector * 3 + state; + offset += unpack(offsetIncrs0, loc, 1); + state = unpack(toStates0, loc, 2)-1; + } + } else if (position == w-1) { + if (state < 5) { + final int loc = vector * 5 + state; + offset += unpack(offsetIncrs1, loc, 1); + state = unpack(toStates1, loc, 3)-1; + } + } else if (position == w-2) { + if (state < 11) { + final int loc = vector * 11 + state; + offset += unpack(offsetIncrs2, loc, 2); + state = unpack(toStates2, loc, 4)-1; + } + } else if (position == w-3) { + if (state < 21) { + final int loc = vector * 21 + state; + offset += unpack(offsetIncrs3, loc, 2); + state = unpack(toStates3, loc, 5)-1; + } + } else if (position == w-4) { + if (state < 30) { + final int loc = vector * 30 + state; + offset += unpack(offsetIncrs4, loc, 3); + state = unpack(toStates4, loc, 5)-1; + } + } else { + if (state < 30) { + final int loc = vector * 30 + state; + offset += unpack(offsetIncrs5, loc, 3); + state = unpack(toStates5, loc, 5)-1; + } + } + + if (state == -1) { + // null state + return -1; + } else { + // translate back to abs + return state*(w+1)+offset; + } + } + + // 1 vectors; 3 states per vector; array length = 3 + private final static long[] toStates0 = new long[] /*2 bits per value */ { + 0x23L + }; + private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ { + 0x0L + }; + + // 2 vectors; 5 states per vector; array length = 10 + private final static long[] toStates1 = new long[] /*3 bits per value */ { + 0x1a68c105L + }; + private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ { + 0x3e0L + }; + + // 4 vectors; 11 states per vector; array length = 44 + private final static long[] toStates2 = new long[] /*4 bits per value */ { + 0x6280b80804280405L,0x2323432321608282L,0x523434543213L + }; + private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ { + 0x5555502220000800L,0x555555L + }; + + // 8 vectors; 21 states per vector; array length = 168 + private final static long[] toStates3 = new long[] /*5 bits per value */ { + 0x40300c0108801005L,0x80202a8208801000L,0x4021006280a0288dL,0x30482184802d8414L, + 0x5990240880010460L,0x191a28118330900L,0x310c413204c1104L,0x8625084811c4710dL, + 0xa92a398e2188231aL,0x104e351c4a508ca4L,0x21208511c8341483L,0xe6290620946a1910L, + 0xd47221423216a4a0L,0x28L + }; + private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ { + 0x33300030c2000800L,0x32828088800c3cfL,0x5555550cace32320L,0x5555555555555555L, + 0x5555555555555555L,0x5555L + }; + + // 16 vectors; 30 states per vector; array length = 480 + private final static long[] toStates4 = new long[] /*5 bits per value */ { + 0x80300c0108801005L,0x88210802000L,0x44200401400000L,0x7ae3b88621185c07L, + 0x101500042100404L,0x20803140501446cL,0x40100420006c2122L,0x490140511b004054L, + 0x8401f2e3c086411L,0x120861200b100822L,0x641102400081180cL,0x4802c40100001088L, + 0x8c21195607048418L,0x1421014245bc3f2L,0x23450230661200b1L,0x2108664118240803L, + 0x8c1984802c802004L,0xbc3e28c41150d140L,0xc4120102209421dL,0x7884c11c4710d031L, + 0x210842109031bc62L,0xd21484360c431044L,0x9c265293a3a6e741L,0x1cc710c41109ce70L, + 0x1bce27a846525495L,0x3105425094a108c7L,0x6f735e95254731c4L,0x9ee7a9c234a9393aL, + 0x144720d0520c4150L,0x211051bc646084c2L,0x3614831048220842L,0x93a460e742351488L, + 0xc4120a2e70a24656L,0x284642d4941cc520L,0x4094a210c51bce46L,0xb525073148310502L, + 0x24356939460f7358L,0x4098e7aaL + }; + private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ { + 0xc0602000010000L,0xa000040000000001L,0x248204041248L,0xb0180c06c3618618L, + 0x238d861860001861L,0x41040061c6e06041L,0x4004900c2402400L,0x409489001041001L, + 0x4184184004148124L,0x1041b4980c24c3L,0xd26040938d061061L,0x2492492492494146L, + 0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L, + 0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L, + 0x2492492492492492L,0x9249249249249249L,0x24924924L + }; + + // 32 vectors; 30 states per vector; array length = 960 + private final static long[] toStates5 = new long[] /*5 bits per value */ { + 0x80300c0108801005L,0x88210802000L,0x42200401400000L,0xa088201000300c03L, + 0x100510842108428L,0x2188461701c01108L,0x108401011eb8eeL,0x85c0700442004014L, + 0x88267ae3b886211L,0x1446c01015108842L,0xc212202080314050L,0x405440100420006L, + 0x10201c50140511b0L,0x942528423b08888L,0x240501446c010155L,0x21007cb8f0219045L, + 0x511b004054402088L,0x2e3c086411490140L,0x200b50904428823fL,0x400081180c120861L, + 0x100001088641102L,0x46030482184802c4L,0x9ce8990840980030L,0x21061200b709c210L, + 0xf0fca308465581c1L,0x802c405084050916L,0xc211956070484184L,0x9e4209ee65bc3f28L, + 0x3450230661200b70L,0x1086641182408032L,0xc1984802c8020042L,0x86098201c8d1408L, + 0xb88a22529ce399L,0x1045434502306612L,0x4088250876f0f8a3L,0xd1408c1984802c80L, + 0xee3dbc3e28c41150L,0xd0310c4188984429L,0xbc627884c11c4710L,0x1044210842109031L, + 0x21704711c4340c43L,0xbdef7bdf0c7a18b4L,0x85210d8310c41ef7L,0x994a4e8e9b9d074L, + 0x60c4310442739c27L,0x3a3a6e741d214843L,0x41ef77bdf77de529L,0x8465254951cc710cL, + 0x94a108c71bce27aL,0x5254731c43105425L,0xdb1c7a38b4a15949L,0xc710c41cf73dce7bL, + 0xe4e9bdcd7a54951cL,0x5427b9ea708d2a4L,0x735e95254731c431L,0xbd677db4a9393a6fL, + 0x4720d0520c41cf75L,0x1051bc646084c214L,0x1483104822084221L,0x193821708511c834L, + 0x1bf6fdef6f7f147aL,0xd08d45220d8520c4L,0x9c289195a4e91839L,0x488361483104828bL, + 0xe5693a460e742351L,0x520c41bf71bdf717L,0xe46284642d4941ccL,0x5024094a210c51bcL, + 0x590b525073148310L,0xce6f7b147a3938a1L,0x941cc520c41f77ddL,0xd5a4e5183dcd62d4L, + 0x48310502639ea890L,0x460f7358b5250731L,0xf779bd6717b56939L + }; + private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ { + 0xc0602000010000L,0x8000040000000001L,0xb6db6d4030180L,0x810104922800010L, + 0x248a000040000092L,0x618000b649654041L,0x861b0180c06c3618L,0x301b0d861860001L, + 0x61861800075d6ed6L,0x1871b8181048e3L,0xe56041238d861860L,0x40240041040075c6L, + 0x4100104004900c2L,0x55b5240309009001L,0x1025224004104005L,0x10410010520490L, + 0x55495240409489L,0x4980c24c34184184L,0x30d061061001041bL,0x184005556d260309L, + 0x51b4981024e34184L,0x40938d0610610010L,0x492492495546d260L,0x2492492492492492L, + 0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L, + 0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L, + 0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L, + 0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L, + 0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L, + 0x2492492492492492L + }; + + // state map + // 0 -> [(0, 0)] + // 1 -> [(0, 2)] + // 2 -> [(0, 1)] + // 3 -> [(0, 2), (1, 2)] + // 4 -> [(0, 1), (1, 1)] + // 5 -> [(0, 2), (2, 1)] + // 6 -> [(0, 1), (2, 2)] + // 7 -> [(0, 2), (1, 2), (2, 2)] + // 8 -> [(0, 1), (2, 1)] + // 9 -> [(0, 2), (2, 2)] + // 10 -> [(0, 1), (1, 1), (2, 1)] + // 11 -> [(0, 2), (1, 2), (2, 2), (3, 2)] + // 12 -> [(0, 2), (2, 1), (3, 1)] + // 13 -> [(0, 2), (3, 2)] + // 14 -> [(0, 2), (2, 2), (3, 2)] + // 15 -> [(0, 2), (1, 2), (3, 1)] + // 16 -> [(0, 2), (1, 2), (3, 2)] + // 17 -> [(0, 1), (2, 2), (3, 2)] + // 18 -> [(0, 2), (3, 1)] + // 19 -> [(0, 1), (3, 2)] + // 20 -> [(0, 1), (1, 1), (3, 2)] + // 21 -> [(0, 2), (2, 1), (4, 2)] + // 22 -> [(0, 2), (1, 2), (4, 2)] + // 23 -> [(0, 2), (1, 2), (3, 2), (4, 2)] + // 24 -> [(0, 2), (2, 2), (4, 2)] + // 25 -> [(0, 2), (2, 2), (3, 2), (4, 2)] + // 26 -> [(0, 2), (3, 2), (4, 2)] + // 27 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)] + // 28 -> [(0, 2), (4, 2)] + // 29 -> [(0, 2), (1, 2), (2, 2), (4, 2)] + + + public Lev2ParametricDescription(int w) { + super(w, 2, new int[] {0,2,1,1,0,-1,0,0,-1,0,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2}); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java new file mode 100644 index 00000000000..49cf7c81a17 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -0,0 +1,258 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Iterator; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +/** + * Class to construct DFAs that match a word within some edit distance. + *

    + * Implements the algorithm described in: + * Schulz and Mihov: Fast String Correction with Levenshtein Automata + *

    + * @lucene.experimental + */ +public class LevenshteinAutomata { + /** @lucene.internal */ + public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; + /* input word */ + final String input; + final char word[]; + /* the automata alphabet. */ + final char alphabet[]; + + /* the unicode ranges outside of alphabet */ + final char rangeLower[]; + final char rangeUpper[]; + int numRanges = 0; + + ParametricDescription descriptions[]; + + /** + * Create a new LevenshteinAutomata for some input String. + */ + public LevenshteinAutomata(String input) { + this.input = input; + this.word = input.toCharArray(); + + // calculate the alphabet + SortedSet set = new TreeSet(); + for (int i = 0; i < word.length; i++) + set.add(word[i]); + alphabet = new char[set.size()]; + Iterator iterator = set.iterator(); + for (int i = 0; i < alphabet.length; i++) + alphabet[i] = iterator.next(); + + rangeLower = new char[alphabet.length + 2]; + rangeUpper = new char[alphabet.length + 2]; + // calculate the unicode range intervals that exclude the alphabet + // these are the ranges for all unicode characters not in the alphabet + int lower = 0; + for (int i = 0; i < alphabet.length; i++) { + char higher = alphabet[i]; + if (higher > lower) { + rangeLower[numRanges] = (char) lower; + rangeUpper[numRanges] = (char) (higher - 1); + numRanges++; + } + lower = higher + 1; + } + /* add the final endpoint */ + if (lower <= 0xFFFF) { + rangeLower[numRanges] = (char) lower; + rangeUpper[numRanges] = '\uFFFF'; + numRanges++; + } + + descriptions = new ParametricDescription[] { + null, /* for n=0, we do not need to go through the trouble */ + new Lev1ParametricDescription(input.length()), + new Lev2ParametricDescription(input.length()), + }; + } + + /** + * Compute a DFA that accepts all strings within an edit distance of n. + *

    + * All automata have the following properties: + *

      + *
    • They are deterministic (DFA). + *
    • There are no transitions to dead states. + *
    • They are not minimal (some transitions could be combined). + *
    + *

    + */ + public Automaton toAutomaton(int n) { + if (n == 0) + return BasicAutomata.makeString(input); + + if (n >= descriptions.length) + return null; + + final int range = 2*n+1; + ParametricDescription description = descriptions[n]; + // the number of states is based on the length of the word and n + State states[] = new State[description.size()]; + // create all states, and mark as accept states if appropriate + for (int i = 0; i < states.length; i++) { + states[i] = new State(); + states[i].setAccept(description.isAccept(i)); + } + // create transitions from state to state + for (int k = 0; k < states.length; k++) { + final int xpos = description.getPosition(k); + if (xpos < 0) + continue; + final int end = xpos + Math.min(word.length - xpos, range); + + for (int x = 0; x < alphabet.length; x++) { + final char ch = alphabet[x]; + // get the characteristic vector at this position wrt ch + final int cvec = getVector(ch, xpos, end); + int dest = description.transition(k, xpos, cvec); + if (dest >= 0) + states[k].addTransition(new Transition(ch, states[dest])); + } + // add transitions for all other chars in unicode + // by definition, their characteristic vectors are always 0, + // because they do not exist in the input string. + int dest = description.transition(k, xpos, 0); // by definition + if (dest >= 0) + for (int r = 0; r < numRanges; r++) + states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest])); + } + + Automaton a = new Automaton(); + a.setInitialState(states[0]); + a.setDeterministic(true); + // we need not trim transitions to dead states, as they are not created. + // a.restoreInvariant(); + return a; + } + + /** + * Get the characteristic vector X(x, V) + * where V is substring(pos, end) + */ + int getVector(char x, int pos, int end) { + int vector = 0; + for (int i = pos; i < end; i++) { + vector <<= 1; + if (word[i] == x) + vector |= 1; + } + return vector; + } + + /** + * A ParametricDescription describes the structure of a Levenshtein DFA for some degree n. + *

    + * There are four components of a parametric description, all parameterized on the length + * of the word w: + *

      + *
    1. The number of states: {@link #size()} + *
    2. The set of final states: {@link #isAccept(int)} + *
    3. The transition function: {@link #transition(int, int, int)} + *
    4. Minimal boundary function: {@link #getPosition(int)} + *
    + */ + static abstract class ParametricDescription { + protected final int w; + protected final int n; + private final int[] minErrors; + + ParametricDescription(int w, int n, int[] minErrors) { + this.w = w; + this.n = n; + this.minErrors = minErrors; + } + + /** + * Return the number of states needed to compute a Levenshtein DFA + */ + int size() { + return minErrors.length * (w+1); + }; + + /** + * Returns true if the state in any Levenshtein DFA is an accept state (final state). + */ + boolean isAccept(int absState) { + // decode absState -> state, offset + int state = absState/(w+1); + int offset = absState%(w+1); + assert offset >= 0; + return w - offset + minErrors[state] <= n; + } + + /** + * Returns the position in the input word for a given state. + * This is the minimal boundary for the state. + */ + int getPosition(int absState) { + return absState % (w+1); + } + + /** + * Returns the state number for a transition from the given state, + * assuming position and characteristic vector vector + */ + abstract int transition(int state, int position, int vector); + + private final static long[] MASKS = new long[] {0x1,0x3,0x7,0xf, + 0x1f,0x3f,0x7f,0xff, + 0x1ff,0x3ff,0x7ff,0xfff, + 0x1fff,0x3fff,0x7fff,0xffff, + 0x1ffff,0x3ffff,0x7ffff,0xfffff, + 0x1fffff,0x3fffff,0x7fffff,0xffffff, + 0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff, + 0x1fffffff,0x3fffffff,0x7fffffffL,0xffffffffL, + 0x1ffffffffL,0x3ffffffffL,0x7ffffffffL,0xfffffffffL, + 0x1fffffffffL,0x3fffffffffL,0x7fffffffffL,0xffffffffffL, + 0x1ffffffffffL,0x3ffffffffffL,0x7ffffffffffL,0xfffffffffffL, + 0x1fffffffffffL,0x3fffffffffffL,0x7fffffffffffL,0xffffffffffffL, + 0x1ffffffffffffL,0x3ffffffffffffL,0x7ffffffffffffL,0xfffffffffffffL, + 0x1fffffffffffffL,0x3fffffffffffffL,0x7fffffffffffffL,0xffffffffffffffL, + 0x1ffffffffffffffL,0x3ffffffffffffffL,0x7ffffffffffffffL,0xfffffffffffffffL, + 0x1fffffffffffffffL,0x3fffffffffffffffL,0x7fffffffffffffffL}; + + protected int unpack(long[] data, int index, int bitsPerValue) { + final long bitLoc = bitsPerValue * index; + final int dataLoc = (int) (bitLoc >> 6); + final int bitStart = (int) (bitLoc & 63); + //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue); + if (bitStart + bitsPerValue <= 64) { + // not split + return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]); + } else { + // split + final int part = 64-bitStart; + return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) + + ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part)); + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java new file mode 100644 index 00000000000..7f25f5566c3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java @@ -0,0 +1,275 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.Set; + +/** + * Operations for minimizing automata. + * + * @lucene.experimental + */ +final public class MinimizationOperations { + + private MinimizationOperations() {} + + /** + * Minimizes (and determinizes if not already deterministic) the given + * automaton. + * + * @see Automaton#setMinimization(int) + */ + public static void minimize(Automaton a) { + if (!a.isSingleton()) { + minimizeHopcroft(a); + } + // recompute hash code + a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; + if (a.hash_code == 0) a.hash_code = 1; + } + + private static void initialize(ArrayList list, int size) { + for (int i = 0; i < size; i++) + list.add(null); + } + + /** + * Minimizes the given automaton using Hopcroft's algorithm. + */ + public static void minimizeHopcroft(Automaton a) { + a.determinize(); + Set tr = a.initial.getTransitions(); + if (tr.size() == 1) { + Transition t = tr.iterator().next(); + if (t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE) return; + } + a.totalize(); + // make arrays for numbered states and effective alphabet + Set ss = a.getStates(); + State[] states = new State[ss.size()]; + int number = 0; + for (State q : ss) { + states[number] = q; + q.number = number++; + } + char[] sigma = a.getStartPoints(); + // initialize data structures + ArrayList>> reverse = new ArrayList>>(); + for (int q = 0; q < states.length; q++) { + ArrayList> v = new ArrayList>(); + initialize(v, sigma.length); + reverse.add(v); + } + boolean[][] reverse_nonempty = new boolean[states.length][sigma.length]; + ArrayList> partition = new ArrayList>(); + initialize(partition, states.length); + int[] block = new int[states.length]; + StateList[][] active = new StateList[states.length][sigma.length]; + StateListNode[][] active2 = new StateListNode[states.length][sigma.length]; + LinkedList pending = new LinkedList(); + boolean[][] pending2 = new boolean[sigma.length][states.length]; + ArrayList split = new ArrayList(); + boolean[] split2 = new boolean[states.length]; + ArrayList refine = new ArrayList(); + boolean[] refine2 = new boolean[states.length]; + ArrayList> splitblock = new ArrayList>(); + initialize(splitblock, states.length); + for (int q = 0; q < states.length; q++) { + splitblock.set(q, new ArrayList()); + partition.set(q, new LinkedList()); + for (int x = 0; x < sigma.length; x++) { + reverse.get(q).set(x, new LinkedList()); + active[q][x] = new StateList(); + } + } + // find initial partition and reverse edges + for (int q = 0; q < states.length; q++) { + State qq = states[q]; + int j; + if (qq.accept) j = 0; + else j = 1; + partition.get(j).add(qq); + block[qq.number] = j; + for (int x = 0; x < sigma.length; x++) { + char y = sigma[x]; + State p = qq.step(y); + reverse.get(p.number).get(x).add(qq); + reverse_nonempty[p.number][x] = true; + } + } + // initialize active sets + for (int j = 0; j <= 1; j++) + for (int x = 0; x < sigma.length; x++) + for (State qq : partition.get(j)) + if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x] + .add(qq); + // initialize pending + for (int x = 0; x < sigma.length; x++) { + int a0 = active[0][x].size; + int a1 = active[1][x].size; + int j; + if (a0 <= a1) j = 0; + else j = 1; + pending.add(new IntPair(j, x)); + pending2[x][j] = true; + } + // process pending until fixed point + int k = 2; + while (!pending.isEmpty()) { + IntPair ip = pending.removeFirst(); + int p = ip.n1; + int x = ip.n2; + pending2[x][p] = false; + // find states that need to be split off their blocks + for (StateListNode m = active[p][x].first; m != null; m = m.next) + for (State s : reverse.get(m.q.number).get(x)) + if (!split2[s.number]) { + split2[s.number] = true; + split.add(s); + int j = block[s.number]; + splitblock.get(j).add(s); + if (!refine2[j]) { + refine2[j] = true; + refine.add(j); + } + } + // refine blocks + for (int j : refine) { + if (splitblock.get(j).size() < partition.get(j).size()) { + LinkedList b1 = partition.get(j); + LinkedList b2 = partition.get(k); + for (State s : splitblock.get(j)) { + b1.remove(s); + b2.add(s); + block[s.number] = k; + for (int c = 0; c < sigma.length; c++) { + StateListNode sn = active2[s.number][c]; + if (sn != null && sn.sl == active[j][c]) { + sn.remove(); + active2[s.number][c] = active[k][c].add(s); + } + } + } + // update pending + for (int c = 0; c < sigma.length; c++) { + int aj = active[j][c].size; + int ak = active[k][c].size; + if (!pending2[c][j] && 0 < aj && aj <= ak) { + pending2[c][j] = true; + pending.add(new IntPair(j, c)); + } else { + pending2[c][k] = true; + pending.add(new IntPair(k, c)); + } + } + k++; + } + for (State s : splitblock.get(j)) + split2[s.number] = false; + refine2[j] = false; + splitblock.get(j).clear(); + } + split.clear(); + refine.clear(); + } + // make a new state for each equivalence class, set initial state + State[] newstates = new State[k]; + for (int n = 0; n < newstates.length; n++) { + State s = new State(); + newstates[n] = s; + for (State q : partition.get(n)) { + if (q == a.initial) a.initial = s; + s.accept = q.accept; + s.number = q.number; // select representative + q.number = n; + } + } + // build transitions and set acceptance + for (int n = 0; n < newstates.length; n++) { + State s = newstates[n]; + s.accept = states[s.number].accept; + for (Transition t : states[s.number].transitions) + s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number])); + } + a.removeDeadTransitions(); + } + + static class IntPair { + + int n1, n2; + + IntPair(int n1, int n2) { + this.n1 = n1; + this.n2 = n2; + } + } + + static class StateList { + + int size; + + StateListNode first, last; + + StateListNode add(State q) { + return new StateListNode(q, this); + } + } + + static class StateListNode { + + State q; + + StateListNode next, prev; + + StateList sl; + + StateListNode(State q, StateList sl) { + this.q = q; + this.sl = sl; + if (sl.size++ == 0) sl.first = sl.last = this; + else { + sl.last.next = this; + prev = sl.last; + sl.last = this; + } + } + + void remove() { + sl.size--; + if (sl.first == this) sl.first = next; + else prev.next = next; + if (sl.last == this) sl.last = prev; + else next.prev = prev; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/src/java/org/apache/lucene/util/automaton/RegExp.java new file mode 100644 index 00000000000..75f30f810d7 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -0,0 +1,1000 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Regular Expression extension to Automaton. + *

    + * Regular expressions are built from the following abstract syntax: + *

    + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    regexp::=unionexp
    |
    unionexp::=interexp | unionexp(union)
    |interexp
    interexp::=concatexp & interexp(intersection)[OPTIONAL]
    |concatexp
    concatexp::=repeatexp concatexp(concatenation)
    |repeatexp
    repeatexp::=repeatexp ?(zero or one occurrence)
    |repeatexp *(zero or more occurrences)
    |repeatexp +(one or more occurrences)
    |repeatexp {n}(n occurrences)
    |repeatexp {n,}(n or more occurrences)
    |repeatexp {n,m}(n to m occurrences, including both)
    |complexp
    complexp::=~ complexp(complement)[OPTIONAL]
    |charclassexp
    charclassexp::=[ charclasses ](character class)
    |[^ charclasses ](negated character class)
    |simpleexp
    charclasses::=charclass charclasses
    |charclass
    charclass::=charexp - charexp(character range, including end-points)
    |charexp
    simpleexp::=charexp
    |.(any single character)
    |#(the empty language)[OPTIONAL]
    |@(any string)[OPTIONAL]
    |" <Unicode string without double-quotes>  "(a string)
    |( )(the empty string)
    |( unionexp )(precedence override)
    |< <identifier> >(named automaton)[OPTIONAL]
    |<n-m>(numerical interval)[OPTIONAL]
    charexp::=<Unicode character>(a single non-reserved character)
    |\ <Unicode character> (a single character)
    + *

    + * The productions marked [OPTIONAL] are only allowed if + * specified by the syntax flags passed to the RegExp constructor. + * The reserved characters used in the (enabled) syntax must be escaped with + * backslash (\) or double-quotes ("..."). (In + * contrast to other regexp syntaxes, this is required also in character + * classes.) Be aware that dash (-) has a special meaning in + * charclass expressions. An identifier is a string not containing right + * angle bracket (>) or dash (-). Numerical + * intervals are specified by non-negative decimal integers and include both end + * points, and if n and m have the same number + * of digits, then the conforming strings must have that length (i.e. prefixed + * by 0's). + * + * @lucene.experimental + */ +public class RegExp { + + enum Kind { + REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL + } + + /** + * Syntax flag, enables intersection (&). + */ + public static final int INTERSECTION = 0x0001; + + /** + * Syntax flag, enables complement (~). + */ + public static final int COMPLEMENT = 0x0002; + + /** + * Syntax flag, enables empty language (#). + */ + public static final int EMPTY = 0x0004; + + /** + * Syntax flag, enables anystring (@). + */ + public static final int ANYSTRING = 0x0008; + + /** + * Syntax flag, enables named automata (<identifier>). + */ + public static final int AUTOMATON = 0x0010; + + /** + * Syntax flag, enables numerical intervals ( + * <n-m>). + */ + public static final int INTERVAL = 0x0020; + + /** + * Syntax flag, enables all optional regexp syntax. + */ + public static final int ALL = 0xffff; + + /** + * Syntax flag, enables no optional regexp syntax. + */ + public static final int NONE = 0x0000; + + private static boolean allow_mutation = false; + + Kind kind; + RegExp exp1, exp2; + String s; + char c; + int min, max, digits; + char from, to; + + String b; + int flags; + int pos; + + RegExp() {} + + /** + * Constructs new RegExp from a string. Same as + * RegExp(s, ALL). + * + * @param s regexp string + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s) throws IllegalArgumentException { + this(s, ALL); + } + + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s, int syntax_flags) throws IllegalArgumentException { + b = s; + flags = syntax_flags; + RegExp e; + if (s.length() == 0) e = makeString(""); + else { + e = parseUnionExp(); + if (pos < b.length()) throw new IllegalArgumentException( + "end-of-string expected at position " + pos); + } + kind = e.kind; + exp1 = e.exp1; + exp2 = e.exp2; + this.s = e.s; + c = e.c; + min = e.min; + max = e.max; + digits = e.digits; + from = e.from; + to = e.to; + b = null; + } + + /** + * Constructs new Automaton from this RegExp. Same + * as toAutomaton(null) (empty automaton map). + */ + public Automaton toAutomaton() { + return toAutomatonAllowMutate(null, null); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automaton_provider provider of automata for named identifiers + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that is not available from the automaton provider + */ + public Automaton toAutomaton(AutomatonProvider automaton_provider) + throws IllegalArgumentException { + return toAutomatonAllowMutate(null, automaton_provider); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automata a map from automaton identifiers to automata (of type + * Automaton). + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that does not occur in the automaton map + */ + public Automaton toAutomaton(Map automata) + throws IllegalArgumentException { + return toAutomatonAllowMutate(automata, null); + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then automata + * construction uses mutable automata, which is slightly faster but not thread + * safe. By default, the flag is not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + private Automaton toAutomatonAllowMutate(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + boolean b = false; + if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe + Automaton a = toAutomaton(automata, automaton_provider); + if (allow_mutation) Automaton.setAllowMutate(b); + return a; + } + + private Automaton toAutomaton(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + List list; + Automaton a = null; + switch (kind) { + case REGEXP_UNION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); + findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); + a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_CONCATENATION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + a = BasicOperations.concatenate(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_INTERSECTION: + a = exp1.toAutomaton(automata, automaton_provider).intersection( + exp2.toAutomaton(automata, automaton_provider)); + MinimizationOperations.minimize(a); + break; + case REGEXP_OPTIONAL: + a = exp1.toAutomaton(automata, automaton_provider).optional(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT: + a = exp1.toAutomaton(automata, automaton_provider).repeat(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MIN: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MINMAX: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max); + MinimizationOperations.minimize(a); + break; + case REGEXP_COMPLEMENT: + a = exp1.toAutomaton(automata, automaton_provider).complement(); + MinimizationOperations.minimize(a); + break; + case REGEXP_CHAR: + a = BasicAutomata.makeChar(c); + break; + case REGEXP_CHAR_RANGE: + a = BasicAutomata.makeCharRange(from, to); + break; + case REGEXP_ANYCHAR: + a = BasicAutomata.makeAnyChar(); + break; + case REGEXP_EMPTY: + a = BasicAutomata.makeEmpty(); + break; + case REGEXP_STRING: + a = BasicAutomata.makeString(s); + break; + case REGEXP_ANYSTRING: + a = BasicAutomata.makeAnyString(); + break; + case REGEXP_AUTOMATON: + Automaton aa = null; + if (automata != null) aa = automata.get(s); + if (aa == null && automaton_provider != null) try { + aa = automaton_provider.getAutomaton(s); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + if (aa == null) throw new IllegalArgumentException("'" + s + + "' not found"); + a = aa.clone(); // always clone here (ignore allow_mutate) + break; + case REGEXP_INTERVAL: + a = BasicAutomata.makeInterval(min, max, digits); + break; + } + return a; + } + + private void findLeaves(RegExp exp, Kind kind, List list, + Map automata, AutomatonProvider automaton_provider) { + if (exp.kind == kind) { + findLeaves(exp.exp1, kind, list, automata, automaton_provider); + findLeaves(exp.exp2, kind, list, automata, automaton_provider); + } else list.add(exp.toAutomaton(automata, automaton_provider)); + } + + /** + * Constructs string from parsed regular expression. + */ + @Override + public String toString() { + return toStringBuilder(new StringBuilder()).toString(); + } + + StringBuilder toStringBuilder(StringBuilder b) { + switch (kind) { + case REGEXP_UNION: + b.append("("); + exp1.toStringBuilder(b); + b.append("|"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CONCATENATION: + exp1.toStringBuilder(b); + exp2.toStringBuilder(b); + break; + case REGEXP_INTERSECTION: + b.append("("); + exp1.toStringBuilder(b); + b.append("&"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_OPTIONAL: + b.append("("); + exp1.toStringBuilder(b); + b.append(")?"); + break; + case REGEXP_REPEAT: + b.append("("); + exp1.toStringBuilder(b); + b.append(")*"); + break; + case REGEXP_REPEAT_MIN: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",}"); + break; + case REGEXP_REPEAT_MINMAX: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",").append(max).append("}"); + break; + case REGEXP_COMPLEMENT: + b.append("~("); + exp1.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CHAR: + b.append("\\").append(c); + break; + case REGEXP_CHAR_RANGE: + b.append("[\\").append(from).append("-\\").append(to).append("]"); + break; + case REGEXP_ANYCHAR: + b.append("."); + break; + case REGEXP_EMPTY: + b.append("#"); + break; + case REGEXP_STRING: + b.append("\"").append(s).append("\""); + break; + case REGEXP_ANYSTRING: + b.append("@"); + break; + case REGEXP_AUTOMATON: + b.append("<").append(s).append(">"); + break; + case REGEXP_INTERVAL: + String s1 = Integer.toString(min); + String s2 = Integer.toString(max); + b.append("<"); + if (digits > 0) for (int i = s1.length(); i < digits; i++) + b.append('0'); + b.append(s1).append("-"); + if (digits > 0) for (int i = s2.length(); i < digits; i++) + b.append('0'); + b.append(s2).append(">"); + break; + } + return b; + } + + /** + * Returns set of automaton identifiers that occur in this regular expression. + */ + public Set getIdentifiers() { + HashSet set = new HashSet(); + getIdentifiers(set); + return set; + } + + void getIdentifiers(Set set) { + switch (kind) { + case REGEXP_UNION: + case REGEXP_CONCATENATION: + case REGEXP_INTERSECTION: + exp1.getIdentifiers(set); + exp2.getIdentifiers(set); + break; + case REGEXP_OPTIONAL: + case REGEXP_REPEAT: + case REGEXP_REPEAT_MIN: + case REGEXP_REPEAT_MINMAX: + case REGEXP_COMPLEMENT: + exp1.getIdentifiers(set); + break; + case REGEXP_AUTOMATON: + set.add(s); + break; + default: + } + } + + static RegExp makeUnion(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_UNION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeConcatenation(RegExp exp1, RegExp exp2) { + if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( + exp1, exp2); + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CONCATENATION; + if (exp1.kind == Kind.REGEXP_CONCATENATION + && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { + r.exp1 = exp1.exp1; + r.exp2 = makeString(exp1.exp2, exp2); + } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && exp2.kind == Kind.REGEXP_CONCATENATION + && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) { + r.exp1 = makeString(exp1, exp2.exp1); + r.exp2 = exp2.exp2; + } else { + r.exp1 = exp1; + r.exp2 = exp2; + } + return r; + } + + static private RegExp makeString(RegExp exp1, RegExp exp2) { + StringBuilder b = new StringBuilder(); + if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); + else b.append(exp1.c); + if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); + else b.append(exp2.c); + return makeString(b.toString()); + } + + static RegExp makeIntersection(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERSECTION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeOptional(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_OPTIONAL; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MIN; + r.exp1 = exp; + r.min = min; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min, int max) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MINMAX; + r.exp1 = exp; + r.min = min; + r.max = max; + return r; + } + + static RegExp makeComplement(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_COMPLEMENT; + r.exp1 = exp; + return r; + } + + static RegExp makeChar(char c) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR; + r.c = c; + return r; + } + + static RegExp makeCharRange(char from, char to) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR_RANGE; + r.from = from; + r.to = to; + return r; + } + + static RegExp makeAnyChar() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYCHAR; + return r; + } + + static RegExp makeEmpty() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_EMPTY; + return r; + } + + static RegExp makeString(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_STRING; + r.s = s; + return r; + } + + static RegExp makeAnyString() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYSTRING; + return r; + } + + static RegExp makeAutomaton(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_AUTOMATON; + r.s = s; + return r; + } + + static RegExp makeInterval(int min, int max, int digits) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERVAL; + r.min = min; + r.max = max; + r.digits = digits; + return r; + } + + private boolean peek(String s) { + return more() && s.indexOf(b.charAt(pos)) != -1; + } + + private boolean match(char c) { + if (pos >= b.length()) return false; + if (b.charAt(pos) == c) { + pos++; + return true; + } + return false; + } + + private boolean more() { + return pos < b.length(); + } + + private char next() throws IllegalArgumentException { + if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); + return b.charAt(pos++); + } + + private boolean check(int flag) { + return (flags & flag) != 0; + } + + final RegExp parseUnionExp() throws IllegalArgumentException { + RegExp e = parseInterExp(); + if (match('|')) e = makeUnion(e, parseUnionExp()); + return e; + } + + final RegExp parseInterExp() throws IllegalArgumentException { + RegExp e = parseConcatExp(); + if (check(INTERSECTION) && match('&')) e = makeIntersection(e, + parseInterExp()); + return e; + } + + final RegExp parseConcatExp() throws IllegalArgumentException { + RegExp e = parseRepeatExp(); + if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( + e, parseConcatExp()); + return e; + } + + final RegExp parseRepeatExp() throws IllegalArgumentException { + RegExp e = parseComplExp(); + while (peek("?*+{")) { + if (match('?')) e = makeOptional(e); + else if (match('*')) e = makeRepeat(e); + else if (match('+')) e = makeRepeat(e, 1); + else if (match('{')) { + int start = pos; + while (peek("0123456789")) + next(); + if (start == pos) throw new IllegalArgumentException( + "integer expected at position " + pos); + int n = Integer.parseInt(b.substring(start, pos)); + int m = -1; + if (match(',')) { + start = pos; + while (peek("0123456789")) + next(); + if (start != pos) m = Integer.parseInt(b.substring(start, pos)); + } else m = n; + if (!match('}')) throw new IllegalArgumentException( + "expected '}' at position " + pos); + if (m == -1) e = makeRepeat(e, n); + else e = makeRepeat(e, n, m); + } + } + return e; + } + + final RegExp parseComplExp() throws IllegalArgumentException { + if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp()); + else return parseCharClassExp(); + } + + final RegExp parseCharClassExp() throws IllegalArgumentException { + if (match('[')) { + boolean negate = false; + if (match('^')) negate = true; + RegExp e = parseCharClasses(); + if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e)); + if (!match(']')) throw new IllegalArgumentException( + "expected ']' at position " + pos); + return e; + } else return parseSimpleExp(); + } + + final RegExp parseCharClasses() throws IllegalArgumentException { + RegExp e = parseCharClass(); + while (more() && !peek("]")) + e = makeUnion(e, parseCharClass()); + return e; + } + + final RegExp parseCharClass() throws IllegalArgumentException { + char c = parseCharExp(); + if (match('-')) return makeCharRange(c, parseCharExp()); + else return makeChar(c); + } + + final RegExp parseSimpleExp() throws IllegalArgumentException { + if (match('.')) return makeAnyChar(); + else if (check(EMPTY) && match('#')) return makeEmpty(); + else if (check(ANYSTRING) && match('@')) return makeAnyString(); + else if (match('"')) { + int start = pos; + while (more() && !peek("\"")) + next(); + if (!match('"')) throw new IllegalArgumentException( + "expected '\"' at position " + pos); + return makeString(b.substring(start, pos - 1)); + } else if (match('(')) { + if (match(')')) return makeString(""); + RegExp e = parseUnionExp(); + if (!match(')')) throw new IllegalArgumentException( + "expected ')' at position " + pos); + return e; + } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) { + int start = pos; + while (more() && !peek(">")) + next(); + if (!match('>')) throw new IllegalArgumentException( + "expected '>' at position " + pos); + String s = b.substring(start, pos - 1); + int i = s.indexOf('-'); + if (i == -1) { + if (!check(AUTOMATON)) throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + return makeAutomaton(s); + } else { + if (!check(INTERVAL)) throw new IllegalArgumentException( + "illegal identifier at position " + (pos - 1)); + try { + if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException(); + String smin = s.substring(0, i); + String smax = s.substring(i + 1, s.length()); + int imin = Integer.parseInt(smin); + int imax = Integer.parseInt(smax); + int digits; + if (smin.length() == smax.length()) digits = smin.length(); + else digits = 0; + if (imin > imax) { + int t = imin; + imin = imax; + imax = t; + } + return makeInterval(imin, imax, digits); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + } + } + } else return makeChar(parseCharExp()); + } + + final char parseCharExp() throws IllegalArgumentException { + match('\\'); + return next(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java new file mode 100644 index 00000000000..f760772073a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java @@ -0,0 +1,210 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Set; + +/** + * Finite-state automaton with fast run operation. + * + * @lucene.experimental + */ +public final class RunAutomaton implements Serializable { + + final int size; + final boolean[] accept; + final int initial; + final int[] transitions; // delta(state,c) = transitions[state*points.length + + // getCharClass(c)] + final char[] points; // char interval start points + final int[] classmap; // map from char number to class class + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("initial state: ").append(initial).append("\n"); + for (int i = 0; i < size; i++) { + b.append("state " + i); + if (accept[i]) b.append(" [accept]:\n"); + else b.append(" [reject]:\n"); + for (int j = 0; j < points.length; j++) { + int k = transitions[i * points.length + j]; + if (k != -1) { + char min = points[j]; + char max; + if (j + 1 < points.length) max = (char) (points[j + 1] - 1); + else max = Character.MAX_VALUE; + b.append(" "); + Transition.appendCharString(min, b); + if (min != max) { + b.append("-"); + Transition.appendCharString(max, b); + } + b.append(" -> ").append(k).append("\n"); + } + } + } + return b.toString(); + } + + /** + * Returns number of states in automaton. + */ + public int getSize() { + return size; + } + + /** + * Returns acceptance status for given state. + */ + public boolean isAccept(int state) { + return accept[state]; + } + + /** + * Returns initial state. + */ + public int getInitialState() { + return initial; + } + + /** + * Returns array of character class interval start points. The array should + * not be modified by the caller. + */ + public char[] getCharIntervals() { + return points.clone(); + } + + /** + * Gets character class of given char. + */ + int getCharClass(char c) { + return SpecialOperations.findIndex(c, points); + } + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. + * + * @param a an automaton + */ + public RunAutomaton(Automaton a) { + a.determinize(); + points = a.getStartPoints(); + Set states = a.getStates(); + Automaton.setStateNumbers(states); + initial = a.initial.number; + size = states.size(); + accept = new boolean[size]; + transitions = new int[size * points.length]; + for (int n = 0; n < size * points.length; n++) + transitions[n] = -1; + for (State s : states) { + int n = s.number; + accept[n] = s.accept; + for (int c = 0; c < points.length; c++) { + State q = s.step(points[c]); + if (q != null) transitions[n * points.length + c] = q.number; + } + } + /* + * Set alphabet table for optimal run performance. + */ + classmap = new int[Character.MAX_VALUE + 1]; + int i = 0; + for (int j = 0; j <= Character.MAX_VALUE; j++) { + if (i + 1 < points.length && j == points[i + 1]) i++; + classmap[j] = i; + } + } + + /** + * Returns the state obtained by reading the given char from the given state. + * Returns -1 if not obtaining any such state. (If the original + * Automaton had no dead states, -1 is returned here if and only + * if a dead state is entered in an equivalent automaton with a total + * transition function.) + */ + public int step(int state, char c) { + return transitions[state * points.length + classmap[c]]; + } + + /** + * Returns true if the given string is accepted by this automaton. + */ + public boolean run(String s) { + int p = initial; + int l = s.length(); + for (int i = 0; i < l; i++) { + p = step(p, s.charAt(i)); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns true if the given string is accepted by this automaton + */ + public boolean run(char[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset; i < l; i++) { + p = step(p, s[i]); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns the length of the longest accepted run of the given string starting + * at the given offset. + * + * @param s the string + * @param offset offset into s where the run starts + * @return length of the longest accepted run, -1 if no run is accepted + */ + public int run(String s, int offset) { + int p = initial; + int l = s.length(); + int max = -1; + for (int r = 0; offset <= l; offset++, r++) { + if (accept[p]) max = r; + if (offset == l) break; + p = step(p, s.charAt(offset)); + if (p == -1) break; + } + return max; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java b/lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java new file mode 100644 index 00000000000..f2a492ccedf --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java @@ -0,0 +1,179 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +/** + * Special automata operations. + * + * @lucene.experimental + */ +final public class SpecialOperations { + + private SpecialOperations() {} + + /** + * Finds the largest entry whose value is less than or equal to c, or 0 if + * there is no such entry. + */ + static int findIndex(char c, char[] points) { + int a = 0; + int b = points.length; + while (b - a > 1) { + int d = (a + b) >>> 1; + if (points[d] > c) b = d; + else if (points[d] < c) a = d; + else return d; + } + return a; + } + + /** + * Returns true if the language of this automaton is finite. + */ + public static boolean isFinite(Automaton a) { + if (a.isSingleton()) return true; + return isFinite(a.initial, new HashSet()); + } + + /** + * Checks whether there is a loop containing s. (This is sufficient since + * there are never transitions to dead states.) + */ + private static boolean isFinite(State s, HashSet path) { + path.add(s); + for (Transition t : s.transitions) + if (path.contains(t.to) || !isFinite(t.to, path)) return false; + path.remove(s); + return true; + } + + /** + * Returns the longest string that is a prefix of all accepted strings and + * visits each state at most once. + * + * @return common prefix + */ + public static String getCommonPrefix(Automaton a) { + if (a.isSingleton()) return a.singleton; + StringBuilder b = new StringBuilder(); + HashSet visited = new HashSet(); + State s = a.initial; + boolean done; + do { + done = true; + visited.add(s); + if (!s.accept && s.transitions.size() == 1) { + Transition t = s.transitions.iterator().next(); + if (t.min == t.max && !visited.contains(t.to)) { + b.append(t.min); + s = t.to; + done = false; + } + } + } while (!done); + return b.toString(); + } + + /** + * Returns the longest string that is a suffix of all accepted strings and + * visits each state at most once. + * + * @return common suffix + */ + public static String getCommonSuffix(Automaton a) { + if (a.isSingleton()) // if singleton, the suffix is the string itself. + return a.singleton; + + // reverse the language of the automaton, then reverse its common prefix. + Automaton r = a.clone(); + reverse(r); + r.determinize(); + return reverseUnicode3(SpecialOperations.getCommonPrefix(r)); + } + + /** + * Reverses the language of the given (non-singleton) automaton while returning + * the set of new initial states. + */ + private static Set reverse(Automaton a) { + a.expandSingleton(); + // reverse all edges + HashMap> m = new HashMap>(); + Set states = a.getStates(); + Set accept = a.getAcceptStates(); + for (State r : states) { + m.put(r, new HashSet()); + r.accept = false; + } + for (State r : states) + for (Transition t : r.getTransitions()) + m.get(t.to).add(new Transition(t.min, t.max, r)); + for (State r : states) + r.transitions = m.get(r); + // make new initial+final states + a.initial.accept = true; + a.initial = new State(); + for (State r : accept) + a.initial.addEpsilon(r); // ensures that all initial states are reachable + a.deterministic = false; + return accept; + } + + /** + * Intentionally use a unicode 3 reverse. + * This is because we are only going to reverse it again... + */ + private static String reverseUnicode3( final String input ){ + char[] charInput = input.toCharArray(); + reverseUnicode3(charInput, 0, charInput.length); + return new String(charInput); + } + + /** + * Intentionally use a unicode 3 reverse. + * This is because it is only used by getCommonSuffix(), + * which will reverse the entire FSM using code unit reversal, + * so we must then reverse its common prefix back using the + * same code unit reversal. + */ + private static void reverseUnicode3(char[] buffer, int start, int len){ + if (len <= 1) return; + int num = len>>1; + for (int i = start; i < ( start + num ); i++) { + char c = buffer[i]; + buffer[i] = buffer[start * 2 + len - i - 1]; + buffer[start * 2 + len - i - 1] = c; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/State.java b/lucene/src/java/org/apache/lucene/util/automaton/State.java new file mode 100644 index 00000000000..da33711968d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/State.java @@ -0,0 +1,209 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Automaton state. + * + * @lucene.experimental + */ +public class State implements Serializable, Comparable { + + boolean accept; + Set transitions; + + int number; + + int id; + static int next_id; + + /** + * Constructs a new state. Initially, the new state is a reject state. + */ + public State() { + resetTransitions(); + id = next_id++; + } + + /** + * Resets transition set. + */ + final void resetTransitions() { + transitions = new HashSet(); + } + + /** + * Returns the set of outgoing transitions. Subsequent changes are reflected + * in the automaton. + * + * @return transition set + */ + public Set getTransitions() { + return transitions; + } + + /** + * Adds an outgoing transition. + * + * @param t transition + */ + public void addTransition(Transition t) { + transitions.add(t); + } + + /** + * Sets acceptance for this state. + * + * @param accept if true, this state is an accept state + */ + public void setAccept(boolean accept) { + this.accept = accept; + } + + /** + * Returns acceptance status. + * + * @return true is this is an accept state + */ + public boolean isAccept() { + return accept; + } + + /** + * Performs lookup in transitions, assuming determinism. + * + * @param c character to look up + * @return destination state, null if no matching outgoing transition + * @see #step(char, Collection) + */ + public State step(char c) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) return t.to; + return null; + } + + /** + * Performs lookup in transitions, allowing nondeterminism. + * + * @param c character to look up + * @param dest collection where destination states are stored + * @see #step(char) + */ + public void step(char c, Collection dest) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) dest.add(t.to); + } + + void addEpsilon(State to) { + if (to.accept) accept = true; + for (Transition t : to.transitions) + transitions.add(t); + } + + /** + * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse + * max) + */ + public Transition[] getSortedTransitionArray(boolean to_first) { + Transition[] e = transitions.toArray(new Transition[transitions.size()]); + Arrays.sort(e, new TransitionComparator(to_first)); + return e; + } + + /** + * Returns sorted list of outgoing transitions. + * + * @param to_first if true, order by (to, min, reverse max); otherwise (min, + * reverse max, to) + * @return transition list + */ + public List getSortedTransitions(boolean to_first) { + return Arrays.asList(getSortedTransitionArray(to_first)); + } + + + /** + * Return this state's number. + *

    + * Expert: Will be useless unless {@link Automaton#setStateNumbers(Set)} + * has been called first to number the states. + * @return the number + */ + public int getNumber() { + return number; + } + + /** + * Returns string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("state ").append(number); + if (accept) b.append(" [accept]"); + else b.append(" [reject]"); + b.append(":\n"); + for (Transition t : transitions) + b.append(" ").append(t.toString()).append("\n"); + return b.toString(); + } + + /** + * Compares this object with the specified object for order. States are + * ordered by the time of construction. + */ + public int compareTo(State s) { + return s.id - id; + } + + /** + * See {@link java.lang.Object#equals(java.lang.Object)}. + */ + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } + + /** + * See {@link java.lang.Object#hashCode()}. + */ + @Override + public int hashCode() { + return super.hashCode(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/src/java/org/apache/lucene/util/automaton/StatePair.java new file mode 100644 index 00000000000..4124b9e8b93 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -0,0 +1,101 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +/** + * Pair of states. + * + * @lucene.experimental + */ +public class StatePair { + State s; + State s1; + State s2; + + StatePair(State s, State s1, State s2) { + this.s = s; + this.s1 = s1; + this.s2 = s2; + } + + /** + * Constructs a new state pair. + * + * @param s1 first state + * @param s2 second state + */ + public StatePair(State s1, State s2) { + this.s1 = s1; + this.s2 = s2; + } + + /** + * Returns first component of this pair. + * + * @return first state + */ + public State getFirstState() { + return s1; + } + + /** + * Returns second component of this pair. + * + * @return second state + */ + public State getSecondState() { + return s2; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj represents the same pair of states as this + * pair + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof StatePair) { + StatePair p = (StatePair) obj; + return p.s1 == s1 && p.s2 == s2; + } else return false; + } + + /** + * Returns hash code. + * + * @return hash code + */ + @Override + public int hashCode() { + return s1.hashCode() + s2.hashCode(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/src/java/org/apache/lucene/util/automaton/Transition.java new file mode 100644 index 00000000000..55fe7aee9e1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/Transition.java @@ -0,0 +1,174 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; + +/** + * Automaton transition. + *

    + * A transition, which belongs to a source state, consists of a Unicode + * character interval and a destination state. + * + * @lucene.experimental + */ +public class Transition implements Serializable, Cloneable { + + /* + * CLASS INVARIANT: min<=max + */ + + char min; + char max; + + State to; + + /** + * Constructs a new singleton interval transition. + * + * @param c transition character + * @param to destination state + */ + public Transition(char c, State to) { + min = max = c; + this.to = to; + } + + /** + * Constructs a new transition. Both end points are included in the interval. + * + * @param min transition interval minimum + * @param max transition interval maximum + * @param to destination state + */ + public Transition(char min, char max, State to) { + if (max < min) { + char t = max; + max = min; + min = t; + } + this.min = min; + this.max = max; + this.to = to; + } + + /** Returns minimum of this transition interval. */ + public char getMin() { + return min; + } + + /** Returns maximum of this transition interval. */ + public char getMax() { + return max; + } + + /** Returns destination of this transition. */ + public State getDest() { + return to; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj is a transition with same character interval + * and destination state as this transition. + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof Transition) { + Transition t = (Transition) obj; + return t.min == min && t.max == max && t.to == to; + } else return false; + } + + /** + * Returns hash code. The hash code is based on the character interval (not + * the destination state). + * + * @return hash code + */ + @Override + public int hashCode() { + return min * 2 + max * 3; + } + + /** + * Clones this transition. + * + * @return clone with same character interval and destination state + */ + @Override + public Transition clone() { + try { + return (Transition) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + static void appendCharString(char c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c); + else { + b.append("\\u"); + String s = Integer.toHexString(c); + if (c < 0x10) b.append("000").append(s); + else if (c < 0x100) b.append("00").append(s); + else if (c < 0x1000) b.append("0").append(s); + else b.append(s); + } + } + + /** + * Returns a string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append(" -> ").append(to.number); + return b.toString(); + } + + void appendDot(StringBuilder b) { + b.append(" -> ").append(to.number).append(" [label=\""); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append("\"]\n"); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java b/lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java new file mode 100644 index 00000000000..1cb9f4df6b8 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java @@ -0,0 +1,75 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Comparator; + +/** + * Comparator for state {@link Transition}s that orders unicode char range + * transitions in lexicographic order. + * + * @lucene.experimental + */ +class TransitionComparator implements Comparator, Serializable { + + boolean to_first; + + TransitionComparator(boolean to_first) { + this.to_first = to_first; + } + + /** + * Compares by (min, reverse max, to) or (to, min, reverse max). + */ + public int compare(Transition t1, Transition t2) { + if (to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + if (t1.min < t2.min) return -1; + if (t1.min > t2.min) return 1; + if (t1.max > t2.max) return -1; + if (t1.max < t2.max) return 1; + if (!to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + return 0; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/createLevAutomata.py b/lucene/src/java/org/apache/lucene/util/automaton/createLevAutomata.py new file mode 100644 index 00000000000..d997ecb8e9b --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/createLevAutomata.py @@ -0,0 +1,492 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Note, this file is known to work with rev 115 of the moman +# repository (http://bitbucket.org/jpbarrette/moman/overview) +# +# See also: http://sites.google.com/site/rrettesite/moman + +import math +import os +import sys +sys.path.insert(0, 'moman/finenight/python') +try: + from possibleStates import genTransitions +except ImportError: + from finenight.possibleStates import genTransitions + +MODE = 'array' +PACKED = True +WORD = 64 +LOG2_WORD = int(math.log(WORD)/math.log(2)) +#MODE = 'switch' + +class LineOutput: + + def __init__(self, indent=''): + self.l = [] + self._indent = self.startIndent = indent + self.inComment = False + + def __call__(self, s, indent=0): + if s.find('}') != -1: + assert self._indent != self.startIndent + self._indent = self._indent[:-2] + + if indent != 0: + indent0 = ' ' * (len(self._indent)/2+indent) + else: + indent0 = self._indent + + if s.find('/*') != -1: + if s.find('*/') == -1: + self.inComment = True + elif s.find('*/') != -1: + self.inComment = True + + if self.inComment: + self.l.append(indent0 + s) + else: + self.l.append(indent0 + s.lstrip()) + + self.inComment = self.inComment and s.find('*/') == -1 + + if s.find('{') != -1: + self._indent += ' ' + + def __str__(self): + if True: + assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \ + (len(self._indent), len(self.startIndent)) + return '\n'.join(self.l) + + def indent(self): + self._indent += ' ' + + def outdent(self): + assert self._indent != self.startIndent + self._indent = self._indent[:-2] + +def charVarNumber(charVar): + """ + Maps binary number (eg [1, 0, 1]) to its decimal value (5). + """ + + p = 1 + sum = 0 + downTo = len(charVar)-1 + while downTo >= 0: + sum += p * int(charVar[downTo]) + p *= 2 + downTo -= 1 + return sum + +def main(): + + if len(sys.argv) != 2: + print + print 'Usage: python -u %s N' % sys.argv[0] + print + print 'NOTE: the resulting .java file is created in the current working dir!' + print + sys.exit(1) + + n = int(sys.argv[1]) + + tables = genTransitions(n) + + stateMap = {} + + # init null state + stateMap['[]'] = -1 + + # init start state + stateMap['[(0, 0)]'] = 0 + + w = LineOutput() + + w('package org.apache.lucene.util.automaton;') + w('') + w('/**') + w(' * Licensed to the Apache Software Foundation (ASF) under one or more') + w(' * contributor license agreements. See the NOTICE file distributed with') + w(' * this work for additional information regarding copyright ownership.') + w(' * The ASF licenses this file to You under the Apache License, Version 2.0') + w(' * (the "License"); you may not use this file except in compliance with') + w(' * the License. You may obtain a copy of the License at') + w(' *') + w(' * http://www.apache.org/licenses/LICENSE-2.0') + w(' *') + w(' * Unless required by applicable law or agreed to in writing, software') + w(' * distributed under the License is distributed on an "AS IS" BASIS,') + w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') + w(' * See the License for the specific language governing permissions and') + w(' * limitations under the License.') + w(' */') + w('') + w('// The following code was generated with the moman/finenight pkg') + w('// This package is available under the MIT License, see NOTICE.txt') + w('// for more details.') + w('') + w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') + w('') + w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) + className = 'Lev%dParametricDescription' % n + + w('class %s extends ParametricDescription {' % className) + + w('') + w('@Override') + w('int transition(int absState, int position, int vector) {') + + w(' // null absState should never be passed in') + w(' assert absState != -1;') + + w('') + w(' // decode absState -> state, offset') + w(' int state = absState/(w+1);') + w(' int offset = absState%(w+1);') + w(' assert offset >= 0;') + w('') + + machines = [] + + for i, map in enumerate(tables): + if i == 0: + w('if (position == w) {') + elif i == len(tables)-1: + w('} else {') + else: + w('} else if (position == w-%d) {' % i) + + if i != 0 and MODE == 'switch': + w('switch(vector) {') + + l = map.items() + l.sort() + + numCasesPerVector = None + numVectors = len(l) + + if MODE == 'array': + toStateArray = [] + toOffsetIncrArray = [] + + for charVar, states in l: + + # somehow it's a string: + charVar = eval(charVar) + + if i != 0 and MODE == 'switch': + w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) + w.indent() + + l = states.items() + + byFromState = {} + + # first pass to assign states + byAction = {} + for s, (toS, offset) in l: + state = str(s) + if state == '[]': + # don't waste code on the null state + continue + + toState = str(toS) + if state not in stateMap: + stateMap[state] = len(stateMap)-1 + if toState not in stateMap: + stateMap[toState] = len(stateMap)-1 + + byFromState[stateMap[state]] = (1+stateMap[toState], offset) + + fromStateDesc = ', '.join([str(x) for x in eval(s)]) + toStateDesc = ', '.join([str(x) for x in toS]) + + tup = (stateMap[toState], toStateDesc, offset) + if tup not in byAction: + byAction[tup] = [] + byAction[tup].append((fromStateDesc, stateMap[state])) + + if numCasesPerVector is None: + numCasesPerVector = len(l)-1 + else: + # we require this to be uniform... empirically it seems to be! + assert numCasesPerVector == len(l)-1 + + if MODE == 'array': + + for s in range(numCasesPerVector): + toState, offsetIncr = byFromState[s] + toStateArray.append(toState) + toOffsetIncrArray.append(offsetIncr) + + else: + + # render switches + w('switch(state) { // %s cases' % len(l)) + + for (toState, toStateDesc, offset), lx in byAction.items(): + for fromStateDesc, fromState in lx: + w('case %s: // %s' % (fromState, fromStateDesc)) + w.indent() + w(' state = %s; // %s' % (toState, toStateDesc)) + if offset > 0: + w(' offset += %s;' % offset) + w('break;') + w.outdent() + + w('}') + if i != 0: + w('break;') + w.outdent() + + if MODE == 'array': + # strangely state can come in wildly out of bounds.... + w(' if (state < %d) {' % numCasesPerVector) + w(' final int loc = vector * %d + state;' % numCasesPerVector) + if PACKED: + w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) + w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) + else: + w(' offset += offsetIncrs%d[loc];' % i) + w(' state = toStates%d[loc]-1;' % i) + w(' }') + elif i != 0: + w('}') + + machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) + + # ends switch statement for machine + w('}') + + w('') + + w(' if (state == -1) {') + w(' // null state') + w(' return -1;') + w(' } else {') + w(' // translate back to abs') + w(' return state*(w+1)+offset;') + w(' }') + + # ends transition method + w('}') + + subs = [] + if MODE == 'array': + w.indent() + for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): + w('') + w.outdent() + w('// %d vectors; %d states per vector; array length = %d' % \ + (numVectors, numCasesPerVector, numVectors*numCasesPerVector)) + w.indent() + if PACKED: + # pack in python + l, nbits = pack(toStateArray) + subs.append(('NBITSSTATES%d' % i, str(nbits))) + w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ + (i, nbits, renderList([hex(long(x)) for x in l]))) + + l, nbits = pack(toOffsetIncrsArray) + subs.append(('NBITSOFFSET%d' % i, str(nbits))) + w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ + (i, nbits, renderList([hex(long(x)) for x in l]))) + else: + w(' private final static int[] toStates%d = new int[] %s;' % \ + (i, renderList([str(x) for x in toStateArray]))) + w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ + (i, renderList([str(x) for x in toStateArray]))) + w.outdent() + + stateMap2 = dict([[v,k] for k,v in stateMap.items()]) + w('') + w('// state map') + sum = 0 + minErrors = [] + for i in xrange(len(stateMap2)-1): + w('// %s -> %s' % (i, stateMap2[i])) + v = eval(stateMap2[i]) + minError = min([-i+e for i, e in v]) + c = len(v) + sum += c + minErrors.append(minError) + w('') + + w.indent() + #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) + + w.outdent() + + w('') + w(' public %s(int w) {' % className) + w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) + w(' }') + + if 0: + w('') + w('@Override') + w('public int size() { // this can now move up?') + w(' return %d*(w+1);' % (len(stateMap2)-1)) + w('}') + + w('') + w('@Override') + w('public int getPosition(int absState) { // this can now move up?') + w(' return absState % (w+1);') + w('}') + + w('') + w('@Override') + w('public boolean isAccept(int absState) { // this can now move up?') + w(' // decode absState -> state, offset') + w(' int state = absState/(w+1);') + w(' if (true || state < minErrors.length) {') + w(' int offset = absState%(w+1);') + w(' assert offset >= 0;') + w(' return w - offset + minErrors[state] <= %d;' % n) + w(' } else {') + w(' return false;') + w(' }') + w('}') + + if MODE == 'array' and PACKED: + + # we moved into super class + if False: + w('') + + v = 2 + l = [] + for i in range(63): + l.append(hex(v-1)) + v *= 2 + + w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) + w('') + + # unpack in java + w('private int unpack(long[] data, int index, int bitsPerValue) {') + w(' final long bitLoc = bitsPerValue * index;') + w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) + w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) + w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') + w(' if (bitStart + bitsPerValue <= %d) {' % WORD) + w(' // not split') + w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') + w(' } else {') + w(' // split') + w(' final int part = %d-bitStart;' % WORD) + w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') + w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) + w(' }') + w('}') + + # class + w('}') + w('') + + fileOut = '%s.java' % className + + s = str(w) + for sub, repl in subs: + s = s.replace(sub, repl) + + open(fileOut, 'wb').write(s) + + print 'Wrote %s [%d lines; %.1f KB]' % \ + (fileOut, len(w.l), os.path.getsize(fileOut)/1024.) + +def renderList(l): + lx = [' '] + for i in xrange(len(l)): + if i > 0: + lx.append(',') + if i % 4 == 0: + lx.append('\n ') + lx.append(l[i]) + return '{\n%s\n }' % ''.join(lx) + +MASKS = [] +v = 2 +for i in xrange(63): + MASKS.append(v-1) + v *= 2 + +# packs into longs; returns long[], numBits +def pack(l): + maxV = max(l) + bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0)))) + + bitsLeft = WORD + pendingValue = 0 + + packed = [] + for i in xrange(len(l)): + v = l[i] + if pendingValue > 0: + bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0)) + assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed) + + if bitsLeft >= bitsPerValue: + pendingValue += v << (WORD-bitsLeft) + bitsLeft -= bitsPerValue + if bitsLeft == 0: + packed.append(pendingValue) + bitsLeft = WORD + pendingValue = 0 + else: + # split + + # bottom bitsLeft go in current word: + pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft) + packed.append(pendingValue) + + pendingValue = v >> bitsLeft + bitsLeft = WORD - (bitsPerValue-bitsLeft) + + if bitsLeft < WORD: + packed.append(pendingValue) + + # verify(l, packed, bitsPerValue) + + return packed, bitsPerValue + +def verify(data, packedData, bitsPerValue): + for i in range(len(data)): + assert data[i] == unpack(packedData, i, bitsPerValue) + +def unpack(data, index, bitsPerValue): + bitLoc = bitsPerValue * index + dataLoc = int(bitLoc >> LOG2_WORD) + bitStart = int(bitLoc & (WORD-1)) + if bitStart + bitsPerValue <= WORD: + # not split + return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1])) + else: + # split + part = WORD-bitStart; + return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) + + ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part))) + +if __name__ == '__main__': + if not __debug__: + print + print 'ERROR: please run without -O' + print + sys.exit(1) + main() diff --git a/lucene/src/java/org/apache/lucene/util/automaton/package.html b/lucene/src/java/org/apache/lucene/util/automaton/package.html new file mode 100644 index 00000000000..0ac5d805241 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/package.html @@ -0,0 +1,47 @@ + + + + +Finite-state automaton for regular expressions. +

    +This package contains a full DFA/NFA implementation with Unicode +alphabet and support for all standard (and a number of non-standard) +regular expression operations. +

    +The most commonly used functionality is located in the classes +{@link org.apache.lucene.util.automaton.Automaton} and +{@link org.apache.lucene.util.automaton.RegExp}. +

    +For more information, go to the package home page at +http://www.brics.dk/automaton/. +@lucene.experimental + + diff --git a/lucene/src/java/org/apache/lucene/util/packed/Direct16.java b/lucene/src/java/org/apache/lucene/util/packed/Direct16.java new file mode 100644 index 00000000000..c97cace0247 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Direct16.java @@ -0,0 +1,86 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 16 bit values to a backing array of shorts. + */ + +class Direct16 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private short[] blocks; + private static final int BITS_PER_VALUE = 16; + + public Direct16(int valueCount) { + super(valueCount, BITS_PER_VALUE); + blocks = new short[valueCount]; + } + + public Direct16(IndexInput in, int valueCount) throws IOException { + super(valueCount, BITS_PER_VALUE); + short[] blocks = new short[valueCount]; + for(int i=0;i

    + * Note: The blocks are used directly, so changes to the given block will + * affect the structure. + * @param blocks used as the internal backing array. + */ + public Direct16(short[] blocks) { + super(blocks.length, BITS_PER_VALUE); + this.blocks = blocks; + } + + public long get(final int index) { + return 0xFFFFL & blocks[index]; + } + + public void set(final int index, final long value) { + blocks[index] = (short)(value & 0xFFFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_SHORT; + } + + public void clear() { + Arrays.fill(blocks, (short)0); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/Direct32.java b/lucene/src/java/org/apache/lucene/util/packed/Direct32.java new file mode 100644 index 00000000000..da3b33c5017 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Direct32.java @@ -0,0 +1,82 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 32 bit values to a backing array of ints. + */ + +class Direct32 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private int[] blocks; + private static final int BITS_PER_VALUE = 32; + + public Direct32(int valueCount) { + super(valueCount, BITS_PER_VALUE); + blocks = new int[valueCount]; + } + + public Direct32(IndexInput in, int valueCount) throws IOException { + super(valueCount, BITS_PER_VALUE); + int[] blocks = new int[valueCount]; + for(int i=0;i

    + * Note: The blocks are used directly, so changes to the given block will + * affect the structure. + * @param blocks used as the internal backing array. + */ + public Direct32(int[] blocks) { + super(blocks.length, BITS_PER_VALUE); + this.blocks = blocks; + } + + public long get(final int index) { + return 0xFFFFFFFFL & blocks[index]; + } + + public void set(final int index, final long value) { + blocks[index] = (int)(value & 0xFFFFFFFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_INT; + } + + public void clear() { + Arrays.fill(blocks, 0); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/Direct64.java b/lucene/src/java/org/apache/lucene/util/packed/Direct64.java new file mode 100644 index 00000000000..cb6b9df343a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Direct64.java @@ -0,0 +1,79 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 32 bit values to a backing array of ints. + */ + +class Direct64 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private long[] blocks; + private static final int BITS_PER_VALUE = 64; + + public Direct64(int valueCount) { + super(valueCount, BITS_PER_VALUE); + blocks = new long[valueCount]; + } + + public Direct64(IndexInput in, int valueCount) throws IOException { + super(valueCount, BITS_PER_VALUE); + long[] blocks = new long[valueCount]; + for(int i=0;i

    + * Note: The blocks are used directly, so changes to the given block will + * affect the structure. + * @param blocks used as the internal backing array. + */ + public Direct64(long[] blocks) { + super(blocks.length, BITS_PER_VALUE); + this.blocks = blocks; + } + + public long get(final int index) { + return blocks[index]; + } + + public void set(final int index, final long value) { + blocks[index] = value; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_LONG; + } + + public void clear() { + Arrays.fill(blocks, 0L); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/Direct8.java b/lucene/src/java/org/apache/lucene/util/packed/Direct8.java new file mode 100644 index 00000000000..128323b8667 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Direct8.java @@ -0,0 +1,86 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 8 bit values to a backing array of bytes. + */ + +class Direct8 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private byte[] blocks; + private static final int BITS_PER_VALUE = 8; + + public Direct8(int valueCount) { + super(valueCount, BITS_PER_VALUE); + blocks = new byte[valueCount]; + } + + public Direct8(IndexInput in, int valueCount) + throws IOException { + super(valueCount, BITS_PER_VALUE); + byte[] blocks = new byte[valueCount]; + for(int i=0;i

    + * Note: The blocks are used directly, so changes to the given block will + * affect the structure. + * @param blocks used as the internal backing array. + */ + public Direct8(byte[] blocks) { + super(blocks.length, BITS_PER_VALUE); + this.blocks = blocks; + } + + public long get(final int index) { + return 0xFFL & blocks[index]; + } + + public void set(final int index, final long value) { + blocks[index] = (byte)(value & 0xFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + blocks.length; + } + + public void clear() { + Arrays.fill(blocks, (byte)0); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/Packed32.java b/lucene/src/java/org/apache/lucene/util/packed/Packed32.java new file mode 100644 index 00000000000..2979786a532 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Packed32.java @@ -0,0 +1,221 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Space optimized random access capable array of values with a fixed number of + * bits. The maximum number of bits/value is 31. Use {@link Packed64} for higher + * numbers. + *

    + * The implementation strives to avoid conditionals and expensive operations, + * sacrificing code clarity to achieve better performance. + */ + +class Packed32 extends PackedInts.ReaderImpl implements PackedInts.Mutable { + static final int BLOCK_SIZE = 32; // 32 = int, 64 = long + static final int BLOCK_BITS = 5; // The #bits representing BLOCK_SIZE + static final int MOD_MASK = BLOCK_SIZE - 1; // x % BLOCK_SIZE + + private static final int ENTRY_SIZE = BLOCK_SIZE + 1; + private static final int FAC_BITPOS = 3; + + /* + * In order to make an efficient value-getter, conditionals should be + * avoided. A value can be positioned inside of a block, requiring shifting + * left or right or it can span two blocks, requiring a left-shift on the + * first block and a right-shift on the right block. + *

    + * By always shifting the first block both left and right, we get exactly + * the right bits. By always shifting the second block right and applying + * a mask, we get the right bits there. After that, we | the two bitsets. + */ + private static final int[][] SHIFTS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + private static final int[][] MASKS = new int[ENTRY_SIZE][ENTRY_SIZE]; + + static { // Generate shifts + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int[] currentShifts = SHIFTS[elementBits]; + int base = bitPos * FAC_BITPOS; + currentShifts[base ] = bitPos; + currentShifts[base + 1] = BLOCK_SIZE - elementBits; + if (bitPos <= BLOCK_SIZE - elementBits) { // Single block + currentShifts[base + 2] = 0; + MASKS[elementBits][bitPos] = 0; + } else { // Two blocks + int rBits = elementBits - (BLOCK_SIZE - bitPos); + currentShifts[base + 2] = BLOCK_SIZE - rBits; + MASKS[elementBits][bitPos] = ~(~0 << rBits); + } + } + } + } + + /* + * The setter requires more masking than the getter. + */ + private static final int[][] WRITE_MASKS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + static { + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + int elementPosMask = ~(~0 << elementBits); + int[] currentShifts = SHIFTS[elementBits]; + int[] currentMasks = WRITE_MASKS[elementBits]; + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int base = bitPos * FAC_BITPOS; + currentMasks[base ] =~((elementPosMask + << currentShifts[base + 1]) + >>> currentShifts[base]); + currentMasks[base+1] = ~(elementPosMask + << currentShifts[base + 2]); + currentMasks[base+2] = currentShifts[base + 2] == 0 ? 0 : ~0; + } + } + } + + /* The bits */ + private int[] blocks; + + // Cached calculations + private int maxPos; // blocks.length * BLOCK_SIZE / bitsPerValue - 1 + private int[] shifts; // The shifts for the current bitsPerValue + private int[] readMasks; + private int[] writeMasks; + + /** + * Creates an array with the internal structures adjusted for the given + * limits and initialized to 0. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * Note: bitsPerValue >32 is not supported by this implementation. + */ + public Packed32(int valueCount, int bitsPerValue) { + this(new int[(int)(((long)valueCount) * bitsPerValue / BLOCK_SIZE + 2)], + valueCount, bitsPerValue); + } + + /** + * Creates an array with content retrieved from the given IndexInput. + * @param in an IndexInput, positioned at the start of Packed64-content. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @throws java.io.IOException if the values for the backing array could not + * be retrieved. + */ + public Packed32(IndexInput in, int valueCount, int bitsPerValue) + throws IOException { + super(valueCount, bitsPerValue); + int size = size(bitsPerValue, valueCount); + blocks = new int[size + 1]; // +1 due to non-conditional tricks + // TODO: find a faster way to bulk-read ints... + for(int i = 0 ; i < size ; i++) { + blocks[i] = in.readInt(); + } + if (size % 2 == 1) { + in.readInt(); // Align to long + } + updateCached(); + } + + private static int size(int bitsPerValue, int valueCount) { + final long totBitCount = (long) valueCount * bitsPerValue; + return (int) (totBitCount/32 + ((totBitCount % 32 == 0 ) ? 0:1)); + } + + + /** + * Creates an array backed by the given blocks. + *

    + * Note: The blocks are used directly, so changes to the given block will + * affect the Packed32-structure. + * @param blocks used as the internal backing array. + * @param valueCount the number of values. + * @param bitsPerValue the number of bits available for any given value. + * Note: bitsPerValue >32 is not supported by this implementation. + */ + public Packed32(int[] blocks, int valueCount, int bitsPerValue) { + // TODO: Check that blocks.length is sufficient for holding length values + super(valueCount, bitsPerValue); + if (bitsPerValue > 31) { + throw new IllegalArgumentException(String.format( + "This array only supports values of 31 bits or less. The " + + "required number of bits was %d. The Packed64 " + + "implementation allows values with more than 31 bits", + bitsPerValue)); + } + this.blocks = blocks; + updateCached(); + } + + private void updateCached() { + readMasks = MASKS[bitsPerValue]; + maxPos = (int)((((long)blocks.length) * BLOCK_SIZE / bitsPerValue) - 2); + shifts = SHIFTS[bitsPerValue]; + writeMasks = WRITE_MASKS[bitsPerValue]; + } + + /** + * @param index the position of the value. + * @return the value at the given index. + */ + public long get(final int index) { + final long majorBitPos = index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + + final int base = bitPos * FAC_BITPOS; + + return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | + ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); + } + + public void set(final int index, final long value) { + final int intValue = (int)value; + final long majorBitPos = index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + final int base = bitPos * FAC_BITPOS; + + blocks[elementPos ] = (blocks[elementPos ] & writeMasks[base]) + | (intValue << shifts[base + 1] >>> shifts[base]); + blocks[elementPos+1] = (blocks[elementPos+1] & writeMasks[base+1]) + | ((intValue << shifts[base + 2]) + & writeMasks[base+2]); + } + + public void clear() { + Arrays.fill(blocks, 0); + } + + public String toString() { + return "Packed32(bitsPerValue=" + bitsPerValue + ", maxPos=" + maxPos + + ", elements.length=" + blocks.length + ")"; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_INT; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/Packed64.java b/lucene/src/java/org/apache/lucene/util/packed/Packed64.java new file mode 100644 index 00000000000..9cdde874820 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/Packed64.java @@ -0,0 +1,211 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Space optimized random access capable array of values with a fixed number of + * bits. For 32 bits/value and less, performance on 32 bit machines is not + * optimal. Consider using {@link Packed32} for such a setup. + *

    + * The implementation strives to avoid conditionals and expensive operations, + * sacrificing code clarity to achieve better performance. + */ + +class Packed64 extends PackedInts.ReaderImpl implements PackedInts.Mutable { + static final int BLOCK_SIZE = 64; // 32 = int, 64 = long + static final int BLOCK_BITS = 6; // The #bits representing BLOCK_SIZE + static final int MOD_MASK = BLOCK_SIZE - 1; // x % BLOCK_SIZE + + private static final int ENTRY_SIZE = BLOCK_SIZE + 1; + private static final int FAC_BITPOS = 3; + + /* + * In order to make an efficient value-getter, conditionals should be + * avoided. A value can be positioned inside of a block, requiring shifting + * left or right or it can span two blocks, requiring a left-shift on the + * first block and a right-shift on the right block. + *

    + * By always shifting the first block both left and right, we get exactly + * the right bits. By always shifting the second block right and applying + * a mask, we get the right bits there. After that, we | the two bitsets. + */ + private static final int[][] SHIFTS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + //new int[BLOCK_SIZE+1][BLOCK_SIZE][BLOCK_SIZE+1]; + private static final long[][] MASKS = new long[ENTRY_SIZE][ENTRY_SIZE]; + + static { // Generate shifts + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int[] currentShifts = SHIFTS[elementBits]; + int base = bitPos * FAC_BITPOS; + currentShifts[base ] = bitPos; + currentShifts[base + 1] = BLOCK_SIZE - elementBits; + if (bitPos <= BLOCK_SIZE - elementBits) { // Single block + currentShifts[base + 2] = 0; + MASKS[elementBits][bitPos] = 0; + } else { // Two blocks + int rBits = elementBits - (BLOCK_SIZE - bitPos); + currentShifts[base + 2] = BLOCK_SIZE - rBits; + MASKS[elementBits][bitPos] = ~(~0L << rBits); + } + } + } + } + + /* + * The setter requires more masking than the getter. + */ + private static final long[][] WRITE_MASKS = + new long[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + static { + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + long elementPosMask = ~(~0L << elementBits); + int[] currentShifts = SHIFTS[elementBits]; + long[] currentMasks = WRITE_MASKS[elementBits]; + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int base = bitPos * FAC_BITPOS; + currentMasks[base ] =~((elementPosMask + << currentShifts[base + 1]) + >>> currentShifts[base]); + currentMasks[base+1] = ~(elementPosMask + << currentShifts[base + 2]); + currentMasks[base+2] = currentShifts[base + 2] == 0 ? 0 : ~0; + } + } + } + + /* The bits */ + private long[] blocks; + + // Cached calculations + private int maxPos; // blocks.length * BLOCK_SIZE / elementBits - 1 + private int[] shifts; // The shifts for the current elementBits + private long[] readMasks; + private long[] writeMasks; + + /** + * Creates an array with the internal structures adjusted for the given + * limits and initialized to 0. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + */ + public Packed64(int valueCount, int bitsPerValue) { + // TODO: Test for edge-cases (2^31 values, 63 bitsPerValue) + // +2 due to the avoid-conditionals-trick. The last entry is always 0 + this(new long[(int)((long)valueCount * bitsPerValue / BLOCK_SIZE + 2)], + valueCount, bitsPerValue); + } + + + /** + * Creates an array backed by the given blocks. + *

    + * Note: The blocks are used directly, so changes to the given block will + * affect the Packed32-structure. + * @param blocks used as the internal backing array. Not that the last + * element cannot be addressed directly. + * @param valueCount the number of values. + * @param bitsPerValue the number of bits available for any given value. + */ + public Packed64(long[] blocks, int valueCount, int bitsPerValue) { + super(valueCount, bitsPerValue); + this.blocks = blocks; + updateCached(); + } + + /** + * Creates an array with content retrieved from the given IndexInput. + * @param in an IndexInput, positioned at the start of Packed64-content. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @throws java.io.IOException if the values for the backing array could not + * be retrieved. + */ + public Packed64(IndexInput in, int valueCount, int bitsPerValue) + throws IOException { + super(valueCount, bitsPerValue); + int size = size(valueCount, bitsPerValue); + blocks = new long[size+1]; // +1 due to non-conditional tricks + // TODO: find a faster way to bulk-read longs... + for(int i=0;i>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + + final int base = bitPos * FAC_BITPOS; + + return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | + ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); + } + + public void set(final int index, final long value) { + final long majorBitPos = index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + final int base = bitPos * FAC_BITPOS; + + blocks[elementPos ] = (blocks[elementPos ] & writeMasks[base]) + | (value << shifts[base + 1] >>> shifts[base]); + blocks[elementPos+1] = (blocks[elementPos+1] & writeMasks[base+1]) + | ((value << shifts[base + 2]) & writeMasks[base+2]); + } + + public String toString() { + return "Packed64(bitsPerValue=" + bitsPerValue + ", size=" + + size() + ", maxPos=" + maxPos + + ", elements.length=" + blocks.length + ")"; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_LONG; + } + + public void clear() { + Arrays.fill(blocks, 0L); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/PackedInts.java b/lucene/src/java/org/apache/lucene/util/packed/PackedInts.java new file mode 100644 index 00000000000..405431b8de7 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/PackedInts.java @@ -0,0 +1,296 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.Constants; + +import java.io.IOException; + +/** + * Simplistic compression for array of unsigned long values. + * Each value is >= 0 and <= a specified maximum value. The + * values are stored as packed ints, with each value + * consuming a fixed number of bits. + * + * @lucene.internal + */ + +public class PackedInts { + + private final static String CODEC_NAME = "PackedInts"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = VERSION_START; + + /** + * A read-only random access array of positive integers. + * @lucene.internal + */ + public static interface Reader { + /** + * @param index the position of the wanted value. + * @return the value at the stated index. + */ + long get(int index); + + /** + * @return the number of bits used to store any given value. + * Note: This does not imply that memory usage is + * {@code bitsPerValue * #values} as implementations are free to + * use non-space-optimal packing of bits. + */ + int getBitsPerValue(); + + /** + * @return the number of values. + */ + int size(); + } + + /** + * Run-once iterator interface, to decode previously saved PackedInts. + */ + public static interface ReaderIterator extends Closeable { + /** Returns next value */ + long next() throws IOException; + /** Returns number of bits per value */ + int getBitsPerValue(); + /** Returns number of values */ + int size(); + } + + /** + * A packed integer array that can be modified. + * @lucene.internal + */ + public static interface Mutable extends Reader { + /** + * Set the value at the given index in the array. + * @param index where the value should be positioned. + * @param value a value conforming to the constraints set by the array. + */ + void set(int index, long value); + + /** + * Sets all values to 0. + */ + + void clear(); + } + + /** + * A simple base for Readers that keeps track of valueCount and bitsPerValue. + * @lucene.internal + */ + public static abstract class ReaderImpl implements Reader { + protected final int bitsPerValue; + protected final int valueCount; + + protected ReaderImpl(int valueCount, int bitsPerValue) { + this.bitsPerValue = bitsPerValue; + assert bitsPerValue > 0 && bitsPerValue <= 64 : "bitsPerValue=" + bitsPerValue; + this.valueCount = valueCount; + } + + public int getBitsPerValue() { + return bitsPerValue; + } + + public int size() { + return valueCount; + } + + public long getMaxValue() { // Convenience method + return maxValue(bitsPerValue); + } + } + + /** A write-once Writer. + * @lucene.internal + */ + public static abstract class Writer { + protected final IndexOutput out; + protected final int bitsPerValue; + protected final int valueCount; + + protected Writer(IndexOutput out, int valueCount, int bitsPerValue) + throws IOException { + assert bitsPerValue <= 64; + + this.out = out; + this.valueCount = valueCount; + this.bitsPerValue = bitsPerValue; + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeVInt(bitsPerValue); + out.writeVInt(valueCount); + } + + public abstract void add(long v) throws IOException; + public abstract void finish() throws IOException; + } + + /** + * Retrieve PackedInt data from the IndexInput and return a packed int + * structure based on it. + * @param in positioned at the beginning of a stored packed int structure. + * @return a read only random access capable array of positive integers. + * @throws IOException if the structure could not be retrieved. + * @lucene.internal + */ + public static Reader getReader(IndexInput in) throws IOException { + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START); + final int bitsPerValue = in.readVInt(); + assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; + final int valueCount = in.readVInt(); + + switch (bitsPerValue) { + case 8: + return new Direct8(in, valueCount); + case 16: + return new Direct16(in, valueCount); + case 32: + return new Direct32(in, valueCount); + case 64: + return new Direct64(in, valueCount); + default: + if (Constants.JRE_IS_64BIT || bitsPerValue >= 32) { + return new Packed64(in, valueCount, bitsPerValue); + } else { + return new Packed32(in, valueCount, bitsPerValue); + } + } + } + + /** + * Retrieve PackedInts as a {@link ReaderIterator} + * @param in positioned at the beginning of a stored packed int structure. + * @return an iterator to access the values + * @throws IOException if the structure could not be retrieved. + * @lucene.internal + */ + public static ReaderIterator getReaderIterator(IndexInput in) throws IOException { + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START); + final int bitsPerValue = in.readVInt(); + assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; + final int valueCount = in.readVInt(); + + return new PackedReaderIterator(bitsPerValue, valueCount, in); + } + + /** + * Create a packed integer array with the given amount of values initialized + * to 0. the valueCount and the bitsPerValue cannot be changed after creation. + * All Mutables known by this factory are kept fully in RAM. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @return a mutable packed integer array. + * @throws java.io.IOException if the Mutable could not be created. With the + * current implementations, this never happens, but the method + * signature allows for future persistence-backed Mutables. + * @lucene.internal + */ + public static Mutable getMutable( + int valueCount, int bitsPerValue) throws IOException { + switch (bitsPerValue) { + case 8: + return new Direct8(valueCount); + case 16: + return new Direct16(valueCount); + case 32: + return new Direct32(valueCount); + case 64: + return new Direct64(valueCount); + default: + if (Constants.JRE_IS_64BIT || bitsPerValue >= 32) { + return new Packed64(valueCount, bitsPerValue); + } else { + return new Packed32(valueCount, bitsPerValue); + } + } + } + + /** + * Create a packed integer array writer for the given number of values at the + * given bits/value. Writers append to the given IndexOutput and has very + * low memory overhead. + * @param out the destination for the produced bits. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @return a Writer ready for receiving values. + * @throws IOException if bits could not be written to out. + * @lucene.internal + */ + public static Writer getWriter(IndexOutput out, int valueCount, int bitsPerValue) + throws IOException { + return new PackedWriter(out, valueCount, bitsPerValue); + } + + /** Returns how many bits are required to hold values up + * to and including maxValue + * @param maxValue the maximum value tha should be representable. + * @return the amount of bits needed to represent values from 0 to maxValue. + * @lucene.internal + */ + public static int bitsRequired(long maxValue) { + // Very high long values does not translate well to double, so we do an + // explicit check for the edge cases + if (maxValue > 0x3FFFFFFFFFFFFFFFL) { + return 63; + } if (maxValue > 0x1FFFFFFFFFFFFFFFL) { + return 62; + } + return Math.max(1, (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0))); + } + + /** + * Calculates the maximum unsigned long that can be expressed with the given + * number of bits. + * @param bitsPerValue the number of bits available for any given value. + * @return the maximum value for the given bits. + * @lucene.internal + */ + public static long maxValue(int bitsPerValue) { + return bitsPerValue == 64 ? Long.MAX_VALUE : ~(~0L << bitsPerValue); + } + + /** Rounds bitsPerValue up to 8, 16, 32 or 64. */ + public static int getNextFixedSize(int bitsPerValue) { + if (bitsPerValue <= 8) { + return 8; + } else if (bitsPerValue <= 16) { + return 16; + } else if (bitsPerValue <= 32) { + return 32; + } else { + return 64; + } + } + + /** Possibly wastes some storage in exchange for faster lookups */ + public static int getRoundedFixedSize(int bitsPerValue) { + if (bitsPerValue > 58 || (bitsPerValue < 32 && bitsPerValue > 29)) { // 10% space-waste is ok + return getNextFixedSize(bitsPerValue); + } else { + return bitsPerValue; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/PackedReaderIterator.java b/lucene/src/java/org/apache/lucene/util/packed/PackedReaderIterator.java new file mode 100644 index 00000000000..271ec73bc51 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/PackedReaderIterator.java @@ -0,0 +1,84 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; + +class PackedReaderIterator implements PackedInts.ReaderIterator { + private long pending; + private int pendingBitsLeft; + private final IndexInput in; + private final int bitsPerValue; + private final int valueCount; + + // masks[n-1] masks for bottom n bits + private final long[] masks; + + public PackedReaderIterator(int bitsPerValue, int valueCount, IndexInput in) + throws IOException { + + this.valueCount = valueCount; + this.bitsPerValue = bitsPerValue; + + this.in = in; + + masks = new long[bitsPerValue]; + + long v = 1; + for (int i = 0; i < bitsPerValue; i++) { + v *= 2; + masks[i] = v - 1; + } + } + + public int getBitsPerValue() { + return bitsPerValue; + } + + public int size() { + return valueCount; + } + + public long next() throws IOException { + if (pendingBitsLeft == 0) { + pending = in.readLong(); + pendingBitsLeft = 64; + } + + if (pendingBitsLeft >= bitsPerValue) { + // not split + final long result = (pending >> (pendingBitsLeft - bitsPerValue)) & masks[bitsPerValue-1]; + pendingBitsLeft -= bitsPerValue; + return result; + } else { + // split + final int bits1 = bitsPerValue - pendingBitsLeft; + final long result1 = (pending & masks[pendingBitsLeft-1]) << bits1; + pending = in.readLong(); + final long result2 = (pending >> (64 - bits1)) & masks[bits1-1]; + pendingBitsLeft = 64 + pendingBitsLeft - bitsPerValue; + return result1 | result2; + } + } + + public void close() throws IOException { + in.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/PackedWriter.java b/lucene/src/java/org/apache/lucene/util/packed/PackedWriter.java new file mode 100644 index 00000000000..0cf054991ba --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/PackedWriter.java @@ -0,0 +1,113 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; + +// Packs high order byte first, to match +// IndexOutput.writeInt/Long/Short byte order + +/** + * Generic writer for space-optimal packed values. The resulting bits can be + * used directly by Packed32, Packed64 and PackedDirect* and will always be + * long-aligned. + */ + +class PackedWriter extends PackedInts.Writer { + private long pending; + private int pendingBitPos; + + // masks[n-1] masks for bottom n bits + private final long[] masks; + private int written = 0; + + public PackedWriter(IndexOutput out, int valueCount, int bitsPerValue) + throws IOException { + super(out, valueCount, bitsPerValue); + + pendingBitPos = 64; + masks = new long[bitsPerValue - 1]; + + long v = 1; + for (int i = 0; i < bitsPerValue - 1; i++) { + v *= 2; + masks[i] = v - 1; + } + } + + /** + * Do not call this after finish + */ + @Override + public void add(long v) throws IOException { + assert v <= PackedInts.maxValue(bitsPerValue) : "v=" + v + + " maxValue=" + PackedInts.maxValue(bitsPerValue); + assert v >= 0; + //System.out.println(" packedw add v=" + v + " pendingBitPos=" + pendingBitPos); + + // TODO + if (pendingBitPos >= bitsPerValue) { + // not split + + // write-once, so we can |= w/o first masking to 0s + pending |= v << (pendingBitPos - bitsPerValue); + if (pendingBitPos == bitsPerValue) { + // flush + out.writeLong(pending); + pending = 0; + pendingBitPos = 64; + } else { + pendingBitPos -= bitsPerValue; + } + + } else { + // split + + // write top pendingBitPos bits of value into bottom bits of pending + pending |= (v >> (bitsPerValue - pendingBitPos)) & masks[pendingBitPos - 1]; + //System.out.println(" part1 (v >> " + (bitsPerValue - pendingBitPos) + ") & " + masks[pendingBitPos-1]); + + // flush + out.writeLong(pending); + + // write bottom (bitsPerValue - pendingBitPos) bits of value into top bits of pending + pendingBitPos = 64 - bitsPerValue + pendingBitPos; + //System.out.println(" part2 v << " + pendingBitPos); + pending = (v << pendingBitPos); + } + written++; + } + + @Override + public void finish() throws IOException { + while (written < valueCount) { + add(0L); // Auto flush + } + + if (pendingBitPos != 64) { + out.writeLong(pending); + } + } + + public String toString() { + return "PackedWriter(written " + written + "/" + valueCount + " with " + + bitsPerValue + " bits/value)"; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/packed/package.html b/lucene/src/java/org/apache/lucene/util/packed/package.html new file mode 100644 index 00000000000..b98aa234276 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/packed/package.html @@ -0,0 +1,16 @@ + + + + + +

    + The packed package provides random access capable arrays of positive longs. + The implementations provides different trade offs between memory usage and + access speed. The standard usage scenario is replacing large int or long + arrays in order to reduce the memory footprint. +

    + The main access point is the {@link org.apache.lucene.util.packed.PackedInts} factory. +

    + + + diff --git a/lucene/src/test/org/apache/lucene/TestDemo.java b/lucene/src/test/org/apache/lucene/TestDemo.java index 6db6943f746..e938533e6cb 100644 --- a/lucene/src/test/org/apache/lucene/TestDemo.java +++ b/lucene/src/test/org/apache/lucene/TestDemo.java @@ -24,11 +24,13 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -54,7 +56,8 @@ public class TestDemo extends LuceneTestCase { TEST_VERSION_CURRENT, analyzer).setMaxFieldLength(25000)); Document doc = new Document(); - String text = "This is the text to be indexed."; + String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); @@ -62,15 +65,17 @@ public class TestDemo extends LuceneTestCase { // Now search the index: IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true + + assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits); // Parse a simple query that searches for "text": QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "fieldname", analyzer); Query query = parser.parse("text"); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(query, null, 1).scoreDocs; assertEquals(1, hits.length); // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); - assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); + assertEquals(text, hitDoc.get("fieldname")); } isearcher.close(); directory.close(); diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java new file mode 100644 index 00000000000..e68f4a4fd21 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -0,0 +1,887 @@ +package org.apache.lucene; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.*; +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.lucene.search.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.standard.*; +import org.apache.lucene.index.codecs.pulsing.*; +import org.apache.lucene.store.*; +import java.util.*; +import java.io.*; + +/* Intentionally outside of oal.index to verify fully + external codecs work fine */ + +public class TestExternalCodecs extends LuceneTestCase { + + // For fun, test that we can override how terms are + // sorted, and basic things still work -- this comparator + // sorts in reversed unicode code point order: + private static final Comparator reverseUnicodeComparator = new Comparator() { + public int compare(BytesRef t1, BytesRef t2) { + byte[] b1 = t1.bytes; + byte[] b2 = t2.bytes; + int b1Stop; + int b1Upto = t1.offset; + int b2Upto = t2.offset; + if (t1.length < t2.length) { + b1Stop = t1.offset + t1.length; + } else { + b1Stop = t1.offset + t2.length; + } + while(b1Upto < b1Stop) { + final int bb1 = b1[b1Upto++] & 0xff; + final int bb2 = b2[b2Upto++] & 0xff; + if (bb1 != bb2) { + //System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1)); + return bb2 - bb1; + } + } + + // One is prefix of another, or they are equal + return t2.length-t1.length; + } + + public boolean equals(Object other) { + return this == other; + } + }; + + // TODO + // - good improvement would be to write through to disk, + // and then load into ram from disk + public static class RAMOnlyCodec extends Codec { + + // Postings state: + static class RAMPostings extends FieldsProducer { + final Map fieldToTerms = new TreeMap(); + + @Override + public Terms terms(String field) { + return fieldToTerms.get(field); + } + + @Override + public FieldsEnum iterator() { + return new RAMFieldsEnum(this); + } + + @Override + public void close() { + } + + @Override + public void loadTermsIndex(int indexDivisor) { + } + } + + static class RAMField extends Terms { + final String field; + final SortedMap termToDocs = new TreeMap(); + RAMField(String field) { + this.field = field; + } + + @Override + public long getUniqueTermCount() { + return termToDocs.size(); + } + + @Override + public TermsEnum iterator() { + return new RAMTermsEnum(RAMOnlyCodec.RAMField.this); + } + + @Override + public Comparator getComparator() { + return reverseUnicodeComparator; + } + } + + static class RAMTerm { + final String term; + final List docs = new ArrayList(); + public RAMTerm(String term) { + this.term = term; + } + } + + static class RAMDoc { + final int docID; + final int[] positions; + public RAMDoc(int docID, int freq) { + this.docID = docID; + positions = new int[freq]; + } + } + + // Classes for writing to the postings state + private static class RAMFieldsConsumer extends FieldsConsumer { + + private final RAMPostings postings; + private final RAMTermsConsumer termsConsumer = new RAMTermsConsumer(); + + public RAMFieldsConsumer(RAMPostings postings) { + this.postings = postings; + } + + @Override + public TermsConsumer addField(FieldInfo field) { + RAMField ramField = new RAMField(field.name); + postings.fieldToTerms.put(field.name, ramField); + termsConsumer.reset(ramField); + return termsConsumer; + } + + @Override + public void close() { + // TODO: finalize stuff + } + } + + private static class RAMTermsConsumer extends TermsConsumer { + private RAMField field; + private final RAMPostingsWriterImpl postingsWriter = new RAMPostingsWriterImpl(); + RAMTerm current; + + void reset(RAMField field) { + this.field = field; + } + + @Override + public PostingsConsumer startTerm(BytesRef text) { + final String term = text.utf8ToString(); + current = new RAMTerm(term); + postingsWriter.reset(current); + return postingsWriter; + } + + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + + @Override + public void finishTerm(BytesRef text, int numDocs) { + assert numDocs > 0; + assert numDocs == current.docs.size(); + field.termToDocs.put(current.term, current); + } + + @Override + public void finish() { + } + } + + public static class RAMPostingsWriterImpl extends PostingsConsumer { + private RAMTerm term; + private RAMDoc current; + private int posUpto = 0; + + public void reset(RAMTerm term) { + this.term = term; + } + + @Override + public void startDoc(int docID, int freq) { + current = new RAMDoc(docID, freq); + term.docs.add(current); + posUpto = 0; + } + + @Override + public void addPosition(int position, BytesRef payload) { + if (payload != null) { + throw new UnsupportedOperationException("can't handle payloads"); + } + current.positions[posUpto++] = position; + } + + @Override + public void finishDoc() { + assert posUpto == current.positions.length; + } + } + + + // Classes for reading from the postings state + static class RAMFieldsEnum extends FieldsEnum { + private final RAMPostings postings; + private final Iterator it; + private String current; + + public RAMFieldsEnum(RAMPostings postings) { + this.postings = postings; + this.it = postings.fieldToTerms.keySet().iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + } else { + current = null; + } + return current; + } + + @Override + public TermsEnum terms() { + return new RAMTermsEnum(postings.fieldToTerms.get(current)); + } + } + + static class RAMTermsEnum extends TermsEnum { + Iterator it; + String current; + private final RAMField ramField; + + public RAMTermsEnum(RAMField field) { + this.ramField = field; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + + @Override + public BytesRef next() { + if (it == null) { + if (current == null) { + it = ramField.termToDocs.keySet().iterator(); + } else { + it = ramField.termToDocs.tailMap(current).keySet().iterator(); + } + } + if (it.hasNext()) { + current = it.next(); + return new BytesRef(current); + } else { + return null; + } + } + + @Override + public SeekStatus seek(BytesRef term, boolean useCache) { + current = term.utf8ToString(); + it = null; + if (ramField.termToDocs.containsKey(current)) { + return SeekStatus.FOUND; + } else { + if (current.compareTo(ramField.termToDocs.lastKey()) > 0) { + return SeekStatus.END; + } else { + return SeekStatus.NOT_FOUND; + } + } + } + + @Override + public SeekStatus seek(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef term() { + // TODO: reuse BytesRef + return new BytesRef(current); + } + + @Override + public int docFreq() { + return ramField.termToDocs.get(current).docs.size(); + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { + return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) { + return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), skipDocs); + } + } + + private static class RAMDocsEnum extends DocsEnum { + private final RAMTerm ramTerm; + private final Bits skipDocs; + private RAMDoc current; + int upto = -1; + int posUpto = 0; + + public RAMDocsEnum(RAMTerm ramTerm, Bits skipDocs) { + this.ramTerm = ramTerm; + this.skipDocs = skipDocs; + } + + @Override + public int advance(int targetDocID) { + do { + nextDoc(); + } while (upto < ramTerm.docs.size() && current.docID < targetDocID); + return NO_MORE_DOCS; + } + + // TODO: override bulk read, for better perf + @Override + public int nextDoc() { + while(true) { + upto++; + if (upto < ramTerm.docs.size()) { + current = ramTerm.docs.get(upto); + if (skipDocs == null || !skipDocs.get(current.docID)) { + posUpto = 0; + return current.docID; + } + } else { + return NO_MORE_DOCS; + } + } + } + + @Override + public int freq() { + return current.positions.length; + } + + @Override + public int docID() { + return current.docID; + } + } + + private static class RAMDocsAndPositionsEnum extends DocsAndPositionsEnum { + private final RAMTerm ramTerm; + private final Bits skipDocs; + private RAMDoc current; + int upto = -1; + int posUpto = 0; + + public RAMDocsAndPositionsEnum(RAMTerm ramTerm, Bits skipDocs) { + this.ramTerm = ramTerm; + this.skipDocs = skipDocs; + } + + @Override + public int advance(int targetDocID) { + do { + nextDoc(); + } while (upto < ramTerm.docs.size() && current.docID < targetDocID); + return NO_MORE_DOCS; + } + + // TODO: override bulk read, for better perf + @Override + public int nextDoc() { + while(true) { + upto++; + if (upto < ramTerm.docs.size()) { + current = ramTerm.docs.get(upto); + if (skipDocs == null || !skipDocs.get(current.docID)) { + posUpto = 0; + return current.docID; + } + } else { + return NO_MORE_DOCS; + } + } + } + + @Override + public int freq() { + return current.positions.length; + } + + @Override + public int docID() { + return current.docID; + } + + @Override + public int nextPosition() { + return current.positions[posUpto++]; + } + + @Override + public boolean hasPayload() { + return false; + } + + @Override + public int getPayloadLength() { + return 0; + } + + @Override + public BytesRef getPayload() { + return null; + } + } + + // Holds all indexes created + private final Map state = new HashMap(); + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) { + RAMPostings postings = new RAMPostings(); + RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings); + synchronized(state) { + state.put(writeState.segmentName, postings); + } + return consumer; + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState readState) + throws IOException { + return state.get(readState.segmentInfo.name); + } + + @Override + public void getExtensions(Set extensions) { + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) { + } + } + + /** Simple Codec that dispatches field-specific codecs. + * You must ensure every field you index has a Codec, or + * the defaultCodec is non null. Also, the separate + * codecs cannot conflict on file names.*/ + // TODO: promote to core + public static class PerFieldCodecWrapper extends Codec { + private final Map fields = new IdentityHashMap(); + private final Codec defaultCodec; + + public PerFieldCodecWrapper(Codec defaultCodec) { + name = "PerField"; + this.defaultCodec = defaultCodec; + } + + public void add(String field, Codec codec) { + fields.put(field, codec); + } + + Codec getCodec(String field) { + Codec codec = fields.get(field); + if (codec != null) { + return codec; + } else { + return defaultCodec; + } + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new FieldsWriter(state); + } + + private class FieldsWriter extends FieldsConsumer { + private final SegmentWriteState state; + private final Map codecs = new HashMap(); + private final Set fieldsSeen = new TreeSet(); + + public FieldsWriter(SegmentWriteState state) { + this.state = state; + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + fieldsSeen.add(field.name); + Codec codec = getCodec(field.name); + + FieldsConsumer fields = codecs.get(codec); + if (fields == null) { + fields = codec.fieldsConsumer(state); + codecs.put(codec, fields); + } + return fields.addField(field); + } + + @Override + public void close() throws IOException { + Iterator it = codecs.values().iterator(); + IOException err = null; + while(it.hasNext()) { + try { + it.next().close(); + } catch (IOException ioe) { + // keep first IOException we hit but keep + // closing the rest + if (err == null) { + err = ioe; + } + } + } + if (err != null) { + throw err; + } + } + } + + private class FieldsReader extends FieldsProducer { + + private final Set fields = new TreeSet(); + private final Map codecs = new HashMap(); + + public FieldsReader(Directory dir, FieldInfos fieldInfos, + SegmentInfo si, int readBufferSize, + int indexDivisor) throws IOException { + + final int fieldCount = fieldInfos.size(); + for(int i=0;i it; + private String current; + + public FieldsIterator() { + it = fields.iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + } else { + current = null; + } + + return current; + } + + @Override + public TermsEnum terms() throws IOException { + Terms terms = codecs.get(getCodec(current)).terms(current); + if (terms != null) { + return terms.iterator(); + } else { + return null; + } + } + } + + @Override + public FieldsEnum iterator() throws IOException { + return new FieldsIterator(); + } + + @Override + public Terms terms(String field) throws IOException { + Codec codec = getCodec(field); + + FieldsProducer fields = codecs.get(codec); + assert fields != null; + return fields.terms(field); + } + + @Override + public void close() throws IOException { + Iterator it = codecs.values().iterator(); + IOException err = null; + while(it.hasNext()) { + try { + it.next().close(); + } catch (IOException ioe) { + // keep first IOException we hit but keep + // closing the rest + if (err == null) { + err = ioe; + } + } + } + if (err != null) { + throw err; + } + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + Iterator it = codecs.values().iterator(); + while(it.hasNext()) { + it.next().loadTermsIndex(indexDivisor); + } + } + } + + public FieldsProducer fieldsProducer(SegmentReadState state) + throws IOException { + return new FieldsReader(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor); + } + + @Override + public void files(Directory dir, SegmentInfo info, Set files) throws IOException { + Iterator it = fields.values().iterator(); + Set seen = new HashSet(); + while(it.hasNext()) { + final Codec codec = it.next(); + if (!seen.contains(codec)) { + seen.add(codec); + codec.files(dir, info, files); + } + } + } + + @Override + public void getExtensions(Set extensions) { + Iterator it = fields.values().iterator(); + while(it.hasNext()) { + final Codec codec = it.next(); + codec.getExtensions(extensions); + } + } + } + + public static class MyCodecs extends CodecProvider { + PerFieldCodecWrapper perField; + + MyCodecs() { + Codec ram = new RAMOnlyCodec(); + Codec pulsing = new PulsingReverseTermsCodec(); + perField = new PerFieldCodecWrapper(ram); + perField.add("field2", pulsing); + perField.add("id", pulsing); + register(perField); + } + + @Override + public Codec getWriter(SegmentWriteState state) { + return perField; + } + } + + // copied from PulsingCodec, just changing the terms + // comparator + private static class PulsingReverseTermsCodec extends Codec { + + public PulsingReverseTermsCodec() { + name = "PulsingReverseTerms"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); + + // Terms that have <= freqCutoff number of docs are + // "pulsed" (inlined): + final int freqCutoff = 1; + StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); + + // Terms dict index + StandardTermsIndexWriter indexWriter; + boolean success = false; + try { + indexWriter = new SimpleStandardTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + pulsingWriter.close(); + } + } + + // Terms dict + success = false; + try { + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator); + success = true; + return ret; + } finally { + if (!success) { + try { + pulsingWriter.close(); + } finally { + indexWriter.close(); + } + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + + StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); + StandardPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader); + + // Terms dict index reader + StandardTermsIndexReader indexReader; + + boolean success = false; + try { + indexReader = new SimpleStandardTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + reverseUnicodeComparator); + success = true; + } finally { + if (!success) { + pulsingReader.close(); + } + } + + // Terms dict reader + success = false; + try { + FieldsProducer ret = new StandardTermsDictReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + pulsingReader, + state.readBufferSize, + reverseUnicodeComparator, + StandardCodec.TERMS_CACHE_SIZE); + success = true; + return ret; + } finally { + if (!success) { + try { + pulsingReader.close(); + } finally { + indexReader.close(); + } + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) throws IOException { + StandardPostingsReaderImpl.files(dir, segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + StandardCodec.getStandardExtensions(extensions); + } + } + + + // tests storing "id" and "field2" fields as pulsing codec, + // whose term sort is backwards unicode code point, and + // storing "field1" as a custom entirely-in-RAM codec + public void testPerFieldCodec() throws Exception { + + final int NUM_DOCS = 173; + + Directory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, + new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer()).setCodecProvider(new MyCodecs())); + + w.setMergeFactor(3); + Document doc = new Document(); + // uses default codec: + doc.add(new Field("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED)); + // uses pulsing codec: + doc.add(new Field("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED)); + + Field idField = new Field("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(idField); + for(int i=0;i 1); + // test each segment + for(int i=0;i allTerms = new ArrayList(); + //System.out.println("TEST: now verify!!"); + testStraightEnum(r); + testRandomSkips(rand, r); + testRandomSeeks(rand, r); + testBogusFieldTerms(rand, r); + } + + private static void testBogusFieldTerms(Random rand, IndexReader r) throws Exception { + final Fields fields = MultiFields.getFields(r); + if (fields == null) { + return; + } + for(int i=0;i<10;i++) { + final String f = "bogus" + rand.nextInt() + "reallybogus"; + Terms terms = fields.terms(f); + assertTrue(terms == null || terms.iterator().next() == null); + } + } + + private static void testStraightEnum(IndexReader r) throws Exception { + + // straight enum of fields/terms/docs/positions + TermEnum termEnum = r.terms(); + final Fields fields = MultiFields.getFields(r); + if (fields == null) { + return; + } + FieldsEnum fieldsEnum = fields.iterator(); + while(true) { + final String field = fieldsEnum.next(); + if (field == null) { + boolean result = termEnum.next(); + if (result) { + System.out.println("got unexpected term=" + termEnum.term() + " termEnum=" + termEnum); + } + assertFalse(result); + break; + } + TermsEnum terms = fieldsEnum.terms(); + DocsAndPositionsEnum postings = null; + DocsEnum docsEnum = null; + final TermPositions termPos = r.termPositions(); + while(true) { + final BytesRef termRef = terms.next(); + if (termRef == null) { + break; + } else { + assertTrue(termEnum.next()); + Term t = termEnum.term(); + assertEquals(t.field(), field); + assertEquals(t.text(), termRef.utf8ToString()); + assertEquals(termEnum.docFreq(), terms.docFreq()); + //allTerms.add(t); + + postings = terms.docsAndPositions(MultiFields.getDeletedDocs(r), postings); + docsEnum = terms.docs(MultiFields.getDeletedDocs(r), docsEnum); + + final DocsEnum docs; + if (postings != null) { + docs = postings; + } else { + docs = docsEnum; + } + + termPos.seek(t); + while(true) { + final int doc = docs.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + assertFalse(termPos.next()); + break; + } else { + assertTrue(termPos.next()); + assertEquals(termPos.doc(), doc); + assertEquals(termPos.freq(), docs.freq()); + final int freq = docs.freq(); + if (postings == null) { + assertEquals(1, freq); + // Old API did not always do this, + // specifically in the MultiTermPositions + // case when some segs omit positions and + // some don't + //assertEquals(0, termPos.nextPosition()); + assertEquals(false, termPos.isPayloadAvailable()); + } else { + for(int i=0;i commitUserData) throws IOException { + r.doCommit(commitUserData); + } + + protected void doClose() throws IOException { + r.doClose(); + } + + public Collection getFieldNames(FieldOption fldOption) { + return r.getFieldNames(fldOption); + } + } + + public static void main(String[] args) throws Exception { + //Directory dir = FSDirectory.open(new File("/x/lucene/wiki.5M/index")); + Directory dir = FSDirectory.open(new File("/x/lucene/flex.wiki.1M/index")); + verifyFlexVsPreFlex(new Random(), dir); + dir.close(); + } +} \ No newline at end of file diff --git a/lucene/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java b/lucene/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java index 7300c62c5bd..392e077045a 100755 --- a/lucene/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java +++ b/lucene/src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.search.PhraseQuery; @@ -47,6 +48,7 @@ public class TestAddIndexesNoOptimize extends LuceneTestCase { addDocs(writer, 100); assertEquals(100, writer.maxDoc()); writer.close(); + _TestUtil.checkIndex(dir); writer = newWriter(aux, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.CREATE)); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(false); // use one without a compound file @@ -68,6 +70,7 @@ public class TestAddIndexesNoOptimize extends LuceneTestCase { writer.addIndexesNoOptimize(new Directory[] { aux, aux2 }); assertEquals(190, writer.maxDoc()); writer.close(); + _TestUtil.checkIndex(dir); // make sure the old index is correct verifyNumDocs(aux, 40); @@ -128,12 +131,13 @@ public class TestAddIndexesNoOptimize extends LuceneTestCase { public void testWithPendingDeletes() throws IOException { // main directory - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); // auxiliary directory - Directory aux = new RAMDirectory(); + Directory aux = new MockRAMDirectory(); setUpDirs(dir, aux); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.APPEND)); + writer.addIndexesNoOptimize(new Directory[] {aux}); // Adds 10 docs, then replaces them with another 10 diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 9587d6494b1..75ddaa4f103 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -26,6 +26,7 @@ import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.OutputStream; import java.util.Arrays; +import java.util.Random; import java.util.Enumeration; import java.util.List; import java.util.ArrayList; @@ -39,14 +40,18 @@ import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.BytesRef; /* Verify we can read the pre-2.1 file format, do searches @@ -134,6 +139,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "24.nocfs", "29.cfs", "29.nocfs", + "30.cfs", + "30.nocfs", }; private void assertCompressedFields29(Directory dir, boolean shouldStillBeCompressed) throws IOException { @@ -201,14 +208,19 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } } - public void testOptimizeOldIndex() throws IOException { + public void testOptimizeOldIndex() throws Exception { int hasTested29 = 0; + + Random rand = newRandom(); for(int i=0;i= 3.0 + if (oldNames[i].compareTo("30.") < 0) continue; + + unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); + String fullPath = fullDir(oldNames[i]); + Directory dir = FSDirectory.open(new File(fullPath)); + IndexSearcher searcher = new IndexSearcher(dir, true); + + for (int id=10; id<15; id++) { + ScoreDoc[] hits = searcher.search(NumericRangeQuery.newIntRange("trieInt", 4, Integer.valueOf(id), Integer.valueOf(id), true, true), 100).scoreDocs; + assertEquals("wrong number of hits", 1, hits.length); + Document d = searcher.doc(hits[0].doc); + assertEquals(String.valueOf(id), d.get("id")); + + hits = searcher.search(NumericRangeQuery.newLongRange("trieLong", 4, Long.valueOf(id), Long.valueOf(id), true, true), 100).scoreDocs; + assertEquals("wrong number of hits", 1, hits.length); + d = searcher.doc(hits[0].doc); + assertEquals(String.valueOf(id), d.get("id")); + } + + // check that also lower-precision fields are ok + ScoreDoc[] hits = searcher.search(NumericRangeQuery.newIntRange("trieInt", 4, Integer.MIN_VALUE, Integer.MAX_VALUE, false, false), 100).scoreDocs; + assertEquals("wrong number of hits", 34, hits.length); + + hits = searcher.search(NumericRangeQuery.newLongRange("trieLong", 4, Long.MIN_VALUE, Long.MAX_VALUE, false, false), 100).scoreDocs; + assertEquals("wrong number of hits", 34, hits.length); + + // check decoding into field cache + int[] fci = FieldCache.DEFAULT.getInts(searcher.getIndexReader(), "trieInt"); + for (int val : fci) { + assertTrue("value in id bounds", val >= 0 && val < 35); + } + + long[] fcl = FieldCache.DEFAULT.getLongs(searcher.getIndexReader(), "trieLong"); + for (long val : fcl) { + assertTrue("value in id bounds", val >= 0L && val < 35L); + } + + searcher.close(); + dir.close(); + rmDir(oldNames[i]); + } + } + } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java new file mode 100644 index 00000000000..48ac0f71b56 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -0,0 +1,614 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Random; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.sep.SepCodec; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.MultiCodecTestCase; +import org.apache.lucene.util.Version; + +// TODO: test multiple codecs here? + +// TODO +// - test across fields +// - fix this test to run once for all codecs +// - make more docs per term, to test > 1 level skipping +// - test all combinations of payloads/not and omitTF/not +// - test w/ different indexDivisor +// - test field where payload length rarely changes +// - 0-term fields +// - seek/skip to same term/doc i'm already on +// - mix in deleted docs +// - seek, skip beyond end -- assert returns false +// - seek, skip to things that don't exist -- ensure it +// goes to 1 before next one known to exist +// - skipTo(term) +// - skipTo(doc) + +public class TestCodecs extends MultiCodecTestCase { + + private Random RANDOM; + private static String[] fieldNames = new String[] {"one", "two", "three", "four"}; + + private final static int NUM_TEST_ITER = 4000; + private final static int NUM_TEST_THREADS = 3; + private final static int NUM_FIELDS = 4; + private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping + private final static int DOC_FREQ_RAND = 500; // must be > 16 to test skipping + private final static int TERM_DOC_FREQ_RAND = 20; + + // start is inclusive and end is exclusive + public int nextInt(final int start, final int end) { + return start + RANDOM.nextInt(end-start); + } + + private int nextInt(final int lim) { + return RANDOM.nextInt(lim); + } + + char[] getRandomText() { + + final int len = 1+this.nextInt(10); + final char[] buffer = new char[len+1]; + for(int i=0;i termsSeen = new HashSet(); + + for(int i=0;i=0;i--) { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(new BytesRef(field.terms[i].text2))); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + } + + // Seek to each term by ord, backwards + for(int i=field.terms.length-1;i>=0;i--) { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + } + + // Seek to non-existent empty-string term + status = termsEnum.seek(new BytesRef("")); + assertNotNull(status); + assertEquals(status, TermsEnum.SeekStatus.NOT_FOUND); + + // Make sure we're now pointing to first term + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[0].text2))); + + // Test docs enum + termsEnum.seek(new BytesRef("")); + upto = 0; + do { + term = field.terms[upto]; + if (TestCodecs.this.nextInt(3) == 1) { + final DocsEnum docs = termsEnum.docs(null, null); + final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(null, null); + + final DocsEnum docsEnum; + if (postings != null) { + docsEnum = postings; + } else { + docsEnum = docs; + } + int upto2 = -1; + while(upto2 < term.docs.length-1) { + // Maybe skip: + final int left = term.docs.length-upto2; + int doc; + if (TestCodecs.this.nextInt(3) == 1 && left >= 1) { + final int inc = 1+TestCodecs.this.nextInt(left-1); + upto2 += inc; + if (TestCodecs.this.nextInt(2) == 1) { + doc = docsEnum.advance(term.docs[upto2]); + assertEquals(term.docs[upto2], doc); + } else { + doc = docsEnum.advance(1+term.docs[upto2]); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + // skipped past last doc + assert upto2 == term.docs.length-1; + break; + } else { + // skipped to next doc + assert upto2 < term.docs.length-1; + if (doc >= term.docs[1+upto2]) { + upto2++; + } + } + } + } else { + doc = docsEnum.nextDoc(); + assertTrue(doc != -1); + upto2++; + } + assertEquals(term.docs[upto2], doc); + if (!field.omitTF) { + assertEquals(term.positions[upto2].length, docsEnum.freq()); + if (TestCodecs.this.nextInt(2) == 1) { + this.verifyPositions(term.positions[upto2], postings); + } + } + } + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); + } + upto++; + + } while (termsEnum.next() != null); + + assertEquals(upto, field.terms.length); + } + } + } + + private void write(final FieldInfos fieldInfos, final Directory dir, final FieldData[] fields) throws Throwable { + + final int termIndexInterval = this.nextInt(13, 27); + + final SegmentWriteState state = new SegmentWriteState(null, dir, SEGMENT, fieldInfos, null, 10000, 10000, termIndexInterval, + CodecProvider.getDefault()); + + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + Arrays.sort(fields); + for (final FieldData field : fields) { + field.write(consumer); + } + consumer.close(); + } +} diff --git a/lucene/src/test/org/apache/lucene/index/TestDoc.java b/lucene/src/test/org/apache/lucene/index/TestDoc.java index a7ce98f4cf4..394e1f06e9b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/src/test/org/apache/lucene/index/TestDoc.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.index.codecs.CodecProvider; /** JUnit adaptation of an older test case DocTest. */ @@ -185,20 +186,24 @@ public class TestDoc extends LuceneTestCase { SegmentReader r1 = SegmentReader.get(true, si1, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); SegmentReader r2 = SegmentReader.get(true, si2, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); - SegmentMerger merger = new SegmentMerger(si1.dir, merged); + SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, merged, null, CodecProvider.getDefault()); merger.add(r1); merger.add(r2); merger.merge(); merger.closeReaders(); + final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, + useCompoundFile, true, -1, null, false, merger.hasProx(), + merger.getCodec()); + if (useCompoundFile) { - List filesToDelete = merger.createCompoundFile(merged + ".cfs"); + List filesToDelete = merger.createCompoundFile(merged + ".cfs", info); for (final String fileToDelete : filesToDelete) si1.dir.deleteFile(fileToDelete); } - return new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, useCompoundFile, true); + return info; } diff --git a/lucene/src/test/org/apache/lucene/index/TestFlex.java b/lucene/src/test/org/apache/lucene/index/TestFlex.java new file mode 100644 index 00000000000..0219488d59e --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestFlex.java @@ -0,0 +1,84 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import java.util.*; +import org.apache.lucene.store.*; +import org.apache.lucene.search.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.*; +import org.apache.lucene.util.*; + +public class TestFlex extends LuceneTestCase { + + // Test non-flex API emulated on flex index + public void testNonFlex() throws Exception { + Directory d = new MockRAMDirectory(); + + final int DOC_COUNT = 177; + + IndexWriter w = new IndexWriter(d, new WhitespaceAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + for(int iter=0;iter<2;iter++) { + if (iter == 0) { + w.setMaxBufferedDocs(7); + Document doc = new Document(); + doc.add(new Field("field1", "this is field1", Field.Store.NO, Field.Index.ANALYZED)); + doc.add(new Field("field2", "this is field2", Field.Store.NO, Field.Index.ANALYZED)); + doc.add(new Field("field3", "aaa", Field.Store.NO, Field.Index.ANALYZED)); + doc.add(new Field("field4", "bbb", Field.Store.NO, Field.Index.ANALYZED)); + for(int i=0;i 0) { - s += "\n "; - } - s += l[i]; - } - return s; - } - public void testOpenReaderAfterDelete() throws IOException { File dirFile = new File(TEMP_DIR, "deletetest"); Directory dir = FSDirectory.open(dirFile); @@ -1410,7 +1383,7 @@ public class TestIndexReader extends LuceneTestCase writer.close(); SegmentInfos sis = new SegmentInfos(); - sis.read(d); + sis.read(d, CodecProvider.getDefault()); IndexReader r = IndexReader.open(d, false); IndexCommit c = r.getIndexCommit(); @@ -1597,6 +1570,7 @@ public class TestIndexReader extends LuceneTestCase // LUCENE-1579: Ensure that on a cloned reader, segments // reuse the doc values arrays in FieldCache public void testFieldCacheReuseAfterClone() throws Exception { + //Codec.DEBUG = true; Directory dir = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); Document doc = new Document(); @@ -1750,7 +1724,6 @@ public class TestIndexReader extends LuceneTestCase } catch (IllegalStateException ise) { // expected } - assertFalse(((SegmentReader) r.getSequentialSubReaders()[0]).termsIndexLoaded()); assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); @@ -1763,10 +1736,13 @@ public class TestIndexReader extends LuceneTestCase IndexReader[] subReaders = r2.getSequentialSubReaders(); assertEquals(2, subReaders.length); for(int i=0;i<2;i++) { - assertFalse(((SegmentReader) subReaders[i]).termsIndexLoaded()); + try { + subReaders[i].docFreq(new Term("field", "f")); + fail("did not hit expected exception"); + } catch (IllegalStateException ise) { + // expected + } } - r2.close(); - dir.close(); } // LUCENE-2046 diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java index 1db7a98f48b..f7a8855c1f4 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java @@ -863,6 +863,8 @@ public class TestIndexReaderReopen extends LuceneTestCase { assertReaderClosed(reader, true, true); assertReaderClosed(firstReader, true, true); + FlexTestUtil.verifyFlexVsPreFlex(rnd, dir); + dir.close(); } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index ba9bf34cfeb..e320e273200 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -23,11 +23,13 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Reader; import java.io.StringReader; +import java.util.List; import java.util.ArrayList; import java.util.Arrays; +import java.util.Set; +import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; @@ -49,6 +51,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; @@ -72,6 +75,7 @@ import org.apache.lucene.store.SingleInstanceLockFactory; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.BytesRef; public class TestIndexWriter extends LuceneTestCase { public TestIndexWriter(String name) { @@ -525,7 +529,7 @@ public class TestIndexWriter extends LuceneTestCase { String[] startFiles = dir.listAll(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null, CodecProvider.getDefault()); String[] endFiles = dir.listAll(); Arrays.sort(startFiles); @@ -544,13 +548,12 @@ public class TestIndexWriter extends LuceneTestCase { IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT))); - char[] chars = new char[DocumentsWriter.CHAR_BLOCK_SIZE-1]; + char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8]; Arrays.fill(chars, 'x'); Document doc = new Document(); final String bigTerm = new String(chars); - // Max length term is 16383, so this contents produces - // a too-long term: + // This produces a too-long term: String contents = "abc xyz x" + bigTerm + " another term"; doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); @@ -3306,7 +3309,7 @@ public class TestIndexWriter extends LuceneTestCase { // LUCENE-510 public void testAllUnicodeChars() throws Throwable { - UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); + BytesRef utf8 = new BytesRef(10); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); char[] chars = new char[2]; for(int ch=0;ch<0x0010FFFF;ch++) { @@ -3326,16 +3329,16 @@ public class TestIndexWriter extends LuceneTestCase { UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); String s1 = new String(chars, 0, len); - String s2 = new String(utf8.result, 0, utf8.length, "UTF-8"); + String s2 = new String(utf8.bytes, 0, utf8.length, "UTF-8"); assertEquals("codepoint " + ch, s1, s2); - UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16); + UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16); assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length)); byte[] b = s1.getBytes("UTF-8"); assertEquals(utf8.length, b.length); for(int j=0;j allTerms, boolean isTop) throws IOException { + TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); + + char[] last = new char[2]; + int lastLength = 0; + + Set seenTerms = new HashSet(); + + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + while(true) { + final BytesRef term = terms.next(); + if (term == null) { + break; + } + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + assertTrue(utf16.length <= 2); + + // Make sure last term comes before current one, in + // UTF16 sort order + int i = 0; + for(i=0;i it = seenTerms.iterator(); + while(it.hasNext()) { + BytesRef tr = new BytesRef(it.next()); + assertEquals("seek failed for term=" + termDesc(tr.utf8ToString()), + TermsEnum.SeekStatus.FOUND, + terms.seek(tr)); + } + } + + private final String asUnicodeChar(char c) { + return "U+" + Integer.toHexString(c); + } + + private final String termDesc(String s) { + final String s0; + assertTrue(s.length() <= 2); + if (s.length() == 1) { + s0 = asUnicodeChar(s.charAt(0)); + } else { + s0 = asUnicodeChar(s.charAt(0)) + "," + asUnicodeChar(s.charAt(1)); + } + return s0; + } + + // Make sure terms, including ones with surrogate pairs, + // sort in UTF16 sort order by default + public void testTermUTF16SortOrder() throws Throwable { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + Document d = new Document(); + // Single segment + Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); + d.add(f); + char[] chars = new char[2]; + Random rnd = newRandom(); + final Set allTerms = new HashSet(); + + for(int i=0;i<200;i++) { + + final String s; + if (rnd.nextBoolean()) { + // Single char + if (rnd.nextBoolean()) { + // Above surrogates + chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff); + } else { + // Below surrogates + chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1); + } + s = new String(chars, 0, 1); + } else { + // Surrogate pair + chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END); + assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END); + chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END); + s = new String(chars, 0, 2); + } + allTerms.add(s); + f.setValue(s); + + //System.out.println("add " + termDesc(s)); + writer.addDocument(d); + + if ((1+i) % 42 == 0) { + writer.commit(); + } + } + + IndexReader r = writer.getReader(); + + // Test each sub-segment + final IndexReader[] subs = r.getSequentialSubReaders(); + assertEquals(5, subs.length); + for(int i=0;i 256 + for(int i=0;i<300;i++) { + s.append(' ').append(""+i); + } + Document d = new Document(); + Field f = new Field("field", s.toString(), Field.Store.NO, Field.Index.ANALYZED); + d.add(f); + w.addDocument(d); + IndexReader r = w.getReader(2).getSequentialSubReaders()[0]; + TermsEnum t = r.fields().terms("field").iterator(); + int count = 0; + while(t.next() != null) { + final DocsEnum docs = t.docs(null, null); + assertEquals(0, docs.nextDoc()); + assertEquals(DocsEnum.NO_MORE_DOCS, docs.nextDoc()); + count++; + } + assertEquals(300, count); + r.close(); + w.close(); + dir.close(); + } + public void testDeleteUnusedFiles() throws Exception { for(int iter=0;iter<2;iter++) { diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java index e8f144e1b72..20f6d35f4db 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.index.DocumentsWriter.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; @@ -81,6 +82,7 @@ public class TestIndexWriterConfig extends LuceneTestCaseJ4 { assertEquals(IndexWriterConfig.DEFAULT_READER_POOLING, conf.getReaderPooling()); assertTrue(DocumentsWriter.defaultIndexingChain == conf.getIndexingChain()); assertNull(conf.getMergedSegmentWarmer()); + assertEquals(IndexWriterConfig.DEFAULT_CODEC_PROVIDER, CodecProvider.getDefault()); assertEquals(IndexWriterConfig.DEFAULT_MAX_THREAD_STATES, conf.getMaxThreadStates()); assertEquals(LogByteSizeMergePolicy.class, conf.getMergePolicy().getClass()); @@ -101,6 +103,7 @@ public class TestIndexWriterConfig extends LuceneTestCaseJ4 { getters.add("getMaxBufferedDocs"); getters.add("getIndexingChain"); getters.add("getMergedSegmentWarmer"); + getters.add("getCodecProvider"); getters.add("getMergePolicy"); getters.add("getMaxThreadStates"); getters.add("getReaderPooling"); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java index 3e0b57a873a..c5e3383f76d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java @@ -18,7 +18,6 @@ package org.apache.lucene.index; */ import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; @@ -770,30 +769,22 @@ public class TestIndexWriterDelete extends LuceneTestCase { } } - String[] startFiles = dir.listAll(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); - String[] endFiles = dir.listAll(); - - if (!Arrays.equals(startFiles, endFiles)) { - fail("docswriter abort() failed to delete unreferenced files:\n before delete:\n " - + arrayToString(startFiles) + "\n after delete:\n " - + arrayToString(endFiles)); - } - + TestIndexWriter.assertNoUnreferencedFiles(dir, "docsWriter.abort() failed to delete unreferenced files"); modifier.close(); - } - private String arrayToString(String[] l) { - String s = ""; - for (int i = 0; i < l.length; i++) { - if (i > 0) { - s += "\n "; - } - s += l[i]; + public void testDeleteNullQuery() throws IOException { + Directory dir = new MockRAMDirectory(); + IndexWriter modifier = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + + for (int i = 0; i < 5; i++) { + addDoc(modifier, i, 2*i); } - return s; + + modifier.deleteDocuments(new TermQuery(new Term("nada", "nada"))); + modifier.commit(); + assertEquals(5, modifier.numDocs()); + modifier.close(); + dir.close(); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java index 511fdbe57c6..5ec3fe32764 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java @@ -84,7 +84,6 @@ public class TestIndexWriterReader extends LuceneTestCase { // get a reader IndexReader r1 = writer.getReader(); - assertTrue(r1.isCurrent()); String id10 = r1.document(10).getField("id").stringValue(); @@ -92,20 +91,15 @@ public class TestIndexWriterReader extends LuceneTestCase { newDoc.removeField("id"); newDoc.add(new Field("id", Integer.toString(8000), Store.YES, Index.NOT_ANALYZED)); writer.updateDocument(new Term("id", id10), newDoc); - assertFalse(r1.isCurrent()); IndexReader r2 = writer.getReader(); - assertTrue(r2.isCurrent()); assertEquals(0, count(new Term("id", id10), r2)); assertEquals(1, count(new Term("id", Integer.toString(8000)), r2)); r1.close(); writer.close(); - assertTrue(r2.isCurrent()); IndexReader r3 = IndexReader.open(dir1, true); - assertTrue(r3.isCurrent()); - assertTrue(r2.isCurrent()); assertEquals(0, count(new Term("id", id10), r3)); assertEquals(1, count(new Term("id", Integer.toString(8000)), r3)); @@ -149,18 +143,9 @@ public class TestIndexWriterReader extends LuceneTestCase { createIndexNoClose(!optimize, "index2", writer2); writer2.close(); - IndexReader r0 = writer.getReader(); - assertTrue(r0.isCurrent()); writer.addIndexesNoOptimize(new Directory[] { dir2 }); - assertFalse(r0.isCurrent()); - r0.close(); IndexReader r1 = writer.getReader(); - assertTrue(r1.isCurrent()); - - writer.commit(); - assertTrue(r1.isCurrent()); - assertEquals(200, r1.maxDoc()); int index2df = r1.docFreq(new Term("indexname", "index2")); diff --git a/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java b/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java index f4ccee3441c..88f9d30ff5f 100755 --- a/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java +++ b/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java @@ -48,7 +48,7 @@ public class TestLazyProxSkipping extends LuceneTestCase { @Override public IndexInput openInput(String name) throws IOException { IndexInput ii = super.openInput(name); - if (name.endsWith(".prx")) { + if (name.endsWith(".prx") || name.endsWith(".pos") ) { // we decorate the proxStream with a wrapper class that allows to count the number of calls of seek() ii = new SeeksCountingStream(ii); } @@ -107,7 +107,7 @@ public class TestLazyProxSkipping extends LuceneTestCase { // check if the number of calls of seek() does not exceed the number of hits assertTrue(this.seeksCounter > 0); - assertTrue(this.seeksCounter <= numHits + 1); + assertTrue("seeksCounter=" + this.seeksCounter + " numHits=" + numHits, this.seeksCounter <= numHits + 1); } public void testLazySkipping() throws IOException { diff --git a/lucene/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java b/lucene/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java index 4485fd74cf8..e387735829b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java +++ b/lucene/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java @@ -29,8 +29,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.util.LuceneTestCase; /** @@ -42,8 +43,18 @@ import org.apache.lucene.util.LuceneTestCase; * */ public class TestMultiLevelSkipList extends LuceneTestCase { + + class CountingRAMDirectory extends MockRAMDirectory { + public IndexInput openInput(String fileName) throws IOException { + IndexInput in = super.openInput(fileName); + if (fileName.endsWith(".frq")) + in = new CountingStream(in); + return in; + } + } + public void testSimpleSkip() throws IOException { - RAMDirectory dir = new RAMDirectory(); + Directory dir = new CountingRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())); Term term = new Term("test", "a"); for (int i = 0; i < 5000; i++) { @@ -56,9 +67,8 @@ public class TestMultiLevelSkipList extends LuceneTestCase { writer.close(); IndexReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); - tp.freqStream = new CountingStream(tp.freqStream); - + TermPositions tp = reader.termPositions(); + for (int i = 0; i < 2; i++) { counter = 0; tp.seek(term); diff --git a/lucene/src/test/org/apache/lucene/index/TestNorms.java b/lucene/src/test/org/apache/lucene/index/TestNorms.java index 9d1f2683609..a1be843c53f 100755 --- a/lucene/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestNorms.java @@ -186,6 +186,7 @@ public class TestNorms extends LuceneTestCase { assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); } } + ir.close(); } private void addDocs(Directory dir, int ndocs, boolean compound) throws IOException { diff --git a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java index d8dda086f42..f020323a982 100644 --- a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java +++ b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Collection; +import java.util.Random; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -26,13 +27,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Searcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockRAMDirectory; @@ -85,20 +80,26 @@ public class TestOmitTf extends LuceneTestCase { // keep things constant d = new Document(); - // Reverese + // Reverse f1.setOmitTermFreqAndPositions(true); d.add(f1); f2.setOmitTermFreqAndPositions(false); d.add(f2); + Random rnd = newRandom(); + writer.addDocument(d); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(ram); + FlexTestUtil.verifyFlexVsPreFlex(rnd, ram); + SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); @@ -144,8 +145,12 @@ public class TestOmitTf extends LuceneTestCase { for(int i=0;i<30;i++) writer.addDocument(d); + Random rnd = newRandom(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -289,6 +294,15 @@ public class TestOmitTf extends LuceneTestCase { TermQuery q3 = new TermQuery(c); TermQuery q4 = new TermQuery(d); + PhraseQuery pq = new PhraseQuery(); + pq.add(a); + pq.add(c); + try { + searcher.search(pq, 10); + fail("did not hit expected exception"); + } catch (IllegalStateException ise) { + // expected + } searcher.search(q1, new CountingHitCollector() { @@ -380,7 +394,7 @@ public class TestOmitTf extends LuceneTestCase { super.collect(doc); } }); - assertTrue(15 == CountingHitCollector.getCount()); + assertEquals(15, CountingHitCollector.getCount()); searcher.close(); dir.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestPayloads.java b/lucene/src/test/org/apache/lucene/index/TestPayloads.java index ce6640ed69f..3c395a22b1b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestPayloads.java +++ b/lucene/src/test/org/apache/lucene/index/TestPayloads.java @@ -39,7 +39,8 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; @@ -99,7 +100,7 @@ public class TestPayloads extends LuceneTestCase { // payload bit in the FieldInfo public void testPayloadFieldBit() throws Exception { rnd = newRandom(); - Directory ram = new RAMDirectory(); + Directory ram = new MockRAMDirectory(); PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document d = new Document(); @@ -139,6 +140,9 @@ public class TestPayloads extends LuceneTestCase { analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); writer.addDocument(d); + + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); // flush @@ -149,14 +153,15 @@ public class TestPayloads extends LuceneTestCase { assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); - reader.close(); + reader.close(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, ram); } // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory public void testPayloadsEncoding() throws Exception { rnd = newRandom(); // first perform the test using a RAMDirectory - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); performTest(dir); // now use a FSDirectory and repeat same test @@ -215,7 +220,9 @@ public class TestPayloads extends LuceneTestCase { writer.addDocument(d); } + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -260,11 +267,17 @@ public class TestPayloads extends LuceneTestCase { TermPositions tp = reader.termPositions(terms[0]); tp.next(); tp.nextPosition(); + // NOTE: prior rev of this test was failing to first + // call next here: + tp.next(); // now we don't read this payload tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); byte[] payload = tp.getPayload(null, 0); assertEquals(payload[0], payloadData[numTerms]); + // NOTE: prior rev of this test was failing to first + // call next here: + tp.next(); tp.nextPosition(); // we don't read this payload and skip to a different document @@ -321,7 +334,9 @@ public class TestPayloads extends LuceneTestCase { writer.addDocument(d); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -469,7 +484,7 @@ public class TestPayloads extends LuceneTestCase { final int numDocs = 50; final ByteArrayPool pool = new ByteArrayPool(numThreads, 5); - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); final String field = "test"; @@ -563,13 +578,13 @@ public class TestPayloads extends LuceneTestCase { } } - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); + private BytesRef utf8Result = new BytesRef(10); synchronized String bytesToString(byte[] bytes) { String s = new String(bytes); UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); try { - return new String(utf8Result.result, 0, utf8Result.length, "UTF-8"); + return new String(utf8Result.bytes, 0, utf8Result.length, "UTF-8"); } catch (UnsupportedEncodingException uee) { return null; } diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java index 6643c956698..d98194d927d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -18,9 +18,11 @@ package org.apache.lucene.index; */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.Collection; @@ -63,14 +65,16 @@ public class TestSegmentMerger extends LuceneTestCase { } public void testMerge() throws IOException { - SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment); + SegmentMerger merger = new SegmentMerger(mergedDir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, mergedSegment, null, CodecProvider.getDefault()); merger.add(reader1); merger.add(reader2); int docsMerged = merger.merge(); merger.closeReaders(); assertTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory - SegmentReader mergedReader = SegmentReader.get(true, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true), IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); + SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true, + -1, null, false, merger.hasProx(), merger.getCodec()), BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); + assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); Document newDoc1 = mergedReader.document(0); diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java b/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java index 901791c1257..3a855f16541 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java @@ -136,6 +136,9 @@ public class TestSegmentReader extends LuceneTestCase { TermPositions positions = reader.termPositions(); assertTrue(positions != null); positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); + // NOTE: prior rev of this test was failing to first + // call next here: + assertTrue(positions.next()); assertTrue(positions.doc() == 0); assertTrue(positions.nextPosition() >= 0); } diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentTermDocs.java b/lucene/src/test/org/apache/lucene/index/TestSegmentTermDocs.java index c13438e61d2..22c90fa3d10 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentTermDocs.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentTermDocs.java @@ -56,13 +56,13 @@ public class TestSegmentTermDocs extends LuceneTestCase { SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) - { - int docId = segTermDocs.doc(); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -77,18 +77,20 @@ public class TestSegmentTermDocs extends LuceneTestCase { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java index c760c9fb446..e06fa91ce30 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java @@ -67,14 +67,16 @@ public class TestSegmentTermEnum extends LuceneTestCase { addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); + TermsEnum terms = reader.fields().terms("content").iterator(); + assertNotNull(terms.next()); + assertEquals("aaa", terms.term().utf8ToString()); + assertNotNull(terms.next()); + long ordB = terms.ord(); + assertEquals("bbb", terms.term().utf8ToString()); + assertNull(terms.next()); + + assertEquals(TermsEnum.SeekStatus.FOUND, terms.seek(ordB)); + assertEquals("bbb", terms.term().utf8ToString()); } private void verifyDocFreq() diff --git a/lucene/src/test/org/apache/lucene/index/TestStressIndexing.java b/lucene/src/test/org/apache/lucene/index/TestStressIndexing.java index 53de4bd3e4b..1bc66a190ee 100644 --- a/lucene/src/test/org/apache/lucene/index/TestStressIndexing.java +++ b/lucene/src/test/org/apache/lucene/index/TestStressIndexing.java @@ -26,7 +26,7 @@ import org.apache.lucene.search.*; import java.util.Random; import java.io.File; -public class TestStressIndexing extends LuceneTestCase { +public class TestStressIndexing extends MultiCodecTestCase { private Random RANDOM; private static abstract class TimedThread extends Thread { @@ -152,6 +152,8 @@ public class TestStressIndexing extends LuceneTestCase { modifier.close(); + FlexTestUtil.verifyFlexVsPreFlex(RANDOM, directory); + for(int i=0;i docs = indexRandom(nThreads, iter, range, dir1, maxThreadStates, doReaderPooling); + //System.out.println("TEST: index serial"); indexSerial(docs, dir2); + //System.out.println("TEST: verify"); verifyEquals(dir1, dir2, "id"); + + FlexTestUtil.verifyFlexVsPreFlex(r, dir1); + FlexTestUtil.verifyFlexVsPreFlex(r, dir2); } } @@ -216,7 +225,7 @@ public class TestStressIndexing2 extends LuceneTestCase { threads[i].join(); } - // w.optimize(); + //w.optimize(); w.close(); for (int i=0; iNu2*s_NtvhVvE8OxL^XS&xL81`04 z(@BZwb%=o| zxa&(S#=3n>mfLj$EtS>JPK{req%O~t$gPoiFN^S&oMcx}UmkG*xT0S^#!x9CvP3_XZB9}QqeIWJfZdV6K z94y$9D9N400VD|*M2?uV#CTQ<+Y{KEwb?b^t&cWVHM?S_I;~JcEq*$kL2s@x&kO*i zB7n3}v7Du%Xjf5-C{$E#$-A~yOVObFiSTog$pn3ESM}XLB(n!u9qNY(^-yGTn*?UH zPbp7)vyZDb)4rMNm!l8!ky_`hRGoAj8<8sKuA3=BUe^H^p3`7p&d^)mPztCsnE*M) z$~qbckVChNMG4zQ?<~pq=HrZq0!;5L+9s>C5XA0rw3~?86Ho7}H`)n2C~2c;4r2WT z64$^kx+c1)3(UZ`9G?Wr_O&2Tm7IAthPROpY4X9PR;A}kybM-0s}JtXs*KmIUOpt$ zIiW65cTr9NUO33h$JFKjGj)ALYuVH4LXdOlT^>04kyc*$)2BihO2K^xyJ2<}s{Aku zkN0}7Q}ccE;RRZwNHPXP2t@ZnwDHIG0yVHA)wSi}40#TcuzD&F-8N))W3aj!ocV52t*PUlB@~C~DBE zHdW6c14}d~hHY=VZ7_BO5ohxB$^dV2Iy3{DwTC@fg3w%z#6j`-qo6tBLqe>Nms{Ia zxRDHDORF%pd6^sjMtH+1ac9ONeWb*7gLi{+<67cqW$PSViVYEx;BDSvCcJxDLN)fY z-$$U;!A!=cPPwkBL&jK1fWM=q?-%W=XF|7KD@@}aO%~}P^X6pyKSvDA5GX9Ko@ zX$u%a=O18~<2pS?K%-w!P5v8I3Fd;ZAVC;V5VmD@f7?TNc%ZWfz;G_y?7p+zt&XLS zme~B1SlX=EBt&eI3^Q-AbNcoZ50TccUJhnnXWy~`+wLzvzc{zrb`v@K{(HIA-Ie7e zzz@CwW6Wm_WZ=%$(m|Q@Q!IL!#fiiRhtmw6U9>~p;mL*;a1G?GCLw2)CW*Q8h)~8% z#5n^xYi+^9&`D9&&^w9^#N65B>IyFzbe4M+R|?orsA?Q^ljADCx>Up05WtatNb%_m z_?FJfosGey`1_+p@-3Zj3JY4>IRDH+-(M#{d}IBW^qMU1UF7g!-+z)`_?JnKdg(f% zK-KNH^W5AmZNfpqT}je?9#F&w4(m8eOkqG8$w{Hc5ejFy#pj`piDU_!2T-;Wv=tO# zPALF4<$}XJRU>Z1%j3g7TO0HB>*13f|JBtl&NGx{sPymnbO~LtifnsZSQ%d7 z)QuIIh{LctDBp(2&I}ng7v6AL!6M~kf}y~O^8)8TE)5;@?Pko|AC4+QbP6B|2T5*j za?KyTX}c-rShF{Rv+UbQ#KoLqOZDwY)`_tk@DcaK2b8J~cjYOucXA`vya(yvB3t)D zbvaarz#+q@t|VloL#HMHgSbHiml$;l)%D1}NdCyuNFGcYW8^2}w1wIldYxeo;24#S zGWad0j51wSToEJHkjs)@GB^TcJU#qSPMonoRj!D+V2*FDp!il*N;G43_H7Z%Vv40o zCDiF?D0K1*J?ResA0yjO&NEpc3-m#{SNR2}K<`($>1CdGu;3w=A+Viz6WEKD)T-9H zqBGV=&}~a|%U4$4&%R_r_9ERb0XP^+3$!F^xuERpe74&8ER)CHTKVVM#8iPO2R0Ro zEslW4GWKkpWkOC1@pG6dOf-f~MK-g0ZoRRr#qhTI#G1QvV^YIs`JpZEq4knzp)YS1 z=1R*dB0i#CxYoiDB7zsLOZ~NPOUdAd33&}4mo}+gX8v;Ohf1*ZTwVaLKKp;vC-O`6 zCH3Le2f4f8c-1UaKjEnZI-8@)LQ(e=UVWToTnc#g(c{$@#fn#7G&b_WpB4o12n+q! zstCG)D_PkW5~yr#K-J|JYY`(I2RdR~!H;W* ze94nT^u&wOkiMWtQSF8ihX_(NHot>n4qh~~R z!XyQ!V(A{^y8&Y+{KL~7 z#!s<*?hvh3x>ox6^YcveQrY189Lhm|!FP5>2tr|x9Vzg?_mN@xMt0tj;3h24pmYNBPd$`_z+9TRB`sy8Pvig`q z{xSg^5^SK|VMQJ$Lzi|&7b&k7&jy!Gdz4AD%RU?9XV>8k)2!}Uxl9`gi}%A)-n0;$6q}%{4gp?g-(r*aC^kj# zvFRoBv(zjtE)50#keDb=UhHYSX0r*ucQ2QIrP)|LEqrfGH8V+h+lnjwT;W)`ggd|> zt)ztN9*e|LYs~UM0Z1#|p@)HVQ%gdkK>LJ*DY*xP5}V8q{-SI0K8RlPMu%#bM4*A-=_x&a`DK- zOTPZRl#r&`c<76gt7}iKl%kll2l**H$FH zn@@N!X&)x@(e7bM%xP@K=^9|*+vEmjXVXRF36Z)TbH``TqlvB43<nv9I#?KuC?cl81pgFSj`sCeKdtR{eHcM8AFjyQd#O$vF7U(cvY}y zRfTLPQko~ThBbgRd1GOULrBrRiyecK?`UUNKFAqWYH|l}Tp*wuc+sW})f#iPx%ApT zJa1hd24753NuC|(yP=gY-g|RhjGU~HyzjgtV*5qNNlnr}ekm1%*W?O*eH*kye zI|SmwQ-9i~np`EQICH-iU#O>(!gON4Ce%;PSnMLhi}-8&_{HK|34QpP6d!Xa__j z7NJ8wILq?E!o&<7){dVINRI(;iv9}cM??hsgdCw^Wju4nSTU7v7NF)KtN=I{2_O{~ zIUoj7#ZxOk##wVGb1(l4)VZr0f^224u4ra#9_b8si4+ZTVlChu2x5<6<_4-n` zVkDnm!s%`G#nAOn;$lA0#3WuaruEYA*R0RsA>fWt4%fgzf-WV+Y{l@$Xy|#x(8=Hu zKh-0ab9hA1ON`ZY3n}#6h1HMVqv-8~I+?@F_Si`koFU^P`Ao}^k zUc&I9x~^1;1BEXUu)>m z^`9{v{P&pt!-)PP>*N@AoMn!Ww{I-`Q1&y{|7%H)vogU)tbZERpV9vNmnWOjXq_gJDQT8!f?dlEeBd&BX2tdkYwZx#Xm+*?CO@xA!J_2lzIE`n050&3zowW6F7B>ap2b2FE@uaVh6Mc^nk6ix$c;KVDjn4#902rSQ JDU?S~{{v`EY%%}< literal 0 HcmV?d00001 diff --git a/lucene/src/test/org/apache/lucene/index/index.30.nocfs.zip b/lucene/src/test/org/apache/lucene/index/index.30.nocfs.zip new file mode 100644 index 0000000000000000000000000000000000000000..28cd83bc04055aac54dc051a28ad257c3fa08c8b GIT binary patch literal 8953 zcmchb2{_c-8^?dM7-NYUO|mzXCFIU9*`h2NLb4@fnWjNQWXW6e|8%;!v-7Kn zoE2#eFDffA;{ngyPiozIx^BaZynELM9Pda(j^quheZ6(GNYmrvxXA*aMLwnTK3>)C zi+I>P@aMbRZJ}}N-;@WVY5NL?7nzwyBYDV64?uX-PZP`&rLN=U&Q># z>Hb=o+FB_>%)5gjKB#R@B?v@_;%-6?;=B6HZd-<52-s=GziQ_i8_#?%w>4PaxX!ff zLo!H?;e{T4Zc_(`zRI#{fdR7(znQWJLX>k7ykjYLAcbNV=T%%16!_u20KW!K9bZ~h zQXJ&3j=KtYsCpkYbrmg^g>#Gud^TK;0!K8MnXq5rtP}nq@cx^7439ktlItIZf^qPm zO(ShfLDth5o`2x%`%a04Ahbz)U<%p>H%ipiy9|;4Af^3P#L&IP)sTpt;>DMeQ_QqD z2O@V2I!b_pES?!?JC!=yQjJYaOe{hmN$@TNRumwsA@)odR0~tg0gfqjbXdpRgBwr_ zN_Hv-wo`*$ii4YG2k2jgBV;QY}RH*J;%RZyiZ%qqoHa^jo!opZr!HuROhD zWq4N45#`)C%fc&eE8SYMjy~#pBDi>s2$_cVGhHR!v-_?`u(N--nJ<}7@2G`(SED0SR$WnBJcx<|2`>I(?&<;m@8CI?IXTA?Z@kjC0_DB&UWKDcPk14{VuZ{(UCF3G(CHHd9VB@33H*7UWEP~0lG_uY!H z;&U6N>VmW0DgUXJ|2jRR?(S9fyO0<+Gewl==%?P|Roi3=p6N-hV%KG>{`wbaaHm<0 zlkFjt+4h8SoNU@e-sGVf(6lJmhRU(YW5m!UH$mV>>6uLq@th4faIePqssW0paU!(7 z9+GnGQlqgheP5zet$9%fZLx%zdEAZNaU!;M%27f%Tjd+*@CM)EDo+_86lQ1@cfIb( z=;!hFybnV!>_wvSM?MFRe$}~P@z`$O*f(9grIVy;{gwqfT0${4PY$j;n58LQ^YGP2 zhtSx{#Rb6gkZTGAq33-OYqQ%29SV+V-j;Q&v3RQR_)gpQ9-YC*$HWzcBLf~x?64y= zt)9Pj)^7t>s2kW*~TywDmr9JNXs%NZlQi*z7{=tAt!W ziLU!n&sR5;q!C}5804~{?$5g8b;-AO@3sir?-SoA@bKX64PQ@%x%(Rrt$$bAweE$` z_;uGS}ujdP=_sd4u(U$4i8hPkB_}PZhBJrP|6%JYQMRw?H?sYA-9y}73yLx}7 zPP^ie%tS}b`=VM?dCj`W_;#=JZ}jgiFC*r|38ieuK_*$Q55Zfymx_M2=4uQo@v(%>Rz=nvXT%e2rKVgMRTujWsn{JRIrjCnhjb=Ot5X)_k%1OXsFqj^cmU||-T4MF{IAH0O#Od&JINz@$;QgD*r!!rcZ`P3 zL+FZUZwtaa|FjPigIh2_5>ecA3|CxY{s>pe7krp#w^lQYvLIxkys>ZZdQ1``Ppy7j z+a9q?WZRb(Z2X-rPu~kIhyxGlupL`v|3CgFDfR7fV#Jg4N`VbApHSk>w>R1(p$ZA!{`>L=@9@8qkGZO5Yfb zY=|o|L_s_@t~4^>u`y{y1;@s0!5)drfTDf$p5M#(b`aQl-<{fiyCDC1GeRY2EA3)? zPiwBl%|RWDP9P$tXXt8!@%hHZ87lo1<;NTEB-kmYX&BJ^T;|8o-@V03L<;4kELttk zVL9IkSN*pN_q(egIQ$$4_gAih>T|4F_1~pg^|{jQs96SK3#i$;Y0WaoTxb?`{jHQ@ zKw_-=@E9zZL<0ECqvi114QO;wr!W|Op7$wD69N)}K!m~J7B-8by}I~rCE0IvcjnJf zt9m{m;wn|w@@iOei}cwY_C04)qpDA(_Bs>dGtQQ;B{hLaRSk<_P zjPm__Ekx|`hrNl#j}>w*CvV*1kXW?#%={2!MXbJ2Owsv2I$V$+4^vIc6UBLo@D?}F z?_;5KGbQw`*m%UF;|hSTtWQB^h6 z`0^)v8_H!q9DeT*?5U`V?{Br}7F_eF9JjiNFfOg%f3K<3({$Z!VfIUmnx8cUG3CI# z48v9j4Lb-on_m~4Er^D4xYl=)*Z8fHr>9-SGHRzz@SQ9?z~O%;c?6WG3WsUT$EbJN z(vFin`Y0p{*%Km?kRX_l5XMKok2WO*ak{fNInYNK97JwrAa2|l#=F4ZnjcCMO5aX5al<6`>8p@p$HTS0Z~8!K~Cc0DQHJgRY-F+(&0K3x@CuWXD^cE>Yh=^51o%qBuGJU}mM0Y|%fAqVS*ip+xFj1@ zc_BHXwY7CM8bvVFfqHep>8AXp$(hyW{cp#v`y_^z?XT;Se4~A0)J)@)H~>vs(*Mts z(cxG64ZSUWPAT}M@@KjkB40jmw06~VPlwbc5D_6Gu;5$h;gD{Gki;U|)_UGj%@&Jq zdbGaQ)lDejW`UmMNbXY6n@TNvyj|WbNh93zD0?KOM)!icC)`WE5f zWEXM_AhLLWo{0sFvI=34EQ_Sbc-f01zcin0h!P_{eo3fZ9#nt3@ut4>CgEHD%Zx&^ ztUuWpMa|20hKo57mN`qHwklr~4fl34?(Od_`|Huc@Ncj(UmKIxCmuI>zb-l0{dTWP zCn_>#VZp5)8<$tbj`=T5ELLo~`T$b<9!kDdcHNjhp|gK(!lO6 zqcOp#`XS((5I^p2Ne#i{a78qnT_xl|7ue7wq97vVufk9)`T%OhItQ{-j+>LWO@L>F%wBrd z*(}4#6tue)HPG8l+45M6<2{3W)>`sq2}`RoHS{$8kguwAtU0D%<@iUjl1|KWyjIk>1+sHVHDG?Seyw`v{s_n+>K(wgzmb$$Ui~QWsP> zbX6*SQPov`aCsn58Q72d-i$qEKGo5TpVM7%9%dJuqe$jtv3H4CSu&syCRw0Fxnr@L zH!JJ@6pNu!?pW-XZ&sE)Xc3IK7~15H#a;?CSztgwb#y(GH7Qf>SnS)>{@4KMKW2FbuV`;_LeCtt8JPE+9`J|_EsM&%NIP`z&TDf{ z=RVbc;Ewg<6U-gF0Wnx}v5!qm)|~94?7Ydi=42n^nXF&fNA^mebsxK@SzMoB?A0if z#XRv7z^d!t+VuYwcMAhHRiSd16zl~i>m+T#t2TqheNwR3kgSueV6uK8DcB20*0}h= z>pvLRoFqj6ll2Qp!Cn`##&vs|#dT7!*E6gv_xa!)r>Yz7_L04oVP&14X8pv9X0CJW zVmYbNpF;&9K<(877M literal 0 HcmV?d00001 diff --git a/lucene/src/test/org/apache/lucene/search/CheckHits.java b/lucene/src/test/org/apache/lucene/search/CheckHits.java index 75fd4e21100..47806010751 100644 --- a/lucene/src/test/org/apache/lucene/search/CheckHits.java +++ b/lucene/src/test/org/apache/lucene/search/CheckHits.java @@ -33,7 +33,7 @@ public class CheckHits { * different order of operations from the actual scoring method ... * this allows for a small amount of variation */ - public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.00005f; + public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f; /** * Tests that all documents up to maxDoc which are *not* in the diff --git a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java index 1d19029ef52..291970399ac 100644 --- a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -24,7 +24,8 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** @@ -202,7 +203,7 @@ final class JustCompileSearch { static final class JustCompileExtendedFieldCacheLongParser implements FieldCache.LongParser { - public long parseLong(String string) { + public long parseLong(BytesRef string) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @@ -210,7 +211,7 @@ final class JustCompileSearch { static final class JustCompileExtendedFieldCacheDoubleParser implements FieldCache.DoubleParser { - public double parseDouble(String string) { + public double parseDouble(BytesRef term) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @@ -318,9 +319,9 @@ final class JustCompileSearch { static final class JustCompilePhraseScorer extends PhraseScorer { - JustCompilePhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + JustCompilePhraseScorer(Weight weight, DocsAndPositionsEnum[] docs, int[] offsets, Similarity similarity, byte[] norms) { - super(weight, tps, offsets, similarity, norms); + super(weight, docs, offsets, similarity, norms); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/QueryUtils.java b/lucene/src/test/org/apache/lucene/search/QueryUtils.java index cfbb708e98c..81c581f35c5 100644 --- a/lucene/src/test/org/apache/lucene/search/QueryUtils.java +++ b/lucene/src/test/org/apache/lucene/search/QueryUtils.java @@ -391,7 +391,6 @@ public class QueryUtils { } @Override public void collect(int doc) throws IOException { - //System.out.println("doc="+doc); float score = scorer.score(); try { diff --git a/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java new file mode 100644 index 00000000000..585e705c909 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -0,0 +1,210 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; + +public class TestAutomatonQuery extends LuceneTestCase { + private IndexSearcher searcher; + + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer( + Version.LUCENE_CURRENT, Collections.emptySet()), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field titleField = new Field("title", "some title", Field.Store.NO, + Field.Index.ANALYZED); + Field field = new Field(FN, "this is document one 2345", Field.Store.NO, + Field.Index.ANALYZED); + Field footerField = new Field("footer", "a footer", Field.Store.NO, + Field.Index.ANALYZED); + doc.add(titleField); + doc.add(field); + doc.add(footerField); + writer.addDocument(doc); + field.setValue("some text from doc two, a short piece. 5678.91"); + writer.addDocument(doc); + field.setValue("doc three has some different stuff" + + ": with numbers 1234 5678.9 and letter b"); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { + return new Term(FN, value); + } + + private int automatonQueryNrHits(AutomatonQuery query) throws IOException { + return searcher.search(query, 5).totalHits; + } + + private void assertAutomatonHits(int expected, Automaton automaton) + throws IOException { + AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton); + + query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + assertEquals(expected, automatonQueryNrHits(query)); + } + + /** + * Test some very simple automata. + */ + public void testBasicAutomata() throws IOException { + assertAutomatonHits(0, BasicAutomata.makeEmpty()); + assertAutomatonHits(0, BasicAutomata.makeEmptyString()); + assertAutomatonHits(2, BasicAutomata.makeAnyChar()); + assertAutomatonHits(3, BasicAutomata.makeAnyString()); + assertAutomatonHits(2, BasicAutomata.makeString("doc")); + assertAutomatonHits(1, BasicAutomata.makeChar('a')); + assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b')); + assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0)); + assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0)); + assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'), + BasicAutomata.makeChar('b'))); + assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata + .makeChar('a'), BasicAutomata.makeChar('b'))); + assertAutomatonHits(1, BasicOperations.minus(BasicAutomata.makeCharRange('a', 'b'), + BasicAutomata.makeChar('a'))); + } + + /** + * Test that a nondeterministic automaton works correctly. (It should will be + * determinized) + */ + public void testNFA() throws IOException { + // accept this or three, the union is an NFA (two transitions for 't' from + // initial state) + Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), + BasicAutomata.makeString("three")); + assertAutomatonHits(2, nfa); + } + + public void testEquals() { + AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata + .makeString("foobar")); + // reference to a1 + AutomatonQuery a2 = a1; + // same as a1 (accepts the same language, same term) + AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations + .concatenate(BasicAutomata.makeString("foo"), BasicAutomata + .makeString("bar"))); + // different than a1 (same term, but different language) + AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata + .makeString("different")); + // different than a1 (different term, same language) + AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata + .makeString("foobar")); + + assertEquals(a1, a2); + assertEquals(a1.hashCode(), a2.hashCode()); + + assertEquals(a1, a3); + assertEquals(a1.hashCode(), a3.hashCode()); + + assertEquals(a1.toString(), a3.toString()); + + // different class + AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); + // different class + AutomatonQuery w2 = new RegexpQuery(newTerm("foobar")); + + assertFalse(a1.equals(w1)); + assertFalse(a1.equals(w2)); + assertFalse(w1.equals(w2)); + assertFalse(a1.equals(a4)); + assertFalse(a1.equals(a5)); + assertFalse(a1.equals(null)); + } + + /** + * Test that rewriting to a single term works as expected, preserves + * MultiTermQuery semantics. + */ + public void testRewriteSingleTerm() throws IOException { + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata + .makeString("piece")); + assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof SingleTermsEnum); + assertEquals(1, automatonQueryNrHits(aq)); + } + + /** + * Test that rewriting to a prefix query works as expected, preserves + * MultiTermQuery semantics. + */ + public void testRewritePrefix() throws IOException { + Automaton pfx = BasicAutomata.makeString("do"); + pfx.expandSingleton(); // expand singleton representation for testing + Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata + .makeAnyString()); + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton); + assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + assertEquals(3, automatonQueryNrHits(aq)); + } + + /** + * Test handling of the empty language + */ + public void testEmptyOptimization() throws IOException { + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata + .makeEmpty()); + // not yet available: assertTrue(aq.getEnum(searcher.getIndexReader()) + // instanceof EmptyTermEnum); + assertSame(TermsEnum.EMPTY, aq.getTermsEnum(searcher.getIndexReader())); + assertEquals(0, automatonQueryNrHits(aq)); + } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new AutomatonQuery(newTerm("bogus"), BasicAutomata + .makeString("piece")).hasNewAPI); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java b/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java new file mode 100644 index 00000000000..9257da2f4d3 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java @@ -0,0 +1,178 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Test the automaton query for several unicode corner cases, + * specifically enumerating strings/indexes containing supplementary characters, + * and the differences between UTF-8/UTF-32 and UTF-16 binary sort order. + */ +public class TestAutomatonQueryUnicode extends LuceneTestCase { + private IndexSearcher searcher; + + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new KeywordAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field titleField = new Field("title", "some title", Field.Store.NO, + Field.Index.ANALYZED); + Field field = new Field(FN, "", Field.Store.NO, + Field.Index.ANALYZED); + Field footerField = new Field("footer", "a footer", Field.Store.NO, + Field.Index.ANALYZED); + doc.add(titleField); + doc.add(field); + doc.add(footerField); + field.setValue("\uD866\uDF05abcdef"); + writer.addDocument(doc); + field.setValue("\uD866\uDF06ghijkl"); + writer.addDocument(doc); + // this sorts before the previous two in UTF-8/UTF-32, but after in UTF-16!!! + field.setValue("\uFB94mnopqr"); + writer.addDocument(doc); + field.setValue("\uFB95stuvwx"); // this one too. + writer.addDocument(doc); + field.setValue("a\uFFFCbc"); + writer.addDocument(doc); + field.setValue("a\uFFFDbc"); + writer.addDocument(doc); + field.setValue("a\uFFFEbc"); + writer.addDocument(doc); + field.setValue("a\uFB94bc"); + writer.addDocument(doc); + field.setValue("bacadaba"); + writer.addDocument(doc); + field.setValue("\uFFFD"); + writer.addDocument(doc); + field.setValue("\uFFFD\uD866\uDF05"); + writer.addDocument(doc); + field.setValue("\uFFFD\uFFFD"); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { + return new Term(FN, value); + } + + private int automatonQueryNrHits(AutomatonQuery query) throws IOException { + return searcher.search(query, 5).totalHits; + } + + private void assertAutomatonHits(int expected, Automaton automaton) + throws IOException { + AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton); + + query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + assertEquals(expected, automatonQueryNrHits(query)); + } + + /** + * Test that AutomatonQuery interacts with lucene's sort order correctly. + * + * This expression matches something either starting with the arabic + * presentation forms block, or a supplementary character. + */ + public void testSortOrder() throws IOException { + Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton(); + assertAutomatonHits(2, a); + } + + /** + * Test that AutomatonQuery properly seeks to supplementary characters. + * Transitions are modeled as UTF-16 code units, so without special handling + * by default it will try to seek to a lead surrogate with some DFAs + */ + public void testSeekSurrogate() throws IOException { + Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an ending lead surrogate. + */ + public void testSeekSurrogate2() throws IOException { + Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an starting trail surrogate. + */ + public void testSeekSurrogate3() throws IOException { + Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an medial/final trail surrogate. + */ + public void testSeekSurrogate4() throws IOException { + Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Ensure the 'constant suffix' does not contain a leading trail surrogate. + */ + public void testSurrogateSuffix() throws IOException { + Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try when the constant suffix is only a leading trail surrogate. + * instead this must use an empty suffix. + */ + public void testSurrogateSuffix2() throws IOException { + Automaton a = new RegExp(".*\uDF05").toAutomaton(); + assertAutomatonHits(1, a); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java b/lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java index da9097ac3a2..569c2673319 100644 --- a/lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java +++ b/lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java @@ -67,7 +67,7 @@ public class TestCachingWrapperFilter extends LuceneTestCase { if (originalSet.isCacheable()) { assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass()); } else { - assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI); + assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI || cachedSet == DocIdSet.EMPTY_DOCIDSET); } } diff --git a/lucene/src/test/org/apache/lucene/search/TestFilteredSearch.java b/lucene/src/test/org/apache/lucene/search/TestFilteredSearch.java index 53613015b43..fa98a427f2a 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFilteredSearch.java +++ b/lucene/src/test/org/apache/lucene/search/TestFilteredSearch.java @@ -62,7 +62,7 @@ public class TestFilteredSearch extends LuceneTestCase { searchFiltered(writer, directory, filter, enforceSingleSegment); } - public void searchFiltered(IndexWriter writer, Directory directory, Filter filter, boolean optimize) { + public void searchFiltered(IndexWriter writer, Directory directory, SimpleDocIdSetFilter filter, boolean optimize) { try { for (int i = 0; i < 60; i++) {//Simple docs Document doc = new Document(); @@ -78,6 +78,7 @@ public class TestFilteredSearch extends LuceneTestCase { IndexSearcher indexSearcher = new IndexSearcher(directory, true); + filter.setTopReader(indexSearcher.getIndexReader()); ScoreDoc[] hits = indexSearcher.search(booleanQuery, filter, 1000).scoreDocs; assertEquals("Number of matched documents", 1, hits.length); @@ -89,29 +90,35 @@ public class TestFilteredSearch extends LuceneTestCase { } public static final class SimpleDocIdSetFilter extends Filter { - private int docBase; private final int[] docs; private int index; + private IndexReader topReader; public SimpleDocIdSetFilter(int[] docs) { this.docs = docs; } + + public void setTopReader(IndexReader r) { + topReader = r; + } + @Override public DocIdSet getDocIdSet(IndexReader reader) { final OpenBitSet set = new OpenBitSet(); + int docBase = topReader.getSubReaderDocBase(reader); final int limit = docBase+reader.maxDoc(); for (;index < docs.length; index++) { final int docId = docs[index]; if(docId > limit) break; - set.set(docId-docBase); + if (docId >= docBase) { + set.set(docId-docBase); + } } - docBase = limit; return set.isEmpty()?null:set; } public void reset(){ index = 0; - docBase = 0; } } diff --git a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 22e948cf35a..93b7de9448a 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -23,17 +23,17 @@ import java.io.IOException; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; /** * Tests {@link FuzzyQuery}. @@ -378,5 +378,10 @@ public class TestFuzzyQuery extends LuceneTestCase { doc.add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new FuzzyQuery(new Term("dummy", "dummy")).hasNewAPI); + } } diff --git a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java new file mode 100644 index 00000000000..4e63d9edc81 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java @@ -0,0 +1,142 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Tests the results of fuzzy against pre-recorded output + * The format of the file is the following: + * + * Header Row: # of bits: generate 2^n sequential documents + * with a value of Integer.toBinaryString + * + * Entries: an entry is a param spec line, a resultCount line, and + * then 'resultCount' results lines. The results lines are in the + * expected order. + * + * param spec line: a comma-separated list of params to FuzzyQuery + * (query, prefixLen, pqSize, minScore) + * query = query text as a number (expand with Integer.toBinaryString) + * prefixLen = prefix length + * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite + * minScore = minimum similarity + * + * resultCount line: total number of expected hits. + * + * results line: comma-separated docID, score pair + **/ +public class TestFuzzyQuery2 extends LuceneTestCase { + /** epsilon for score comparisons */ + static final float epsilon = 0.00001f; + + public void testFromTestData() throws Exception { + InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt"); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + + int bits = Integer.parseInt(reader.readLine()); + int terms = (int) Math.pow(2, bits); + + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + for (int i = 0; i < terms; i++) { + field.setValue(Integer.toBinaryString(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + String line; + while ((line = reader.readLine()) != null) { + String params[] = line.split(","); + String query = Integer.toBinaryString(Integer.parseInt(params[0])); + int prefix = Integer.parseInt(params[1]); + int pqSize = Integer.parseInt(params[2]); + float minScore = Float.parseFloat(params[3]); + FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix); + q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize)); + int expectedResults = Integer.parseInt(reader.readLine()); + TopDocs docs = searcher.search(q, expectedResults); + assertEquals(expectedResults, docs.totalHits); + for (int i = 0; i < expectedResults; i++) { + String scoreDoc[] = reader.readLine().split(","); + assertEquals(Integer.parseInt(scoreDoc[0]), docs.scoreDocs[i].doc); + assertEquals(Float.parseFloat(scoreDoc[1]), docs.scoreDocs[i].score, epsilon); + } + } + searcher.close(); + dir.close(); + } + + /* Code to generate test data + public static void main(String args[]) throws Exception { + int bits = 3; + System.out.println(bits); + int terms = (int) Math.pow(2, bits); + + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + for (int i = 0; i < terms; i++) { + field.setValue(Integer.toBinaryString(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + for (int prefix = 0; prefix < bits; prefix++) + for (int pqsize = 1; pqsize <= terms; pqsize++) + for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F) + for (int query = 0; query < terms; query++) { + FuzzyQuery q = new FuzzyQuery( + new Term("field", Integer.toBinaryString(query)), minscore, prefix); + q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize)); + System.out.println(query + "," + prefix + "," + pqsize + "," + minscore); + TopDocs docs = searcher.search(q, terms); + System.out.println(docs.totalHits); + for (int i = 0; i < docs.totalHits; i++) + System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score); + } + } + */ +} diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index 35fe89998e0..637143593f7 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -22,14 +22,17 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + import java.io.IOException; +import java.util.HashSet; import java.util.LinkedList; import java.util.Collections; @@ -45,7 +48,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase } public void testPhrasePrefix() throws IOException { - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new SimpleAnalyzer(TEST_VERSION_CURRENT))); add("blueberry pie", writer); add("blueberry strudel", writer); @@ -101,6 +104,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase termsWithPrefix.add(te.term()); } } while (te.next()); + ir.close(); query3.add(termsWithPrefix.toArray(new Term[0])); query3.add(new Term("body", "pizza")); @@ -139,7 +143,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase // and all terms required. // The contained PhraseMultiQuery must contain exactly one term array. - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new SimpleAnalyzer(TEST_VERSION_CURRENT))); add("blueberry pie", writer); add("blueberry chewing gum", writer); @@ -164,10 +168,11 @@ public class TestMultiPhraseQuery extends LuceneTestCase assertEquals("Wrong number of hits", 2, hits.length); searcher.close(); + indexStore.close(); } public void testPhrasePrefixWithBooleanQuery() throws IOException { - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig( TEST_VERSION_CURRENT, new StandardAnalyzer( TEST_VERSION_CURRENT, Collections.emptySet()))); @@ -190,6 +195,23 @@ public class TestMultiPhraseQuery extends LuceneTestCase ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; assertEquals("Wrong number of hits", 0, hits.length); searcher.close(); + indexStore.close(); + } + + public void testNoDocs() throws Exception { + MockRAMDirectory indexStore = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(Version.LUCENE_CURRENT, new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED); + add("a note", "note", writer); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(indexStore, true); + + MultiPhraseQuery q = new MultiPhraseQuery(); + q.add(new Term("body", "a")); + q.add(new Term[] { new Term("body", "nope"), new Term("body", "nope") }); + assertEquals("Wrong number of hits", 0, searcher.search(q, null, 1).totalHits); + searcher.close(); + indexStore.close(); } public void testHashCodeAndEquals(){ diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryBWComp.java b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryBWComp.java new file mode 100644 index 00000000000..2778dcb9f15 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryBWComp.java @@ -0,0 +1,239 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCaseJ4; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * Test MultiTermQuery api backwards compat + * @deprecated Remove test when old API is no longer supported + */ +@Deprecated +public class TestMultiTermQueryBWComp extends LuceneTestCaseJ4 { + private static RAMDirectory dir; + private static Searcher searcher; + private static final String FIELD = "test"; + + /** + * Test that the correct method (getTermsEnum/getEnum) is called. + */ + @Test + public void testEnumMethod() throws IOException { + assertAPI("old", new OldAPI(FIELD)); + assertAPI("new", new NewAPI(FIELD)); + assertAPI("new", new BothAPI(FIELD)); + + assertAPI("old2", new OldExtendsOldAPI(FIELD)); + assertAPI("old2", new OldExtendsNewAPI(FIELD)); + assertAPI("old2", new OldExtendsBothAPI(FIELD)); + + assertAPI("new2", new NewExtendsOldAPI(FIELD)); + assertAPI("new2", new NewExtendsNewAPI(FIELD)); + assertAPI("new2", new NewExtendsBothAPI(FIELD)); + + assertAPI("new2", new BothExtendsOldAPI(FIELD)); + assertAPI("new2", new BothExtendsNewAPI(FIELD)); + assertAPI("new2", new BothExtendsBothAPI(FIELD)); + } + + private static void assertAPI(String expected, Query query) throws IOException { + TopDocs td = searcher.search(query, 25); + assertEquals(1, td.totalHits); + Document doc = searcher.doc(td.scoreDocs[0].doc); + assertEquals(expected, doc.get(FIELD)); + } + + private class OldAPI extends MultiTermQuery { + OldAPI(String field) { super(field); } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old")); + } + + @Override + public String toString(String field) { return null; } + } + + private class NewAPI extends MultiTermQuery { + NewAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new")); + } + + @Override + public String toString(String field) { return null; } + } + + private class BothAPI extends MultiTermQuery { + BothAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new")); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old")); + } + + @Override + public String toString(String field) { return null; } + } + + private class OldExtendsOldAPI extends OldAPI { + OldExtendsOldAPI(String field) { super(field); } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + private class OldExtendsNewAPI extends NewAPI { + OldExtendsNewAPI(String field) { super(field); } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + private class OldExtendsBothAPI extends BothAPI { + OldExtendsBothAPI(String field) { super(field); } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + private class NewExtendsOldAPI extends OldAPI { + NewExtendsOldAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + } + + private class NewExtendsNewAPI extends NewAPI { + NewExtendsNewAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + } + + private class NewExtendsBothAPI extends BothAPI { + NewExtendsBothAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + } + + private class BothExtendsOldAPI extends OldAPI { + BothExtendsOldAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + private class BothExtendsNewAPI extends NewAPI { + BothExtendsNewAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + private class BothExtendsBothAPI extends BothAPI { + BothExtendsBothAPI(String field) { super(field); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SingleTermsEnum(reader, new Term(FIELD, "new2")); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new SingleTermEnum(reader, new Term(FIELD, "old2")); + } + } + + @BeforeClass + public static void beforeClass() throws Exception { + dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, + new WhitespaceAnalyzer(TEST_VERSION_CURRENT), true, + IndexWriter.MaxFieldLength.LIMITED); + + String values[] = { "old", "old2", "new", "new2" }; + for (String value : values) { + Document doc = new Document(); + doc.add(new Field(FIELD, value, + Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir, true); + } + + @AfterClass + public static void afterClass() throws Exception { + searcher.close(); + searcher = null; + dir.close(); + dir = null; + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java index 423053f42c7..20d755bd3b9 100644 --- a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java +++ b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java @@ -24,9 +24,11 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCaseJ4; import org.apache.lucene.util.NumericUtils; @@ -331,9 +333,15 @@ public class TestNumericRangeQuery32 extends LuceneTestCaseJ4 { if (lower>upper) { int a=lower; lower=upper; upper=a; } + final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_INT), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_INT); + NumericUtils.intToPrefixCoded(lower, 0, lowerBytes); + NumericUtils.intToPrefixCoded(upper, 0, upperBytes); + // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! + final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, true); - TermRangeQuery cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, true); + TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true); TopDocs tTopDocs = searcher.search(tq, 1); TopDocs cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -341,7 +349,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, false); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, false); + cq=new TermRangeQuery(field, lowerString, upperString, false, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -349,7 +357,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test left exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, true); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, true); + cq=new TermRangeQuery(field, lowerString, upperString, false, true); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -357,7 +365,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test right exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, false); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, false); + cq=new TermRangeQuery(field, lowerString, upperString, true, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -549,23 +557,24 @@ public class TestNumericRangeQuery32 extends LuceneTestCaseJ4 { } private void testEnum(int lower, int upper) throws Exception { - NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); - FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); - try { - int count = 0; - do { - final Term t = termEnum.term(); - if (t != null) { - final int val = NumericUtils.prefixCodedToInt(t.text()); - assertTrue("value not in bounds", val >= lower && val <= upper); - count++; - } else break; - } while (termEnum.next()); - assertFalse(termEnum.next()); - if (VERBOSE) System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); - } finally { - termEnum.close(); - } + NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, + lower, upper, true, true); + TermsEnum termEnum = q.getTermsEnum(searcher.getIndexReader()); + int count = 0; + while (termEnum.next() != null) { + final BytesRef t = termEnum.term(); + if (t != null) { + final int val = NumericUtils.prefixCodedToInt(t); + assertTrue("value not in bounds " + val + " >= " + lower + " && " + + val + " <= " + upper, val >= lower && val <= upper); + count++; + } else + break; + } + assertNull(termEnum.next()); + if (VERBOSE) System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + + "] contained " + count + " terms."); + } @Test diff --git a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java index c4c46ba351f..2fa92b45bde 100644 --- a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java +++ b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java @@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCaseJ4; import org.apache.lucene.util.NumericUtils; @@ -350,9 +351,15 @@ public class TestNumericRangeQuery64 extends LuceneTestCaseJ4 { if (lower>upper) { long a=lower; lower=upper; upper=a; } + final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); + NumericUtils.longToPrefixCoded(lower, 0, lowerBytes); + NumericUtils.longToPrefixCoded(upper, 0, upperBytes); + // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! + final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true); - TermRangeQuery cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, true); + TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true); TopDocs tTopDocs = searcher.search(tq, 1); TopDocs cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -360,7 +367,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, false); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, false); + cq=new TermRangeQuery(field, lowerString, upperString, false, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -368,7 +375,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test left exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, true); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, true); + cq=new TermRangeQuery(field, lowerString, upperString, false, true); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -376,7 +383,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCaseJ4 { termCountC += cq.getTotalNumberOfTerms(); // test right exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, false); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, false); + cq=new TermRangeQuery(field, lowerString, upperString, true, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -583,4 +590,9 @@ public class TestNumericRangeQuery64 extends LuceneTestCaseJ4 { // difference to int range is tested in TestNumericRangeQuery32 } + @Test @Deprecated + public void testBackwardsLayer() { + assertTrue(NumericRangeQuery.newLongRange("dummy", null, null, true, true).hasNewAPI); + } + } diff --git a/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java b/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java index e4e12aa349a..2a525718664 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java +++ b/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.Collection; import java.util.Collections; +import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; @@ -61,6 +62,8 @@ import org.apache.lucene.util.LuceneTestCase; */ public class TestPositionIncrement extends LuceneTestCase { + final static boolean VERBOSE = false; + public void testSetPosition() throws Exception { Analyzer analyzer = new Analyzer() { @Override @@ -242,8 +245,8 @@ public class TestPositionIncrement extends LuceneTestCase { IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); Document doc = new Document(); - doc.add(new Field("content", - new StringReader("a a b c d e a f g h i j a b k k"))); + doc.add(new Field("content", new StringReader( + "a a b c d e a f g h i j a b k k"))); writer.addDocument(doc); IndexReader r = writer.getReader(); @@ -271,30 +274,43 @@ public class TestPositionIncrement extends LuceneTestCase { count = 0; boolean sawZero = false; - //System.out.println("\ngetPayloadSpans test"); + if (VERBOSE) { + System.out.println("\ngetPayloadSpans test"); + } Spans pspans = snq.getSpans(is.getIndexReader()); while (pspans.next()) { - //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end()); + if (VERBOSE) { + System.out.println("doc " + pspans.doc() + ": span " + pspans.start() + + " to " + pspans.end()); + } Collection payloads = pspans.getPayload(); sawZero |= pspans.start() == 0; - count += payloads.size(); + for (@SuppressWarnings("unused") byte[] bytes : payloads) { + count++; + if (!VERBOSE) { + // do nothing + } else { + System.out.println(" payload: " + new String((byte[]) bytes)); + } + } } assertEquals(5, count); assertTrue(sawZero); - //System.out.println("\ngetSpans test"); + // System.out.println("\ngetSpans test"); Spans spans = snq.getSpans(is.getIndexReader()); count = 0; sawZero = false; while (spans.next()) { count++; sawZero |= spans.start() == 0; - //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end()); + // System.out.println(spans.doc() + " - " + spans.start() + " - " + + // spans.end()); } assertEquals(4, count); assertTrue(sawZero); - - //System.out.println("\nPayloadSpanUtil test"); + + // System.out.println("\nPayloadSpanUtil test"); sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader()); @@ -355,7 +371,9 @@ class PayloadFilter extends TokenFilter { } posIncrAttr.setPositionIncrement(posIncr); pos += posIncr; - // System.out.println("term=" + termAttr.term() + " pos=" + pos); + if (TestPositionIncrement.VERBOSE) { + System.out.println("term=" + termAttr.term() + " pos=" + pos); + } i++; return true; } else { diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java b/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java index abf5f18f232..7a130c26680 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java @@ -53,5 +53,15 @@ public class TestPrefixQuery extends LuceneTestCase { query = new PrefixQuery(new Term("category", "/Computers/Mac")); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals("One in /Computers/Mac", 1, hits.length); + + query = new PrefixQuery(new Term("category", "")); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("everything", 3, hits.length); + } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new PrefixQuery(new Term("dummy", "dummy")).hasNewAPI); } } diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java new file mode 100644 index 00000000000..f69c2ac52cb --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -0,0 +1,128 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Some simple regex tests, mostly converted from contrib's TestRegexQuery. + */ +public class TestRegexpQuery extends LuceneTestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, + "the quick brown fox jumps over the lazy ??? dog 493432 49344", + Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { + return new Term(FN, value); + } + + private int regexQueryNrHits(String regex) throws IOException { + RegexpQuery query = new RegexpQuery(newTerm(regex)); + return searcher.search(query, 5).totalHits; + } + + public void testRegex1() throws IOException { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws IOException { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws IOException { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + public void testNumericRange() throws IOException { + assertEquals(1, regexQueryNrHits("<420000-600000>")); + assertEquals(0, regexQueryNrHits("<493433-600000>")); + } + + public void testRegexComplement() throws IOException { + assertEquals(1, regexQueryNrHits("4934~[3]")); + // not the empty lang, i.e. match all docs + assertEquals(1, regexQueryNrHits("~#")); + } + + public void testCustomProvider() throws IOException { + AutomatonProvider myProvider = new AutomatonProvider() { + // automaton that matches quick or brown + private Automaton quickBrownAutomaton = BasicOperations.union(Arrays + .asList(new Automaton[] {BasicAutomata.makeString("quick"), + BasicAutomata.makeString("brown"), + BasicAutomata.makeString("bob")})); + + public Automaton getAutomaton(String name) throws IOException { + if (name.equals("quickBrown")) return quickBrownAutomaton; + else return null; + } + }; + RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, + myProvider); + assertEquals(1, searcher.search(query, 5).totalHits); + } + + /** + * Test a corner case for backtracking: In this case the term dictionary has + * 493432 followed by 49344. When backtracking from 49343... to 4934, its + * necessary to test that 4934 itself is ok before trying to append more + * characters. + */ + public void testBacktracking() throws IOException { + assertEquals(1, regexQueryNrHits("4934[314]")); + } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new RegexpQuery(newTerm(".*")).hasNewAPI); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java new file mode 100644 index 00000000000..b1b1fa58f57 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java @@ -0,0 +1,144 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Create an index with terms from 0000-9999. + * Generates random regexps according to simple patterns, + * and validates the correct number of hits are returned. + */ +public class TestRegexpRandom extends LuceneTestCase { + private Searcher searcher; + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + NumberFormat df = new DecimalFormat("0000"); + for (int i = 0; i < 10000; i++) { + field.setValue(df.format(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir); + } + + private char N() { + return (char) (0x30 + random.nextInt(10)); + } + + private String fillPattern(String wildcardPattern) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardPattern.length(); i++) { + switch(wildcardPattern.charAt(i)) { + case 'N': + sb.append(N()); + break; + default: + sb.append(wildcardPattern.charAt(i)); + } + } + return sb.toString(); + } + + private void assertPatternHits(String pattern, int numHits) throws Exception { + Query wq = new RegexpQuery(new Term("field", fillPattern(pattern))); + TopDocs docs = searcher.search(wq, 25); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); + } + + @Override + protected void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + public void testRegexps() throws Exception { + random = newRandom(System.nanoTime()); + for (int i = 0; i < 100; i++) { + assertPatternHits("NNNN", 1); + assertPatternHits(".NNN", 10); + assertPatternHits("N.NN", 10); + assertPatternHits("NN.N", 10); + assertPatternHits("NNN.", 10); + } + + for (int i = 0; i < 10; i++) { + assertPatternHits(".{1,2}NN", 100); + assertPatternHits("N.{1,2}N", 100); + assertPatternHits("NN.{1,2}", 100); + assertPatternHits(".{1,3}N", 1000); + assertPatternHits("N.{1,3}", 1000); + assertPatternHits(".{1,4}", 10000); + + assertPatternHits("NNN[3-7]", 5); + assertPatternHits("NN[2-6][3-7]", 25); + assertPatternHits("N[1-5][2-6][3-7]", 125); + assertPatternHits("[0-4][3-7][4-8][5-9]", 625); + assertPatternHits("[3-7][2-6][0-4]N", 125); + assertPatternHits("[2-6][3-7]NN", 25); + assertPatternHits("[3-7]NNN", 5); + + assertPatternHits("NNN.*", 10); + assertPatternHits("NN.*", 100); + assertPatternHits("N.*", 1000); + assertPatternHits(".*", 10000); + + assertPatternHits(".*NNN", 10); + assertPatternHits(".*NN", 100); + assertPatternHits(".*N", 1000); + + assertPatternHits("N.*NN", 10); + assertPatternHits("NN.*N", 10); + + // combo of ? and * operators + assertPatternHits(".NN.*", 100); + assertPatternHits("N.N.*", 100); + assertPatternHits("NN..*", 100); + assertPatternHits(".N..*", 1000); + assertPatternHits("N...*", 1000); + + assertPatternHits(".*NN.", 100); + assertPatternHits(".*N..", 1000); + assertPatternHits(".*...", 10000); + assertPatternHits(".*.N.", 1000); + assertPatternHits(".*..N", 1000); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java new file mode 100644 index 00000000000..14e8ed80c23 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -0,0 +1,221 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.RunAutomaton; + +/** + * Create an index with random unicode terms + * Generates random regexps, and validates against a simple impl. + */ +public class TestRegexpRandom2 extends LuceneTestCase { + private IndexSearcher searcher; + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + random = newRandom(System.nanoTime()); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.YES, Field.Index.ANALYZED); + doc.add(field); + + for (int i = 0; i < 1000; i++) { + field.setValue(randomString()); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir); + } + + @Override + protected void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + /** a stupid regexp query that just blasts thru the terms */ + private class DumbRegexpQuery extends MultiTermQuery { + private final Automaton automaton; + + DumbRegexpQuery(Term term) { + super(term.field()); + RegExp re = new RegExp(term.text()); + automaton = re.toAutomaton(); + } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return new SimpleAutomatonTermsEnum(reader, field); + } + + private class SimpleAutomatonTermsEnum extends FilteredTermsEnum { + RunAutomaton runAutomaton = new RunAutomaton(automaton); + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + + private SimpleAutomatonTermsEnum(IndexReader reader, String field) throws IOException { + super(reader, field); + setInitialSeekTerm(new BytesRef("")); + } + + @Override + protected AcceptStatus accept(BytesRef term) throws IOException { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + return runAutomaton.run(utf16.result, 0, utf16.length) ? + AcceptStatus.YES : AcceptStatus.NO; + } + } + + @Override + public String toString(String field) { + return field.toString() + automaton.toString(); + } + } + + /** test a bunch of random regular expressions */ + public void testRegexps() throws Exception { + for (int i = 0; i < 500; i++) + assertSame(randomRegex()); + } + + /** check that the # of hits is the same as from a very + * simple regexpquery implementation. + */ + private void assertSame(String regexp) throws IOException { + // we will generate some illegal syntax regular expressions... + try { + new RegExp(regexp).toAutomaton(); + } catch (Exception e) { + return; + } + + // we will also generate some undefined unicode queries + if (!UnicodeUtil.validUTF16String(regexp)) + return; + + RegexpQuery smart = new RegexpQuery(new Term("field", regexp)); + DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp)); + + // we can't compare the two if automaton rewrites to a simpler enum. + // for example: "a\uda07\udcc7?.*?" gets rewritten to a simpler query: + // a\uda07* prefixquery. Prefixquery then does the "wrong" thing, which + // isn't really wrong as the query was undefined to begin with... but not + // automatically comparable. + if (!(smart.getTermsEnum(searcher.getIndexReader()) instanceof AutomatonTermsEnum)) + return; + + TopDocs smartDocs = searcher.search(smart, 25); + TopDocs dumbDocs = searcher.search(dumb, 25); + + assertEquals(dumbDocs.totalHits, smartDocs.totalHits); + } + + char buffer[] = new char[20]; + + // start is inclusive and end is exclusive + public int nextInt(int start, int end) { + return start + random.nextInt(end - start); + } + + public String randomString() { + final int end = random.nextInt(20); + if (buffer.length < 1 + end) { + char[] newBuffer = new char[(int) ((1 + end) * 1.25)]; + System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); + buffer = newBuffer; + } + for (int i = 0; i < end - 1; i++) { + int t = random.nextInt(6); + if (0 == t && i < end - 1) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(0xd800, 0xdc00); + // Low surrogate + buffer[i] = (char) nextInt(0xdc00, 0xe000); + } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80); + else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800); + else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800); + else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff); + else if (5 == t) { + // Illegal unpaired surrogate + if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00); + else buffer[i] = (char) nextInt(0xdc00, 0xe000); + } + } + return new String(buffer, 0, end); + } + + // a random string biased towards populating a ton of operators + public String randomRegex() { + final int end = random.nextInt(20); + if (buffer.length < 1 + end) { + char[] newBuffer = new char[(int) ((1 + end) * 1.25)]; + System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); + buffer = newBuffer; + } + for (int i = 0; i < end - 1; i++) { + int t = random.nextInt(10); + if (0 == t && i < end - 1) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(0xd800, 0xdc00); + // Low surrogate + buffer[i] = (char) nextInt(0xdc00, 0xe000); + } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80); + else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800); + else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800); + else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff); + else if (5 == t) { + // Illegal unpaired surrogate + if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00); + else buffer[i] = (char) nextInt(0xdc00, 0xe000); + } else if (6 == t) { + buffer[i] = '.'; + } else if (7 == t) { + buffer[i] = '?'; + } else if (8 == t) { + buffer[i] = '*'; + } else if (9 == t) { + buffer[i] = '+'; + } + } + return new String(buffer, 0, end); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestSort.java b/lucene/src/test/org/apache/lucene/search/TestSort.java index 6b3dad43564..b39b7829999 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/src/test/org/apache/lucene/search/TestSort.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.FieldValueHitQueue.Entry; @@ -277,7 +278,7 @@ public class TestSort extends LuceneTestCase implements Serializable { sort.setSort( new SortField("string", SortField.STRING), new SortField("string2", SortField.STRING, true), - SortField.FIELD_DOC ); + SortField.FIELD_DOC); result = searcher.search(new MatchAllDocsQuery(), null, 500, sort).scoreDocs; @@ -337,8 +338,8 @@ public class TestSort extends LuceneTestCase implements Serializable { sort.setSort (new SortField ("parser", new FieldCache.IntParser(){ - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -346,8 +347,8 @@ public class TestSort extends LuceneTestCase implements Serializable { fc.purgeAllCaches(); sort.setSort (new SortField ("parser", new FieldCache.FloatParser(){ - public final float parseFloat(final String val) { - return (float) Math.sqrt( val.charAt(0) ); + public final float parseFloat(final BytesRef term) { + return (float) Math.sqrt( term.bytes[term.offset] ); } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -355,8 +356,8 @@ public class TestSort extends LuceneTestCase implements Serializable { fc.purgeAllCaches(); sort.setSort (new SortField ("parser", new FieldCache.LongParser(){ - public final long parseLong(final String val) { - return (val.charAt(0)-'A') * 1234567890L; + public final long parseLong(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 1234567890L; } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -364,8 +365,8 @@ public class TestSort extends LuceneTestCase implements Serializable { fc.purgeAllCaches(); sort.setSort (new SortField ("parser", new FieldCache.DoubleParser(){ - public final double parseDouble(final String val) { - return Math.pow( val.charAt(0), (val.charAt(0)-'A') ); + public final double parseDouble(final BytesRef term) { + return Math.pow( term.bytes[term.offset], (term.bytes[term.offset]-'A') ); } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -373,8 +374,8 @@ public class TestSort extends LuceneTestCase implements Serializable { fc.purgeAllCaches(); sort.setSort (new SortField ("parser", new FieldCache.ByteParser(){ - public final byte parseByte(final String val) { - return (byte) (val.charAt(0)-'A'); + public final byte parseByte(final BytesRef term) { + return (byte) (term.bytes[term.offset]-'A'); } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -382,8 +383,8 @@ public class TestSort extends LuceneTestCase implements Serializable { fc.purgeAllCaches(); sort.setSort (new SortField ("parser", new FieldCache.ShortParser(){ - public final short parseShort(final String val) { - return (short) (val.charAt(0)-'A'); + public final short parseShort(final BytesRef term) { + return (short) (term.bytes[term.offset]-'A'); } }), SortField.FIELD_DOC ); assertMatches (full, queryA, sort, "JIHGFEDCBA"); @@ -443,8 +444,8 @@ public class TestSort extends LuceneTestCase implements Serializable { @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; } }); } diff --git a/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java b/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java index 42ba25c8a04..a75831a2d83 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java @@ -92,6 +92,25 @@ public class TestTermRangeQuery extends LuceneTestCase { assertEquals("C added - A, B, C in range", 3, hits.length); searcher.close(); } + + public void testAllDocs() throws Exception { + initializeIndex(new String[]{"A", "B", "C", "D"}); + IndexSearcher searcher = new IndexSearcher(dir, true); + TermRangeQuery query = new TermRangeQuery("content", null, null, true, true); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + query = new TermRangeQuery("content", null, null, false, false); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + query = new TermRangeQuery("content", "", null, true, false); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + // and now anothe one + query = new TermRangeQuery("content", "B", null, true, false); + assertTrue(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length); + searcher.close(); + } /** This test should not be here, but it tests the fuzzy query rewrite mode (TOP_TERMS_SCORING_BOOLEAN_REWRITE) * with constant score and checks, that only the lower end of terms is put into the range */ @@ -402,4 +421,9 @@ public class TestTermRangeQuery extends LuceneTestCase { //assertEquals("C added => A,B,,C in range", 3, hits.length()); searcher.close(); } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new TermRangeQuery("dummy", null, null, true, true).hasNewAPI); + } } diff --git a/lucene/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/src/test/org/apache/lucene/search/TestTermScorer.java index 48292505e7c..f14ff0de445 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/src/test/org/apache/lucene/search/TestTermScorer.java @@ -71,9 +71,8 @@ public class TestTermScorer extends LuceneTestCase Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); //we have 2 documents with the term all in them, one document for all the other values final List docs = new ArrayList(); //must call next first @@ -137,9 +136,8 @@ public class TestTermScorer extends LuceneTestCase Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue("score is not correct", ts.score() == 1.6931472f); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); @@ -147,16 +145,15 @@ public class TestTermScorer extends LuceneTestCase assertTrue("next returned a doc and it should not have", ts.nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } - public void testSkipTo() throws Exception { + public void testAdvance() throws Exception { Term allTerm = new Term(FIELD, "all"); TermQuery termQuery = new TermQuery(allTerm); Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); assertTrue("Didn't skip", ts.advance(3) != DocIdSetIterator.NO_MORE_DOCS); //The next doc should be doc 5 assertTrue("doc should be number 5", ts.docID() == 5); diff --git a/lucene/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/src/test/org/apache/lucene/search/TestWildcard.java index 7cd5ce02470..3edbfb8be52 100644 --- a/lucene/src/test/org/apache/lucene/search/TestWildcard.java +++ b/lucene/src/test/org/apache/lucene/search/TestWildcard.java @@ -24,6 +24,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; @@ -121,30 +122,12 @@ public class TestWildcard MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*")); assertMatches(searcher, wq, 2); - MultiTermQuery expected = new PrefixQuery(new Term("field", "prefix")); - wq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.1F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); + assertTrue(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); - wq.setBoost(0.2F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - wq.setBoost(0.3F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.4F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); + wq = new WildcardQuery(new Term("field", "*")); + assertMatches(searcher, wq, 2); + assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof AutomatonTermsEnum); } /** @@ -326,5 +309,62 @@ public class TestWildcard searcher.close(); } + @Deprecated + private static final class OldWildcardQuery extends MultiTermQuery { + final Term term; + OldWildcardQuery(Term term) { + this.term = term; + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new WildcardTermEnum(reader, term); + } + + @Override + public String toString(String field) { + return "OldWildcard(" + term.toString()+ ")"; + } + } + + @Deprecated + public void testDeprecatedTermEnum() throws Exception { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore, true); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new OldWildcardQuery(new Term("body", "metal*")); + Query query3 = new OldWildcardQuery(new Term("body", "m*tal")); + Query query4 = new OldWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new OldWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new OldWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2); + } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new WildcardQuery(new Term("body", "metal*")).hasNewAPI); + assertFalse(new OldWildcardQuery(new Term("body", "metal*")).hasNewAPI); + } } diff --git a/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java b/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java new file mode 100644 index 00000000000..1262dd04c5f --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java @@ -0,0 +1,137 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Create an index with terms from 0000-9999. + * Generates random wildcards according to patterns, + * and validates the correct number of hits are returned. + */ +public class TestWildcardRandom extends LuceneTestCase { + private Searcher searcher; + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + NumberFormat df = new DecimalFormat("0000"); + for (int i = 0; i < 10000; i++) { + field.setValue(df.format(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir); + } + + private char N() { + return (char) (0x30 + random.nextInt(10)); + } + + private String fillPattern(String wildcardPattern) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardPattern.length(); i++) { + switch(wildcardPattern.charAt(i)) { + case 'N': + sb.append(N()); + break; + default: + sb.append(wildcardPattern.charAt(i)); + } + } + return sb.toString(); + } + + private void assertPatternHits(String pattern, int numHits) throws Exception { + Query wq = new WildcardQuery(new Term("field", fillPattern(pattern))); + TopDocs docs = searcher.search(wq, 25); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); + } + + @Override + protected void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + public void testWildcards() throws Exception { + random = newRandom(System.nanoTime()); + for (int i = 0; i < 100; i++) { + assertPatternHits("NNNN", 1); + assertPatternHits("?NNN", 10); + assertPatternHits("N?NN", 10); + assertPatternHits("NN?N", 10); + assertPatternHits("NNN?", 10); + } + + for (int i = 0; i < 10; i++) { + assertPatternHits("??NN", 100); + assertPatternHits("N??N", 100); + assertPatternHits("NN??", 100); + assertPatternHits("???N", 1000); + assertPatternHits("N???", 1000); + assertPatternHits("????", 10000); + + assertPatternHits("NNN*", 10); + assertPatternHits("NN*", 100); + assertPatternHits("N*", 1000); + assertPatternHits("*", 10000); + + assertPatternHits("*NNN", 10); + assertPatternHits("*NN", 100); + assertPatternHits("*N", 1000); + + assertPatternHits("N*NN", 10); + assertPatternHits("NN*N", 10); + + // combo of ? and * operators + assertPatternHits("?NN*", 100); + assertPatternHits("N?N*", 100); + assertPatternHits("NN?*", 100); + assertPatternHits("?N?*", 1000); + assertPatternHits("N??*", 1000); + + assertPatternHits("*NN?", 100); + assertPatternHits("*N??", 1000); + assertPatternHits("*???", 10000); + assertPatternHits("*?N?", 1000); + assertPatternHits("*??N", 1000); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/function/TestOrdValues.java b/lucene/src/test/org/apache/lucene/search/function/TestOrdValues.java index 8f06b92f51a..2e0692088d7 100644 --- a/lucene/src/test/org/apache/lucene/search/function/TestOrdValues.java +++ b/lucene/src/test/org/apache/lucene/search/function/TestOrdValues.java @@ -62,9 +62,9 @@ public class TestOrdValues extends FunctionTestSetup { IndexSearcher s = new IndexSearcher(dir, true); ValueSource vs; if (inOrder) { - vs = new OrdFieldSource(field); + vs = new MultiValueSource(new OrdFieldSource(field)); } else { - vs = new ReverseOrdFieldSource(field); + vs = new MultiValueSource(new ReverseOrdFieldSource(field)); } Query q = new ValueSourceQuery(vs); diff --git a/lucene/src/test/org/apache/lucene/search/function/TestValueSource.java b/lucene/src/test/org/apache/lucene/search/function/TestValueSource.java new file mode 100644 index 00000000000..a296a0f8686 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/function/TestValueSource.java @@ -0,0 +1,64 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.*; +import org.apache.lucene.store.*; +import org.apache.lucene.search.*; +import org.apache.lucene.search.function.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; + +public class TestValueSource extends LuceneTestCase { + + public void testMultiValueSource() throws Exception { + Directory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + Document doc = new Document(); + Field f = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(f); + + for(int i=0;i<17;i++) { + f.setValue(""+i); + w.addDocument(doc); + w.commit(); + } + + IndexReader r = w.getReader(); + w.close(); + + assertTrue(r.getSequentialSubReaders().length > 1); + + ValueSource s1 = new IntFieldSource("field"); + DocValues v1 = s1.getValues(r); + DocValues v2 = new MultiValueSource(s1).getValues(r); + + for(int i=0;i> it = clone.getAttributeClassesIterator(); - assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("FlagsAttribute must be the first attribute", FlagsAttribute.class, it.next()); assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); assertFalse("No more attributes", it.hasNext()); - final TermAttribute termAtt2 = clone.getAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt2 = clone.getAttribute(FlagsAttribute.class); final TypeAttribute typeAtt2 = clone.getAttribute(TypeAttribute.class); - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); // test copy back - termAtt2.setTermBuffer("OtherTerm"); + flagsAtt2.setFlags(4711); typeAtt2.setType("OtherType"); clone.copyTo(src); - assertEquals("TermAttribute of original must now contain updated term", "OtherTerm", termAtt.term()); + assertEquals("FlagsAttribute of original must now contain updated term", 4711, flagsAtt.getFlags()); assertEquals("TypeAttribute of original must now contain updated type", "OtherType", typeAtt.type()); // verify again: - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); } public void testToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); - TermAttribute termAtt = src.addAttribute(TermAttribute.class); + CharTermAttribute termAtt = src.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + termAtt.append("TestTerm"); typeAtt.setType("TestType"); assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString()); Iterator it = src.getAttributeImplsIterator(); @@ -125,23 +125,23 @@ public class TestAttributeSource extends LuceneTestCase { src = new AttributeSource(); src.addAttributeImpl(new Token()); - // this should not add a new attribute as Token implements TermAttribute, too - termAtt = src.addAttribute(TermAttribute.class); - assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token); + // this should not add a new attribute as Token implements CharTermAttribute, too + termAtt = src.addAttribute(CharTermAttribute.class); + assertTrue("CharTermAttribute should be implemented by Token", termAtt instanceof Token); // get the Token attribute and check, that it is the only one it = src.getAttributeImplsIterator(); Token tok = (Token) it.next(); assertFalse("There should be only one attribute implementation instance", it.hasNext()); - termAtt.setTermBuffer("TestTerm"); + termAtt.setEmpty().append("TestTerm"); assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString()); } public void testDefaultAttributeFactory() throws Exception { AttributeSource src = new AttributeSource(); - assertTrue("TermAttribute is not implemented by TermAttributeImpl", - src.addAttribute(TermAttribute.class) instanceof TermAttributeImpl); + assertTrue("CharTermAttribute is not implemented by CharTermAttributeImpl", + src.addAttribute(CharTermAttribute.class) instanceof CharTermAttributeImpl); assertTrue("OffsetAttribute is not implemented by OffsetAttributeImpl", src.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl); assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl", diff --git a/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java b/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java index 8e82720e140..e59727fcd3d 100644 --- a/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java +++ b/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java @@ -25,33 +25,37 @@ public class TestNumericUtils extends LuceneTestCase { public void testLongConversionAndOrdering() throws Exception { // generate a series of encoded longs, each numerical one bigger than the one before - String last=null; + BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_LONG); for (long l=-100000L; l<100000L; l++) { - String act=NumericUtils.longToPrefixCoded(l); + NumericUtils.longToPrefixCoded(l, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works assertEquals("forward and back conversion should generate same long", l, NumericUtils.prefixCodedToLong(act)); // next step - last=act; + last = act; + act = new BytesRef(NumericUtils.BUF_SIZE_LONG); } } public void testIntConversionAndOrdering() throws Exception { // generate a series of encoded ints, each numerical one bigger than the one before - String last=null; + BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_INT); for (int i=-100000; i<100000; i++) { - String act=NumericUtils.intToPrefixCoded(i); + NumericUtils.intToPrefixCoded(i, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works assertEquals("forward and back conversion should generate same int", i, NumericUtils.prefixCodedToInt(act)); // next step last=act; + act = new BytesRef(NumericUtils.BUF_SIZE_INT); } } @@ -60,10 +64,11 @@ public class TestNumericUtils extends LuceneTestCase { Long.MIN_VALUE, Long.MIN_VALUE+1, Long.MIN_VALUE+2, -5003400000000L, -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE }; - String[] prefixVals=new String[vals.length]; + BytesRef[] prefixVals=new BytesRef[vals.length]; for (int i=0; i 0) { + assertTrue(automata[n-1].subsetOf(automata[n])); + assertNotSame(automata[n-1], automata[n]); + } + // special checks for specific n + switch(n) { + case 0: + // easy, matches the string itself + assertEquals(BasicAutomata.makeString(s), automata[0]); + break; + case 1: + // generate a lev1 naively, and check the accepted lang is the same. + assertEquals(naiveLev1(s), automata[1]); + break; + default: + assertBruteForce(s, automata[n], n); + break; + } + } + } + + /** + * Return an automaton that accepts all 1-character insertions, deletions, and + * substitutions of s. + */ + private Automaton naiveLev1(String s) { + Automaton a = BasicAutomata.makeString(s); + a = BasicOperations.union(a, insertionsOf(s)); + MinimizationOperations.minimize(a); + a = BasicOperations.union(a, deletionsOf(s)); + MinimizationOperations.minimize(a); + a = BasicOperations.union(a, substitutionsOf(s)); + MinimizationOperations.minimize(a); + + return a; + } + + /** + * Return an automaton that accepts all 1-character insertions of s (inserting + * one character) + */ + private Automaton insertionsOf(String s) { + List list = new ArrayList(); + + for (int i = 0; i <= s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s + .substring(i))); + list.add(a); + } + + Automaton a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + return a; + } + + /** + * Return an automaton that accepts all 1-character deletions of s (deleting + * one character). + */ + private Automaton deletionsOf(String s) { + List list = new ArrayList(); + + for (int i = 0; i < s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s + .substring(i + 1))); + a.expandSingleton(); + list.add(a); + } + + Automaton a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + return a; + } + + /** + * Return an automaton that accepts all 1-character substitutions of s + * (replacing one character) + */ + private Automaton substitutionsOf(String s) { + List list = new ArrayList(); + + for (int i = 0; i < s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s + .substring(i + 1))); + list.add(a); + } + + Automaton a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + return a; + } + + private void assertBruteForce(String input, Automaton dfa, int distance) { + RunAutomaton ra = new RunAutomaton(dfa); + int maxLen = input.length() + distance + 1; + int maxNum = (int) Math.pow(2, maxLen); + for (int i = 0; i < maxNum; i++) { + String encoded = Integer.toString(i, 2); + boolean accepts = ra.run(encoded); + if (accepts) { + assertTrue(getDistance(input, encoded) <= distance); + } else { + assertTrue(getDistance(input, encoded) > distance); + } + } + } + + //***************************** + // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) + //***************************** + private int getDistance (String target, String other) { + char[] sa; + int n; + int p[]; //'previous' cost array, horizontally + int d[]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length()+1 by t.length()+1, + we maintain two single-dimensional arrays of length s.length()+1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + sa = target.toCharArray(); + n = sa.length; + p = new int[n+1]; + d = new int[n+1]; + + final int m = other.length(); + if (n == 0 || m == 0) { + if (n == m) { + return 0; + } + else { + return Math.max(n, m); + } + } + + + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t + + char t_j; // jth character of t + + int cost; // cost + + for (i = 0; i<=n; i++) { + p[i] = i; + } + + for (j = 1; j<=m; j++) { + t_j = other.charAt(j-1); + d[0] = j; + + for (i=1; i<=n; i++) { + cost = sa[i-1]==t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return Math.abs(p[n]); + } +} diff --git a/lucene/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/src/test/org/apache/lucene/util/packed/TestPackedInts.java new file mode 100644 index 00000000000..27898f62839 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -0,0 +1,225 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.*; +import org.apache.lucene.util.LuceneTestCase; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.io.IOException; + +public class TestPackedInts extends LuceneTestCase { + + private Random rnd; + + public void testBitsRequired() throws Exception { + assertEquals(61, PackedInts.bitsRequired((long)Math.pow(2, 61)-1)); + assertEquals(61, PackedInts.bitsRequired(0x1FFFFFFFFFFFFFFFL)); + assertEquals(62, PackedInts.bitsRequired(0x3FFFFFFFFFFFFFFFL)); + assertEquals(63, PackedInts.bitsRequired(0x7FFFFFFFFFFFFFFFL)); + } + + public void testMaxValues() throws Exception { + assertEquals("1 bit -> max == 1", + 1, PackedInts.maxValue(1)); + assertEquals("2 bit -> max == 3", + 3, PackedInts.maxValue(2)); + assertEquals("8 bit -> max == 255", + 255, PackedInts.maxValue(8)); + assertEquals("63 bit -> max == Long.MAX_VALUE", + Long.MAX_VALUE, PackedInts.maxValue(63)); + assertEquals("64 bit -> max == Long.MAX_VALUE (same as for 63 bit)", + Long.MAX_VALUE, PackedInts.maxValue(64)); + } + + public void testPackedInts() throws IOException { + rnd = newRandom(); + for(int iter=0;iter<5;iter++) { + long ceil = 2; + for(int nbits=1;nbits<63;nbits++) { + final int valueCount = 100+rnd.nextInt(500); + final Directory d = new MockRAMDirectory(); + + IndexOutput out = d.createOutput("out.bin"); + PackedInts.Writer w = PackedInts.getWriter( + out, valueCount, nbits); + + final long[] values = new long[valueCount]; + for(int i=0;i packedInts = + createPackedInts(VALUE_COUNT, BITS_PER_VALUE); + for (PackedInts.Mutable packedInt: packedInts) { + for (int i = 0 ; i < packedInt.size() ; i++) { + packedInt.set(i, i+1); + } + } + assertListEquality(packedInts); + } + + public void testRandomEquality() { + final int[] VALUE_COUNTS = new int[]{0, 1, 5, 8, 100, 500}; + final int MIN_BITS_PER_VALUE = 1; + final int MAX_BITS_PER_VALUE = 64; + + rnd = newRandom(); + + for (int valueCount: VALUE_COUNTS) { + for (int bitsPerValue = MIN_BITS_PER_VALUE ; + bitsPerValue <= MAX_BITS_PER_VALUE ; + bitsPerValue++) { + assertRandomEquality(valueCount, bitsPerValue, rnd.nextLong()); + } + } + } + + private void assertRandomEquality(int valueCount, int bitsPerValue, long randomSeed) { + List packedInts = createPackedInts(valueCount, bitsPerValue); + for (PackedInts.Mutable packedInt: packedInts) { + try { + fill(packedInt, (long)(Math.pow(2, bitsPerValue)-1), randomSeed); + } catch (Exception e) { + e.printStackTrace(System.err); + fail(String.format( + "Exception while filling %s: valueCount=%d, bitsPerValue=%s", + packedInt.getClass().getSimpleName(), + valueCount, bitsPerValue)); + } + } + assertListEquality(packedInts); + } + + private List createPackedInts( + int valueCount, int bitsPerValue) { + List packedInts = new ArrayList(); + if (bitsPerValue <= 8) { + packedInts.add(new Direct8(valueCount)); + } + if (bitsPerValue <= 16) { + packedInts.add(new Direct16(valueCount)); + } + if (bitsPerValue <= 31) { + packedInts.add(new Packed32(valueCount, bitsPerValue)); + } + if (bitsPerValue <= 32) { + packedInts.add(new Direct32(valueCount)); + } + if (bitsPerValue <= 63) { + packedInts.add(new Packed64(valueCount, bitsPerValue)); + } + packedInts.add(new Direct64(valueCount)); + return packedInts; + } + + private void fill(PackedInts.Mutable packedInt, long maxValue, long randomSeed) { + Random rnd2 = new Random(randomSeed); + maxValue++; + for (int i = 0 ; i < packedInt.size() ; i++) { + long value = Math.abs(rnd2.nextLong() % maxValue); + packedInt.set(i, value); + assertEquals(String.format( + "The set/get of the value at index %d should match for %s", + i, packedInt.getClass().getSimpleName()), + value, packedInt.get(i)); + } + } + + private void assertListEquality( + List packedInts) { + assertListEquality("", packedInts); + } + + private void assertListEquality( + String message, List packedInts) { + if (packedInts.size() == 0) { + return; + } + PackedInts.Reader base = packedInts.get(0); + int valueCount = base.size(); + for (PackedInts.Reader packedInt: packedInts) { + assertEquals(message + ". The number of values should be the same ", + valueCount, packedInt.size()); + } + for (int i = 0 ; i < valueCount ; i++) { + for (int j = 1 ; j < packedInts.size() ; j++) { + assertEquals(String.format( + "%s. The value at index %d should be the same for %s and %s", + message, i, base.getClass().getSimpleName(), + packedInts.get(j).getClass().getSimpleName()), + base.get(i), packedInts.get(j).get(i)); + } + } + } + + public void testSingleValue() throws Exception { + Directory dir = new MockRAMDirectory(); + IndexOutput out = dir.createOutput("out"); + PackedInts.Writer w = PackedInts.getWriter(out, 1, 8); + w.add(17); + w.finish(); + final long end = out.getFilePointer(); + out.close(); + + IndexInput in = dir.openInput("out"); + PackedInts.Reader r = PackedInts.getReader(in); + assertEquals(end, in.getFilePointer()); + in.close(); + + dir.close(); + } +} diff --git a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java index e67f339e5b7..beb8bab5e70 100644 --- a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java +++ b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java @@ -20,12 +20,8 @@ import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; @@ -139,15 +135,29 @@ public class AnalysisRequestHandler extends RequestHandlerBase { // outer is namedList since order of tokens is important NamedList> tokens = new NamedList>(); // TODO: support custom attributes - TermAttribute termAtt = (TermAttribute) tstream.addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) tstream.addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) tstream.addAttribute(TypeAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tstream.addAttribute(PositionIncrementAttribute.class); + TermAttribute termAtt = null; + TermToBytesRefAttribute bytesAtt = null; + if (tstream.hasAttribute(TermAttribute.class)) { + termAtt = tstream.getAttribute(TermAttribute.class); + } else if (tstream.hasAttribute(TermToBytesRefAttribute.class)) { + bytesAtt = tstream.getAttribute(TermToBytesRefAttribute.class); + } + final OffsetAttribute offsetAtt = tstream.addAttribute(OffsetAttribute.class); + final TypeAttribute typeAtt = tstream.addAttribute(TypeAttribute.class); + final PositionIncrementAttribute posIncAtt = tstream.addAttribute(PositionIncrementAttribute.class); + final BytesRef bytes = new BytesRef(); while (tstream.incrementToken()) { NamedList token = new SimpleOrderedMap(); tokens.add("token", token); - token.add("value", new String(termAtt.termBuffer(), 0, termAtt.termLength())); + if (termAtt != null) { + token.add("value", termAtt.term()); + } + if (bytesAtt != null) { + bytesAtt.toBytesRef(bytes); + // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! + token.add("value", bytes.utf8ToString()); + } token.add("start", offsetAtt.startOffset()); token.add("end", offsetAtt.endOffset()); token.add("posInc", posIncAtt.getPositionIncrement()); diff --git a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java index f8f9529b375..1b2e8b14dcb 100644 --- a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java +++ b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java @@ -22,12 +22,8 @@ import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.util.BytesRef; import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; @@ -147,25 +143,33 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { */ private List analyzeTokenStream(TokenStream tokenStream) { List tokens = new ArrayList(); - - // TODO change this API to support custom attributes - TermAttribute termAtt = (TermAttribute) - tokenStream.addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) - tokenStream.addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) - tokenStream.addAttribute(TypeAttribute.class); - FlagsAttribute flagsAtt = (FlagsAttribute) - tokenStream.addAttribute(FlagsAttribute.class); - PayloadAttribute payloadAtt = (PayloadAttribute) - tokenStream.addAttribute(PayloadAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) - tokenStream.addAttribute(PositionIncrementAttribute.class); + // TODO change this API to support custom attributes + TermAttribute termAtt = null; + TermToBytesRefAttribute bytesAtt = null; + if (tokenStream.hasAttribute(TermAttribute.class)) { + termAtt = tokenStream.getAttribute(TermAttribute.class); + } else if (tokenStream.hasAttribute(TermToBytesRefAttribute.class)) { + bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); + } + final OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); + final TypeAttribute typeAtt = tokenStream.addAttribute(TypeAttribute.class); + final PositionIncrementAttribute posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); + final FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); + final PayloadAttribute payloadAtt = tokenStream.addAttribute(PayloadAttribute.class); + + final BytesRef bytes = new BytesRef(); try { while (tokenStream.incrementToken()) { Token token = new Token(); - token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); + if (termAtt != null) { + token.setTermBuffer(termAtt.term()); + } + if (bytesAtt != null) { + bytesAtt.toBytesRef(bytes); + // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! + token.setTermBuffer(bytes.utf8ToString()); + } token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); diff --git a/solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java b/solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java index 3ca560eb285..2bfa205b9c7 100755 --- a/solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java +++ b/solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java @@ -23,6 +23,7 @@ import java.util.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.util.NamedList; @@ -80,12 +81,12 @@ public class PHPSerializedResponseWriter implements QueryResponseWriter { class PHPSerializedWriter extends JSONWriter { final private boolean CESU8; - final UnicodeUtil.UTF8Result utf8; + final BytesRef utf8; public PHPSerializedWriter(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp, boolean CESU8) { super(writer, req, rsp); this.CESU8 = CESU8; - this.utf8 = CESU8 ? null : new UnicodeUtil.UTF8Result(); + this.utf8 = CESU8 ? null : new BytesRef(10); // never indent serialized PHP data doIndent = false; } diff --git a/solr/src/java/org/apache/solr/schema/TrieDateField.java b/solr/src/java/org/apache/solr/schema/TrieDateField.java index 8379431aa89..0f313e47262 100755 --- a/solr/src/java/org/apache/solr/schema/TrieDateField.java +++ b/solr/src/java/org/apache/solr/schema/TrieDateField.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.Query; import org.apache.lucene.search.NumericRangeQuery; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; @@ -126,7 +127,10 @@ public class TrieDateField extends DateField { @Override public String readableToIndexed(String val) { - return NumericUtils.longToPrefixCoded(super.parseMath(null, val).getTime()); + // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! + BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); + NumericUtils.longToPrefixCoded(super.parseMath(null, val).getTime(), 0, bytes); + return bytes.utf8ToString(); } @Override @@ -142,7 +146,8 @@ public class TrieDateField extends DateField { } @Override - public String indexedToReadable(String indexedForm) { + public String indexedToReadable(String _indexedForm) { + final BytesRef indexedForm = new BytesRef(_indexedForm); return super.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); } diff --git a/solr/src/java/org/apache/solr/schema/TrieField.java b/solr/src/java/org/apache/solr/schema/TrieField.java index e5b10678ed5..850bdbe12c1 100644 --- a/solr/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/src/java/org/apache/solr/schema/TrieField.java @@ -19,6 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field; import org.apache.lucene.search.*; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; @@ -322,20 +323,28 @@ public class TrieField extends FieldType { @Override public String readableToIndexed(String val) { + // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! + BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); switch (type) { case INTEGER: - return NumericUtils.intToPrefixCoded(Integer.parseInt(val)); + NumericUtils.intToPrefixCoded(Integer.parseInt(val), 0, bytes); + break; case FLOAT: - return NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(val))); + NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(val)), 0, bytes); + break; case LONG: - return NumericUtils.longToPrefixCoded(Long.parseLong(val)); + NumericUtils.longToPrefixCoded(Long.parseLong(val), 0, bytes); + break; case DOUBLE: - return NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(val))); + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(val)), 0, bytes); + break; case DATE: - return NumericUtils.longToPrefixCoded(dateField.parseMath(null, val).getTime()); + NumericUtils.longToPrefixCoded(dateField.parseMath(null, val).getTime(), 0, bytes); + break; default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type); } + return bytes.utf8ToString(); } @@ -371,7 +380,8 @@ public class TrieField extends FieldType { } @Override - public String indexedToReadable(String indexedForm) { + public String indexedToReadable(String _indexedForm) { + final BytesRef indexedForm = new BytesRef(_indexedForm); switch (type) { case INTEGER: return Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) ); diff --git a/solr/src/test/org/apache/solr/search/TestDocSet.java b/solr/src/test/org/apache/solr/search/TestDocSet.java index 920f5946fa2..0c061affd64 100644 --- a/solr/src/test/org/apache/solr/search/TestDocSet.java +++ b/solr/src/test/org/apache/solr/search/TestDocSet.java @@ -355,7 +355,7 @@ public class TestDocSet extends TestCase { return r; } - public IndexReader dummyMultiReader(int nSeg, int maxDoc) { + public IndexReader dummyMultiReader(int nSeg, int maxDoc) throws IOException { if (nSeg==1 && rand.nextBoolean()) return dummyIndexReader(rand.nextInt(maxDoc)); IndexReader[] subs = new IndexReader[rand.nextInt(nSeg)+1];