mirror of https://github.com/apache/lucene.git
LUCENE-5915: remove pulsing
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1621961 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec0a99552d
commit
7e69874ea5
|
@ -88,6 +88,8 @@ Other
|
|||
* LUCENE-5858: Moved compatibility codecs to 'lucene-backward-codecs.jar'.
|
||||
(Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-5915: Remove Pulsing postings format. (Robert Muir)
|
||||
|
||||
======================= Lucene 4.11.0 ======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -839,7 +839,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
|
||||
// TODO: implement reuse, something like Pulsing:
|
||||
// TODO: implement reuse
|
||||
// it's hairy!
|
||||
|
||||
if (terms[termOrd] instanceof LowFreqTerm) {
|
||||
|
@ -916,7 +916,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
return null;
|
||||
}
|
||||
|
||||
// TODO: implement reuse, something like Pulsing:
|
||||
// TODO: implement reuse
|
||||
// it's hairy!
|
||||
|
||||
if (terms[termOrd] instanceof LowFreqTerm) {
|
||||
|
@ -1437,7 +1437,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
|
||||
// TODO: implement reuse, something like Pulsing:
|
||||
// TODO: implement reuse
|
||||
// it's hairy!
|
||||
|
||||
if (terms[termOrd] instanceof LowFreqTerm) {
|
||||
|
@ -1473,7 +1473,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
return null;
|
||||
}
|
||||
|
||||
// TODO: implement reuse, something like Pulsing:
|
||||
// TODO: implement reuse
|
||||
// it's hairy!
|
||||
|
||||
if (terms[termOrd] instanceof LowFreqTerm) {
|
||||
|
|
|
@ -1,88 +0,0 @@
|
|||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** FSTOrd + Pulsing41
|
||||
* @lucene.experimental */
|
||||
|
||||
public class FSTOrdPulsing41PostingsFormat extends PostingsFormat {
|
||||
private final PostingsBaseFormat wrappedPostingsBaseFormat;
|
||||
private final int freqCutoff;
|
||||
|
||||
public FSTOrdPulsing41PostingsFormat() {
|
||||
this(1);
|
||||
}
|
||||
|
||||
public FSTOrdPulsing41PostingsFormat(int freqCutoff) {
|
||||
super("FSTOrdPulsing41");
|
||||
this.wrappedPostingsBaseFormat = new Lucene41PostingsBaseFormat();
|
||||
this.freqCutoff = freqCutoff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = null;
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
|
||||
pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
|
||||
FieldsConsumer ret = new FSTOrdTermsWriter(state, pulsingWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
pulsingReader = new PulsingPostingsReader(state, docsReader);
|
||||
FieldsProducer ret = new FSTOrdTermsReader(state, pulsingReader);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** FST + Pulsing41, test only, since
|
||||
* FST does no delta encoding here!
|
||||
* @lucene.experimental */
|
||||
|
||||
public class FSTPulsing41PostingsFormat extends PostingsFormat {
|
||||
private final PostingsBaseFormat wrappedPostingsBaseFormat;
|
||||
private final int freqCutoff;
|
||||
|
||||
public FSTPulsing41PostingsFormat() {
|
||||
this(1);
|
||||
}
|
||||
|
||||
public FSTPulsing41PostingsFormat(int freqCutoff) {
|
||||
super("FSTPulsing41");
|
||||
this.wrappedPostingsBaseFormat = new Lucene41PostingsBaseFormat();
|
||||
this.freqCutoff = freqCutoff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = null;
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
|
||||
pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
|
||||
FieldsConsumer ret = new FSTTermsWriter(state, pulsingWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
pulsingReader = new PulsingPostingsReader(state, docsReader);
|
||||
FieldsProducer ret = new FSTTermsReader(state, pulsingReader);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
|
||||
|
||||
/**
|
||||
* Concrete pulsing implementation over {@link Lucene41PostingsFormat}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Pulsing41PostingsFormat extends PulsingPostingsFormat {
|
||||
|
||||
/** Inlines docFreq=1 terms, otherwise uses the normal "Lucene41" format. */
|
||||
public Pulsing41PostingsFormat() {
|
||||
this(1);
|
||||
}
|
||||
|
||||
/** Inlines docFreq=<code>freqCutoff</code> terms, otherwise uses the normal "Lucene41" format. */
|
||||
public Pulsing41PostingsFormat(int freqCutoff) {
|
||||
this(freqCutoff, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/** Inlines docFreq=<code>freqCutoff</code> terms, otherwise uses the normal "Lucene41" format. */
|
||||
public Pulsing41PostingsFormat(int freqCutoff, int minBlockSize, int maxBlockSize) {
|
||||
super("Pulsing41", new Lucene41PostingsBaseFormat(), freqCutoff, minBlockSize, maxBlockSize);
|
||||
}
|
||||
}
|
|
@ -1,119 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsBaseFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** This postings format "inlines" the postings for terms that have
|
||||
* low docFreq. It wraps another postings format, which is used for
|
||||
* writing the non-inlined terms.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
|
||||
public abstract class PulsingPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final int freqCutoff;
|
||||
private final int minBlockSize;
|
||||
private final int maxBlockSize;
|
||||
private final PostingsBaseFormat wrappedPostingsBaseFormat;
|
||||
|
||||
public PulsingPostingsFormat(String name, PostingsBaseFormat wrappedPostingsBaseFormat, int freqCutoff) {
|
||||
this(name, wrappedPostingsBaseFormat, freqCutoff, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/** Terms with freq <= freqCutoff are inlined into terms
|
||||
* dict. */
|
||||
public PulsingPostingsFormat(String name, PostingsBaseFormat wrappedPostingsBaseFormat, int freqCutoff, int minBlockSize, int maxBlockSize) {
|
||||
super(name);
|
||||
this.freqCutoff = freqCutoff;
|
||||
this.minBlockSize = minBlockSize;
|
||||
assert minBlockSize > 1;
|
||||
this.maxBlockSize = maxBlockSize;
|
||||
this.wrappedPostingsBaseFormat = wrappedPostingsBaseFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(freqCutoff=" + freqCutoff + " minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = null;
|
||||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
// Terms dict
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
|
||||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
pulsingReader = new PulsingPostingsReader(state, docsReader);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.directory, state.fieldInfos, state.segmentInfo,
|
||||
pulsingReader,
|
||||
state.context,
|
||||
state.segmentSuffix);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int getFreqCutoff() {
|
||||
return freqCutoff;
|
||||
}
|
||||
}
|
|
@ -1,661 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Concrete class that reads the current doc/freq/skip
|
||||
* postings format
|
||||
* @lucene.experimental */
|
||||
|
||||
// TODO: -- should we switch "hasProx" higher up? and
|
||||
// create two separate docs readers, one that also reads
|
||||
// prox and one that doesn't?
|
||||
|
||||
public class PulsingPostingsReader extends PostingsReaderBase {
|
||||
|
||||
// Fallback reader for non-pulsed terms:
|
||||
final PostingsReaderBase wrappedPostingsReader;
|
||||
final SegmentReadState segmentState;
|
||||
int maxPositions;
|
||||
int version;
|
||||
TreeMap<Integer, Integer> fields;
|
||||
|
||||
public PulsingPostingsReader(SegmentReadState state, PostingsReaderBase wrappedPostingsReader) {
|
||||
this.wrappedPostingsReader = wrappedPostingsReader;
|
||||
this.segmentState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexInput termsIn) throws IOException {
|
||||
version = CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC,
|
||||
PulsingPostingsWriter.VERSION_START,
|
||||
PulsingPostingsWriter.VERSION_CURRENT);
|
||||
maxPositions = termsIn.readVInt();
|
||||
wrappedPostingsReader.init(termsIn);
|
||||
if (wrappedPostingsReader instanceof PulsingPostingsReader ||
|
||||
version < PulsingPostingsWriter.VERSION_META_ARRAY) {
|
||||
fields = null;
|
||||
} else {
|
||||
fields = new TreeMap<>();
|
||||
String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, PulsingPostingsWriter.SUMMARY_EXTENSION);
|
||||
IndexInput in = null;
|
||||
try {
|
||||
in = segmentState.directory.openInput(summaryFileName, segmentState.context);
|
||||
CodecUtil.checkHeader(in, PulsingPostingsWriter.CODEC, version,
|
||||
PulsingPostingsWriter.VERSION_CURRENT);
|
||||
int numField = in.readVInt();
|
||||
for (int i = 0; i < numField; i++) {
|
||||
int fieldNum = in.readVInt();
|
||||
int longsSize = in.readVInt();
|
||||
fields.put(fieldNum, longsSize);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class PulsingTermState extends BlockTermState {
|
||||
private boolean absolute = false;
|
||||
private long[] longs;
|
||||
private byte[] postings;
|
||||
private int postingsSize; // -1 if this term was not inlined
|
||||
private BlockTermState wrappedTermState;
|
||||
|
||||
@Override
|
||||
public PulsingTermState clone() {
|
||||
PulsingTermState clone;
|
||||
clone = (PulsingTermState) super.clone();
|
||||
if (postingsSize != -1) {
|
||||
clone.postings = new byte[postingsSize];
|
||||
System.arraycopy(postings, 0, clone.postings, 0, postingsSize);
|
||||
} else {
|
||||
assert wrappedTermState != null;
|
||||
clone.wrappedTermState = (BlockTermState) wrappedTermState.clone();
|
||||
clone.absolute = absolute;
|
||||
if (longs != null) {
|
||||
clone.longs = new long[longs.length];
|
||||
System.arraycopy(longs, 0, clone.longs, 0, longs.length);
|
||||
}
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
super.copyFrom(_other);
|
||||
PulsingTermState other = (PulsingTermState) _other;
|
||||
postingsSize = other.postingsSize;
|
||||
if (other.postingsSize != -1) {
|
||||
if (postings == null || postings.length < other.postingsSize) {
|
||||
postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)];
|
||||
}
|
||||
System.arraycopy(other.postings, 0, postings, 0, other.postingsSize);
|
||||
} else {
|
||||
wrappedTermState.copyFrom(other.wrappedTermState);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (postingsSize == -1) {
|
||||
return "PulsingTermState: not inlined: wrapped=" + wrappedTermState;
|
||||
} else {
|
||||
return "PulsingTermState: inlined size=" + postingsSize + " " + super.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState newTermState() throws IOException {
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
state.wrappedTermState = wrappedPostingsReader.newTermState();
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException {
|
||||
//System.out.println("PR nextTerm");
|
||||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
assert empty.length == 0;
|
||||
termState.absolute = termState.absolute || absolute;
|
||||
// if we have positions, its total TF, otherwise its computed based on docFreq.
|
||||
long count = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq;
|
||||
//System.out.println(" count=" + count + " threshold=" + maxPositions);
|
||||
|
||||
if (count <= maxPositions) {
|
||||
// Inlined into terms dict -- just read the byte[] blob in,
|
||||
// but don't decode it now (we only decode when a DocsEnum
|
||||
// or D&PEnum is pulled):
|
||||
termState.postingsSize = in.readVInt();
|
||||
if (termState.postings == null || termState.postings.length < termState.postingsSize) {
|
||||
termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)];
|
||||
}
|
||||
// TODO: sort of silly to copy from one big byte[]
|
||||
// (the blob holding all inlined terms' blobs for
|
||||
// current term block) into another byte[] (just the
|
||||
// blob for this term)...
|
||||
in.readBytes(termState.postings, 0, termState.postingsSize);
|
||||
//System.out.println(" inlined bytes=" + termState.postingsSize);
|
||||
termState.absolute = termState.absolute || absolute;
|
||||
} else {
|
||||
//System.out.println(" not inlined");
|
||||
final int longsSize = fields == null ? 0 : fields.get(fieldInfo.number);
|
||||
if (termState.longs == null) {
|
||||
termState.longs = new long[longsSize];
|
||||
}
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
termState.longs[i] = in.readVLong();
|
||||
}
|
||||
termState.postingsSize = -1;
|
||||
termState.wrappedTermState.docFreq = termState.docFreq;
|
||||
termState.wrappedTermState.totalTermFreq = termState.totalTermFreq;
|
||||
wrappedPostingsReader.decodeTerm(termState.longs, in, fieldInfo, termState.wrappedTermState, termState.absolute);
|
||||
termState.absolute = false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
|
||||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
if (termState.postingsSize != -1) {
|
||||
PulsingDocsEnum postings;
|
||||
if (reuse instanceof PulsingDocsEnum) {
|
||||
postings = (PulsingDocsEnum) reuse;
|
||||
if (!postings.canReuse(field)) {
|
||||
postings = new PulsingDocsEnum(field);
|
||||
}
|
||||
} else {
|
||||
// the 'reuse' is actually the wrapped enum
|
||||
PulsingDocsEnum previous = (PulsingDocsEnum) getOther(reuse);
|
||||
if (previous != null && previous.canReuse(field)) {
|
||||
postings = previous;
|
||||
} else {
|
||||
postings = new PulsingDocsEnum(field);
|
||||
}
|
||||
}
|
||||
if (reuse != postings) {
|
||||
setOther(postings, reuse); // postings.other = reuse
|
||||
}
|
||||
return postings.reset(liveDocs, termState);
|
||||
} else {
|
||||
if (reuse instanceof PulsingDocsEnum) {
|
||||
DocsEnum wrapped = wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, getOther(reuse), flags);
|
||||
setOther(wrapped, reuse); // wrapped.other = reuse
|
||||
return wrapped;
|
||||
} else {
|
||||
return wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, reuse, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse,
|
||||
int flags) throws IOException {
|
||||
|
||||
final PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
||||
if (termState.postingsSize != -1) {
|
||||
PulsingDocsAndPositionsEnum postings;
|
||||
if (reuse instanceof PulsingDocsAndPositionsEnum) {
|
||||
postings = (PulsingDocsAndPositionsEnum) reuse;
|
||||
if (!postings.canReuse(field)) {
|
||||
postings = new PulsingDocsAndPositionsEnum(field);
|
||||
}
|
||||
} else {
|
||||
// the 'reuse' is actually the wrapped enum
|
||||
PulsingDocsAndPositionsEnum previous = (PulsingDocsAndPositionsEnum) getOther(reuse);
|
||||
if (previous != null && previous.canReuse(field)) {
|
||||
postings = previous;
|
||||
} else {
|
||||
postings = new PulsingDocsAndPositionsEnum(field);
|
||||
}
|
||||
}
|
||||
if (reuse != postings) {
|
||||
setOther(postings, reuse); // postings.other = reuse
|
||||
}
|
||||
return postings.reset(liveDocs, termState);
|
||||
} else {
|
||||
if (reuse instanceof PulsingDocsAndPositionsEnum) {
|
||||
DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse),
|
||||
flags);
|
||||
setOther(wrapped, reuse); // wrapped.other = reuse
|
||||
return wrapped;
|
||||
} else {
|
||||
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class PulsingDocsEnum extends DocsEnum {
|
||||
private byte[] postingsBytes;
|
||||
private final ByteArrayDataInput postings = new ByteArrayDataInput();
|
||||
private final IndexOptions indexOptions;
|
||||
private final boolean storePayloads;
|
||||
private final boolean storeOffsets;
|
||||
private Bits liveDocs;
|
||||
private int docID = -1;
|
||||
private int accum;
|
||||
private int freq;
|
||||
private int payloadLength;
|
||||
private int cost;
|
||||
|
||||
public PulsingDocsEnum(FieldInfo fieldInfo) {
|
||||
indexOptions = fieldInfo.getIndexOptions();
|
||||
storePayloads = fieldInfo.hasPayloads();
|
||||
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
public PulsingDocsEnum reset(Bits liveDocs, PulsingTermState termState) {
|
||||
//System.out.println("PR docsEnum termState=" + termState + " docFreq=" + termState.docFreq);
|
||||
assert termState.postingsSize != -1;
|
||||
|
||||
// Must make a copy of termState's byte[] so that if
|
||||
// app does TermsEnum.next(), this DocsEnum is not affected
|
||||
if (postingsBytes == null) {
|
||||
postingsBytes = new byte[termState.postingsSize];
|
||||
} else if (postingsBytes.length < termState.postingsSize) {
|
||||
postingsBytes = ArrayUtil.grow(postingsBytes, termState.postingsSize);
|
||||
}
|
||||
System.arraycopy(termState.postings, 0, postingsBytes, 0, termState.postingsSize);
|
||||
postings.reset(postingsBytes, 0, termState.postingsSize);
|
||||
docID = -1;
|
||||
accum = 0;
|
||||
freq = 1;
|
||||
cost = termState.docFreq;
|
||||
payloadLength = 0;
|
||||
this.liveDocs = liveDocs;
|
||||
return this;
|
||||
}
|
||||
|
||||
boolean canReuse(FieldInfo fieldInfo) {
|
||||
return indexOptions == fieldInfo.getIndexOptions() && storePayloads == fieldInfo.hasPayloads();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
//System.out.println("PR nextDoc this= "+ this);
|
||||
while(true) {
|
||||
if (postings.eof()) {
|
||||
//System.out.println("PR END");
|
||||
return docID = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
final int code = postings.readVInt();
|
||||
//System.out.println(" read code=" + code);
|
||||
if (indexOptions == IndexOptions.DOCS_ONLY) {
|
||||
accum += code;
|
||||
} else {
|
||||
accum += code >>> 1; // shift off low bit
|
||||
if ((code & 1) != 0) { // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
} else {
|
||||
freq = postings.readVInt(); // else read freq
|
||||
}
|
||||
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
// Skip positions
|
||||
if (storePayloads) {
|
||||
for(int pos=0;pos<freq;pos++) {
|
||||
final int posCode = postings.readVInt();
|
||||
if ((posCode & 1) != 0) {
|
||||
payloadLength = postings.readVInt();
|
||||
}
|
||||
if (storeOffsets && (postings.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
postings.readVInt();
|
||||
}
|
||||
if (payloadLength != 0) {
|
||||
postings.skipBytes(payloadLength);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(int pos=0;pos<freq;pos++) {
|
||||
// TODO: skipVInt
|
||||
postings.readVInt();
|
||||
if (storeOffsets && (postings.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
postings.readVInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (liveDocs == null || liveDocs.get(accum)) {
|
||||
return (docID = accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return docID = slowAdvance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
}
|
||||
|
||||
private static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
private byte[] postingsBytes;
|
||||
private final ByteArrayDataInput postings = new ByteArrayDataInput();
|
||||
private final boolean storePayloads;
|
||||
private final boolean storeOffsets;
|
||||
// note: we could actually reuse across different options, if we passed this to reset()
|
||||
// and re-init'ed storeOffsets accordingly (made it non-final)
|
||||
private final IndexOptions indexOptions;
|
||||
|
||||
private Bits liveDocs;
|
||||
private int docID = -1;
|
||||
private int accum;
|
||||
private int freq;
|
||||
private int posPending;
|
||||
private int position;
|
||||
private int payloadLength;
|
||||
private BytesRefBuilder payload;
|
||||
private int startOffset;
|
||||
private int offsetLength;
|
||||
|
||||
private boolean payloadRetrieved;
|
||||
private int cost;
|
||||
|
||||
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
|
||||
indexOptions = fieldInfo.getIndexOptions();
|
||||
storePayloads = fieldInfo.hasPayloads();
|
||||
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
boolean canReuse(FieldInfo fieldInfo) {
|
||||
return indexOptions == fieldInfo.getIndexOptions() && storePayloads == fieldInfo.hasPayloads();
|
||||
}
|
||||
|
||||
public PulsingDocsAndPositionsEnum reset(Bits liveDocs, PulsingTermState termState) {
|
||||
assert termState.postingsSize != -1;
|
||||
if (postingsBytes == null) {
|
||||
postingsBytes = new byte[termState.postingsSize];
|
||||
} else if (postingsBytes.length < termState.postingsSize) {
|
||||
postingsBytes = ArrayUtil.grow(postingsBytes, termState.postingsSize);
|
||||
}
|
||||
System.arraycopy(termState.postings, 0, postingsBytes, 0, termState.postingsSize);
|
||||
postings.reset(postingsBytes, 0, termState.postingsSize);
|
||||
this.liveDocs = liveDocs;
|
||||
payloadLength = 0;
|
||||
posPending = 0;
|
||||
docID = -1;
|
||||
accum = 0;
|
||||
cost = termState.docFreq;
|
||||
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
|
||||
offsetLength = 0;
|
||||
//System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
//System.out.println("PR d&p nextDoc this=" + this);
|
||||
|
||||
while(true) {
|
||||
//System.out.println(" cycle skip posPending=" + posPending);
|
||||
|
||||
skipPositions();
|
||||
|
||||
if (postings.eof()) {
|
||||
//System.out.println("PR END");
|
||||
return docID = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
final int code = postings.readVInt();
|
||||
accum += code >>> 1; // shift off low bit
|
||||
if ((code & 1) != 0) { // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
} else {
|
||||
freq = postings.readVInt(); // else read freq
|
||||
}
|
||||
posPending = freq;
|
||||
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
|
||||
|
||||
if (liveDocs == null || liveDocs.get(accum)) {
|
||||
//System.out.println(" return docID=" + docID + " freq=" + freq);
|
||||
position = 0;
|
||||
return (docID = accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return docID = slowAdvance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
//System.out.println("PR d&p nextPosition posPending=" + posPending + " vs freq=" + freq);
|
||||
|
||||
assert posPending > 0;
|
||||
posPending--;
|
||||
|
||||
if (storePayloads) {
|
||||
if (!payloadRetrieved) {
|
||||
//System.out.println("PR skip payload=" + payloadLength);
|
||||
postings.skipBytes(payloadLength);
|
||||
}
|
||||
final int code = postings.readVInt();
|
||||
//System.out.println("PR code=" + code);
|
||||
if ((code & 1) != 0) {
|
||||
payloadLength = postings.readVInt();
|
||||
//System.out.println("PR new payload len=" + payloadLength);
|
||||
}
|
||||
position += code >>> 1;
|
||||
payloadRetrieved = false;
|
||||
} else {
|
||||
position += postings.readVInt();
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
int offsetCode = postings.readVInt();
|
||||
if ((offsetCode & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = postings.readVInt();
|
||||
}
|
||||
startOffset += offsetCode >>> 1;
|
||||
}
|
||||
|
||||
//System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
|
||||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return startOffset + offsetLength;
|
||||
}
|
||||
|
||||
private void skipPositions() throws IOException {
|
||||
while(posPending != 0) {
|
||||
nextPosition();
|
||||
}
|
||||
if (storePayloads && !payloadRetrieved) {
|
||||
//System.out.println(" skip payload len=" + payloadLength);
|
||||
postings.skipBytes(payloadLength);
|
||||
payloadRetrieved = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
//System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this);
|
||||
if (payloadRetrieved) {
|
||||
return payload.get();
|
||||
} else if (storePayloads && payloadLength > 0) {
|
||||
payloadRetrieved = true;
|
||||
if (payload == null) {
|
||||
payload = new BytesRefBuilder();
|
||||
}
|
||||
payload.grow(payloadLength);
|
||||
postings.readBytes(payload.bytes(), 0, payloadLength);
|
||||
payload.setLength(payloadLength);
|
||||
return payload.get();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
wrappedPostingsReader.close();
|
||||
}
|
||||
|
||||
/** for a docsenum, gets the 'other' reused enum.
|
||||
* Example: Pulsing(Standard).
|
||||
* when doing a term range query you are switching back and forth
|
||||
* between Pulsing and Standard
|
||||
*
|
||||
* The way the reuse works is that Pulsing.other = Standard and
|
||||
* Standard.other = Pulsing.
|
||||
*/
|
||||
private DocsEnum getOther(DocsEnum de) {
|
||||
if (de == null) {
|
||||
return null;
|
||||
} else {
|
||||
final AttributeSource atts = de.attributes();
|
||||
return atts.addAttribute(PulsingEnumAttribute.class).enums().get(this);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* for a docsenum, sets the 'other' reused enum.
|
||||
* see getOther for an example.
|
||||
*/
|
||||
private DocsEnum setOther(DocsEnum de, DocsEnum other) {
|
||||
final AttributeSource atts = de.attributes();
|
||||
return atts.addAttribute(PulsingEnumAttribute.class).enums().put(this, other);
|
||||
}
|
||||
|
||||
/**
|
||||
* A per-docsenum attribute that stores additional reuse information
|
||||
* so that pulsing enums can keep a reference to their wrapped enums,
|
||||
* and vice versa. this way we can always reuse.
|
||||
*
|
||||
* @lucene.internal */
|
||||
public static interface PulsingEnumAttribute extends Attribute {
|
||||
public Map<PulsingPostingsReader,DocsEnum> enums();
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of {@link PulsingEnumAttribute} for reuse of
|
||||
* wrapped postings readers underneath pulsing.
|
||||
*
|
||||
* @lucene.internal */
|
||||
public static final class PulsingEnumAttributeImpl extends AttributeImpl implements PulsingEnumAttribute {
|
||||
// we could store 'other', but what if someone 'chained' multiple postings readers,
|
||||
// this could cause problems?
|
||||
// TODO: we should consider nuking this map and just making it so if you do this,
|
||||
// you don't reuse? and maybe pulsingPostingsReader should throw an exc if it wraps
|
||||
// another pulsing, because this is just stupid and wasteful.
|
||||
// we still have to be careful in case someone does Pulsing(Stomping(Pulsing(...
|
||||
private final Map<PulsingPostingsReader,DocsEnum> enums =
|
||||
new IdentityHashMap<>();
|
||||
|
||||
@Override
|
||||
public Map<PulsingPostingsReader,DocsEnum> enums() {
|
||||
return enums;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
// our state is per-docsenum, so this makes no sense.
|
||||
// its best not to clear, in case a wrapped enum has a per-doc attribute or something
|
||||
// and is calling clearAttributes(), so they don't nuke the reuse information!
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
// this makes no sense for us, because our state is per-docsenum.
|
||||
// we don't want to copy any stuff over to another docsenum ever!
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return ((wrappedPostingsReader!=null) ? wrappedPostingsReader.ramBytesUsed(): 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
wrappedPostingsReader.checkIntegrity();
|
||||
}
|
||||
}
|
|
@ -1,378 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// TODO: we now inline based on total TF of the term,
|
||||
// but it might be better to inline by "net bytes used"
|
||||
// so that a term that has only 1 posting but a huge
|
||||
// payload would not be inlined. Though this is
|
||||
// presumably rare in practice...
|
||||
|
||||
/**
|
||||
* Writer for the pulsing format.
|
||||
* <p>
|
||||
* Wraps another postings implementation and decides
|
||||
* (based on total number of occurrences), whether a terms
|
||||
* postings should be inlined into the term dictionary,
|
||||
* or passed through to the wrapped writer.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public final class PulsingPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
final static String CODEC = "PulsedPostingsWriter";
|
||||
|
||||
// recording field summary
|
||||
final static String SUMMARY_EXTENSION = "smy";
|
||||
|
||||
// To add a new version, increment from the last one, and
|
||||
// change VERSION_CURRENT to point to your new version:
|
||||
final static int VERSION_START = 0;
|
||||
|
||||
final static int VERSION_META_ARRAY = 1;
|
||||
|
||||
final static int VERSION_CURRENT = VERSION_META_ARRAY;
|
||||
|
||||
private SegmentWriteState segmentState;
|
||||
|
||||
private List<FieldMetaData> fields;
|
||||
|
||||
// Reused by writeTerm:
|
||||
private DocsEnum docsEnum;
|
||||
private DocsAndPositionsEnum posEnum;
|
||||
private int enumFlags;
|
||||
|
||||
private final RAMOutputStream buffer = new RAMOutputStream();
|
||||
|
||||
private IndexOptions indexOptions;
|
||||
|
||||
// information for wrapped PF, in current field
|
||||
private int longsSize;
|
||||
private long[] longs;
|
||||
private boolean fieldHasFreqs;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
boolean absolute;
|
||||
|
||||
private static class PulsingTermState extends BlockTermState {
|
||||
private byte[] bytes;
|
||||
private BlockTermState wrappedState;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (bytes != null) {
|
||||
return "inlined";
|
||||
} else {
|
||||
return "not inlined wrapped=" + wrappedState;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class FieldMetaData {
|
||||
int fieldNumber;
|
||||
int longsSize;
|
||||
FieldMetaData(int number, int size) {
|
||||
fieldNumber = number;
|
||||
longsSize = size;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: -- lazy init this? ie, if every single term
|
||||
// was inlined (eg for a "primary key" field) then we
|
||||
// never need to use this fallback? Fallback writer for
|
||||
// non-inlined terms:
|
||||
final PostingsWriterBase wrappedPostingsWriter;
|
||||
|
||||
final int maxPositions;
|
||||
|
||||
/** If the total number of positions (summed across all docs
|
||||
* for this term) is <= maxPositions, then the postings are
|
||||
* inlined into terms dict */
|
||||
public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
|
||||
fields = new ArrayList<>();
|
||||
this.maxPositions = maxPositions;
|
||||
// We simply wrap another postings writer, but only call
|
||||
// on it when tot positions is >= the cutoff:
|
||||
this.wrappedPostingsWriter = wrappedPostingsWriter;
|
||||
this.segmentState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexOutput termsOut) throws IOException {
|
||||
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(maxPositions); // encode maxPositions in header
|
||||
wrappedPostingsWriter.init(termsOut);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
|
||||
|
||||
// First pass: figure out whether we should pulse this term
|
||||
long posCount = 0;
|
||||
|
||||
if (fieldHasPositions == false) {
|
||||
// No positions:
|
||||
docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
|
||||
assert docsEnum != null;
|
||||
while (posCount <= maxPositions) {
|
||||
if (docsEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
posCount++;
|
||||
}
|
||||
} else {
|
||||
posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
|
||||
assert posEnum != null;
|
||||
while (posCount <= maxPositions) {
|
||||
if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
posCount += posEnum.freq();
|
||||
}
|
||||
}
|
||||
|
||||
if (posCount == 0) {
|
||||
// All docs were deleted
|
||||
return null;
|
||||
}
|
||||
|
||||
// Second pass: write postings
|
||||
if (posCount > maxPositions) {
|
||||
// Too many positions; do not pulse. Just lset
|
||||
// wrapped postingsWriter encode the postings:
|
||||
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen);
|
||||
state.docFreq = state.wrappedState.docFreq;
|
||||
state.totalTermFreq = state.wrappedState.totalTermFreq;
|
||||
return state;
|
||||
} else {
|
||||
// Pulsed:
|
||||
if (fieldHasPositions == false) {
|
||||
docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
|
||||
} else {
|
||||
posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
|
||||
docsEnum = posEnum;
|
||||
}
|
||||
assert docsEnum != null;
|
||||
|
||||
// There were few enough total occurrences for this
|
||||
// term, so we fully inline our postings data into
|
||||
// terms dict, now:
|
||||
|
||||
// TODO: it'd be better to share this encoding logic
|
||||
// in some inner codec that knows how to write a
|
||||
// single doc / single position, etc. This way if a
|
||||
// given codec wants to store other interesting
|
||||
// stuff, it could use this pulsing codec to do so
|
||||
|
||||
int lastDocID = 0;
|
||||
int lastPayloadLength = -1;
|
||||
int lastOffsetLength = -1;
|
||||
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
while (true) {
|
||||
int docID = docsEnum.nextDoc();
|
||||
if (docID == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
docsSeen.set(docID);
|
||||
|
||||
int delta = docID - lastDocID;
|
||||
lastDocID = docID;
|
||||
|
||||
docFreq++;
|
||||
|
||||
if (fieldHasFreqs) {
|
||||
int freq = docsEnum.freq();
|
||||
totalTermFreq += freq;
|
||||
|
||||
if (freq == 1) {
|
||||
buffer.writeVInt((delta << 1) | 1);
|
||||
} else {
|
||||
buffer.writeVInt(delta << 1);
|
||||
buffer.writeVInt(freq);
|
||||
}
|
||||
|
||||
if (fieldHasPositions) {
|
||||
int lastPos = 0;
|
||||
int lastOffset = 0;
|
||||
for(int posIDX=0;posIDX<freq;posIDX++) {
|
||||
int pos = posEnum.nextPosition();
|
||||
int posDelta = pos - lastPos;
|
||||
lastPos = pos;
|
||||
int payloadLength;
|
||||
BytesRef payload;
|
||||
if (fieldHasPayloads) {
|
||||
payload = posEnum.getPayload();
|
||||
payloadLength = payload == null ? 0 : payload.length;
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
buffer.writeVInt((posDelta << 1)|1);
|
||||
buffer.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
} else {
|
||||
buffer.writeVInt(posDelta << 1);
|
||||
}
|
||||
} else {
|
||||
payloadLength = 0;
|
||||
payload = null;
|
||||
buffer.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
int startOffset = posEnum.startOffset();
|
||||
int endOffset = posEnum.endOffset();
|
||||
int offsetDelta = startOffset - lastOffset;
|
||||
int offsetLength = endOffset - startOffset;
|
||||
if (offsetLength != lastOffsetLength) {
|
||||
buffer.writeVInt(offsetDelta << 1 | 1);
|
||||
buffer.writeVInt(offsetLength);
|
||||
} else {
|
||||
buffer.writeVInt(offsetDelta << 1);
|
||||
}
|
||||
lastOffset = startOffset;
|
||||
lastOffsetLength = offsetLength;
|
||||
}
|
||||
|
||||
if (payloadLength > 0) {
|
||||
assert fieldHasPayloads;
|
||||
assert payload != null;
|
||||
buffer.writeBytes(payload.bytes, payload.offset, payload.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buffer.writeVInt(delta);
|
||||
}
|
||||
}
|
||||
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
state.bytes = new byte[(int) buffer.getFilePointer()];
|
||||
state.docFreq = docFreq;
|
||||
state.totalTermFreq = fieldHasFreqs ? totalTermFreq : -1;
|
||||
buffer.writeTo(state.bytes, 0);
|
||||
buffer.reset();
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: -- should we NOT reuse across fields? would
|
||||
// be cleaner
|
||||
|
||||
// Currently, this instance is re-used across fields, so
|
||||
// our parent calls setField whenever the field changes
|
||||
@Override
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
this.indexOptions = fieldInfo.getIndexOptions();
|
||||
//if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
absolute = false;
|
||||
longsSize = wrappedPostingsWriter.setField(fieldInfo);
|
||||
longs = new long[longsSize];
|
||||
fields.add(new FieldMetaData(fieldInfo.number, longsSize));
|
||||
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
|
||||
if (fieldHasFreqs == false) {
|
||||
enumFlags = 0;
|
||||
} else if (fieldHasPositions == false) {
|
||||
enumFlags = DocsEnum.FLAG_FREQS;
|
||||
} else if (fieldHasOffsets == false) {
|
||||
if (fieldHasPayloads) {
|
||||
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
|
||||
} else {
|
||||
enumFlags = 0;
|
||||
}
|
||||
} else {
|
||||
if (fieldHasPayloads) {
|
||||
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
|
||||
} else {
|
||||
enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
//DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
PulsingTermState state = (PulsingTermState)_state;
|
||||
assert empty.length == 0;
|
||||
this.absolute = this.absolute || absolute;
|
||||
if (state.bytes == null) {
|
||||
wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute);
|
||||
for (int i = 0; i < longsSize; i++) {
|
||||
out.writeVLong(longs[i]);
|
||||
}
|
||||
buffer.writeTo(out);
|
||||
buffer.reset();
|
||||
this.absolute = false;
|
||||
} else {
|
||||
out.writeVInt(state.bytes.length);
|
||||
out.writeBytes(state.bytes, 0, state.bytes.length);
|
||||
this.absolute = this.absolute || absolute;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
wrappedPostingsWriter.close();
|
||||
if (wrappedPostingsWriter instanceof PulsingPostingsWriter ||
|
||||
VERSION_CURRENT < VERSION_META_ARRAY) {
|
||||
return;
|
||||
}
|
||||
String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, SUMMARY_EXTENSION);
|
||||
IndexOutput out = null;
|
||||
try {
|
||||
out = segmentState.directory.createOutput(summaryFileName, segmentState.context);
|
||||
CodecUtil.writeHeader(out, CODEC, VERSION_CURRENT);
|
||||
out.writeVInt(fields.size());
|
||||
for (FieldMetaData field : fields) {
|
||||
out.writeVInt(field.fieldNumber);
|
||||
out.writeVInt(field.longsSize);
|
||||
}
|
||||
out.close();
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Pulsing Codec: inlines low frequency terms' postings into terms dictionary.
|
||||
</body>
|
||||
</html>
|
|
@ -17,9 +17,6 @@ org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat
|
|||
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
|
||||
org.apache.lucene.codecs.memory.DirectPostingsFormat
|
||||
org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
|
||||
org.apache.lucene.codecs.memory.FSTOrdPulsing41PostingsFormat
|
||||
org.apache.lucene.codecs.memory.FSTPostingsFormat
|
||||
org.apache.lucene.codecs.memory.FSTPulsing41PostingsFormat
|
||||
org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
||||
org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat
|
||||
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BasePostingsFormatTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests FSTOrdPulsing41PostingsFormat
|
||||
*/
|
||||
public class TestFSTOrdPulsing41PostingsFormat extends BasePostingsFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTOrdPulsing41PostingsFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BasePostingsFormatTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests FSTPulsing41PostingsFormat
|
||||
*/
|
||||
public class TestFSTPulsing41PostingsFormat extends BasePostingsFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTPulsing41PostingsFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
|
@ -1,156 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.DecimalFormatSymbols;
|
||||
import java.text.NumberFormat;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Pulses 10k terms/docs,
|
||||
* originally designed to find JRE bugs (https://issues.apache.org/jira/browse/LUCENE-3335)
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
@LuceneTestCase.Nightly
|
||||
public class Test10KPulsings extends LuceneTestCase {
|
||||
public void test10kPulsed() throws Exception {
|
||||
// we always run this test with pulsing codec.
|
||||
Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));
|
||||
|
||||
File f = createTempDir("10kpulsed");
|
||||
BaseDirectoryWrapper dir = newFSDirectory(f);
|
||||
dir.setCheckIndexOnClose(false); // we do this ourselves explicitly
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
|
||||
|
||||
Document document = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
|
||||
switch(TestUtil.nextInt(random(), 0, 2)) {
|
||||
case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
|
||||
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
|
||||
default: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
|
||||
}
|
||||
|
||||
Field field = newField("field", "", ft);
|
||||
document.add(field);
|
||||
|
||||
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));
|
||||
|
||||
for (int i = 0; i < 10050; i++) {
|
||||
field.setStringValue(df.format(i));
|
||||
iw.addDocument(document);
|
||||
}
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
|
||||
DocsEnum de = null;
|
||||
|
||||
for (int i = 0; i < 10050; i++) {
|
||||
String expected = df.format(i);
|
||||
assertEquals(expected, te.next().utf8ToString());
|
||||
de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
|
||||
assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
|
||||
}
|
||||
ir.close();
|
||||
|
||||
TestUtil.checkIndex(dir);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** a variant, that uses pulsing, but uses a high TF to force pass thru to the underlying codec
|
||||
*/
|
||||
public void test10kNotPulsed() throws Exception {
|
||||
// we always run this test with pulsing codec.
|
||||
int freqCutoff = TestUtil.nextInt(random(), 1, 10);
|
||||
Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(freqCutoff));
|
||||
|
||||
File f = createTempDir("10knotpulsed");
|
||||
BaseDirectoryWrapper dir = newFSDirectory(f);
|
||||
dir.setCheckIndexOnClose(false); // we do this ourselves explicitly
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
|
||||
|
||||
Document document = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
|
||||
switch(TestUtil.nextInt(random(), 0, 2)) {
|
||||
case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
|
||||
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
|
||||
default: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
|
||||
}
|
||||
|
||||
Field field = newField("field", "", ft);
|
||||
document.add(field);
|
||||
|
||||
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));
|
||||
|
||||
final int freq = freqCutoff + 1;
|
||||
|
||||
for (int i = 0; i < 10050; i++) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int j = 0; j < freq; j++) {
|
||||
sb.append(df.format(i));
|
||||
sb.append(' '); // whitespace
|
||||
}
|
||||
field.setStringValue(sb.toString());
|
||||
iw.addDocument(document);
|
||||
}
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
|
||||
DocsEnum de = null;
|
||||
|
||||
for (int i = 0; i < 10050; i++) {
|
||||
String expected = df.format(i);
|
||||
assertEquals(expected, te.next().utf8ToString());
|
||||
de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
|
||||
assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
|
||||
}
|
||||
ir.close();
|
||||
|
||||
TestUtil.checkIndex(dir);
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BasePostingsFormatTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests PulsingPostingsFormat
|
||||
*/
|
||||
public class TestPulsingPostingsFormat extends BasePostingsFormatTestCase {
|
||||
// TODO: randomize cutoff
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
|
@ -1,122 +0,0 @@
|
|||
package org.apache.lucene.codecs.pulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests that pulsing codec reuses its enums and wrapped enums
|
||||
*/
|
||||
public class TestPulsingReuse extends LuceneTestCase {
|
||||
// TODO: this is a basic test. this thing is complicated, add more
|
||||
public void testSophisticatedReuse() throws Exception {
|
||||
// we always run this test with pulsing codec.
|
||||
Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("foo", "a b b c c c d e f g g h i i j j k", Field.Store.NO));
|
||||
iw.addDocument(doc);
|
||||
DirectoryReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
AtomicReader segment = getOnlySegmentReader(ir);
|
||||
DocsEnum reuse = null;
|
||||
Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<>();
|
||||
TermsEnum te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
reuse = te.docs(null, reuse, DocsEnum.FLAG_NONE);
|
||||
allEnums.put(reuse, true);
|
||||
}
|
||||
|
||||
assertEquals(2, allEnums.size());
|
||||
|
||||
allEnums.clear();
|
||||
DocsAndPositionsEnum posReuse = null;
|
||||
te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
posReuse = te.docsAndPositions(null, posReuse);
|
||||
allEnums.put(posReuse, true);
|
||||
}
|
||||
|
||||
assertEquals(2, allEnums.size());
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** tests reuse with Pulsing1(Pulsing2(Standard)) */
|
||||
public void testNestedPulsing() throws Exception {
|
||||
// we always run this test with pulsing codec.
|
||||
Codec cp = TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat());
|
||||
BaseDirectoryWrapper dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("foo", "a b b c c c d e f g g g h i i j j k l l m m m", Field.Store.NO));
|
||||
// note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm')
|
||||
// this is because we only track the 'last' enum we reused (not all).
|
||||
// but this seems 'good enough' for now.
|
||||
iw.addDocument(doc);
|
||||
DirectoryReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
AtomicReader segment = getOnlySegmentReader(ir);
|
||||
DocsEnum reuse = null;
|
||||
Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<>();
|
||||
TermsEnum te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
reuse = te.docs(null, reuse, DocsEnum.FLAG_NONE);
|
||||
allEnums.put(reuse, true);
|
||||
}
|
||||
|
||||
assertEquals(4, allEnums.size());
|
||||
|
||||
allEnums.clear();
|
||||
DocsAndPositionsEnum posReuse = null;
|
||||
te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
posReuse = te.docsAndPositions(null, posReuse);
|
||||
allEnums.put(posReuse, true);
|
||||
}
|
||||
|
||||
assertEquals(4, allEnums.size());
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -41,12 +41,12 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
|
||||
private final PostingsFormat ramFormat = PostingsFormat.forName("RAMOnly");
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
|
||||
private final PostingsFormat pulsingFormat = PostingsFormat.forName("Pulsing41");
|
||||
private final PostingsFormat memoryFormat = PostingsFormat.forName("Memory");
|
||||
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if (field.equals("field2") || field.equals("id")) {
|
||||
return pulsingFormat;
|
||||
return memoryFormat;
|
||||
} else if (field.equals("field1")) {
|
||||
return defaultFormat;
|
||||
} else {
|
||||
|
@ -76,8 +76,8 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
Document doc = new Document();
|
||||
// uses default codec:
|
||||
doc.add(newTextField("field1", "this field uses the standard codec as the test", Field.Store.NO));
|
||||
// uses pulsing codec:
|
||||
Field field2 = newTextField("field2", "this field uses the pulsing codec as the test", Field.Store.NO);
|
||||
// uses memory codec:
|
||||
Field field2 = newTextField("field2", "this field uses the memory codec as the test", Field.Store.NO);
|
||||
doc.add(field2);
|
||||
|
||||
Field idField = newStringField("id", "", Field.Store.NO);
|
||||
|
@ -100,7 +100,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
assertEquals(NUM_DOCS-1, r.numDocs());
|
||||
IndexSearcher s = newSearcher(r);
|
||||
assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
|
||||
assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
|
||||
assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field2", "memory")), 1).totalHits);
|
||||
r.close();
|
||||
|
||||
if (VERBOSE) {
|
||||
|
@ -120,7 +120,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
assertEquals(NUM_DOCS-2, r.numDocs());
|
||||
s = newSearcher(r);
|
||||
assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
|
||||
assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
|
||||
assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field2", "memory")), 1).totalHits);
|
||||
assertEquals(1, s.search(new TermQuery(new Term("id", "76")), 1).totalHits);
|
||||
assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits);
|
||||
assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits);
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds;
|
||||
import org.apache.lucene.codecs.memory.FSTOrdPostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.FSTOrdPulsing41PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
|
@ -131,8 +130,6 @@ public class TestLucene410DocValuesFormat extends BaseCompressingDocValuesFormat
|
|||
// TODO: these don't actually support ords!
|
||||
//case 2: pf = new FSTOrdPostingsFormat();
|
||||
// break;
|
||||
//case 3: pf = new FSTOrdPulsing41PostingsFormat();
|
||||
// break;
|
||||
default: throw new AssertionError();
|
||||
}
|
||||
final DocValuesFormat dv = new Lucene410DocValuesFormat();
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene410.Lucene410Codec;
|
||||
import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval;
|
||||
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -272,9 +272,9 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase {
|
|||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if ("id".equals(field)) {
|
||||
return new Pulsing41PostingsFormat(1);
|
||||
return new MemoryPostingsFormat();
|
||||
} else if ("date".equals(field)) {
|
||||
return new Pulsing41PostingsFormat(1);
|
||||
return new MemoryPostingsFormat();
|
||||
} else {
|
||||
return super.getPostingsFormatForField(field);
|
||||
}
|
||||
|
@ -288,9 +288,9 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase {
|
|||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if ("id".equals(field)) {
|
||||
return new Pulsing41PostingsFormat(1);
|
||||
return new Lucene41VarGapFixedInterval(1);
|
||||
} else if ("date".equals(field)) {
|
||||
return new Pulsing41PostingsFormat(2);
|
||||
return new Lucene41VarGapFixedInterval(2);
|
||||
} else {
|
||||
return super.getPostingsFormatForField(field);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene410.Lucene410Codec;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -1149,7 +1149,7 @@ public class TestAddIndexes extends LuceneTestCase {
|
|||
{
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setCodec(TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1 + random().nextInt(20))));
|
||||
conf.setCodec(TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
|
||||
IndexWriter w = new IndexWriter(dir, conf);
|
||||
try {
|
||||
w.addIndexes(toAdd);
|
||||
|
|
|
@ -34,9 +34,6 @@ public class TestForTooMuchCloning extends LuceneTestCase {
|
|||
// Make sure we don't clone IndexInputs too frequently
|
||||
// during merging:
|
||||
public void test() throws Exception {
|
||||
// NOTE: if we see a fail on this test with "NestedPulsing" its because its
|
||||
// reuse isnt perfect (but reasonable). see TestPulsingReuse.testNestedPulsing
|
||||
// for more details
|
||||
final MockDirectoryWrapper dir = newMockDirectory();
|
||||
final TieredMergePolicy tmp = new TieredMergePolicy();
|
||||
tmp.setMaxMergeAtOnce(2);
|
||||
|
|
|
@ -279,7 +279,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
* @param openMode see {@link OpenMode}
|
||||
*/
|
||||
protected IndexWriterConfig createIndexWriterConfig(OpenMode openMode) {
|
||||
// TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
|
||||
// TODO: should we use a more optimized Codec?
|
||||
// The taxonomy has a unique structure, where each term is associated with one document
|
||||
|
||||
// Make sure we use a MergePolicy which always merges adjacent segments and thus
|
||||
|
|
|
@ -44,8 +44,6 @@ import org.apache.lucene.codecs.memory.FSTOrdTermsReader;
|
|||
import org.apache.lucene.codecs.memory.FSTOrdTermsWriter;
|
||||
import org.apache.lucene.codecs.memory.FSTTermsReader;
|
||||
import org.apache.lucene.codecs.memory.FSTTermsWriter;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
|
@ -121,14 +119,6 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
|
||||
PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state, skipInterval);
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
final int totTFCutoff = TestUtil.nextInt(random, 1, 20);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("MockRandomCodec: writing pulsing postings with totTFCutoff=" + totTFCutoff);
|
||||
}
|
||||
postingsWriter = new PulsingPostingsWriter(state, totTFCutoff, postingsWriter);
|
||||
}
|
||||
|
||||
final FieldsConsumer fields;
|
||||
final int t1 = random.nextInt(5);
|
||||
|
||||
|
@ -292,14 +282,6 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
|
||||
PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
final int totTFCutoff = TestUtil.nextInt(random, 1, 20);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff);
|
||||
}
|
||||
postingsReader = new PulsingPostingsReader(state, postingsReader);
|
||||
}
|
||||
|
||||
final FieldsProducer fields;
|
||||
final int t1 = random.nextInt(5);
|
||||
if (t1 == 0) {
|
||||
|
|
|
@ -1,96 +0,0 @@
|
|||
package org.apache.lucene.codecs.nestedpulsing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
|
||||
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pulsing(1, Pulsing(2, Lucene41))
|
||||
* @lucene.experimental
|
||||
*/
|
||||
// TODO: if we create PulsingPostingsBaseFormat then we
|
||||
// can simplify this? note: I don't like the *BaseFormat
|
||||
// hierarchy, maybe we can clean that up...
|
||||
public final class NestedPulsingPostingsFormat extends PostingsFormat {
|
||||
public NestedPulsingPostingsFormat() {
|
||||
super("NestedPulsing");
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = null;
|
||||
PostingsWriterBase pulsingWriterInner = null;
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
// Terms dict
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = new Lucene41PostingsWriter(state);
|
||||
|
||||
pulsingWriterInner = new PulsingPostingsWriter(state, 2, docsWriter);
|
||||
pulsingWriter = new PulsingPostingsWriter(state, 1, pulsingWriterInner);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter,
|
||||
BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriterInner, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReaderInner = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
pulsingReaderInner = new PulsingPostingsReader(state, docsReader);
|
||||
pulsingReader = new PulsingPostingsReader(state, pulsingReaderInner);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.directory, state.fieldInfos, state.segmentInfo,
|
||||
pulsingReader,
|
||||
state.context,
|
||||
state.segmentSuffix);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReaderInner, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codec for testing that wraps {@link org.apache.lucene.codecs.pulsing.PulsingPostingsFormat} with itself.
|
||||
</body>
|
||||
</html>
|
|
@ -40,14 +40,10 @@ import org.apache.lucene.codecs.lucene410.Lucene410Codec;
|
|||
import org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat;
|
||||
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.FSTOrdPostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.FSTOrdPulsing41PostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.FSTPostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.FSTPulsing41PostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.MemoryDocValuesFormat;
|
||||
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
|
||||
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
|
||||
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -130,19 +126,13 @@ public class RandomCodec extends Lucene410Codec {
|
|||
new Lucene41PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
|
||||
new FSTPostingsFormat(),
|
||||
new FSTOrdPostingsFormat(),
|
||||
new FSTPulsing41PostingsFormat(1 + random.nextInt(20)),
|
||||
new FSTOrdPulsing41PostingsFormat(1 + random.nextInt(20)),
|
||||
new DirectPostingsFormat(LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : maxItemsPerBlock),
|
||||
LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : lowFreqCutoff)),
|
||||
new Pulsing41PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
// add pulsing again with (usually) different parameters
|
||||
new Pulsing41PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
//TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene41Postings to be constructed
|
||||
//with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing
|
||||
//with such "wrapper" classes?
|
||||
new TestBloomFilteredLucene41Postings(),
|
||||
new MockRandomPostingsFormat(random),
|
||||
new NestedPulsingPostingsFormat(),
|
||||
new Lucene41WithOrds(TestUtil.nextInt(random, 1, 1000)),
|
||||
new Lucene41VarGapFixedInterval(TestUtil.nextInt(random, 1, 1000)),
|
||||
new Lucene41VarGapDocFreqInterval(TestUtil.nextInt(random, 1, 100), TestUtil.nextInt(random, 1, 1000)),
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat
|
||||
org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat
|
||||
org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat
|
||||
org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds
|
||||
org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval
|
||||
|
|
|
@ -36,9 +36,9 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware {
|
|||
private volatile SolrCore core;
|
||||
|
||||
// TODO: we need to change how solr does this?
|
||||
// rather than a string like "Pulsing" you need to be able to pass parameters
|
||||
// rather than a string like "Direct" you need to be able to pass parameters
|
||||
// and everything to a field in the schema, e.g. we should provide factories for
|
||||
// the Lucene's core formats (Memory, Pulsing, ...) and such.
|
||||
// the Lucene's core formats (Memory, Direct, ...) and such.
|
||||
//
|
||||
// So I think a FieldType should return PostingsFormat, not a String.
|
||||
// how it constructs this from the XML... i don't care.
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
<schema name="bad-schema-codec-global-vs-ft-mismatch" version="1.0">
|
||||
<types>
|
||||
<!-- BAD: postingsFormat here but no codec that allows it -->
|
||||
<fieldType name="pulsing1" class="solr.TextField" postingsFormat="Pulsing">
|
||||
<fieldType name="direct1" class="solr.TextField" postingsFormat="Direct">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
</analyzer>
|
||||
|
@ -27,10 +27,10 @@
|
|||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="pulsing1text" type="pulsing1" indexed="true" stored="true"/>
|
||||
<dynamicField name="*" type="pulsing1" />
|
||||
<field name="direct1text" type="direct1" indexed="true" stored="true"/>
|
||||
<dynamicField name="*" type="direct1" />
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>pulsing1text</defaultSearchField>
|
||||
<defaultSearchField>direct1text</defaultSearchField>
|
||||
|
||||
</schema>
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
-->
|
||||
<schema name="codec" version="1.2">
|
||||
<types>
|
||||
<fieldType name="string_pulsing" class="solr.StrField" postingsFormat="Pulsing41"/>
|
||||
<fieldType name="string_direct" class="solr.StrField" postingsFormat="Direct"/>
|
||||
<fieldType name="string_simpletext" class="solr.StrField" postingsFormat="SimpleText"/>
|
||||
<fieldType name="string_standard" class="solr.StrField" postingsFormat="Lucene41"/>
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
|||
|
||||
</types>
|
||||
<fields>
|
||||
<field name="string_pulsing_f" type="string_pulsing" indexed="true" stored="true" />
|
||||
<field name="string_direct_f" type="string_direct" indexed="true" stored="true" />
|
||||
<field name="string_simpletext_f" type="string_simpletext" indexed="true" stored="true" />
|
||||
<field name="string_standard_f" type="string_standard" indexed="true" stored="true" />
|
||||
|
||||
|
@ -38,7 +38,7 @@
|
|||
<field name="string_f" type="string" indexed="true" stored="true" docValues="true" required="true"/>
|
||||
|
||||
<dynamicField name="*_simple" type="string_simpletext" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_pulsing" type="string_pulsing" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_direct" type="string_direct" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_standard" type="string_standard" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="*_disk" type="string_disk" indexed="false" stored="false" docValues="true" />
|
||||
|
|
|
@ -36,9 +36,9 @@ public class TestCodecSupport extends SolrTestCaseJ4 {
|
|||
public void testPostingsFormats() {
|
||||
Codec codec = h.getCore().getCodec();
|
||||
Map<String, SchemaField> fields = h.getCore().getLatestSchema().getFields();
|
||||
SchemaField schemaField = fields.get("string_pulsing_f");
|
||||
SchemaField schemaField = fields.get("string_direct_f");
|
||||
PerFieldPostingsFormat format = (PerFieldPostingsFormat) codec.postingsFormat();
|
||||
assertEquals("Pulsing41", format.getPostingsFormatForField(schemaField.getName()).getName());
|
||||
assertEquals("Direct", format.getPostingsFormatForField(schemaField.getName()).getName());
|
||||
schemaField = fields.get("string_simpletext_f");
|
||||
assertEquals("SimpleText",
|
||||
format.getPostingsFormatForField(schemaField.getName()).getName());
|
||||
|
@ -68,8 +68,8 @@ public class TestCodecSupport extends SolrTestCaseJ4 {
|
|||
|
||||
assertEquals("SimpleText", format.getPostingsFormatForField("foo_simple").getName());
|
||||
assertEquals("SimpleText", format.getPostingsFormatForField("bar_simple").getName());
|
||||
assertEquals("Pulsing41", format.getPostingsFormatForField("foo_pulsing").getName());
|
||||
assertEquals("Pulsing41", format.getPostingsFormatForField("bar_pulsing").getName());
|
||||
assertEquals("Direct", format.getPostingsFormatForField("foo_direct").getName());
|
||||
assertEquals("Direct", format.getPostingsFormatForField("bar_direct").getName());
|
||||
assertEquals("Lucene41", format.getPostingsFormatForField("foo_standard").getName());
|
||||
assertEquals("Lucene41", format.getPostingsFormatForField("bar_standard").getName());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue