LUCENE-7626: IndexWriter no longer accepts broken offsets

This commit is contained in:
Mike McCandless 2017-01-13 17:46:02 -05:00
parent 5b3565ed7e
commit 64b86331c2
19 changed files with 480 additions and 72 deletions

View File

@ -29,6 +29,9 @@ API Changes
Bug Fixes
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
(Mike McCandless)
Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default

View File

@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* A filter to correct offsets that illegally go backwards.
*
* @deprecated Fix the token filters that create broken offsets in the first place.
*/
@Deprecated
public final class FixBrokenOffsetsFilter extends TokenFilter {
private int lastStartOffset;
private int lastEndOffset;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public FixBrokenOffsetsFilter(TokenStream in) {
super(in);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken() == false) {
return false;
}
fixOffsets();
return true;
}
@Override
public void end() throws IOException {
super.end();
fixOffsets();
}
@Override
public void reset() throws IOException {
super.reset();
lastStartOffset = 0;
lastEndOffset = 0;
}
private void fixOffsets() {
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (startOffset < lastStartOffset) {
startOffset = lastStartOffset;
}
if (endOffset < startOffset) {
endOffset = startOffset;
}
offsetAtt.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
lastEndOffset = endOffset;
}
}

View File

@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link FixBrokenOffsetsFilter}.
*/
public class FixBrokenOffsetsFilterFactory extends TokenFilterFactory {
/** Sole constructor */
public FixBrokenOffsetsFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public TokenStream create(TokenStream input) {
return new FixBrokenOffsetsFilter(input);
}
}

View File

@ -64,6 +64,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
public class TestFixBrokenOffsetsFilter extends BaseTokenStreamTestCase {
public void testBogusTermVectors() throws IOException {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(
new Token("bar", 5, 10), new Token("bar", 1, 4)
)));
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
}

View File

@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.SuppressForbidden;
/**
* Command-line tool that reads from a source index and
* writes to a dest index, correcting any broken offsets
* in the process.
*
* @lucene.experimental
*/
public class FixBrokenOffsets {
public SegmentInfos infos;
FSDirectory fsDir;
Path dir;
@SuppressForbidden(reason = "System.out required: command line tool")
public static void main(String[] args) throws IOException {
if (args.length < 2) {
System.err.println("Usage: FixBrokenOffsetse <srcDir> <destDir>");
return;
}
Path srcPath = Paths.get(args[0]);
if (!Files.exists(srcPath)) {
throw new RuntimeException("srcPath " + srcPath.toAbsolutePath() + " doesn't exist");
}
Path destPath = Paths.get(args[1]);
if (Files.exists(destPath)) {
throw new RuntimeException("destPath " + destPath.toAbsolutePath() + " already exists; please remove it and re-run");
}
Directory srcDir = FSDirectory.open(srcPath);
DirectoryReader reader = DirectoryReader.open(srcDir);
List<LeafReaderContext> leaves = reader.leaves();
CodecReader[] filtered = new CodecReader[leaves.size()];
for(int i=0;i<leaves.size();i++) {
filtered[i] = SlowCodecReaderWrapper.wrap(new FilterLeafReader(leaves.get(i).reader()) {
@Override
public Fields getTermVectors(int docID) throws IOException {
Fields termVectors = in.getTermVectors(docID);
if (termVectors == null) {
return null;
}
return new FilterFields(termVectors) {
@Override
public Terms terms(String field) throws IOException {
return new FilterTerms(super.terms(field)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(super.iterator()) {
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return new FilterPostingsEnum(super.postings(reuse, flags)) {
int nextLastStartOffset = 0;
int lastStartOffset = 0;
@Override
public int nextPosition() throws IOException {
int pos = super.nextPosition();
lastStartOffset = nextLastStartOffset;
nextLastStartOffset = startOffset();
return pos;
}
@Override
public int startOffset() throws IOException {
int offset = super.startOffset();
if (offset < lastStartOffset) {
offset = lastStartOffset;
}
return offset;
}
@Override
public int endOffset() throws IOException {
int offset = super.endOffset();
if (offset < lastStartOffset) {
offset = lastStartOffset;
}
return offset;
}
};
}
};
}
};
}
};
}
});
}
Directory destDir = FSDirectory.open(destPath);
IndexWriter writer = new IndexWriter(destDir, new IndexWriterConfig());
writer.addIndexes(filtered);
IOUtils.close(writer, reader, srcDir, destDir);
}
}

View File

@ -0,0 +1,27 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>Tools for handling backwards compatibility issues with indices.</title>
</head>
<body>
Tools for handling backwards compatibility issues with indices.
</body>
</html>

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestFixBrokenOffsets extends LuceneTestCase {
// Run this in Lucene 6.x:
//
// ant test -Dtestcase=TestFixBrokenOffsets -Dtestmethod=testCreateBrokenOffsetsIndex -Dtests.codec=default -Dtests.useSecurityManager=false
/*
public void testCreateBrokenOffsetsIndex() throws IOException {
Path indexDir = Paths.get("/tmp/brokenoffsets");
Files.deleteIfExists(indexDir);
Directory dir = newFSDirectory(indexDir);
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "bar", fieldType);
field.setTokenStream(new CannedTokenStream(new Token("foo", 10, 13), new Token("foo", 7, 9)));
doc.add(field);
writer.addDocument(doc);
writer.commit();
// 2nd segment
doc = new Document();
field = new Field("foo", "bar", fieldType);
field.setTokenStream(new CannedTokenStream(new Token("bar", 15, 17), new Token("bar", 1, 5)));
doc.add(field);
writer.addDocument(doc);
writer.close();
dir.close();
}
*/
public void testFixBrokenOffsetsIndex() throws IOException {
InputStream resource = getClass().getResourceAsStream("index.630.brokenoffsets.zip");
assertNotNull("Broken offsets index not found", resource);
Path path = createTempDir("brokenoffsets");
TestUtil.unzip(resource, path);
Directory dir = FSDirectory.open(path);
// OK: index is 6.3.0 so offsets not checked:
TestUtil.checkIndex(dir);
MockDirectoryWrapper tmpDir = newMockDirectory();
tmpDir.setCheckIndexOnClose(false);
IndexWriter w = new IndexWriter(tmpDir, new IndexWriterConfig());
w.addIndexes(dir);
w.close();
// OK: addIndexes(Directory...) also keeps version as 6.3.0, so offsets not checked:
TestUtil.checkIndex(tmpDir);
tmpDir.close();
final MockDirectoryWrapper tmpDir2 = newMockDirectory();
tmpDir2.setCheckIndexOnClose(false);
w = new IndexWriter(tmpDir2, new IndexWriterConfig());
DirectoryReader reader = DirectoryReader.open(dir);
List<LeafReaderContext> leaves = reader.leaves();
CodecReader[] codecReaders = new CodecReader[leaves.size()];
for(int i=0;i<leaves.size();i++) {
codecReaders[i] = (CodecReader) leaves.get(i).reader();
}
w.addIndexes(codecReaders);
w.close();
// NOT OK: broken offsets were copied into a 7.0 segment:
ByteArrayOutputStream output = new ByteArrayOutputStream(1024);
RuntimeException re = expectThrows(RuntimeException.class, () -> {TestUtil.checkIndex(tmpDir2, false, true, output);});
assertEquals("term [66 6f 6f]: doc 0: pos 1: startOffset 7 < lastStartOffset 10; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index", re.getMessage());
tmpDir2.close();
// Now run the tool and confirm the broken offsets are fixed:
Path path2 = createTempDir("fixedbrokenoffsets").resolve("subdir");
FixBrokenOffsets.main(new String[] {path.toString(), path2.toString()});
Directory tmpDir3 = FSDirectory.open(path2);
TestUtil.checkIndex(tmpDir3);
tmpDir3.close();
dir.close();
}
}

View File

@ -740,13 +740,13 @@ public final class CheckIndex implements Closeable {
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
// Test the Term Index
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast);
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast, version);
// Test Stored Fields
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
// Test Term Vectors
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast);
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast, version);
// Test Docvalues
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
@ -1205,7 +1205,7 @@ public final class CheckIndex implements Closeable {
* checks Fields api is consistent with itself.
* searcher is optional, to verify with queries. Can be null.
*/
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose) throws IOException {
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, Version version) throws IOException {
// TODO: we should probably return our own stats thing...?!
long startNS;
if (doPrint) {
@ -1461,14 +1461,13 @@ public final class CheckIndex implements Closeable {
if (hasOffsets) {
int startOffset = postings.startOffset();
int endOffset = postings.endOffset();
// NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
// but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
if (!isVectors) {
// In Lucene 7 we fixed IndexWriter to also enforce term vector offsets
if (isVectors == false || version.onOrAfter(Version.LUCENE_7_0_0)) {
if (startOffset < 0) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
}
if (startOffset < lastOffset) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset + "; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index");
}
if (endOffset < 0) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
@ -1742,15 +1741,15 @@ public final class CheckIndex implements Closeable {
* Test the term index.
* @lucene.experimental
*/
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) throws IOException {
return testPostings(reader, infoStream, false, false);
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
return testPostings(reader, infoStream, false, false, version);
}
/**
* Test the term index.
* @lucene.experimental
*/
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast, Version version) throws IOException {
// TODO: we should go and verify term vectors match, if
// crossCheckTermVectors is on...
@ -1765,7 +1764,7 @@ public final class CheckIndex implements Closeable {
final Fields fields = reader.getPostingsReader().getMergeInstance();
final FieldInfos fieldInfos = reader.getFieldInfos();
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose);
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
@ -2339,15 +2338,15 @@ public final class CheckIndex implements Closeable {
* Test term vectors.
* @lucene.experimental
*/
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) throws IOException {
return testTermVectors(reader, infoStream, false, false, false);
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
return testTermVectors(reader, infoStream, false, false, false, version);
}
/**
* Test term vectors.
* @lucene.experimental
*/
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
long startNS = System.nanoTime();
final Status.TermVectorStatus status = new Status.TermVectorStatus();
final FieldInfos fieldInfos = reader.getFieldInfos();
@ -2387,7 +2386,7 @@ public final class CheckIndex implements Closeable {
if (tfv != null) {
// First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);

View File

@ -27,6 +27,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsConsumer;
@ -728,10 +729,6 @@ final class DefaultIndexingChain extends DocConsumer {
final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
// only bother checking offsets if something will consume them.
// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
final boolean checkOffsets = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
/*
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
@ -743,6 +740,7 @@ final class DefaultIndexingChain extends DocConsumer {
stream.reset();
invertState.setAttributeSource(stream);
termsHashPerField.start(field, first);
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
@ -771,15 +769,13 @@ final class DefaultIndexingChain extends DocConsumer {
invertState.numOverlap++;
}
if (checkOffsets) {
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
}
invertState.lastStartOffset = startOffset;
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
}
invertState.lastStartOffset = startOffset;
invertState.length++;
if (invertState.length < 0) {

View File

@ -42,11 +42,6 @@ public class TestCheckIndex extends BaseTestCheckIndex {
testDeletedDocs(directory);
}
@Test
public void testBogusTermVectors() throws IOException {
testBogusTermVectors(directory);
}
@Test
public void testChecksumsOnly() throws IOException {
testChecksumsOnly(directory);

View File

@ -377,7 +377,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
}
final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false);
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
//check to see if the token streams might have non-deterministic testable result
final boolean storeTermVectorPositions = random().nextBoolean();
final int[] startOffsets = rTokenStream.getStartOffsets();

View File

@ -45,6 +45,7 @@ import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
@ -431,7 +432,9 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
@Override
public boolean incrementToken() throws IOException {
if (synNext) {
AttributeSource.State state = captureState();
clearAttributes();
restoreState(state);
posIncAtt.setPositionIncrement(0);
termAtt.append(""+((char) 97 + random().nextInt(3)));
synNext = false;

View File

@ -200,10 +200,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
int i = 0;
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
this(len, sampleTerms, sampleTermBytes, rarely());
}
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
terms = new String[len];
termBytes = new BytesRef[len];
positionsIncrements = new int[len];
@ -216,17 +212,12 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
terms[i] = sampleTerms[o];
termBytes[i] = sampleTermBytes[o];
positionsIncrements[i] = TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
if (offsetsGoBackwards) {
startOffsets[i] = random().nextInt();
endOffsets[i] = random().nextInt();
if (i == 0) {
startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
} else {
if (i == 0) {
startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
} else {
startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
}
endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
}
endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
}
for (int i = 0; i < len; ++i) {

View File

@ -22,11 +22,8 @@ import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.Directory;
@ -105,22 +102,6 @@ public class BaseTestCheckIndex extends LuceneTestCase {
checker.close();
}
// LUCENE-4221: we have to let these thru, for now
public void testBogusTermVectors(Directory dir) throws IOException {
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new CannedTokenStream(
new Token("bar", 5, 10), new Token("bar", 1, 4)
));
doc.add(field);
iw.addDocument(doc);
iw.close();
}
public void testChecksumsOnly(Directory dir) throws IOException {
LineFileDocs lf = new LineFileDocs(random());
MockAnalyzer analyzer = new MockAnalyzer(random());

View File

@ -334,9 +334,9 @@ public final class TestUtil {
CheckIndex.testLiveDocs(codecReader, infoStream, true);
CheckIndex.testFieldInfos(codecReader, infoStream, true);
CheckIndex.testFieldNorms(codecReader, infoStream, true);
CheckIndex.testPostings(codecReader, infoStream, false, true);
CheckIndex.testPostings(codecReader, infoStream, false, true, Version.LUCENE_7_0_0);
CheckIndex.testStoredFields(codecReader, infoStream, true);
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true);
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true, Version.LUCENE_7_0_0);
CheckIndex.testDocValues(codecReader, infoStream, true);
CheckIndex.testPoints(codecReader, infoStream, true);

View File

@ -27,6 +27,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
@ -284,6 +285,7 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
private byte[] binaryValue = null;
private PreAnalyzedParser parser;
private IOException readerConsumptionException;
private int lastEndOffset;
public PreAnalyzedTokenizer(PreAnalyzedParser parser) {
// we don't pack attributes: since we are used for (de)serialization and dont want bloat.
@ -311,6 +313,8 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
AttributeSource.State state = it.next();
restoreState(state.clone());
// TODO: why can't I lookup the OffsetAttribute up in ctor instead?
lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
return true;
}
@ -329,6 +333,13 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
it = cachedStates.iterator();
}
@Override
public void end() throws IOException {
super.end();
// we must set the end offset correctly so multi-valued fields don't try to send offsets backwards:
addAttribute(OffsetAttribute.class).setOffset(lastEndOffset, lastEndOffset);
}
private void setReaderConsumptionException(IOException e) {
readerConsumptionException = e;
}

View File

@ -120,11 +120,6 @@ public class CheckHdfsIndexTest extends AbstractFullDistribZkTestBase {
testCheckIndex.testDeletedDocs(directory);
}
@Test
public void testBogusTermVectors() throws IOException {
testCheckIndex.testBogusTermVectors(directory);
}
@Test
public void testChecksumsOnly() throws IOException {
testCheckIndex.testChecksumsOnly(directory);